def process_data(inputs_data): x_test = [] for line in inputs_data: line_list = datahelper.text_to_wordlist(line, remove_stop_words=True, stem_words=False).split(" ") x_test.append(line_list) return x_test
def text_to_wordlist(text, remove_stop_words=True, stem_words=False, lemma=True): text = datahelper.text_to_wordlist(text, remove_stop_words=True, stem_words=False) return text
def process(): x_text1, x_text2, y_train = datahelper.load_data(FLAGS.en_train, FLAGS.sp_train) x_text = np.concatenate([x_text1, x_text2], axis=0) word2index, index2word = datahelper.create_vocabulary(x_text) vocab_size = len(index2word) word_embedding = datahelper.asign_pretrained_word_embedding() max_len = max([len(x.split(" ")) for x in x_text]) x_text1_int = [] x_text2_int = [] for line in x_text1: line_list = datahelper.text_to_wordlist(line) line_list = line_list.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text1_int.append(text) for line in x_text2: line_list = datahelper.text_to_wordlist(line) line_list = line_list.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text2_int.append(text) x_train1 = pad_sequences(x_text1_int, max_len) x_train2 = pad_sequences(x_text2_int, max_len) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_shuffled1 = x_train1[shuffle_indices] x_shuffled2 = x_train2[shuffle_indices] y_shuffled = y_train[shuffle_indices] dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y_train))) x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[ dev_sample_index:] x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x_text1, x_text2, x_text1_int, x_text2_int return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size
def process_data(inputs_data): d2c_list = [] for line in inputs_data: # line_list = [x for x in line if x not in stop_word_list] line = datahelper.text_to_wordlist(line, remove_stop_words=True, stem_words=False).split(" ") d2c_list.append(line) return d2c_list
def process(): x_train1, x_train2, _ = datahelper.load_data(filepath_en_train, filepath_sp_train) # stop_word = list(open(file_stop_word, "r", encoding='UTF-8').readlines()) # stop_word_list = [ # line.replace("\n", "").replace(",", "").replace(".", "").replace("?", "").replace("¿", "").replace("!", # "").replace( # "¡", "").lower() for # line in # stop_word] train_data = np.concatenate([x_train1, x_train2], axis=0) d2c_list = [] for line in train_data: # line_list = [x for x in line if x not in stop_word_list] line = datahelper.text_to_wordlist(line, remove_stop_words=True, stem_words=False).split(" ") d2c_list.append(line) alldocuments = [] analyzedDocument = namedtuple('AnalyzedDocument', 'words tags') for id, record in enumerate(d2c_list): qid = str('SENT_%s' % id) words = record words_text = " ".join(words) words = gensim.utils.simple_preprocess(words_text) tags = [qid] alldocuments.append(analyzedDocument(words, tags)) print("Start Training Doc2Vec Time : %s" % (str(datetime.datetime.now()))) saved_model_name = "doc_2_vec_" + str(int(time.time())) model_4 = gensim.models.Doc2Vec(alldocuments, dm=1, dm_concat=1, vector_size=300, window=5, min_count=2, epochs=100) model_4.save("%s" % (saved_model_name)) print("model training completed : %s" % (saved_model_name))