def process(): x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data( FLAGS.en_train, FLAGS.sp_train) word2index, index2word = datahelper.create_vocabulary(x_train_reshape) vocab_size = len(index2word) word_embedding = datahelper.asign_pretrained_word_embedding( index2word, vocab_size, FLAGS.word2vec_model_path) max_len = max([len(x.split(" ")) for x in x_train_reshape]) test1, test2 = datahelper.load_testdata(filepath_test) test1_int = [] test2_int = [] x_text1_int = [] x_text2_int = [] for line in x_text1: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text1_int.append(text) for line in x_text2: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] x_text2_int.append(text) for line in test1: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] test1_int.append(text) for line in test2: line_list = line.split(" ") text = [word2index.get(x, UNK_ID) for x in line_list] test2_int.append(text) x_train1 = pad_sequences(x_text1_int, max_len) x_train2 = pad_sequences(x_text2_int, max_len) x_test1 = pad_sequences(test1_int, max_len) x_test2 = pad_sequences(test2_int, max_len) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_shuffled1 = x_train1[shuffle_indices] x_shuffled2 = x_train2[shuffle_indices] y_shuffled = y_train[shuffle_indices] dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y_train))) x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[ dev_sample_index:] x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x_text1, x_text2, x_text1_int, x_text2_int return x_shuffled1, x_shuffled2, y_shuffled, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size, x_test1, x_test2
def process(): x_text1, x_text2, _, y_train, _, x_train_reshape = datahelper.load_data( FLAGS.en_train, FLAGS.sp_train) word2index, index2word = datahelper.create_vocabulary(x_train_reshape) vocab_size = len(index2word) word_embedding = datahelper.asign_pretrained_word_embedding( index2word, vocab_size, FLAGS.word2vec_model_path) max_len = max([len(x.split(" ")) for x in x_train_reshape]) x_text1_int = [] x_text2_int = [] stop_word = list(open(FLAGS.stop_word, "r", encoding='UTF-8').readlines()) stop_word_list = [ line.replace("\n", "").replace(",", "").replace(".", "").replace( "?", "").replace("¿", "").replace("!", "").replace("¡", "").lower() for line in stop_word ] for line in x_text1: line_list = line.split(" ") line_list = [x for x in line_list if x not in stop_word_list] text = [word2index.get(x, UNK_ID) for x in line_list] x_text1_int.append(text) for line in x_text2: line_list = line.split(" ") line_list = [x for x in line_list if x not in stop_word_list] text = [word2index.get(x, UNK_ID) for x in line_list] x_text2_int.append(text) x_train1 = pad_sequences(x_text1_int, max_len) x_train2 = pad_sequences(x_text2_int, max_len) np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y_train))) x_shuffled1 = x_train1[shuffle_indices] x_shuffled2 = x_train2[shuffle_indices] y_shuffled = y_train[shuffle_indices] dev_sample_index = -1 * int( FLAGS.dev_sample_percentage * float(len(y_train))) x_train1, x_dev1 = x_shuffled1[:dev_sample_index], x_shuffled1[ dev_sample_index:] x_train2, x_dev2 = x_shuffled2[:dev_sample_index], x_shuffled2[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] del x_text1, x_text2, x_text1_int, x_text2_int return x_train1, x_dev1, x_train2, x_dev2, y_train, y_dev, word_embedding, max_len, vocab_size