def preprocess(): # Load data print("Loading data...") x_text, t_text = dh.load_data(FLAGS.source_train_data, FLAGS.target_train_data) xv_text, tv_text = dh.load_data(FLAGS.source_dev_data, FLAGS.target_dev_data) print("Build vocabulary...") source_word_id_dict, _ = dh.buildVocab( x_text, FLAGS.source_vocab_size) # apple:1017 <--> 1017:apple FLAGS.source_vocab_size = len(source_word_id_dict) + 4 target_word_id_dict, _ = dh.buildVocab(t_text, FLAGS.target_vocab_size) # print(target_word_id_dict) FLAGS.target_vocab_size = len(target_word_id_dict) + 4 print("Source language vocabulary size: ", FLAGS.source_vocab_size) print("Target language vocabulary size: ", FLAGS.target_vocab_size) print("Average length (source): ", sum([len(x.split(" ")) for x in x_text]) / len(x_text)) print("Average length (target): ", sum([len(t.split(" ")) for t in t_text]) / len(t_text)) # FLAGS.max_length = 2 + max([len(x.split(" ")) for x in x_text] + [len(t.split(" ")) for t in t_text]) for word_id in source_word_id_dict.keys(): source_word_id_dict[ word_id] += 4 # 0: <pad>, 1: <s>, 2: <eos>, 3: <unk> source_word_id_dict['<pad>'] = 0 source_word_id_dict['<s>'] = 1 source_word_id_dict['<eos>'] = 2 source_word_id_dict['<unk>'] = 3 # Gooooooooood!!!!!! -> <unk> for word_id in target_word_id_dict.keys(): target_word_id_dict[word_id] += 4 target_word_id_dict['<pad>'] = 0 target_word_id_dict['<s>'] = 1 target_word_id_dict['<eos>'] = 2 target_word_id_dict['<unk>'] = 3 np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(x_text))) x_text = x_text[shuffle_indices] t_text = t_text[shuffle_indices] x, x_lengths = dh.text_to_index(x_text, source_word_id_dict) t, t_lengths = dh.text_to_index_ko(t_text, target_word_id_dict) xv, xv_lengths = dh.text_to_index(xv_text, source_word_id_dict) tv, tv_lengths = dh.text_to_index_ko(tv_text, target_word_id_dict) print("Train/Val split: {:d}/{:d}".format(len(x), len(xv))) return x, t, x_lengths, t_lengths, xv, tv, xv_lengths, tv_lengths, source_word_id_dict, target_word_id_dict
def preprocess(): # Load data print("Loading data...") if FLAGS.task == "MR": x_text, y = dh.load_mr_data(FLAGS.mr_train_file_pos, FLAGS.mr_train_file_neg) elif FLAGS.task == "TREC": x_text, y = dh.load_trec_data(FLAGS.trec_train_file) pass # TREC data 전처리 구현 # Build vocabulary word_id_dict, _ = dh.buildVocab( x_text, FLAGS.vocab_size) # training corpus를 토대로 단어사전 구축 FLAGS.vocab_size = len(word_id_dict) + 4 #30000 + 4 print("vocabulary size: ", FLAGS.vocab_size) for word in word_id_dict.keys(): word_id_dict[word] += 4 # <pad>: 0, <unk>: 1, <s>: 2 (a: 0 -> 4) word_id_dict['<pad>'] = 0 # zero padding을 위한 토큰 word_id_dict['<unk>'] = 1 # OOV word를 위한 토큰 word_id_dict['<s>'] = 2 # 문장 시작을 알리는 start 토큰 word_id_dict['</s>'] = 3 # 문장 마침을 알리는 end 토큰 x = dh.text_to_index(x_text, word_id_dict, max(list(map(int, FLAGS.filter_sizes.split(",")))) - 1) # i am a boy, word_id_dict, max([3,4,5]) -> 5 - 1 x, FLAGS.max_length = dh.train_tensor(x) # 문장 max length를 바탕으로 batch 구성 # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/dev set dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] FLAGS.num_classes = y_train.shape[ 1] # 2 (eg., [0, 1]), class 개수를 y shape로 부터 획득 del x, x_text, y, x_shuffled, y_shuffled print(x_train) print(y_train) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) return x_train, y_train, word_id_dict, x_dev, y_dev
def preprocess(): # Data Preparation # Load data print("Loading data...") x_text, y, lengths = dh.load_data(FLAGS.x_train_file, FLAGS.t_train_file) print("Build vocabulary...") # Build vocabulary word_id_dict, _ = dh.buildVocab(x_text, FLAGS.vocab_size) # print(word_id_dict) FLAGS.vocab_size = len(word_id_dict) + 4 print("vocabulary size: ", FLAGS.vocab_size) for word_id in word_id_dict.keys(): word_id_dict[word_id] += 4 # 0: <pad>, 1: <unk>, 2: <s> word_id_dict['<pad>'] = 0 word_id_dict['<unk>'] = 1 word_id_dict['<s>'] = 2 word_id_dict['</s>'] = 3 np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_text = x_text[shuffle_indices] print("Split train/validation set...") val_sample_index = -1 * int(FLAGS.val_sample_percentage * float(len(y))) x_train, x_val = x_text[:val_sample_index], x_text[val_sample_index:] x_train = dh.text_to_index(x_train, word_id_dict, 0) x_val = dh.text_to_index(x_val, word_id_dict, 0) FLAGS.num_classes = y.shape[1] y = y[shuffle_indices] lengths = lengths[shuffle_indices] y_train, y_val = y[:val_sample_index], y[val_sample_index:] lengths, lengths_val = lengths[:val_sample_index], lengths[ val_sample_index:] print("Vocabulary Size: {:d}".format(FLAGS.vocab_size)) print("Train/Val split: {:d}/{:d}".format(len(y_train), len(y_val))) return x_train, y_train, lengths, word_id_dict, x_val, y_val, lengths_val