def preprocess(): # Load data print("Loading data...") if FLAGS.task == "MR": x_text, y = dh.load_mr_data(FLAGS.mr_train_file_pos, FLAGS.mr_train_file_neg) elif FLAGS.task == "TREC": x_text, y = dh.load_trec_data(FLAGS.trec_train_file) pass # TREC data 전처리 구현 # Build vocabulary word_id_dict, _ = dh.buildVocab( x_text, FLAGS.vocab_size) # training corpus를 토대로 단어사전 구축 FLAGS.vocab_size = len(word_id_dict) + 4 #30000 + 4 print("vocabulary size: ", FLAGS.vocab_size) for word in word_id_dict.keys(): word_id_dict[word] += 4 # <pad>: 0, <unk>: 1, <s>: 2 (a: 0 -> 4) word_id_dict['<pad>'] = 0 # zero padding을 위한 토큰 word_id_dict['<unk>'] = 1 # OOV word를 위한 토큰 word_id_dict['<s>'] = 2 # 문장 시작을 알리는 start 토큰 word_id_dict['</s>'] = 3 # 문장 마침을 알리는 end 토큰 x = dh.text_to_index(x_text, word_id_dict, max(list(map(int, FLAGS.filter_sizes.split(",")))) - 1) # i am a boy, word_id_dict, max([3,4,5]) -> 5 - 1 x, FLAGS.max_length = dh.train_tensor(x) # 문장 max length를 바탕으로 batch 구성 # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/dev set dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] FLAGS.num_classes = y_train.shape[ 1] # 2 (eg., [0, 1]), class 개수를 y shape로 부터 획득 del x, x_text, y, x_shuffled, y_shuffled print(x_train) print(y_train) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) return x_train, y_train, word_id_dict, x_dev, y_dev
tf.flags.DEFINE_string("task", "MR", "Choose the classification task") # Eval Parameters tf.flags.DEFINE_string("dir", "./runs/text_cnn_nonstatic_MR", "Checkpoint directory from training run") # Misc Parameters tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS if FLAGS.task == "MR": x_raw, y_test = dh.load_mr_data(FLAGS.mr_test_file_pos, FLAGS.mr_test_file_neg) elif FLAGS.task == "TREC": x_raw, y_test = dh.load_trec_data(FLAGS.trec_test_file) y_test = np.argmax(y_test, axis=1) # 0, 0, 0, 1 -> 3 # Map data into vocabulary with smart_open.smart_open(os.path.join(FLAGS.dir, "vocab"), 'rb') as f: word_id_dict = pickle.load(f) with smart_open.smart_open(os.path.join(FLAGS.dir, "config"), 'rb') as f: config = pickle.load(f) x_test = dh.text_to_index( x_raw, word_id_dict, max(list(map(int, config["filter_sizes"].split(",")))) - 1) x_test = dh.test_tensor(x_test, config["max_length"])