Beispiel #1
0
def preprocess():
    # Load data
    print("Loading data...")
    if FLAGS.task == "MR":
        x_text, y = dh.load_mr_data(FLAGS.mr_train_file_pos,
                                    FLAGS.mr_train_file_neg)
    elif FLAGS.task == "TREC":
        x_text, y = dh.load_trec_data(FLAGS.trec_train_file)
        pass  # TREC data 전처리 구현

    # Build vocabulary
    word_id_dict, _ = dh.buildVocab(
        x_text, FLAGS.vocab_size)  # training corpus를 토대로 단어사전 구축
    FLAGS.vocab_size = len(word_id_dict) + 4  #30000 + 4
    print("vocabulary size: ", FLAGS.vocab_size)

    for word in word_id_dict.keys():
        word_id_dict[word] += 4  # <pad>: 0, <unk>: 1, <s>: 2 (a: 0 -> 4)
    word_id_dict['<pad>'] = 0  # zero padding을 위한 토큰
    word_id_dict['<unk>'] = 1  # OOV word를 위한 토큰
    word_id_dict['<s>'] = 2  # 문장 시작을 알리는 start 토큰
    word_id_dict['</s>'] = 3  # 문장 마침을 알리는 end 토큰

    x = dh.text_to_index(x_text, word_id_dict,
                         max(list(map(int, FLAGS.filter_sizes.split(",")))) -
                         1)  # i am a boy, word_id_dict, max([3,4,5]) -> 5 - 1
    x, FLAGS.max_length = dh.train_tensor(x)  # 문장 max length를 바탕으로 batch 구성

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/dev set
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    FLAGS.num_classes = y_train.shape[
        1]  # 2 (eg., [0, 1]), class 개수를 y shape로 부터 획득

    del x, x_text, y, x_shuffled, y_shuffled
    print(x_train)
    print(y_train)

    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, word_id_dict, x_dev, y_dev
Beispiel #2
0
tf.flags.DEFINE_string("dir", "./runs/text_cnn_nonstatic_MR",
                       "Checkpoint directory from training run")

# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS

if FLAGS.task == "MR":
    x_raw, y_test = dh.load_mr_data(FLAGS.mr_test_file_pos,
                                    FLAGS.mr_test_file_neg)
elif FLAGS.task == "TREC":
    x_raw, y_test = dh.load_trec_data(FLAGS.trec_test_file)
y_test = np.argmax(y_test, axis=1)  # 0, 0, 0, 1 -> 3

# Map data into vocabulary
with smart_open.smart_open(os.path.join(FLAGS.dir, "vocab"), 'rb') as f:
    word_id_dict = pickle.load(f)
with smart_open.smart_open(os.path.join(FLAGS.dir, "config"), 'rb') as f:
    config = pickle.load(f)

    x_test = dh.text_to_index(
        x_raw, word_id_dict,
        max(list(map(int, config["filter_sizes"].split(",")))) - 1)
    x_test = dh.test_tensor(x_test, config["max_length"])

print("\nEvaluating...\n")