Exemple #1
0
def preprocess():
    # Load data
    print("Loading data...")
    x_text, t_text = dh.load_data(FLAGS.source_train_data,
                                  FLAGS.target_train_data)
    xv_text, tv_text = dh.load_data(FLAGS.source_dev_data,
                                    FLAGS.target_dev_data)

    print("Build vocabulary...")
    source_word_id_dict, _ = dh.buildVocab(
        x_text, FLAGS.source_vocab_size)  # apple:1017 <--> 1017:apple
    FLAGS.source_vocab_size = len(source_word_id_dict) + 4
    target_word_id_dict, _ = dh.buildVocab(t_text, FLAGS.target_vocab_size)
    # print(target_word_id_dict)
    FLAGS.target_vocab_size = len(target_word_id_dict) + 4
    print("Source language vocabulary size: ", FLAGS.source_vocab_size)
    print("Target language vocabulary size: ", FLAGS.target_vocab_size)
    print("Average length (source): ",
          sum([len(x.split(" ")) for x in x_text]) / len(x_text))
    print("Average length (target): ",
          sum([len(t.split(" ")) for t in t_text]) / len(t_text))
    # FLAGS.max_length = 2 + max([len(x.split(" ")) for x in x_text] + [len(t.split(" ")) for t in t_text])
    for word_id in source_word_id_dict.keys():
        source_word_id_dict[
            word_id] += 4  # 0: <pad>, 1: <s>, 2: <eos>, 3: <unk>
    source_word_id_dict['<pad>'] = 0
    source_word_id_dict['<s>'] = 1
    source_word_id_dict['<eos>'] = 2
    source_word_id_dict['<unk>'] = 3  # Gooooooooood!!!!!! -> <unk>

    for word_id in target_word_id_dict.keys():
        target_word_id_dict[word_id] += 4
    target_word_id_dict['<pad>'] = 0
    target_word_id_dict['<s>'] = 1
    target_word_id_dict['<eos>'] = 2
    target_word_id_dict['<unk>'] = 3

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(x_text)))
    x_text = x_text[shuffle_indices]
    t_text = t_text[shuffle_indices]

    x, x_lengths = dh.text_to_index(x_text, source_word_id_dict)
    t, t_lengths = dh.text_to_index_ko(t_text, target_word_id_dict)

    xv, xv_lengths = dh.text_to_index(xv_text, source_word_id_dict)
    tv, tv_lengths = dh.text_to_index_ko(tv_text, target_word_id_dict)

    print("Train/Val split: {:d}/{:d}".format(len(x), len(xv)))
    return x, t, x_lengths, t_lengths, xv, tv, xv_lengths, tv_lengths, source_word_id_dict, target_word_id_dict
Exemple #2
0
def preprocess():
    # Load data
    print("Loading data...")
    if FLAGS.task == "MR":
        x_text, y = dh.load_mr_data(FLAGS.mr_train_file_pos,
                                    FLAGS.mr_train_file_neg)
    elif FLAGS.task == "TREC":
        x_text, y = dh.load_trec_data(FLAGS.trec_train_file)
        pass  # TREC data 전처리 구현

    # Build vocabulary
    word_id_dict, _ = dh.buildVocab(
        x_text, FLAGS.vocab_size)  # training corpus를 토대로 단어사전 구축
    FLAGS.vocab_size = len(word_id_dict) + 4  #30000 + 4
    print("vocabulary size: ", FLAGS.vocab_size)

    for word in word_id_dict.keys():
        word_id_dict[word] += 4  # <pad>: 0, <unk>: 1, <s>: 2 (a: 0 -> 4)
    word_id_dict['<pad>'] = 0  # zero padding을 위한 토큰
    word_id_dict['<unk>'] = 1  # OOV word를 위한 토큰
    word_id_dict['<s>'] = 2  # 문장 시작을 알리는 start 토큰
    word_id_dict['</s>'] = 3  # 문장 마침을 알리는 end 토큰

    x = dh.text_to_index(x_text, word_id_dict,
                         max(list(map(int, FLAGS.filter_sizes.split(",")))) -
                         1)  # i am a boy, word_id_dict, max([3,4,5]) -> 5 - 1
    x, FLAGS.max_length = dh.train_tensor(x)  # 문장 max length를 바탕으로 batch 구성

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/dev set
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]

    FLAGS.num_classes = y_train.shape[
        1]  # 2 (eg., [0, 1]), class 개수를 y shape로 부터 획득

    del x, x_text, y, x_shuffled, y_shuffled
    print(x_train)
    print(y_train)

    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, word_id_dict, x_dev, y_dev
Exemple #3
0
def preprocess():
    # Data Preparation
    # Load data
    print("Loading data...")
    x_text, y, lengths = dh.load_data(FLAGS.x_train_file, FLAGS.t_train_file)

    print("Build vocabulary...")
    # Build vocabulary
    word_id_dict, _ = dh.buildVocab(x_text, FLAGS.vocab_size)
    # print(word_id_dict)
    FLAGS.vocab_size = len(word_id_dict) + 4
    print("vocabulary size: ", FLAGS.vocab_size)

    for word_id in word_id_dict.keys():
        word_id_dict[word_id] += 4  # 0: <pad>, 1: <unk>, 2: <s>
    word_id_dict['<pad>'] = 0
    word_id_dict['<unk>'] = 1
    word_id_dict['<s>'] = 2
    word_id_dict['</s>'] = 3

    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_text = x_text[shuffle_indices]
    print("Split train/validation set...")
    val_sample_index = -1 * int(FLAGS.val_sample_percentage * float(len(y)))
    x_train, x_val = x_text[:val_sample_index], x_text[val_sample_index:]

    x_train = dh.text_to_index(x_train, word_id_dict, 0)
    x_val = dh.text_to_index(x_val, word_id_dict, 0)

    FLAGS.num_classes = y.shape[1]

    y = y[shuffle_indices]
    lengths = lengths[shuffle_indices]

    y_train, y_val = y[:val_sample_index], y[val_sample_index:]
    lengths, lengths_val = lengths[:val_sample_index], lengths[
        val_sample_index:]

    print("Vocabulary Size: {:d}".format(FLAGS.vocab_size))
    print("Train/Val split: {:d}/{:d}".format(len(y_train), len(y_val)))
    return x_train, y_train, lengths, word_id_dict, x_val, y_val, lengths_val