Esempio n. 1
0
def main(args):
    train_input_filepath = args.train_input
    dev_input_filepath = args.dev_input

    vocab = Vocabulary()
    vocab.load_word_from_data(train_path=train_input_filepath,
                              dev_path=dev_input_filepath)
    vocab.save(vocab_file=PREPROCESSED_DIR + 'vocab_file.vocab')
    print("vocab size:{}".format(len(vocab)))

    max_length = -1
    with open(train_input_filepath, 'r') as f_t, open(dev_input_filepath,
                                                      'r') as f_d:
        for line in chain(f_t, f_d):
            line = line.rstrip().split()
            max_length = max(max_length, len(line))
    # max_length = 17

    for file_name in [train_input_filepath, dev_input_filepath]:
        output_file = file_name.split('/')[-1] + '.preprocessed'
        data = []
        with open(file_name, 'r') as fin, open(PREPROCESSED_DIR + output_file,
                                               'w') as fout:
            total_len = ilen(fin)
            bar = tqdm(total=total_len)
            fin.seek(0)
            for i, line in enumerate(fin, start=1):
                line = line.rstrip().split()
                data.append(
                    torch.LongTensor(vocab.sentence2index(line, max_length)))
                # data = ' '.join(
                #     list(map(str, vocab.sentence2index(line, max_length))))
                # if i == total_len:
                #     print(data, file=fout, end='')
                # else:
                #     print(data, file=fout)
                bar.update(1)
        torch.save(data, PREPROCESSED_DIR + output_file)