Exemple #1
0
def prep_to_entityduet_format():
    train_file = os.path.join(args.data_dir, 'train.prep.pairwise')
    dev_file = os.path.join(args.data_dir, 'test.prep.pointwise')
    test_file = os.path.join(args.data_dir, 'test.prep.pointwise')
    vocab_file = os.path.join(args.data_dir, 'vocab')
    emb_file = os.path.join(args.data_dir, 'w2v')
    train_file_out = os.path.join(args.out_dir, 'train_pair.pkl')
    dev_file_out = os.path.join(args.out_dir, 'dev.pkl')
    test_file_out = os.path.join(args.out_dir, 'test.pkl')
    vocab_file_out = os.path.join(args.out_dir, 'vocab.txt')
    emb_file_out = os.path.join(args.out_dir, 'embed.txt')

    def id_map_fn(ids):
        return [id + 1 for id in ids]

    def label_map_fn(label):
        if label > 0:
            return 1
        return 0

    # save train, dev, test data
    for in_file, out_file in [(train_file, train_file_out),
                              (dev_file, dev_file_out),
                              (test_file, test_file_out)]:
        transformed_data = []
        print('transforming {} ...'.format(in_file))
        if in_file.endswith('pointwise'):
            mode = 1
            func = int
        elif in_file.endswith('pairwise'):
            mode = 2
            func = float
        for sample in prep_file_iterator(in_file,
                                         method='sample',
                                         func=func,
                                         parse=True):
            if mode == 1:
                transformed_data.append(
                    (id_map_fn(sample.query), id_map_fn(sample.doc),
                     label_map_fn(sample.label), sample.qid))
            elif mode == 2:
                transformed_data.append(
                    (id_map_fn(sample.query), id_map_fn(sample.doc1),
                     id_map_fn(sample.doc2)))
        print('saving to {}'.format(out_file))
        with open(out_file, 'wb') as fout:
            pickle.dump(transformed_data, fout, protocol=2)
    # save vocab
    print('saving to {}'.format(vocab_file_out))
    vocab = Vocab(filepath=vocab_file, file_format=args.format)
    words = ['<PAD>'] + vocab.get_word_list()
    with open(vocab_file_out, 'w') as fout:
        fout.write('\n'.join(words) + '\n')
    # save emb
    print('saving to {}'.format(emb_file_out))
    wv = WordVector(filepath=emb_file, first_line=args.first_line)
    vector = np.concatenate([np.zeros_like(wv.vectors[:1]), wv.vectors],
                            axis=0)
    vector.dump(emb_file_out)
Exemple #2
0
def word_vector_transform():
    print('loading word vector ...')
    wv = WordVector(filepath=args.word_vector_path, first_line=True)
    vocab = Vocab(filepath=os.path.join(args.data_dir, 'vocab'),
                  file_format='ir')
    print('transforming ...')
    wv.transform(vocab.get_word_list(),
                 oov_filepath=os.path.join(args.data_dir, 'oov.txt'),
                 oov_at_end=True)
    print('saving ...')
    wv.save_to_file(os.path.join(args.data_dir, 'w2v'))
Exemple #3
0
def word_vector_transform():
    print('loading word vector ...')
    wv = WordVector(filepath=args.word_vector_path, first_line=args.first_line)
    vocab = Vocab(filepath=os.path.join(args.data_dir, 'vocab'),
                  file_format=args.format)
    print('transforming ...')
    wv.transform(
        vocab.get_word_list(),
        oov_filepath=os.path.join(args.data_dir, 'oov.txt'),
        oov_at_end=False)  # don't use oov_at_end because it is problematic
    print('saving ...')
    wv.save_to_file(os.path.join(args.data_dir, 'w2v'))