def prep_to_entityduet_format(): train_file = os.path.join(args.data_dir, 'train.prep.pairwise') dev_file = os.path.join(args.data_dir, 'test.prep.pointwise') test_file = os.path.join(args.data_dir, 'test.prep.pointwise') vocab_file = os.path.join(args.data_dir, 'vocab') emb_file = os.path.join(args.data_dir, 'w2v') train_file_out = os.path.join(args.out_dir, 'train_pair.pkl') dev_file_out = os.path.join(args.out_dir, 'dev.pkl') test_file_out = os.path.join(args.out_dir, 'test.pkl') vocab_file_out = os.path.join(args.out_dir, 'vocab.txt') emb_file_out = os.path.join(args.out_dir, 'embed.txt') def id_map_fn(ids): return [id + 1 for id in ids] def label_map_fn(label): if label > 0: return 1 return 0 # save train, dev, test data for in_file, out_file in [(train_file, train_file_out), (dev_file, dev_file_out), (test_file, test_file_out)]: transformed_data = [] print('transforming {} ...'.format(in_file)) if in_file.endswith('pointwise'): mode = 1 func = int elif in_file.endswith('pairwise'): mode = 2 func = float for sample in prep_file_iterator(in_file, method='sample', func=func, parse=True): if mode == 1: transformed_data.append( (id_map_fn(sample.query), id_map_fn(sample.doc), label_map_fn(sample.label), sample.qid)) elif mode == 2: transformed_data.append( (id_map_fn(sample.query), id_map_fn(sample.doc1), id_map_fn(sample.doc2))) print('saving to {}'.format(out_file)) with open(out_file, 'wb') as fout: pickle.dump(transformed_data, fout, protocol=2) # save vocab print('saving to {}'.format(vocab_file_out)) vocab = Vocab(filepath=vocab_file, file_format=args.format) words = ['<PAD>'] + vocab.get_word_list() with open(vocab_file_out, 'w') as fout: fout.write('\n'.join(words) + '\n') # save emb print('saving to {}'.format(emb_file_out)) wv = WordVector(filepath=emb_file, first_line=args.first_line) vector = np.concatenate([np.zeros_like(wv.vectors[:1]), wv.vectors], axis=0) vector.dump(emb_file_out)
def word_vector_transform(): print('loading word vector ...') wv = WordVector(filepath=args.word_vector_path, first_line=True) vocab = Vocab(filepath=os.path.join(args.data_dir, 'vocab'), file_format='ir') print('transforming ...') wv.transform(vocab.get_word_list(), oov_filepath=os.path.join(args.data_dir, 'oov.txt'), oov_at_end=True) print('saving ...') wv.save_to_file(os.path.join(args.data_dir, 'w2v'))
def word_vector_transform(): print('loading word vector ...') wv = WordVector(filepath=args.word_vector_path, first_line=args.first_line) vocab = Vocab(filepath=os.path.join(args.data_dir, 'vocab'), file_format=args.format) print('transforming ...') wv.transform( vocab.get_word_list(), oov_filepath=os.path.join(args.data_dir, 'oov.txt'), oov_at_end=False) # don't use oov_at_end because it is problematic print('saving ...') wv.save_to_file(os.path.join(args.data_dir, 'w2v'))