def prepare_s2b_dataset(data_dir, data_dict, max_src_vocab=16000, max_tgt_vocab=300, vocab_freq_cutoff=1): train_set = Dataset.from_raw_file( os.path.join(data_dir, data_dict['train'])) dev_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['dev'])) test_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['test'])) # generate vocabulary src_vocab = VocabEntry.from_corpus([e.src for e in train_set], size=max_src_vocab, freq_cutoff=vocab_freq_cutoff) tgt_vocab = VocabEntry.from_corpus([e.tgt for e in train_set], size=max_tgt_vocab, freq_cutoff=vocab_freq_cutoff) vocab = Vocab(src=src_vocab, tgt=tgt_vocab) print('generated vocabulary %s' % repr(vocab), file=sys.stderr) print("sum info: train:{},dev:{},test:{}".format( len(train_set), len(dev_set), len(test_set), )) detail(train_set) detail(dev_set) detail(test_set) train_file = data_dir + "/train.bin" dev_file = data_dir + "/dev.bin" test_file = data_dir + "/test.bin" vocab_file = data_dir + "/vocab.bin" pickle.dump(train_set.examples, open(train_file, 'wb')) pickle.dump(dev_set.examples, open(dev_file, 'wb')) pickle.dump(test_set.examples, open(test_file, 'wb')) pickle.dump(vocab, open(vocab_file, 'wb')) if 'debug' in data_dict: debug_set = Dataset.from_raw_file( os.path.join(data_dir, data_dict['debug'])) debug_file = data_dir + "/debug.bin" pickle.dump(debug_set.examples, open(debug_file, 'wb'))
def prepare_ptb_to_distance(data_dir, data_dict): train_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['train']), e_type='ptb') dev_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['dev']), e_type='ptb') test_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['test']), e_type='ptb') debug_set = Dataset.from_raw_file(os.path.join(data_dir, data_dict['debug']), e_type='ptb') train_file = data_dir + "/train.bin" dev_file = data_dir + "/dev.bin" test_file = data_dir + "/test.bin" debug_file = data_dir + "/debug.bin" pickle.dump(train_set.examples, open(train_file, 'wb')) pickle.dump(dev_set.examples, open(dev_file, 'wb')) pickle.dump(test_set.examples, open(test_file, 'wb')) pickle.dump(debug_set.examples, open(debug_file, 'wb'))
def prepare_raw_data(data_dir, data_dict): for key, val in data_dict.items(): path = os.path.join(data_dir, val) data = Dataset.from_raw_file(path) out_file = path + ".bin" pickle.dump(data.examples, open(out_file, 'wb'))