def main(opt): train_src, train_tgt = read_parallel_corpus(opt.train_src, opt.train_tgt, opt.max_len, opt.lower_case) dev_src, dev_tgt = read_parallel_corpus(opt.dev_src, opt.dev_tgt, None, opt.lower_case) if opt.vocab: src_counter, src_word2idx, src_idx2word, = torch.load( opt.vocab)['src_dict'] tgt_counter, tgt_word2idx, tgt_idx2word, = torch.load( opt.vocab)['tgt_dict'] else: if opt.share_vocab: print('Building shared vocabulary') vocab_size = min(opt.src_vocab_size, opt.tgt_vocab_size) \ if (opt.src_vocab_size is not None and opt.tgt_vocab_size is not None) else None counter, word2idx, idx2word = build_vocab(train_src + train_tgt, vocab_size, opt.min_word_count, data_utils.extra_tokens) src_counter, src_word2idx, src_idx2word = (counter, word2idx, idx2word) tgt_counter, tgt_word2idx, tgt_idx2word = (counter, word2idx, idx2word) else: src_counter, src_word2idx, src_idx2word = build_vocab( train_src, opt.src_vocab_size, opt.min_word_count, data_utils.extra_tokens) tgt_counter, tgt_word2idx, tgt_idx2word = build_vocab( train_tgt, opt.tgt_vocab_size, opt.min_word_count, data_utils.extra_tokens) train_src, train_tgt = \ convert_text2idx(train_src, src_word2idx), convert_text2idx(train_tgt, tgt_word2idx) dev_src, dev_tgt = \ convert_text2idx(dev_src, src_word2idx), convert_text2idx(dev_tgt, tgt_word2idx) # Save source/target vocabulary and train/dev data torch.save( { 'src_dict': (src_counter, src_word2idx, src_idx2word), 'tgt_dict': (tgt_counter, tgt_word2idx, tgt_idx2word), 'src_path': opt.train_src, 'tgt_path': opt.train_tgt, 'lower_case': opt.lower_case }, '{}.dict'.format(opt.save_data)) torch.save( { 'train_src': train_src, 'train_tgt': train_tgt, 'dev_src': dev_src, 'dev_tgt': dev_tgt, 'src_dict': src_word2idx, 'tgt_dict': tgt_word2idx, }, '{}-train.t7'.format(opt.save_data)) print('Saved the vocabulary at {}.dict'.format(opt.save_data)) print('Saved the preprocessed train/dev data at {}-train.t7'.format( opt.save_data))
def prepare_data(args, task_id): # get condidate response (restaurants domain) candidates, candid2idx, idx2candid = data_utils.load_candidates( task_id=task_id, candidates_f=DATA_DIR + 'dialog-babi-candidates.txt') # get train, test, val data train, test, val = data_utils.load_dialog_task(data_dir=DATA_DIR, task_id=task_id, candid_dic=candid2idx, isOOV=False) # get metadata metadata = data_utils.build_vocab(train + test + val, candidates) # write data to file (pickle을 사용함 이거 빠름) data_ = { 'candidates': candidates, 'train': train, 'test': test, 'val': val } with open(P_DATA_DIR + str(task_id) + '.data.pkl', 'wb') as f: pkl.dump(data_, f) # 메타데이터에 추가 후 저장 metadata['candid2idx'] = candid2idx metadata['idx2candid'] = idx2candid with open(P_DATA_DIR + str(task_id) + '.metadata.pkl', 'wb') as f: pkl.dump(metadata, f)
os.makedirs(original_path) ### Reformat Source data for split, file in zip(["train", "dev", "test"], ["eng.train", "eng.testa", "eng.testb"]): words, tags, _ = parse_raw_conll(source_path + file, sep="\t") ### Correct IOB tags in case eng.train ..etc.. have only I- tags tags = correct_iob(tags) with open(original_path + "{}.words.txt".format(split), "w", encoding="utf8") as file: for sent in words: file.write("{}\n".format(" ".join(sent))) with open(original_path + "{}.iob.txt".format(split), "w", encoding="utf8") as file: for sent in tags: file.write("{}\n".format(" ".join(sent))) with open(original_path + "{}.iobes.txt".format(split), "w", encoding="utf8") as file: for sent in tags: file.write("{}\n".format(" ".join(iob2iobes(sent)))) ### Compute vocabulary build_vocab(original_path) ### Trim glove embeddings word_vocab_path = original_path + "vocab.words.txt" embedding_path = EMBEDDINGS_DIR + "glove.840B/glove.840B.300d.txt" saving_path = original_path + "glove.840B.300d" trim_embeddings(word_vocab_path, embedding_path, saving_path)
with open(original_path + "{}.iob.txt".format(split), "w", encoding="utf8") as file: for i, row in df.iterrows(): file.write("{}\n".format(" ".join(row["ner"]))) iob = load_file(original_path + "{}.iob.txt".format(split)) with open(original_path + "{}.iobes.txt".format(split), "w", encoding="utf8") as file: for tag in iob: file.write("{}\n".format(" ".join(iob2iobes(tag)))) ### Compute vocabulary build_vocab(original_path) ### Trim glove embeddings word_vocab_path = original_path + "vocab.words.txt" embedding_path = EMBEDDINGS_DIR + "glove.840B/glove.840B.300d.txt" saving_path = original_path + "glove.840B.300d" trim_embeddings(word_vocab_path, embedding_path, saving_path, check_exists=False) ### Remap dataset data = load_data(original_path, scheme="iob") tag2idx = data["vocab"]["tag"][0] mapping = {
def preprocess(opts): # Create dirs if not exist ensure_dir(opts.save_dir) # Build vocabulary logger.info('Building vocabulary from %s' % opts.vocab) vocab = build_vocab(opts.vocab, opts.max_vocab_size) logger.info('Saving vocabulary of size %d to %s' % \ (len(vocab), os.path.join(opts.save_dir, 'vocab.pt'))) torch.save(vocab, os.path.join(opts.save_dir, 'vocab.pt')) # Build feature extractor feat_ext = AudioFeatureExtractor(sample_rate=opts.sample_rate, window_size=opts.window_size, window_stride=opts.window_stride, window=opts.window, feat_type=opts.feat_type, normalize_audio=opts.normalize_audio) torch.save(feat_ext, os.path.join(opts.save_dir, 'feat_ext.pt')) # Build train shards for src_train, tgt_train in zip(opts.src_train, opts.tgt_train): accent = src_train.split('/')[-2] feats = {'accent': ACCENTS[accent], 'labeled': accent == 'us'} build_shards(src_dir=opts.src_dir, save_dir=os.path.join(opts.save_dir, accent), src_file=src_train, tgt_file=tgt_train, vocab=vocab, shard_size=opts.shard_size, feat_ext=feat_ext, mode='train', feats=feats) # Build validation shards for src_valid, tgt_valid in zip(opts.src_valid, opts.tgt_valid): accent = src_valid.split('/')[-2] feats = {'accent': ACCENTS[accent], 'labeled': True} build_shards(src_dir=opts.src_dir, save_dir=os.path.join(opts.save_dir, accent), src_file=src_valid, tgt_file=tgt_valid, vocab=vocab, shard_size=opts.shard_size, feat_ext=feat_ext, mode='valid', feats=feats) # Build test shards for src_test, tgt_test in zip(opts.src_test, opts.tgt_test): accent = src_test.split('/')[-2] feats = {'accent': ACCENTS[accent], 'labeled': True} build_shards(src_dir=opts.src_dir, save_dir=os.path.join(opts.save_dir, accent), src_file=src_test, tgt_file=tgt_test, vocab=vocab, shard_size=opts.shard_size, feat_ext=feat_ext, mode='test', feats=feats)