Beispiel #1
0
def main(opt):
    train_src, train_tgt = read_parallel_corpus(opt.train_src, opt.train_tgt,
                                                opt.max_len, opt.lower_case)
    dev_src, dev_tgt = read_parallel_corpus(opt.dev_src, opt.dev_tgt, None,
                                            opt.lower_case)

    if opt.vocab:
        src_counter, src_word2idx, src_idx2word, = torch.load(
            opt.vocab)['src_dict']
        tgt_counter, tgt_word2idx, tgt_idx2word, = torch.load(
            opt.vocab)['tgt_dict']
    else:
        if opt.share_vocab:
            print('Building shared vocabulary')
            vocab_size = min(opt.src_vocab_size, opt.tgt_vocab_size) \
                if (opt.src_vocab_size is not None and opt.tgt_vocab_size is not None) else None
            counter, word2idx, idx2word = build_vocab(train_src + train_tgt,
                                                      vocab_size,
                                                      opt.min_word_count,
                                                      data_utils.extra_tokens)
            src_counter, src_word2idx, src_idx2word = (counter, word2idx,
                                                       idx2word)
            tgt_counter, tgt_word2idx, tgt_idx2word = (counter, word2idx,
                                                       idx2word)
        else:
            src_counter, src_word2idx, src_idx2word = build_vocab(
                train_src, opt.src_vocab_size, opt.min_word_count,
                data_utils.extra_tokens)
            tgt_counter, tgt_word2idx, tgt_idx2word = build_vocab(
                train_tgt, opt.tgt_vocab_size, opt.min_word_count,
                data_utils.extra_tokens)
    train_src, train_tgt = \
        convert_text2idx(train_src, src_word2idx), convert_text2idx(train_tgt, tgt_word2idx)
    dev_src, dev_tgt = \
        convert_text2idx(dev_src, src_word2idx), convert_text2idx(dev_tgt, tgt_word2idx)

    # Save source/target vocabulary and train/dev data
    torch.save(
        {
            'src_dict': (src_counter, src_word2idx, src_idx2word),
            'tgt_dict': (tgt_counter, tgt_word2idx, tgt_idx2word),
            'src_path': opt.train_src,
            'tgt_path': opt.train_tgt,
            'lower_case': opt.lower_case
        }, '{}.dict'.format(opt.save_data))
    torch.save(
        {
            'train_src': train_src,
            'train_tgt': train_tgt,
            'dev_src': dev_src,
            'dev_tgt': dev_tgt,
            'src_dict': src_word2idx,
            'tgt_dict': tgt_word2idx,
        }, '{}-train.t7'.format(opt.save_data))
    print('Saved the vocabulary at {}.dict'.format(opt.save_data))
    print('Saved the preprocessed train/dev data at {}-train.t7'.format(
        opt.save_data))
Beispiel #2
0
def prepare_data(args, task_id):
    # get condidate response (restaurants domain)
    candidates, candid2idx, idx2candid = data_utils.load_candidates(
        task_id=task_id, candidates_f=DATA_DIR + 'dialog-babi-candidates.txt')

    # get train, test, val data
    train, test, val = data_utils.load_dialog_task(data_dir=DATA_DIR,
                                                   task_id=task_id,
                                                   candid_dic=candid2idx,
                                                   isOOV=False)

    # get metadata
    metadata = data_utils.build_vocab(train + test + val, candidates)

    # write data to file (pickle을 사용함 이거 빠름)
    data_ = {
        'candidates': candidates,
        'train': train,
        'test': test,
        'val': val
    }
    with open(P_DATA_DIR + str(task_id) + '.data.pkl', 'wb') as f:
        pkl.dump(data_, f)

    # 메타데이터에 추가 후 저장
    metadata['candid2idx'] = candid2idx
    metadata['idx2candid'] = idx2candid

    with open(P_DATA_DIR + str(task_id) + '.metadata.pkl', 'wb') as f:
        pkl.dump(metadata, f)
Beispiel #3
0
        os.makedirs(original_path)

    ### Reformat Source data
    for split, file in zip(["train", "dev", "test"],
                           ["eng.train", "eng.testa", "eng.testb"]):
        words, tags, _ = parse_raw_conll(source_path + file, sep="\t")

        ### Correct IOB tags in case eng.train ..etc.. have only I- tags
        tags = correct_iob(tags)

        with open(original_path + "{}.words.txt".format(split), "w", encoding="utf8") as file:
            for sent in words:
                file.write("{}\n".format(" ".join(sent)))

        with open(original_path + "{}.iob.txt".format(split), "w", encoding="utf8") as file:
            for sent in tags:
                file.write("{}\n".format(" ".join(sent)))

        with open(original_path + "{}.iobes.txt".format(split), "w", encoding="utf8") as file:
            for sent in tags:
                file.write("{}\n".format(" ".join(iob2iobes(sent))))

    ### Compute vocabulary
    build_vocab(original_path)

    ### Trim glove embeddings
    word_vocab_path = original_path + "vocab.words.txt"
    embedding_path = EMBEDDINGS_DIR + "glove.840B/glove.840B.300d.txt"
    saving_path = original_path + "glove.840B.300d"
    trim_embeddings(word_vocab_path, embedding_path, saving_path)
Beispiel #4
0
        with open(original_path + "{}.iob.txt".format(split),
                  "w",
                  encoding="utf8") as file:
            for i, row in df.iterrows():
                file.write("{}\n".format(" ".join(row["ner"])))

        iob = load_file(original_path + "{}.iob.txt".format(split))

        with open(original_path + "{}.iobes.txt".format(split),
                  "w",
                  encoding="utf8") as file:
            for tag in iob:
                file.write("{}\n".format(" ".join(iob2iobes(tag))))

    ### Compute vocabulary
    build_vocab(original_path)

    ### Trim glove embeddings
    word_vocab_path = original_path + "vocab.words.txt"
    embedding_path = EMBEDDINGS_DIR + "glove.840B/glove.840B.300d.txt"
    saving_path = original_path + "glove.840B.300d"
    trim_embeddings(word_vocab_path,
                    embedding_path,
                    saving_path,
                    check_exists=False)

    ### Remap dataset
    data = load_data(original_path, scheme="iob")
    tag2idx = data["vocab"]["tag"][0]

    mapping = {
Beispiel #5
0
def preprocess(opts):
    # Create dirs if not exist
    ensure_dir(opts.save_dir)

    # Build vocabulary
    logger.info('Building vocabulary from %s' % opts.vocab)
    vocab = build_vocab(opts.vocab, opts.max_vocab_size)
    logger.info('Saving vocabulary of size %d to %s' % \
        (len(vocab), os.path.join(opts.save_dir, 'vocab.pt')))
    torch.save(vocab, os.path.join(opts.save_dir, 'vocab.pt'))

    # Build feature extractor
    feat_ext = AudioFeatureExtractor(sample_rate=opts.sample_rate,
                                     window_size=opts.window_size,
                                     window_stride=opts.window_stride,
                                     window=opts.window,
                                     feat_type=opts.feat_type,
                                     normalize_audio=opts.normalize_audio)
    torch.save(feat_ext, os.path.join(opts.save_dir, 'feat_ext.pt'))

    # Build train shards
    for src_train, tgt_train in zip(opts.src_train, opts.tgt_train):
        accent = src_train.split('/')[-2]
        feats = {'accent': ACCENTS[accent], 'labeled': accent == 'us'}
        build_shards(src_dir=opts.src_dir,
                     save_dir=os.path.join(opts.save_dir, accent),
                     src_file=src_train,
                     tgt_file=tgt_train,
                     vocab=vocab,
                     shard_size=opts.shard_size,
                     feat_ext=feat_ext,
                     mode='train',
                     feats=feats)

    # Build validation shards
    for src_valid, tgt_valid in zip(opts.src_valid, opts.tgt_valid):
        accent = src_valid.split('/')[-2]
        feats = {'accent': ACCENTS[accent], 'labeled': True}
        build_shards(src_dir=opts.src_dir,
                     save_dir=os.path.join(opts.save_dir, accent),
                     src_file=src_valid,
                     tgt_file=tgt_valid,
                     vocab=vocab,
                     shard_size=opts.shard_size,
                     feat_ext=feat_ext,
                     mode='valid',
                     feats=feats)

    # Build test shards
    for src_test, tgt_test in zip(opts.src_test, opts.tgt_test):
        accent = src_test.split('/')[-2]
        feats = {'accent': ACCENTS[accent], 'labeled': True}
        build_shards(src_dir=opts.src_dir,
                     save_dir=os.path.join(opts.save_dir, accent),
                     src_file=src_test,
                     tgt_file=tgt_test,
                     vocab=vocab,
                     shard_size=opts.shard_size,
                     feat_ext=feat_ext,
                     mode='test',
                     feats=feats)