Esempi in Python per IndexedDatasetBuilder, esempi in Python per fairseq.data.indexed_dataset.IndexedDatasetBuilder

Esempio n. 1

0

Mostra file

File: preprocess_nstack2seq_merge.py Progetto: MaratSaidov/source-code-summarization

    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))

        input_file = "{}{}".format(
            input_prefix, ("." + lang) if lang is not None else ""
        )
        pool = None

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin")
        )

        def consumer(tensor):
            ds.add_item(tensor)

        stat = BinarizerDataset.export_binarized_dataset(
            input_file, vocab, consumer, add_if_not_exist=False, num_workers=num_workers,
        )

        ntok = stat['ntok']
        nseq = stat['nseq']
        nunk = stat['nunk']

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print(
            "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                nseq,
                ntok,
                100 * nunk / ntok,
                vocab.unk_word,
            )
        )

Esempio n. 2

0

Mostra file

def binarize(args,
             filename,
             vocab,
             output_prefix,
             lang,
             offset,
             end,
             append_eos=True,
             copy_from=None):
    ds = indexed_dataset.IndexedDatasetBuilder(
        dataset_dest_file(args, output_prefix, lang, "bin"))
    words_list = []  # todo: 目前传不出去

    def consumer(ids, words):
        ds.add_item(ids)
        words_list.append(words)

    res = Binarizer.binarize(filename,
                             vocab,
                             consumer,
                             append_eos=append_eos,
                             offset=offset,
                             end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res

Esempio n. 3

0

Mostra file

def make_binary_dataset(data_dir):
    dict = dictionary.Dictionary.load(data_dir + '/dict.en.txt')

    print('Converting utf8 files to fairseq binary')
    files = glob.glob(data_dir + '/utf8/test*.en', recursive=True)
    files += glob.glob(data_dir + '/utf8/test*.de', recursive=True)

    files += glob.glob(data_dir + '/utf8/dev*.en', recursive=True)
    files += glob.glob(data_dir + '/utf8/dev*.de', recursive=True)

    files += glob.glob(data_dir + '/utf8/train*.en', recursive=True)
    files += glob.glob(data_dir + '/utf8/train*.de', recursive=True)

    def consumer(tensor):
        ds.add_item(tensor)

    for file in files:
        print('Converting file:', file)
        ds = indexed_dataset.IndexedDatasetBuilder(file + '.bin')

        def consumer(tensor):
            ds.add_item(tensor)

        res = MockTokenizer.binarize(file, dict, consumer)

        ds.finalize(file + '.idx')

Esempio n. 4

0

Mostra file

File: multidata_preprocess.py Progetto: wongz97/MD_NMT_mixing_model

def merge_files(files, outpath):
    ds = indexed_dataset.IndexedDatasetBuilder("{}.bin".format(outpath))
    for file in files:
        ds.merge_file_(file)
        os.remove(indexed_dataset.data_file_path(file))
        os.remove(indexed_dataset.index_file_path(file))
    ds.finalize("{}.idx".format(outpath))

Esempio n. 5

0

Mostra file

File: preprocess.py Progetto: lyy1994/reformer

def binarize(args, filename, dict, output_prefix, lang, offset, end):
    ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin'))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))
    return res

Esempio n. 6

0

Mostra file

File: preprocess_universal.py Progetto: sugeeth14/multilingual-kd-pytorch

    def make_binary_dataset(input_prefix, output_prefix, lng_pair, lang,
                            num_workers):
        if not args.joined_dictionary and lang != 'en':
            dict = dictionary.Dictionary.load(tgt_dict_path)
        else:
            dict = dictionary.Dictionary.load(dict_path)

        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = f'{input_prefix}.{lng_pair}.{lang}.tok.bpe'
        if not os.path.exists(input_file):
            input_file = f'{input_prefix}.{lng_pair}.{lang}'
            if not os.path.exists(input_file):
                print("| {} not found".format(input_file))
                return
        if args.expert:
            input_file = input_file + '.e'
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                fn_without_ext = f"{output_prefix}{worker_id}.{lng_pair}.{lang}"
                pool.apply_async(binarize,
                                 (input_file, dict, fn_without_ext,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            f"{output_prefix}.{lng_pair}.{lang}.bin")
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = f"{output_prefix}{worker_id}.{lng_pair}.{lang}"
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(f"{output_prefix}.{lng_pair}.{lang}.idx")

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))

Esempio n. 7

0

Mostra file

File: preprocess_universal.py Progetto: sugeeth14/multilingual-kd-pytorch

def binarize(filename, dict, fn_without_ext, offset, end):
    ds = indexed_dataset.IndexedDatasetBuilder(f"{fn_without_ext}.bin")

    def consumer(tensor):
        ds.add_item(tensor)

    res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end)
    ds.finalize(f"{fn_without_ext}.idx")
    return res

Esempio n. 8

0

Mostra file

def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True):
    ds = indexed_dataset.IndexedDatasetBuilder(
        dataset_dest_file(args, output_prefix, lang, "bin")
    )

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize(filename, vocab, consumer, append_eos=append_eos,
                             offset=offset, end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
    return res

Esempio n. 9

0

Mostra file

File: preprocess_soft_proto.py Progetto: zjplab/nmt_soft_prototype

    def make_binary_proto_dataset(input_prefix, output_prefix,
                                  output_token_suffix, output_weight_suffix,
                                  soft_proto_dict, proto_k, lang, lang_name):

        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        print('| [prototype] Dictinary: {} types, {} candidates'.format(
            len(soft_proto_dict), args.proto_k))

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_path(output_prefix,
                              lang_name + '.' + output_token_suffix, 'bin'))
        ds_w = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(
            output_prefix, lang_name + '.' + output_weight_suffix, 'bin'),
                                                     dtype=np.float32)

        def consumer(tensor):
            ds.add_item(tensor)

        def consumer_weights(tensor):
            ds_w.add_item(tensor)

        input_prefix = os.path.join(args.srcdir, input_prefix)
        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        res = TokenizerProb.binarize(input_file,
                                     dict,
                                     consumer,
                                     consumer_weights,
                                     soft_proto_dict=soft_proto_dict,
                                     proto_k=proto_k)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize(
            dataset_dest_path(output_prefix,
                              lang_name + '.' + output_token_suffix, 'idx'))
        ds_w.finalize(
            dataset_dest_path(output_prefix,
                              lang_name + '.' + output_weight_suffix, 'idx'))

Esempio n. 10

0

Mostra file

    def make_binary_dataset(vocab, input_prefix, output_prefix, lang,
                            num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin"))
        merge_result(
            Binarizer.binarize(input_file,
                               vocab,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            lang,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

Esempio n. 11

0

Mostra file

    def make_binary_dataset(input_file, output_prefix, dic, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = args.tokenizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        prefix,
                        dic,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            output_file(args, output_prefix, "bin"))
        merge_result(
            args.tokenizer.binarize(input_file,
                                    dic,
                                    lambda t: ds.add_item(t),
                                    offset=0,
                                    end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = output_file(args, prefix, '')
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(output_file(args, output_prefix, 'idx'))

        print("| {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dic.unk_word))

Esempio n. 12

0

Mostra file

def binarize(args, input_file, output_prefix, dict, offset, end):
    ds = indexed_dataset.IndexedDatasetBuilder(
        output_file(args, output_prefix, 'bin'))

    def consumer(tensor):
        ds.add_item(tensor)

    res = args.tokenizer.binarize(input_file,
                                  dict,
                                  consumer,
                                  offset=offset,
                                  end=end)
    ds.finalize(output_file(args, output_prefix, 'idx'))
    return res

Esempio n. 13

0

Mostra file

    def make_binary_dataset(input_prefix, output_prefix):
        # dict = dictionary.Dictionary.load(dict_path(lang))
        # print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_path(output_prefix, 'bin'))

        def consumer(ids):
            ds.add_item(torch.IntTensor(ids))

        # input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
        input_file = "{}/{}.bert".format(args.data, input_prefix)

        res = tokenizer.binarize(input_file, consumer)
        ds.finalize(dataset_dest_path(output_prefix, 'idx'))

Esempio n. 14

0

Mostra file

    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        print("offsets", offsets)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, dict, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, 'bin'))
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))

Esempio n. 15

0

Mostra file

File: preprocess.py Progetto: yuantiku/PoDA

def make_binary_dataset(input_prefix, output_prefix, lang, src_ids=None):
    dict = dictionary.Dictionary.load(dict_path(lang))
    print('| [{}] Dictionary: {} types'.format(lang, len(dict)))
    ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

    def consumer(tensor):
        ds.add_item(tensor)

    input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
    res, ids = Tokenizer.binarize(input_file, dict, consumer, src_ids=src_ids)
    print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% replaced by copy'.format(
        lang, input_file, res['nseq'], res['ntok'],
        100 * res['nunk'] / res['ntok'], dict.unk_word, 100 * res['ncopied'] / res['ntok']))
    ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))

    return ids

Esempio n. 16

0

Mostra file

def binarize(args, filename, dict, output_prefix, lang, offset, end):

    ds = indexed_dataset.IndexedDatasetBuilder(
        dataset_dest_file(args, output_prefix, lang, 'bin'))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end)
    # {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced}
    to_print = ['nseq', 'nunk', 'ntok']
    debug_data = {}
    for k, v in res.items:
        if k in to_print:
            debug_data[k] = v
    debug_data['offset'] = offset
    debug_data['end'] = end

    print(debug_data)

    ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))
    return res

Esempio n. 17

0

Mostra file

    def make_binary_audio_dataset(input_prefix, output_prefix, lang):
        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(
            args, output_prefix, lang, 'bin'),
                                                   dtype=np.float32)

        def consumer(tensor):
            ds.add_item(tensor)

        def binarize(input_file, audio_reader, consumer):
            nseq, nsamp = 0, 0
            for tensor in audio_reader(input_file):
                consumer(tensor)
                nseq += 1
                nsamp += tensor.size(0)
            return {'nseq': nseq, 'nsamp': nsamp}

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        audio_reader = get_reader(args.format)
        res = binarize(input_file, audio_reader, consumer)
        print('| [{}] {}: {} audio_seq, {} audio_samples'.format(
            lang, input_file, res['nseq'], res['nsamp']))
        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))

Esempio n. 18

0

Mostra file

    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        print('input_prefix', input_prefix)
        print(dict_path(lang))

        dict = roberta_dictionary.RobertaDictionary.load(dict_path(lang))
        input_file = "{}{}".format(
            input_prefix, ("." + lang) if lang is not None else ""
        )
        from pytorch_transformers import RobertaTokenizer
        import torch

        tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

        def penn_token2orig_token(sent):
            # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB-
            penn2orig = {"``":'"', "''": '"',
                         "-LRB-": '(', "-RRB-": ')',
                         "-LSB-":'[', "-RSB-":']',
                         "-LCB-":'{', "-RCB-":'}',
                         "-lrb-": '(', "-rrb-": ')',
                         "-lsb-": '[', "-rsb-": ']',
                         "-lcb-": '{', "-rcb-": '}',
                         }
            words = sent.strip().split()
            words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words]
            return ' '.join(words)

        num_token, num_unk_token = 0, 0
        num_seq = 0
        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin")
        )
        truncated_number = 512 if lang == 'article' else 256
        CLS_TOKEN = '<s>'
        SEP_TOKEN = '</s>'
        if lang == 'article':
            for line in open(input_file, encoding='utf8'):
                article_wids = []
                min_src_sentence = 3
                max_src_sentence = 100
                max_src_ntokens_per_sent = 200
                min_src_ntokens_per_sent = 5
                sents = line.strip().split('<S_SEP>')
                sents = [sent.strip().split() for sent in sents]
                idxs = [i for i, sent in enumerate(sents) if (len(sent) > min_src_ntokens_per_sent)]
                src = [sents[i][:max_src_ntokens_per_sent] for i in idxs]
                src = src[:max_src_sentence]
                src_txt = [' '.join(sent) for sent in src]
                src_tokens = [tokenizer.tokenize(sent) for sent in src_txt]
                for i, sent in enumerate(src_tokens):
                    MAX_SENT_NTOKENS = 500
                    if len(sent) > MAX_SENT_NTOKENS:
                        sent = sent[:MAX_SENT_NTOKENS]
                    if i == 0:
                        input_text = [CLS_TOKEN] + sent + [SEP_TOKEN]
                    elif i != 0:
                        input_text = [SEP_TOKEN] + sent + [SEP_TOKEN]
                    wids = tokenizer.convert_tokens_to_ids(input_text)
                    article_wids.extend(wids)
                    for wid in wids:
                        if wid == dict.unk_index:
                            num_unk_token += 1
                        num_token += 1
                num_seq += 1
                article_wids = article_wids[:truncated_number] if len(article_wids) > truncated_number else article_wids
                if article_wids[-1] != dict.sep_index:
                    article_wids[-1] = dict.sep_index
                tensor = torch.IntTensor(article_wids)
                # print( dict.string_complete(tensor) )
                ds.add_item(tensor)
            ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
        elif lang == 'summary':
            for line in open(input_file, encoding='utf8'):
                article_wids = []
                max_tgt_ntokens = 500
                min_tgt_ntokens = 5
                sents = line.strip().split('<S_SEP>')
                sents = [tokenizer.tokenize(sent) for sent in sents]
                for i, sent in enumerate(sents):
                    # sometimes, there are too many token in one single sentence
                    # to be specific, there are 8 sentences in the training article longer than 512, so truncate them to 500
                    # MAX_SENT_LEN = 500
                    # if len(sent) > MAX_SENT_LEN:
                    #     sent = sent[:MAX_SENT_LEN]
                    if i != 0:
                        input_text = [SEP_TOKEN] + sent
                    else:
                        input_text = sent
                    wids = tokenizer.convert_tokens_to_ids(input_text)
                    # wtoks = tokenizer.convert_ids_to_tokens(wids)
                    # wstring = tokenizer.convert_tokens_to_string(wtoks)

                    # wids_vocab = [dict.index(word) for word in input_text]
                    # assert wids == wids_vocab, 'word indices should be the same!'
                    article_wids.extend(wids)
                    for wid in wids:
                        if wid == dict.unk_index:
                            num_unk_token += 1
                        num_token += 1

                num_seq += 1
                article_wids = article_wids[:truncated_number] if len(article_wids) > truncated_number else article_wids
                if article_wids[-1] == dict.sep_index:
                    article_wids = article_wids[:len(article_wids)-1]
                # print(article_wids)
                if len(article_wids) > truncated_number:
                    print('lang: %s, token len: %d, truncated len: %d' % (lang, len(article_wids), truncated_number))

                tensor = torch.IntTensor(article_wids)
                # print( dict.string_complete(tensor) )
                ds.add_item(tensor)
            ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, num_seq, num_token,
            100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))

Esempio n. 19

0

Mostra file

def make_select_databin(data_ids,
                        prefix_dir="eval-select-bin",
                        prefix_name="eval"):
    # prefix = "valid"
    def consumer(ds, ids):
        ds.add_item(torch.IntTensor(ids))

    def consumer_float(ds, ids):
        ds.add_item(torch.FloatTensor(ids))

    query_bin = "/nas/qsj/data-bin-v2/{}/{}-query.bin".format(
        prefix_dir, prefix_name)
    query_idx = "/nas/qsj/data-bin-v2/{}/{}-query.idx".format(
        prefix_dir, prefix_name)

    passage_bin = "/nas/qsj/data-bin-v2/{}/{}-passage.bin".format(
        prefix_dir, prefix_name)
    passage_idx = "/nas/qsj/data-bin-v2/{}/{}-passage.idx".format(
        prefix_dir, prefix_name)

    target_bin = "/nas/qsj/data-bin-v2/{}/{}-target.bin".format(
        prefix_dir, prefix_name)
    target_idx = "/nas/qsj/data-bin-v2/{}/{}-target.idx".format(
        prefix_dir, prefix_name)

    query_ds = indexed_dataset.IndexedDatasetBuilder(query_bin)

    passage_ds = indexed_dataset.IndexedDatasetBuilder(passage_bin)

    target_ds = indexed_dataset.IndexedDatasetBuilder(target_bin,
                                                      dtype=np.float)

    id_to_index_file = open(
        "/nas/qsj/data-bin-v2/{}/{}-id-to-index.pk".format(
            prefix_dir, prefix_name), "wb")
    id_to_index = {}

    data_dict = pickle.load(open(data_ids, "rb"))
    id_ = 0

    above_10 = 0
    below_10 = 0
    for key, data in data_dict.items():
        id_to_index[str(id_)] = key
        id_ += 1
        ids_query = data["ids_query"]
        ids_passages = data["ids_passages"]
        ids_selected = data["selected"]
        assert len(ids_passages) == len(ids_selected)

        len_ = len(ids_passages)
        if len_ > 10:
            ids_passages = ids_passages[:10]
            ids_selected = ids_selected[:10]
            above_10 += 1
        if len_ < 10:
            for _ in range(len_, 10):
                ids_passages.append(ids_passages[-1])
                ids_selected.append(ids_selected[-1])
                below_10 += 1
        assert len(ids_passages) == 10
        assert len(ids_passages) == len(ids_selected)
        for ids_p in ids_passages:
            consumer(query_ds, ids_query)
            consumer(passage_ds, ids_p)
        if prefix_name == "train":
            total_sum = sum(ids_selected)
            if total_sum > 1:
                for i in range(10):
                    ids_selected[i] /= total_sum
        consumer_float(target_ds, ids_selected)

    query_ds.finalize(query_idx)
    passage_ds.finalize(passage_idx)
    target_ds.finalize(target_idx)

    pickle.dump(id_to_index, id_to_index_file)
    print("| above_10 {}, below_10 {}".format(above_10, below_10))

Esempio n. 20

0

Mostra file

def augument_shffle_top5(passages,
                         queries,
                         answers,
                         prefix="train",
                         dir_file="top-5-qa+nlg+shuffle"):
    def consumer(ds, ids):
        ds.add_item(torch.IntTensor(ids))

    query_bin = "/nas/qsj/data-bin-v2/{}/{}-query.bin".format(dir_file, prefix)
    query_idx = "/nas/qsj/data-bin-v2/{}/{}-query.idx".format(dir_file, prefix)

    passage_1_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-1.bin".format(
        dir_file, prefix)
    passage_1_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-1.idx".format(
        dir_file, prefix)

    passage_2_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-2.bin".format(
        dir_file, prefix)
    passage_2_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-2.idx".format(
        dir_file, prefix)

    passage_3_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-3.bin".format(
        dir_file, prefix)
    passage_3_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-3.idx".format(
        dir_file, prefix)

    passage_4_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-4.bin".format(
        dir_file, prefix)
    passage_4_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-4.idx".format(
        dir_file, prefix)

    passage_5_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-5.bin".format(
        dir_file, prefix)
    passage_5_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-5.idx".format(
        dir_file, prefix)

    target_bin = "/nas/qsj/data-bin-v2/{}/{}-target.bin".format(
        dir_file, prefix)
    target_idx = "/nas/qsj/data-bin-v2/{}/{}-target.idx".format(
        dir_file, prefix)

    query_ds = indexed_dataset.IndexedDatasetBuilder(query_bin)

    passage_1_ds = indexed_dataset.IndexedDatasetBuilder(passage_1_bin)

    passage_2_ds = indexed_dataset.IndexedDatasetBuilder(passage_2_bin)

    passage_3_ds = indexed_dataset.IndexedDatasetBuilder(passage_3_bin)

    passage_4_ds = indexed_dataset.IndexedDatasetBuilder(passage_4_bin)

    passage_5_ds = indexed_dataset.IndexedDatasetBuilder(passage_5_bin)

    target_ds = indexed_dataset.IndexedDatasetBuilder(target_bin)

    for passage, query, answer in zip(passages, queries, answers):
        passage_shuffle = passage[2:]

        consumer(query_ds, query)
        consumer(passage_1_ds, passage[0])
        consumer(passage_2_ds, passage[1])
        consumer(passage_3_ds, passage_shuffle[0])
        consumer(passage_4_ds, passage_shuffle[1])
        consumer(passage_5_ds, passage_shuffle[2])
        consumer(target_ds, answer)

        # random.shuffle(passage_shuffle)
        # consumer(query_ds, query)
        # consumer(passage_1_ds, passage[0])
        # consumer(passage_2_ds, passage[1])
        # consumer(passage_3_ds, passage_shuffle[0])
        # consumer(passage_4_ds, passage_shuffle[1])
        # consumer(passage_5_ds, passage_shuffle[2])
        # consumer(target_ds, answer)
    query_ds.finalize(query_idx)
    passage_1_ds.finalize(passage_1_idx)
    passage_2_ds.finalize(passage_2_idx)
    passage_3_ds.finalize(passage_3_idx)
    passage_4_ds.finalize(passage_4_idx)
    passage_5_ds.finalize(passage_5_idx)
    target_ds.finalize(target_idx)

Esempio n. 21

0

Mostra file

    def make_binary_dataset(vocab,
                            input_prefix,
                            output_prefix,
                            lang,
                            num_workers,
                            copy_src_words=None):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()
        copyied = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            copyied.update(worker_result["copied"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:  # todo: not support copy
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin"))
        words_list = []

        def binarize_consumer(ids, words):
            ds.add_item(ids)
            words_list.append(words)

        merge_result(
            Binarizer.binarize(input_file,
                               vocab,
                               binarize_consumer,
                               reverse_order=args.reverse_order,
                               offset=0,
                               end=offsets[1],
                               copy_ext_dict=args.copy_ext_dict,
                               copy_src_words=copy_src_words))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print(
            "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% <unk> copied from src"
            .format(lang, input_file, n_seq_tok[0], n_seq_tok[1],
                    100 * sum(replaced.values()) / n_seq_tok[1],
                    vocab.unk_word,
                    100 * sum(copyied.values()) / n_seq_tok[1]))

        return words_list

Esempio n. 22

0

Mostra file

File: preprocess_sum_roberta.py Progetto: xssstory/STAS

    def make_binary_dataset(input_prefix, output_prefix, lang, append_eos=False):
        if lang == args.target_lang:
            dict = flexible_dictionary.FlexibleDictionary.load(dict_path(lang))
        else:
            # dict = bert_dictionary.BertDictionary.load(dict_path(lang))
            dict = gpt2_dictionary.GPT2Dictionary.load(dict_path(lang))

        print('| [{}] Dictionary: {} types | {} types (for real)'.format(lang, len(dict) - 1, len(dict)))

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
        if lang == args.target_lang:
            res = Tokenizer.binarize(input_file, dict, consumer, append_eos=append_eos)
            print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
                lang, input_file, res['nseq'], res['ntok'],
                100 * res['nunk'] / res['ntok'], dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))
        else:
            # read article
            # from pytorch_pretrained_bert.tokenization import BertTokenizer
            # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
            from pytorch_transformers import RobertaTokenizer
            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

            def penn_token2orig_token(sent):
                # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB-
                '''
                penn2orig = {"``":'"', "''": '"',
                             "-LRB-": '(', "-RRB-": ')',
                             "-LSB-":'[', "-RSB-":']',
                             "-LCB-":'{', "-RCB-":'}'}
                '''
                penn2orig = {"-LRB-": '(', "-RRB-": ')',
                             "-LSB-": '[', "-RSB-": ']',
                             "-LCB-": '{', "-RCB-": '}',
                             "-lrb-": '(', "-rrb-": ')',
                             "-lsb-": '[', "-rsb-": ']',
                             "-lcb-": '{', "-rcb-": '}',}
                words = sent.strip().split()
                words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words]
                return ' '.join(words)

            num_token, num_unk_token = 0, 0
            num_seq = 0
            skip_line = 0
            for line in open(input_file, encoding='utf8'):
                sents = line.strip().split('<S_SEP>')
                sents = sents[0:args.max_num_sentences]
                sents = [' '.join(sent.strip().split()[0:args.max_num_words]) for sent in sents]
                # print(sents)
                sents = [tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents]
                article_wids = []
                for i, sent in enumerate(sents):
                    # sometimes there are too many tokens
                    MAXLEN = 500
                    if len(sent) > MAXLEN:
                        # sent = sent[0:MAXLEN]
                        print(' '.join(sent))
                        skip_line += 1
                        print(skip_line)
                        continue
                    if i != 0:
                        article_wids.append( dict.sep_index )
                    wids = tokenizer.convert_tokens_to_ids(sent)
                    # wids_vocab = [dict.index(word) for word in sent]
                    # assert wids == wids_vocab, 'word indices should be the same!'
                    article_wids.extend(wids)
                    for wid in wids:
                        if wid == dict.unk_index:
                            num_unk_token += 1
                        num_token += 1

                num_seq += 1
                tensor = torch.IntTensor(article_wids)
                # print( dict.string_complete(tensor) )
                ds.add_item(tensor)

            print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
                lang, input_file, num_seq, num_token,
                100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))

        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))

Esempio n. 23

0

Mostra file

    def make_binary_dataset(vocab, input_prefix, output_prefix, lang,
                            num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        print('input_prefix', input_prefix)
        print(dict_path(lang))

        dict = roberta_dictionary.RobertaDictionary.load(dict_path(lang))
        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        from pytorch_transformers import RobertaTokenizer
        import torch

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        def penn_token2orig_token(sent):
            # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB-
            penn2orig = {
                "``": '"',
                "''": '"',
                "-LRB-": '(',
                "-RRB-": ')',
                "-LSB-": '[',
                "-RSB-": ']',
                "-LCB-": '{',
                "-RCB-": '}'
            }
            words = sent.strip().split()
            words = [
                wd if not wd in penn2orig else penn2orig[wd] for wd in words
            ]
            return ' '.join(words)

        num_token, num_unk_token = 0, 0
        num_seq = 0
        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin"))
        output_ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, 'article_next', "bin"))
        truncated_number = 512
        output_length = 256

        CLS_TOKEN = '<s>'
        SEP_TOKEN = '</s>'

        for line in open(input_file, encoding='utf8'):
            sents = line.strip().split('<S_SEP>')
            sents = [
                tokenizer.tokenize(penn_token2orig_token(sent))
                for sent in sents
            ]
            article_toks = []
            for i, sent in enumerate(sents):
                if i != 0:
                    article_toks.append(SEP_TOKEN)
                article_toks.extend(sent)
            article_segments = []
            output_segments = []
            tmp_seg = []
            for i, tok in enumerate(article_toks):
                if len(tmp_seg) == 0:
                    tmp_seg.append(CLS_TOKEN)
                tmp_seg.append(tok)
                if tok == SEP_TOKEN:
                    tmp_seg.append(tok)
                if len(tmp_seg) >= truncated_number:
                    tmp_seg = tmp_seg[:truncated_number]
                    if tmp_seg[-1] != SEP_TOKEN:
                        tmp_seg[-1] = SEP_TOKEN
                    tmp_output = article_toks[
                        i + 1:min(i + 1 + output_length, len(article_toks))]
                    if len(tmp_output) < 0.3 * output_length:
                        break
                    article_segments.append(
                        tokenizer.convert_tokens_to_ids(tmp_seg))
                    output_segments.append(
                        tokenizer.convert_tokens_to_ids(tmp_output))
                    tmp_seg = []
            assert len(article_segments) == len(output_segments)
            for i in range(len(article_segments)):
                assert len(article_segments[i]) <= truncated_number
                assert len(output_segments[i]) <= output_length and len(
                    output_segments[i]) >= 0.3 * output_length
                tensor = torch.IntTensor(article_segments[i])
                ds.add_item(tensor)
                output_tensor = torch.IntTensor(output_segments[i])
                output_ds.add_item(output_tensor)

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
        output_ds.finalize(
            dataset_dest_file(args, output_prefix, 'article_next', "idx"))
        print('done!')

Esempio n. 24

0

Mostra file

File: preprocess_sum_bert_pretrain.py Progetto: zoezou2015/abs_pretraining

    def make_binary_dataset(vocab, input_prefix, output_prefix, lang,
                            num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        print('input_prefix', input_prefix)
        print(dict_path(lang))

        dict = bert_dictionary.BertDictionary.load(dict_path(lang))
        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        from pytorch_transformers import BertTokenizer
        import torch

        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        def penn_token2orig_token(sent):
            # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB-
            penn2orig = {
                "``": '"',
                "''": '"',
                "-LRB-": '(',
                "-RRB-": ')',
                "-LSB-": '[',
                "-RSB-": ']',
                "-LCB-": '{',
                "-RCB-": '}'
            }
            words = sent.strip().split()
            words = [
                wd if not wd in penn2orig else penn2orig[wd] for wd in words
            ]
            return ' '.join(words)

        num_token, num_unk_token = 0, 0
        num_seq = 0
        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin"))
        output_ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, 'article_next', "bin"))
        article_input = 511
        article_next = 256
        BERT_CLS_ID = tokenizer.convert_tokens_to_ids([BERT_CLS])[0]
        BERT_SEP_ID = tokenizer.convert_tokens_to_ids([BERT_SEP])[0]
        for line in open(input_file, encoding='utf8'):
            sents = line.strip().split('<S_SEP>')
            sents = [
                tokenizer.tokenize(penn_token2orig_token(sent))
                for sent in sents
            ]
            article_wids = []
            for i, sent in enumerate(sents):
                if i != 0:
                    article_wids.append(dict.sep_index)
                if len(sent) > article_input:

                    wids = []
                    temp_sent = [
                        sent[x:x + article_input]
                        for x in range(0, len(sent), article_input)
                    ]
                    for se in temp_sent:
                        se_ids = tokenizer.convert_tokens_to_ids(se)
                        wids.extend(se_ids)

                else:
                    wids = tokenizer.convert_tokens_to_ids(sent)
                # wids_vocab = [dict.index(word) for word in sent]
                # assert wids == wids_vocab, 'word indices should be the same!'
                article_wids.extend(wids)
                for wid in wids:
                    if wid == dict.unk_index:
                        num_unk_token += 1
                    num_token += 1

            article_segments = [
                article_wids[x:x + article_input]
                for x in range(0, len(article_wids), article_input)
            ]

            cur_position = 0
            for i in range(len(article_segments)):
                article_seq = article_segments[i]
                cur_position += len(article_seq)
                output_seg = article_wids[
                    cur_position:min(len(article_wids), cur_position +
                                     article_next)]
                if len(output_seg) < 0.3 * article_next:
                    continue
                num_seq += 1
                if len(article_seq) > article_input:
                    print('lang: %s, token len: %d, truncated len: %d' %
                          (lang, len(article_seq), article_input))
                if lang == 'article':
                    if article_seq[-1] != BERT_SEP_ID:
                        if article_seq[-2] != BERT_SEP_ID:
                            article_seq[-1] = BERT_SEP_ID
                    article_seq = [BERT_CLS_ID] + article_seq

                if len(output_seg) > article_next:
                    print(
                        'lang: article_next, token len: %d, truncated len: %d'
                        % (len(output_seg), article_next))

                tensor = torch.IntTensor(article_seq)
                ds.add_item(tensor)
                output_tensor = torch.IntTensor(output_seg)
                output_ds.add_item(output_tensor)

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
        output_ds.finalize(
            dataset_dest_file(args, output_prefix, 'article_next', "idx"))
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, num_seq, num_token,
            100 * num_unk_token / num_token,
            dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))

Esempio n. 25

0

Mostra file

    def make_binary_dataset(vocab, input_prefix, output_prefix, lang,
                            num_workers):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        print('input_prefix', input_prefix)
        print(dict_path(lang))

        dict = xlnet_dictionary.XLNetDictionary.load(dict_path(lang))
        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        from pytorch_transformers import XLNetConfig, XLNetTokenizer
        import torch

        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')

        def penn_token2orig_token(sent):
            # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB-
            penn2orig = {
                "``": '"',
                "''": '"',
                "-LRB-": '(',
                "-RRB-": ')',
                "-LSB-": '[',
                "-RSB-": ']',
                "-LCB-": '{',
                "-RCB-": '}'
            }
            words = sent.strip().split()
            words = [
                wd if not wd in penn2orig else penn2orig[wd] for wd in words
            ]
            return ' '.join(words)

        num_token, num_unk_token = 0, 0
        num_seq = 0
        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin"))
        for line in open(input_file, encoding='utf8'):
            sents = line.strip().split('<S_SEP>')
            sents = [
                tokenizer.tokenize(penn_token2orig_token(sent))
                for sent in sents
            ]
            article_wids = []
            for i, sent in enumerate(sents):
                if i != 0:
                    article_wids.append(dict.sep_index)
                wids = tokenizer.convert_tokens_to_ids(sent)
                # wids_vocab = [dict.index(word) for word in sent]
                # assert wids == wids_vocab, 'word indices should be the same!'
                article_wids.extend(wids)
                for wid in wids:
                    if wid == dict.unk_index:
                        num_unk_token += 1
                    num_token += 1

            num_seq += 1
            tensor = torch.IntTensor(article_wids)
            # print( dict.string_complete(tensor) )
            ds.add_item(tensor)

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, num_seq, num_token,
            100 * num_unk_token / num_token,
            dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))

Esempio n. 26

0

Mostra file

File: gcn_preprocess.py Progetto: s1879281/gcn-wikicatsum

    def make_binary_dataset(input_prefix, output_prefix, lang):
        #debugging, do only targets
        #if 'src' in lang:
        #if 'tgt' in lang:
        #    print("skip src files...")
        #    #print("skip tgt files...")
        #    return

        dict = load_dictionary(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

        annotator = None
        aconsumer = None
        if args.addAnnotations:
            brigramFile = None
            if os.path.isfile(args.addAnnotations  + "_bigrams"):
                brigramFile = args.addAnnotations  + "_bigrams"
            if args.addAnnotations.endswith(".mallet"):
                annotator = MalletLDATopicDistAnnotator(args.addAnnotations, args.numTopics, brigramFile)
            else:
                annotator = GensimLDATopicDistAnnotator(args.addAnnotations, args.numTopics, brigramFile)
            #generate embeddings for topic keywords
            annotator.generateKeywordEmbeddings(dataset_dest_path(output_prefix+"_keyEmbeddings", None, 'txt'))
            annotator.generateKeywordDict(dataset_dest_path(output_prefix+"_keyVocab", None, 'txt'))
            ads = indexed_dataset.IndexedDatasetBuilder(
                dataset_dest_path(output_prefix+"_keys", lang, 'bin'),
                dtype=np.double)
            def aconsumer(tensor):
                ads.add_item(tensor)
            awds = indexed_dataset.IndexedDatasetBuilder(
                dataset_dest_path(output_prefix+"_keywords", lang, 'bin'))
            def awconsumer(tensor):
                awds.add_item(tensor)


        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')

        if args.singleSeq:
            if '.src' in input_file:
                res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, L=args.L,
                                                          aconsumer=aconsumer, annotator=annotator)
                TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict)
            elif '.tgt' in input_file:
                res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer,
                                                          aconsumer=aconsumer, annotator=annotator)
                TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict)
            else:
                res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, append_eos=False,
                                                          aconsumer=aconsumer, annotator=annotator)
                TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict)
        else:
            if '.src' in input_file:
                res = TokenizerWCSParagraph.binarize(input_file, dict, consumer,
                                                     max_chunk_length= args.src_chunk_length, L=args.L,
                                                     aconsumer=aconsumer, awconsumer=awconsumer, annotator=annotator)
                TokenizerWCSParagraph.printStats(res, lang, input_file, dict)
            else:
                res = TokenizerWCSSentence.binarize(input_file, dict, consumer, max_chunk_length=args.tgt_chunk_length,
                                                    aconsumer=aconsumer, annotator=annotator)
                TokenizerWCSSentence.printStats(res, lang, input_file, dict)

        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
        if args.addAnnotations:
            ads.finalize(dataset_dest_path(output_prefix+"_keys", lang, 'idx'))
            awds.finalize(dataset_dest_path(output_prefix+"_keywords", lang, 'idx'))