def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            attr: str, num_workers: int):
        """make binary dataset"""
        LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, attr,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize_bpe(input_file,
                                   vocab,
                                   lambda t: ds.add_item(t),
                                   offset=0,
                                   end=offsets[1]))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, BPE no replaced token".format(
                attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
            ))
Example #2
0
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        # if num_workers > 1:
        #     # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
        #     pool = Pool(processes=num_workers - 1)
        #     for worker_id in range(1, num_workers):
        #         prefix = "{}{}".format(output_file, worker_id)
        #         pool.apply_async(
        #             binarize,
        #             (
        #                 args,
        #                 input_file,
        #                 vocab,
        #                 prefix,
        #                 offsets[worker_id],
        #                 offsets[worker_id + 1]
        #             ),
        #             callback=merge_result
        #         )
        #     pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        if 'code_tokens_wo_func' in os.path.basename(output_file):
            bin_out = Binarizer.binarize_wo_func(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.string_sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'code_tokens' in os.path.basename(input_file):
            bin_out = Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'docstring_tokens' in os.path.basename(input_file):
            bin_out = Binarizer.binarize_bpe(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.lower_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'func_name' in os.path.basename(input_file):
            bin_out = Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.func_name_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )

        merge_result(bin_out)
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))
Example #3
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_token_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    token_dict = save_token_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']

    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens")
        PathManager.mkdir(os.path.dirname(dst_file))

        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        src_file,
                        prefix,
                        vocab,
                        token_dict,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                )
            pool.close()

        ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab))
        end = offsets[1]

        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.encode(line, out_type=str)
                code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize(f"{dst_file}.idx")
Example #4
0
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = f'{output_file}.mmap'
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq')

        def consumer(data, start_idx):
            ds.add_item(data)
            ext_ds.add_item(start_idx)

        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=seperate_tokenize,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                ext_ds.merge_file_(f"{temp_file_path}.ext")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                os.remove(
                    indexed_dataset.index_file_path(f"{temp_file_path}.ext"))
        ds.finalize('{}.idx'.format(output_file))
        ext_ds.finalize()
        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))
Example #5
0
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            use_func, num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = find_offsets(input_file, num_chunks=num_workers)
        func_offsets = None
        modality = input_file.split('.')[-1]
        if modality == 'code_tokens':
            tokenizer = tokenizers.list_tokenizer
            if use_func:
                func_offsets = Binarizer.find_func_offsets(input_file,
                                                           offsets=offsets)
        elif modality == 'func_name':
            tokenizer = tokenizers.func_name_tokenizer
        elif modality == 'docstring_tokens':
            tokenizer = tokenizers.lower_tokenizer
        else:
            raise NotImplementedError(modality)

        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize, (
                    args,
                    input_file,
                    vocab,
                    prefix,
                    tokenizer,
                    use_func and (modality == 'code_tokens'),
                    offsets[worker_id],
                    offsets[worker_id + 1],
                    func_offsets[worker_id] if func_offsets else 0,
                ),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))

        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizer,
                use_func=use_func and (modality == 'code_tokens'),
                offset=offsets[0],
                end=offsets[1],
                func_offset=func_offsets[0] if func_offsets else 0,
                append_eos=False,
                min_func_len=args['preprocess']['min_func_len'],
            ))

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))
Example #6
0
    def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        aux_dict,
                        prefix,
                        lang,
                        tokenize,
                        max_path_num,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result
                )
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
        if lang == 'path':
            sz_ds_file = '{}.sz.mmap'.format(output_file)
            sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'],
                                                 vocab_size=len(vocab))
        else:
            sz_ds = None

        def consumer(tensor, size=None):
            ds.add_item(tensor)
            if size is not None:
                sz_ds.add_item(size)

        if sz_ds is None:
            merge_result(
                Binarizer.binarize(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False,
                    max_path_num=max_path_num,
                )
            )
        else:
            merge_result(
                PathSummarizationBinarizer.path_binarizer(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict,
                    max_path_num=max_path_num,
                )
            )
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                if sz_ds is not None:
                    sz_ds.merge_file_(f"{temp_file_path}.sz")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                if sz_ds is not None:
                    os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz"))
                    os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz"))
        ds.finalize('{}.idx'.format(output_file))
        if sz_ds is not None:
            sz_ds.finalize('{}.sz.idx'.format(output_file))
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            )
        )
Example #7
0
    def make_binary_dataset(vocab, input_file, output_file, lang, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize, (
                    args,
                    input_file,
                    vocab,
                    prefix,
                    lang,
                    offsets[worker_id],
                    offsets[worker_id + 1],
                ),
                                 callback=merge_result)
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq')

        def consumer(data, start_idx):
            ds.add_item(data)
            ext_ds.add_item(start_idx)

        tokenize = string2dfs if lang == 'ast' else string2type_dfs
        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=tokenize,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                ext_ds.merge_file_(f"{temp_file_path}.ext")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                os.remove(
                    indexed_dataset.index_file_path(f"{temp_file_path}.ext"))
        ds.finalize('{}.idx'.format(output_file))
        ext_ds.finalize()
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            ))
Example #8
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    from ncc.data.dictionary import TransformersDictionary
    vocab = TransformersDictionary.from_pretrained(
        'microsoft/graphcodebert-base')

    file = os.path.join(args['preprocess']['destdir'], 'dfs.jsonl')
    node_dict = Dictionary.load(file)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess'][
        'tgt_lang']

    # code tokens => code tokens
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.code_tokens"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code_tokens")
        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize_tokens,
                    (args, src_file, vocab, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()

        ds_file = '{}.mmap'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="mmap",
                                          vocab_size=len(vocab))
        end = offsets[1]
        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.subtokenize(line)
                code_tokens = torch.IntTensor(
                    vocab.tokens_to_indices(code_tokens))
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(dst_file))

    # code => code
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.code"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")

        ds_file = '{}.bin'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="bin",
                                          vocab_size=len(vocab))
        with open(src_file, 'r') as reader:
            for line in reader:
                line = json_io.json_loads(line)
                ds.add_item(line)
        ds.finalize('{}.idx'.format(dst_file))

    # dfs => dfs
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.dfs"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.dfs")
        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize_dfs,
                    (args, src_file, node_dict, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()

        ds_file = '{}.mmap'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="mmap",
                                          vocab_size=len(vocab))
        end = offsets[1]
        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                dfs = torch.IntTensor([node_dict.index(tok) for tok in line])
                ds.add_item(dfs)
                line = reader.readline()

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(dst_file))