コード例 #1
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".json"

    def bpe_dict_path(lang):
        return dest_path(lang + '.bpe', "dict") + ".json"

    target = not args['preprocess']['only_source']

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        src_dict = task.build_dictionary(
            [train_path(args['preprocess']['source_lang'])],
            tokenize_func=tokenizers.sub_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['thresholdsrc'],
            nwords=args['preprocess']['nwordssrc'],
        )
    if target:
        if args['preprocess']['tgtdict']:
            tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --tgtdict is not specified"
            tgt_dict = task.build_bpe_dictionary(
                [train_path(args['preprocess']['target_lang'])],
                tokenize_func=tokenizers.lower_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=0,
                nwords=args['preprocess']['nwordstgt'],
            )
    else:
        tgt_dict = None

    if args['preprocess']['source_bpe']:
        src_dict_name = bpe_dict_path(args['preprocess']['source_lang'])
    else:
        src_dict_name = dict_path(args['preprocess']['source_lang'])
    src_dict.save_json(src_dict_name)
    if args['preprocess']['target_bpe']:
        tgt_dict_name = bpe_dict_path(args['preprocess']['target_lang'])
    else:
        tgt_dict_name = dict_path(args['preprocess']['target_lang'])
    tgt_dict.save_json(tgt_dict_name)

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        # if num_workers > 1:
        #     # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
        #     pool = Pool(processes=num_workers - 1)
        #     for worker_id in range(1, num_workers):
        #         prefix = "{}{}".format(output_file, worker_id)
        #         pool.apply_async(
        #             binarize,
        #             (
        #                 args,
        #                 input_file,
        #                 vocab,
        #                 prefix,
        #                 offsets[worker_id],
        #                 offsets[worker_id + 1]
        #             ),
        #             callback=merge_result
        #         )
        #     pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        if 'code_tokens_wo_func' in os.path.basename(output_file):
            bin_out = Binarizer.binarize_wo_func(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.string_sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'code_tokens' in os.path.basename(input_file):
            bin_out = Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'docstring_tokens' in os.path.basename(input_file):
            bin_out = Binarizer.binarize_bpe(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.lower_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'func_name' in os.path.basename(input_file):
            bin_out = Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.func_name_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )

        merge_result(bin_out)
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab,
                     input_prefix,
                     output_prefix,
                     lang,
                     num_workers=1,
                     is_bpe=False):
        if args['preprocess']['dataset_impl'] == "raw":
            in_file = file_name(input_prefix, lang)
            out_dir = args['preprocess']['destdir']
            os.makedirs(out_dir, exist_ok=True)
            LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
            shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
        else:
            if lang == 'code_tokens_wo_func':
                in_file = file_name(input_prefix, 'code_tokens')
                out_file = dest_path(output_prefix, lang)
                num_workers = 1  # not support multi-processing
            else:
                in_file = file_name(input_prefix, lang)
                out_file = dest_path(output_prefix, lang)
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            if is_bpe:
                out_file += '.bpe'
            make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab, is_bpe=False):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         is_bpe=is_bpe)
        if args['preprocess']['validpref']:
            make_dataset(vocab,
                         args['preprocess']['validpref'],
                         "valid",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         is_bpe=is_bpe)
        if args['preprocess']['testpref']:
            make_dataset(vocab,
                         args['preprocess']['testpref'],
                         "test",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         is_bpe=is_bpe)

    make_all(args['preprocess']['source_lang'],
             src_dict,
             is_bpe=args['preprocess']['source_bpe'])
    make_all("code_tokens_wo_func",
             src_dict,
             is_bpe=args['preprocess']['source_bpe'])  # func_name
    if target:
        make_all(args['preprocess']['target_lang'],
                 tgt_dict,
                 is_bpe=args['preprocess']['target_bpe'])
        make_all("func_name",
                 tgt_dict,
                 is_bpe=args['preprocess']['target_bpe'])  # func_name
コード例 #2
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info(
            'Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = PathManager.ls(
            train_path(args['preprocess']['source_lang']))
        if not args['preprocess']['only_train']:
            filenames.extend(
                PathManager.ls(valid_path(args['preprocess']['source_lang'])))
        src_dict = task.build_dictionary(
            filenames,
            tokenize_func=tokenization.json_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordssrc'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None,
            eos=None,
        )

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    # copy shared dict into each language's data directory
    for d in PathManager.ls(os.path.dirname(args['preprocess']['trainpref'])):
        lang = os.path.basename(d)
        src_dict.save(
            os.path.join(args['preprocess']['destdir'], lang,
                         f"{args['preprocess']['source_lang']}.dict.jsonl"))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab, input_file, output_file, num_workers):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))

        def consumer(data, _):
            ds.add_item(data)

        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=string2tokens,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            languages = [
                os.path.basename(d)
                for d in PathManager.ls(os.path.dirname(input_prefix))
            ]
            for l in languages:
                in_file = file_name(input_prefix, lang)
                in_file = str.replace(in_file, '*', l)
                out_file = dest_path(os.path.join(l, output_prefix), lang)
                PathManager.mkdir(os.path.dirname(out_file))
                make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
コード例 #3
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    from dataset.codexglue.code_to_text import BPE_DIR

    source_dict_file = os.path.join(BPE_DIR, 'csn/csn.spm.vocab')
    target_dict_file = os.path.join(
        os.path.dirname(args['preprocess']['destdir']), 'dict.jsonl')
    with open(source_dict_file, 'r') as reader, open(target_dict_file,
                                                     'w') as writer:
        for line in reader:
            print(json_io.json_dumps([line.split('\t')[0], 100]), file=writer)
    src_dict = tgt_dict = task.load_dictionary(target_dict_file)

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenization.json_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=True,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            in_file = file_name(input_prefix, lang)
            out_dir = args['preprocess']['destdir']
            os.makedirs(out_dir, exist_ok=True)
            LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
            shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
コード例 #4
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir for {} task'.format(args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(modality):
        train_files = []
        for lang, value in args['preprocess']['dataprefs'].items():
            train_files.append("{}{}".format(
                args['preprocess']['dataprefs'][lang]['trainpref'],
                ("." + modality) if modality else ""))
        return train_files

    def build_dictionary(filenames, modality, src=False, tgt=False):
        """
        ['code_tokens', 'docstring_tokens', 'path', 'sbt', 'sbtao', 'binary_ast', 'traversal']
        """
        assert src ^ tgt
        if modality in ['binary_ast']:
            tokenize_func = tokenization.json_tokenizer
        elif modality in [
                'code_tokens', 'docstring_tokens', 'path', 'path.terminals',
                'sbt', 'sbtao', 'traversal'
        ]:
            tokenize_func = tokenization.json_tokenizer
        else:
            raise NotImplementedError("{}".format(modality))

        return task.build_dictionary(
            filenames,
            tokenize_func=tokenize_func,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['thresholdsrc'],
            nwords=args['preprocess']['nwordssrc']
            if src else args['preprocess']['nwordstgt'],
            padding_factor=args['preprocess']['padding_factor'],
        )

    def build_vocab_dict(args):
        """Build vocabulary (dictionary) for source and target domain"""
        LOGGER.info('Build vocabularies...')
        # task = tasks.get_task(args['preprocess']['task'])
        src_dicts = OrderedDict()

        def load_dict(modality):
            modality_dict_filename = os.path.join(
                args['preprocess']['destdir'],
                'data-{}'.format(args['preprocess']['dataset_impl']),
                '{}.dict.json'.format(modality))
            os.makedirs(os.path.dirname(modality_dict_filename), exist_ok=True)
            if os.path.exists(modality_dict_filename):
                LOGGER.info('Loading {} dict from {}'.format(
                    modality, modality_dict_filename))
                modality_dict = Dictionary.load_json(modality_dict_filename)
            else:
                modality_dict = build_dictionary(train_path(modality),
                                                 modality,
                                                 src=True)
                LOGGER.info('Saving {} dict at {}'.format(
                    modality, modality_dict_filename))
                modality_dict.save_json(modality_dict_filename)
            return modality_dict

        if args['preprocess']['joined_dictionary']:
            modalities = args['preprocess']['source_lang'] + [
                args['preprocess']['target_lang']
            ]
            modalities = sorted(
                list(
                    itertools.filterfalse(lambda modality: modality is None,
                                          modalities)))
            joined_dictionary_filename = os.path.join(
                args['preprocess']['destdir'],
                '{}.dict.txt'.format('_'.join(modalities)))
            if os.path.exists(joined_dictionary_filename):
                LOGGER.info('Loading joint dict from {}'.format(
                    joined_dictionary_filename))
                joined_dictionary = Dictionary.load_json(
                    joined_dictionary_filename)
            else:
                joined_dictionary = build_dictionary(
                    [train_path(modality) for modality in modalities],
                    modalities,
                    src=True)
                LOGGER.info('Saving joint dict at {}'.format(
                    joined_dictionary_filename))
                joined_dictionary.save_json(joined_dictionary_filename)

            for modality in modalities:
                src_dicts[modality] = joined_dictionary
            tgt_dict = joined_dictionary
        else:
            # src dict
            for modality in args['preprocess']['source_lang']:
                src_dicts[modality] = load_dict(modality)

            # tgt dict
            if args['preprocess']['target_lang']:
                tgt_dict = load_dict(args['preprocess']['target_lang'])
            else:
                tgt_dict = None

        return src_dicts, tgt_dict

    # 1. build vocabulary
    src_dicts, tgt_dict = build_vocab_dict(args)

    # 2. ***************build dataset********************
    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, impl, lang, modality):
        return os.path.join(args['preprocess']['destdir'],
                            'data-{}'.format(impl), lang,
                            file_name(prefix, modality))

    def make_binary_dataset(dict: Dictionary, input_file, output_file,
                            attr: str, num_workers: int):
        """make binary dataset"""
        LOGGER.info("[{}] Dictionary: {} types".format(attr, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, dict, prefix, attr,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(dict))
        merge_result(
            Binarizer.binarize(
                input_file,
                dict,
                lambda t: ds.add_item(t),
                tokenize=tokenization.json_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                dict.unk_word,
            ))

    def make_graph_bin_dataset(dict: Dictionary, input_file, output_file,
                               num_workers):
        offsets = Binarizer.find_offsets(input_file, num_workers)
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers)
            for worker_id in range(num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(
                    binarize_dgl,
                    (args, input_file, dict, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()
        else:
            prefix = "{}0".format(output_file)
            binarize_dgl(args, input_file, dict, prefix, 0, -1)

    def make_dataset(vocab,
                     input_prefix,
                     output_prefix,
                     lang,
                     modality,
                     num_workers=1):
        in_file = file_name(input_prefix, modality)
        out_file = dest_path(output_prefix, args['preprocess']['dataset_impl'],
                             lang, modality)
        os.makedirs(os.path.dirname(out_file), exist_ok=True)
        if args['preprocess']['dataset_impl'] == "raw":
            logger.info('Copying {} into {}'.format(in_file, out_file))
            shutil.copy(src=in_file, dst=out_file)
        else:
            if modality == 'binary_ast':
                make_graph_bin_dataset(vocab, in_file, out_file, num_workers)
            else:
                make_binary_dataset(vocab, in_file, out_file, modality,
                                    num_workers)

    def make_all(modality, vocab, lang, data_prefs):
        num_workers = min(args['preprocess']['workers'], cpu_count())
        if data_prefs['trainpref']:
            make_dataset(vocab,
                         data_prefs['trainpref'],
                         "train",
                         lang,
                         modality,
                         num_workers=num_workers)
        if data_prefs['validpref']:
            for k, validpref in enumerate(data_prefs['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             modality,
                             num_workers=num_workers)
        if data_prefs['testpref']:
            for k, testpref in enumerate(data_prefs['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             modality,
                             num_workers=num_workers)

    def build_dataset(args: Dict, src_dicts: Dict[str, Dictionary],
                      tgt_dict: Dictionary):
        """build dataset for modal"""
        for modality, src_dict in src_dicts.items():
            LOGGER.info('Building dataset for {}'.format(modality))
            for lang, data_prefs in args['preprocess']['dataprefs'].items():
                make_all(modality, src_dict, lang, data_prefs)

    # 2. build dataset
    build_dataset(args, src_dicts, tgt_dict)
コード例 #5
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}.{}".format(args['preprocess']['trainpref'], lang)

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        return f"{prefix}.{lang}"

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))
    if target and not args['preprocess']['tgtdict'] and os.path.exists(
            dict_path(args['preprocess']['target_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['target_lang']))

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info(
            'Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['joined_dictionary']:
        assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \
            "cannot use both --srcdict and --tgtdict with --joined-dictionary"
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        elif args['preprocess']['tgtdict']:
            src_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --srcdict is not specified"
            filenames = [
                train_path(args['preprocess']['source_lang']),
                train_path(args['preprocess']['target_lang'])
            ]
            if not args['preprocess']['only_train']:
                filenames.extend( \
                    [valid_path(args['preprocess']['source_lang']), valid_path(args['preprocess']['target_lang'])])
            src_dict = task.build_dictionary(
                filenames,
                tokenize_func=json_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['threshold'],
                # set max len for joint dictionaries
                nwords=max(args['preprocess']['nwordssrc'],
                           args['preprocess']['nwordstgt']),
            )
        tgt_dict = src_dict

    else:
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --srcdict is not specified"

            filenames = [train_path(args['preprocess']['source_lang'])]
            if not args['preprocess']['only_train']:
                filenames.append(valid_path(args['preprocess']['source_lang']))
            src_dict = task.build_dictionary(
                filenames,
                tokenize_func=json_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['threshold'],
                nwords=args['preprocess']['nwordssrc'],
                padding_factor=args['preprocess']['padding_factor'],
                bos=None,
                eos=None,
            )
        if target:
            if args['preprocess']['tgtdict']:
                tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
            else:
                assert args['preprocess'][
                    'trainpref'], "--trainpref must be set if --tgtdict is not specified"
                filenames = [train_path(args['preprocess']['target_lang'])]
                if not args['preprocess']['only_train']:
                    filenames.append(
                        valid_path(args['preprocess']['target_lang']))
                tgt_dict = task.build_dictionary(
                    filenames,
                    tokenize_func=json_tokenizer,
                    workers=args['preprocess']['workers'],
                    threshold=args['preprocess']['threshold'],
                    nwords=args['preprocess']['nwordstgt'],
                    padding_factor=args['preprocess']['padding_factor'],
                    bos=None,
                    eos=None,
                )
        else:
            tgt_dict = None

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def seperate_tokenize(line):
        line = json_io.json_loads(line)
        tokens = separate_list(line, args['preprocess']['max_len'])
        return tokens

    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = f'{output_file}.mmap'
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq')

        def consumer(data, start_idx):
            ds.add_item(data)
            ext_ds.add_item(start_idx)

        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=seperate_tokenize,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                ext_ds.merge_file_(f"{temp_file_path}.ext")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                os.remove(
                    indexed_dataset.index_file_path(f"{temp_file_path}.ext"))
        ds.finalize('{}.idx'.format(output_file))
        ext_ds.finalize()
        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            in_file = file_name(input_prefix, lang)
            out_dir = args['preprocess']['destdir']
            os.makedirs(out_dir, exist_ok=True)
            LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
            shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
コード例 #6
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        data_files = train_path(args['preprocess']['source_lang'])
        data_files = PathManager.ls(data_files)

        src_dict = task.build_bpe_dictionary(
            data_files,
            tokenize_func=tokenizers.sub_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['thresholdsrc'],
            nwords=args['preprocess']['nwordssrc'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None,
            eos=None,
            bpe_portion=args['preprocess']['source_bpe_portion'],
        )
    if target:
        if args['preprocess']['tgtdict']:
            tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            data_files = train_path(args['preprocess']['target_lang'])
            if '*' in data_files:
                data_files = glob(data_files)
            else:
                data_files = [data_files]

            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --tgtdict is not specified"
            tgt_dict = task.build_bpe_dictionary(
                data_files,
                tokenize_func=tokenizers.lower_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=0,
                nwords=args['preprocess']['nwordstgt'],
                padding_factor=args['preprocess']['padding_factor'],
                bos=None,
                eos=None,
                bpe_portion=args['preprocess']['target_bpe_portion'],
            )
    else:
        tgt_dict = None

    # src_dict.save(dict_path(args['preprocess']['source_lang']))
    # tgt_dict.save(dict_path(args['preprocess']['target_lang']))
    # tgt_dict.save(dict_path("func_name"))  # save target_lang dict for func_name

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            use_func, num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = find_offsets(input_file, num_chunks=num_workers)
        func_offsets = None
        modality = input_file.split('.')[-1]
        if modality == 'code_tokens':
            tokenizer = tokenizers.list_tokenizer
            if use_func:
                func_offsets = Binarizer.find_func_offsets(input_file,
                                                           offsets=offsets)
        elif modality == 'func_name':
            tokenizer = tokenizers.func_name_tokenizer
        elif modality == 'docstring_tokens':
            tokenizer = tokenizers.lower_tokenizer
        else:
            raise NotImplementedError(modality)

        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize, (
                    args,
                    input_file,
                    vocab,
                    prefix,
                    tokenizer,
                    use_func and (modality == 'code_tokens'),
                    offsets[worker_id],
                    offsets[worker_id + 1],
                    func_offsets[worker_id] if func_offsets else 0,
                ),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))

        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizer,
                use_func=use_func and (modality == 'code_tokens'),
                offset=offsets[0],
                end=offsets[1],
                func_offset=func_offsets[0] if func_offsets else 0,
                append_eos=False,
                min_func_len=args['preprocess']['min_func_len'],
            ))

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab,
                     input_prefix,
                     output_prefix,
                     lang,
                     use_func=False,
                     num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            in_files = file_name(input_prefix, lang)
            if '*' in in_files:
                in_files = glob(in_files)
            else:
                in_files = [in_files]
            for in_file in in_files:
                if lang == 'code_tokens':
                    out_file = dest_path(output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang + ".wo_func"}') \
                        if use_func == True else dest_path(output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang}')
                else:
                    out_file = dest_path(
                        output_prefix,
                        f'{str.split(in_file, os.sep)[-2]}.{lang}')
                os.makedirs(os.path.dirname(out_file), exist_ok=True)
                make_binary_dataset(vocab, in_file, out_file, use_func,
                                    num_workers)

    def make_all(lang, vocab, use_func=False):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         use_func=use_func)
        if args['preprocess']['validpref']:
            make_dataset(vocab,
                         args['preprocess']['validpref'],
                         "valid",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         use_func=use_func)
        if args['preprocess']['testpref']:
            make_dataset(vocab,
                         args['preprocess']['testpref'],
                         "test",
                         lang,
                         num_workers=args['preprocess']['workers'],
                         use_func=use_func)

    make_all(args['preprocess']['source_lang'], src_dict)
    make_all(args['preprocess']['source_lang'], src_dict, use_func=True)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
        make_all('func_name', tgt_dict)  # func_name as query
コード例 #7
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info(
            'Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['source_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['source_lang']))
        src_dict = task.build_dictionary(
            filenames,
            tokenize_func=bin_ast_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['thresholdsrc'],
            nwords=args['preprocess']['nwordssrc'],
            padding_factor=args['preprocess']['padding_factor'],
        )
        src_dict.save(dict_path(args['preprocess']['source_lang']))

    # 2. ***************build dataset********************
    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            in_file = file_name(input_prefix, lang)
            out_dir = args['preprocess']['destdir']
            os.makedirs(out_dir, exist_ok=True)
            LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
            shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            offsets = find_offsets(in_file, num_workers)
            with Pool(num_workers) as mpool:
                results = [
                    mpool.apply_async(
                        build_dgl_graph,
                        (vocab, in_file, f'{out_file}{worker_id}.mmap',
                         offsets[worker_id], offsets[worker_id + 1]),
                    ) for worker_id in range(num_workers)
                ]
                results = [res.get() for res in results]
            graph_batch = []
            for worker_id in range(num_workers):
                sub_file = f'{out_file}{worker_id}.mmap'
                glist, _ = load_graphs(sub_file)
                graph_batch.extend(glist)
                os.remove(sub_file)
            save_graphs(f'{out_file}.mmap', graph_batch)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
コード例 #8
0
from dataset.codexglue.code_to_text import (
    LANGUAGES,
    MODES,
)
from ncc import tasks
from ncc.data import (
    Dictionary,
    indexed_dataset,
)
from ncc.utils.file_ops.yaml_io import recursive_expanduser
from ncc.utils.file_ops import file_io
from ncc.utils.path_manager import PathManager

if __name__ == '__main__':
    task = tasks.get_task('multilingual_denoising')
    base_dir = recursive_expanduser(
        '~/ncc_data/codexglue/code_to_text/multilingual_denoising/data-mmap')

    dict_file = os.path.join(base_dir, 'dict.jsonl')
    vocab = task.load_dictionary(dict_file)

    for mode in MODES:
        dst_file = os.path.join(base_dir, 'docstring', f"{mode}.docstring.spm")
        PathManager.mkdir(os.path.dirname(dst_file))
        # mmap
        ds = indexed_dataset.make_builder(f'{dst_file}.mmap',
                                          impl='mmap',
                                          vocab_size=len(vocab))
        for lang in LANGUAGES:
            src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm")
コード例 #9
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir for {} task'.format(args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path("dict", lang) + ".txt"

    target = not args['preprocess']['only_source']

    # 1. build vocabulary from bpe directory
    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))
    if target and not args['preprocess']['tgtdict'] and os.path.exists(
            dict_path(args['preprocess']['target_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['target_lang']))

    if args['preprocess']['joined_dictionary']:
        assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \
            "cannot use both --srcdict and --tgtdict with --joined-dictionary"
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        elif args['preprocess']['tgtdict']:
            src_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            LOGGER.error(
                'Please run sentencepiece to generate the model and vocab files first.'
            )
            exit()

        tgt_dict = src_dict

        # Load sentencepiece (sp) module
        if args['preprocess']['src_sp']:
            src_sp = spm.SentencePieceProcessor()
            src_sp.load(args['preprocess']['src_sp'])
        elif args['preprocess']['tgt_sp']:
            src_sp = spm.SentencePieceProcessor()
            src_sp.load(args['preprocess']['tgt_sp'])
        else:
            LOGGER.error('Please assign the sentencepiece model path.')
            exit()
        tgt_sp = src_sp

    else:
        if args['preprocess']['srcdict'] and args['preprocess']['src_sp']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
            src_sp = spm.SentencePieceProcessor()
            src_sp.load(args['preprocess']['src_sp'])
        else:
            LOGGER.error(
                'Please run sentencepiece to generate the model and vocab files first.'
            )
            exit()

        if target:
            if args['preprocess']['tgtdict'] and args['preprocess']['tgt_sp']:
                tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
                tgt_sp = spm.SentencePieceProcessor()
                tgt_sp.load(args['preprocess']['tgt_sp'])
            else:
                # assert args['preprocess']['trainpref'], "--trainpref must be set if --tgtdict is not specified"
                # tgt_dict = build_dictionary([train_path(args['preprocess']['target_lang'])], tgt=True)
                LOGGER.error(
                    'Please run sentencepiece to generate the model and vocab files first.'
                )
                exit()
        else:
            tgt_dict = None
            tgt_sp = None
    # exit()
    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            attr: str, num_workers: int):
        """make binary dataset"""
        LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, attr,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize_bpe(input_file,
                                   vocab,
                                   lambda t: ds.add_item(t),
                                   offset=0,
                                   end=offsets[1]))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, BPE no replaced token".format(
                attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
            ))

    def make_dataset(vocab,
                     sp,
                     input_prefix,
                     output_prefix,
                     lang,
                     num_workers=1):
        if args['preprocess']['dataset_impl'] == 'raw':
            with open(file_name(input_prefix, lang), 'rb') as input_file, open(
                    dest_path(output_prefix,
                              lang), 'w', encoding="utf-8") as output_file:
                for line in input_file.readlines(
                )[0:100]:  # TODO only for debug
                    line = ujson.loads(line)
                    line = normalize_program(line)
                    line = sp.EncodeAsPieces(line)
                    output_file.write(ujson.dumps(line) + '\n')
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            make_binary_dataset(vocab, in_file, out_file, lang, num_workers)

    def make_all(lang, vocab, sp):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         sp,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             sp,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             sp,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    # # 2. build dataset
    make_all(args['preprocess']['source_lang'], src_dict, src_sp)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict, tgt_sp)
コード例 #10
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    LOGGER.info('mkdir for {} task'.format(args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    # 1. ***************build vocabulary***************
    task = tasks.get_task(args['preprocess']['task'])

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "")

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info('Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['subtokendict']:
        subtoken_dict = task.load_dictionary(args['preprocess']['subtokendict'])
    else:
        assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['source_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['source_lang']))
        subtoken_dict = task.build_dictionary(
            filenames,
            tokenize_func=subtoken_tokenize,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordssubtoken'],
            padding_factor=args['preprocess']['padding_factor'],
        )

    if args['preprocess']['typedict']:
        type_dict = task.load_dictionary(args['preprocess']['typedict'])
    else:
        assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['source_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['source_lang']))
        type_dict = task.build_dictionary(
            filenames,
            tokenize_func=type_tokenize,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordstype'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None, eos=None,
        )

    if args['preprocess']['docstringdict']:
        docstring_dict = task.load_dictionary(args['preprocess']['docstringdict'])
    else:
        assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['target_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['target_lang']))
        docstring_dict = task.build_dictionary(
            filenames,
            tokenize_func=tokenization.json_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordsdocstring'],
            padding_factor=args['preprocess']['padding_factor'],
        )

    subtoken_dict.save(dict_path('subtoken'))
    type_dict.save(dict_path('type'))
    docstring_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        aux_dict,
                        prefix,
                        lang,
                        tokenize,
                        max_path_num,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result
                )
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
        if lang == 'path':
            sz_ds_file = '{}.sz.mmap'.format(output_file)
            sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'],
                                                 vocab_size=len(vocab))
        else:
            sz_ds = None

        def consumer(tensor, size=None):
            ds.add_item(tensor)
            if size is not None:
                sz_ds.add_item(size)

        if sz_ds is None:
            merge_result(
                Binarizer.binarize(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False,
                    max_path_num=max_path_num,
                )
            )
        else:
            merge_result(
                PathSummarizationBinarizer.path_binarizer(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict,
                    max_path_num=max_path_num,
                )
            )
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                if sz_ds is not None:
                    sz_ds.merge_file_(f"{temp_file_path}.sz")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                if sz_ds is not None:
                    os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz"))
                    os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz"))
        ds.finalize('{}.idx'.format(output_file))
        if sz_ds is not None:
            sz_ds.finalize('{}.sz.idx'.format(output_file))
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            )
        )

    def make_dataset(vocab, aux_dict, input_prefix, output_prefix, lang, max_path_num, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, aux_dict, in_file, out_file, lang, max_path_num, num_workers)

    def make_all(lang, vocab, aux_dict=None):
        if args['preprocess']['trainpref']:
            max_path_num = args['preprocess']['train_path_num']
            make_dataset(vocab, aux_dict, args['preprocess']['trainpref'], "train", lang, max_path_num,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            max_path_num = args['preprocess']['eval_path_num']
            for k, validpref in enumerate(args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab, aux_dict, validpref, outprefix, lang, max_path_num,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            max_path_num = args['preprocess']['eval_path_num']
            for k, testpref in enumerate(args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab, aux_dict, testpref, outprefix, lang, max_path_num,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], subtoken_dict, type_dict)
    make_all(args['preprocess']['target_lang'], docstring_dict)
コード例 #11
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir for {} task'.format(args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    # 1. build vocabulary
    LOGGER.info('Build vocabularies...')

    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))
    if not args['preprocess']['tgtdict'] and os.path.exists(
            dict_path(args['preprocess']['target_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['target_lang']))

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"
        src_dict = task.build_dictionary(
            [train_path(args['preprocess']['source_lang'])],
            tokenize_func=tokenization.json_tokenizer,
            workers=args['preprocess']['workers'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None,
            eos=None,
            unk=None,
        )

    if args['preprocess']['tgtdict']:
        tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"
        tgt_dict = task.build_dictionary(
            filenames=[],
            tokenize_func=label_tokenization,
            workers=args['preprocess']['workers'],
            padding_factor=args['preprocess']['padding_factor'],
            pad=None,
            bos=None,
            eos=None,
            unk=None,
        )
        tgt_dict.add_symbol('CPU', 0)
        tgt_dict.add_symbol('GPU', 1)

    LOGGER.info('dict_path: {}'.format(
        dict_path(args['preprocess']['source_lang'])))
    src_dict.save(dict_path(args['preprocess']['source_lang']))
    LOGGER.info('dict_path: {}'.format(
        dict_path(args['preprocess']['target_lang'])))
    tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            attr: str, num_workers: int):
        """make binary dataset"""
        LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab)))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, attr,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=label_tokenization
                if attr == 'oracle' else tokenization.json_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            in_file = file_name(input_prefix, lang)
            out_dir = args['preprocess']['destdir']
            os.makedirs(out_dir, exist_ok=True)
            LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
            shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            make_binary_dataset(vocab, in_file, out_file, lang, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    make_all(args['preprocess']['target_lang'], tgt_dict)

    # dump other attributes
    for lang in args['preprocess']['auxiliary_langs']:
        src_file = f"{args['preprocess']['trainpref']}.{lang}"
        tgt_file = os.path.join(args['preprocess']['destdir'], f"train.{lang}")
        with open(src_file, 'r') as reader, open(tgt_file, 'wb') as writer:
            data = [eval(line.strip()) for line in reader]
            pickle.dump(data, file=writer)
コード例 #12
0
def main(args):
    LOGGER.info('mkdir for {} task'.format(args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    # 1. ***************build vocabulary***************
    task = tasks.get_task(args['preprocess']['task'])

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    def string2dfs(line):
        line = json_io.json_loads(line)
        asts = py150_util.separate_dps(line, args['preprocess']['max_len'])
        ast_dfs = [[py150_util.get_dfs(ast), ext] for ast, ext in asts
                   if len(ast) > 1]
        return ast_dfs

    def string2type_dfs(line):
        type_dfs = type_tokenize_func(line)
        type_dfs = py150_util.separate_dps(type_dfs,
                                           args['preprocess']['max_len'])
        type_dfs = [[dfs, ext] for dfs, ext in type_dfs if len(dfs) > 1]
        return type_dfs

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    target = not args['preprocess']['only_source']

    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))
    if target and not args['preprocess']['tgtdict'] and os.path.exists(
            dict_path(args['preprocess']['target_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['target_lang']))

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info(
            'Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['source_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['source_lang']))
        src_dict = task.build_dictionary(
            filenames,
            tokenize_func=tokenize_func,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordssrc'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None,
            eos=None,
        )
    if target:
        if args['preprocess']['tgtdict']:
            tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --tgtdict is not specified"
            # code_types are from ast
            filenames = [train_path(args['preprocess']['source_lang'])]
            if not args['preprocess']['only_train']:
                filenames.append(valid_path(args['preprocess']['source_lang']))
            tgt_dict = task.build_dictionary(
                filenames,
                tokenize_func=type_tokenize_func,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['threshold'],
                nwords=args['preprocess']['nwordstgt'],
                padding_factor=args['preprocess']['padding_factor'],
                bos=None,
                eos=None,
            )
    else:
        tgt_dict = None

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab, input_file, output_file, lang, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize, (
                    args,
                    input_file,
                    vocab,
                    prefix,
                    lang,
                    offsets[worker_id],
                    offsets[worker_id + 1],
                ),
                                 callback=merge_result)
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq')

        def consumer(data, start_idx):
            ds.add_item(data)
            ext_ds.add_item(start_idx)

        tokenize = string2dfs if lang == 'ast' else string2type_dfs
        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=tokenize,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                ext_ds.merge_file_(f"{temp_file_path}.ext")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                os.remove(
                    indexed_dataset.index_file_path(f"{temp_file_path}.ext"))
        ds.finalize('{}.idx'.format(output_file))
        ext_ds.finalize()
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            # TODO: parse json to txt file, one line one traversal, please help me parallize it.
            """
            because only 1 thread is allowed to write file, we have to use multi-processing for deal with data
            and merge results from CPUs into a block and then dumps such block. 
            """
            def _func(line):
                line = py150_util.separate_dps(
                    json_io.json_loads(line.strip()),
                    args['preprocess']['n_ctx'])
                line = [
                    py150_util.get_dfs(ast) + [ext] for ast, ext in line
                    if len(ast) > 1
                ]
                # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1]
                return line

            with PPool() as thread_pool:
                with file_io.open(file_name(input_prefix, lang), 'r') as f, \
                    file_io.open(dest_path(output_prefix, lang), 'w') as fout:

                    def _write(result):
                        for res in itertools.chain(*result):
                            print(json_io.json_dumps(res), file=fout)

                    batch_data = []
                    for line in f:
                        batch_data.append(line)
                        if len(batch_data) >= MAX_BATCH_SIZE:
                            result = thread_pool.feed(_func,
                                                      batch_data,
                                                      one_params=True)
                            _write(result)
                            del batch_data
                            batch_data = []

                    if len(batch_data) > 0:
                        result = thread_pool.feed(_func,
                                                  batch_data,
                                                  one_params=True)
                        _write(result)
                        del batch_data
        else:
            if lang == 'code_types':
                in_file = file_name(input_prefix, 'ast')
            else:
                in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, lang, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
コード例 #13
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    if args['preprocess']['joined_dictionary']:
        assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \
            "cannot use both --srcdict and --tgtdict with --joined-dictionary"
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        elif args['preprocess']['tgtdict']:
            src_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --srcdict is not specified"
            filenames = [
                train_path(args['preprocess']['source_lang']),
                train_path(args['preprocess']['target_lang'])
            ]
            if not args['preprocess']['only_train']:
                filenames.extend( \
                    [valid_path(args['preprocess']['source_lang']), valid_path(args['preprocess']['target_lang'])])
            src_dict = task.build_dictionary(
                filenames,
                tokenize_func=tokenization.dpu_sub_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['threshold'],
                # set max len for joint dictionaries
                nwords=max(args['preprocess']['nwordssrc'],
                           args['preprocess']['nwordstgt']),
            )
        tgt_dict = src_dict

    else:
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --srcdict is not specified"

            filenames = PathManager.ls(
                train_path(args['preprocess']['source_lang']))
            if not args['preprocess']['only_train']:
                filenames.extend(
                    PathManager.ls(
                        valid_path(args['preprocess']['source_lang'])))
            src_dict = task.build_dictionary(
                filenames,
                tokenize_func=tokenization.dpu_sub_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['thresholdsrc'],
                nwords=args['preprocess']['nwordssrc'],
                padding_factor=args['preprocess']['padding_factor'],
            )
        if target:
            if args['preprocess']['tgtdict']:
                tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
            else:
                assert args['preprocess'][
                    'trainpref'], "--trainpref must be set if --tgtdict is not specified"
                filenames = PathManager.ls(
                    train_path(args['preprocess']['target_lang']))
                if not args['preprocess']['only_train']:
                    filenames.extend(
                        PathManager.ls(
                            valid_path(args['preprocess']['target_lang'])))
                tgt_dict = task.build_dictionary(
                    filenames,
                    tokenize_func=tokenization.dpu_sub_tokenizer,
                    workers=args['preprocess']['workers'],
                    threshold=args['preprocess']['thresholdtgt'],
                    nwords=args['preprocess']['nwordstgt'],
                    padding_factor=args['preprocess']['padding_factor'],
                )
        else:
            tgt_dict = None

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenization.dpu_sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab,
                     input_prefix,
                     output_prefix,
                     lang,
                     out_file=None,
                     num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            in_file = file_name(input_prefix, lang)
            if out_file is None:
                out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]):
            # copy shared dict into each languages
            out_dir = os.path.join(args['preprocess']['destdir'], l)
            PathManager.mkdir(out_dir)
            dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl')
            PathManager.copy(dict_path(lang), dst_dict)

            if args['preprocess']['trainpref']:
                out_file = os.path.join(out_dir, f"train.{lang}")
                make_dataset(vocab,
                             args['preprocess']['trainpref'].replace('*', l),
                             "train",
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['validpref']:
                out_file = os.path.join(out_dir, f"valid.{lang}")
                make_dataset(vocab,
                             args['preprocess']['validpref'].replace('*', l),
                             'valid',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['testpref']:
                out_file = os.path.join(out_dir, f"test.{lang}")
                make_dataset(vocab,
                             args['preprocess']['testpref'].replace('*', l),
                             'test',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
コード例 #14
0
ファイル: preprocess.py プロジェクト: CGCL-codes/naturalcc
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir for {} task'.format(args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".json"

    def build_dictionary(filenames, modality, src=False, tgt=False):
        assert src ^ tgt
        if modality in ['binary_ast']:
            tokenize_func = tokenization.json_tokenizer
        elif modality in ['code_tokens', 'docstring_tokens', 'sbt', 'sbtao', 'path']:
            tokenize_func = tokenization.json_tokenizer

        return task.build_dictionary(
            filenames,
            tokenize_func=tokenize_func,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['thresholdsrc'],
            nwords=args['preprocess']['nwordssrc'] if src else args['preprocess']['nwordstgt'],
            padding_factor=args['preprocess']['padding_factor'],
        )

    # 1. build vocabulary
    LOGGER.info('Build vocabularies...')
    target = not args['preprocess']['only_source']

    # if not args['preprocess']['srcdict'] and os.path.exists(dict_path(args['preprocess']['source_lang'])):
    #     raise FileExistsError(dict_path(args['preprocess']['source_lang']))
    # if target and not args['preprocess']['tgtdict'] and os.path.exists(dict_path(args['preprocess']['target_lang'])):
    #     raise FileExistsError(dict_path(args['preprocess']['target_lang']))

    if args['preprocess']['joined_dictionary']:
        assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \
            "cannot use both --srcdict and --tgtdict with --joined-dictionary"

        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        elif args['preprocess']['tgtdict']:
            src_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary(
                {train_path(lang) for lang in [args['preprocess']['source_lang'], args['preprocess']['target_lang']]},
                args['preprocess']['source_lang'], src=True
            )
        tgt_dict = src_dict
    else:
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        else:
            assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary([train_path(args['preprocess']['source_lang'])],
                                        args['preprocess']['source_lang'], src=True)

        if target:
            if args['preprocess']['tgtdict']:
                tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
            else:
                assert args['preprocess']['trainpref'], "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args['preprocess']['target_lang'])],
                                            args['preprocess']['target_lang'], tgt=True)
        else:
            tgt_dict = None

    LOGGER.info('dict_path: {}'.format(dict_path(args['preprocess']['source_lang'])))
    src_dict.save_json(dict_path(args['preprocess']['source_lang']))
    if target and tgt_dict is not None:
        tgt_dict.save_json(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            attr: str, num_workers: int):
        """make binary dataset"""
        LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        attr,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                    callback=merge_result
                )
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(
                input_file, vocab, lambda t: ds.add_item(t),
                tokenize=tokenization.json_tokenizer, offset=0, end=offsets[1], append_eos=False,
            )
        )
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            )
        )

    def make_graph_binary_dataset(vocab: Dictionary, input_file, output_file):
        import torch
        from dgl.data.graph_serialize import GraphData
        from dgl.data.utils import save_graphs
        from tqdm import tqdm

        graph_batch, ids = [], []
        with open(input_file, 'r') as reader:
            num_lines = sum(1 for _ in reader)
            reader.seek(0)
            for idx, line in tqdm(enumerate(reader), total=num_lines):
                ast = ujson.loads(line)
                graph = tree2dgl(ast, dict)
                graph = GraphData.create(graph)
                graph_batch.append(graph)
                ids.append(idx)
        graph_labels = {"glabel": torch.IntTensor(ids)}
        save_graphs(output_file + '.mmap', graph_batch, graph_labels)

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            in_file = file_name(input_prefix, lang)
            out_dir = args['preprocess']['destdir']
            os.makedirs(out_dir, exist_ok=True)
            LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
            shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            make_binary_dataset(vocab, in_file, out_file, lang, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab, args['preprocess']['trainpref'], "train", lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)