Ejemplo n.º 1
0
    def make_all(lang, vocab):
        for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]):
            # copy shared dict into each languages
            out_dir = os.path.join(args['preprocess']['destdir'], l)
            PathManager.mkdir(out_dir)
            dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl')
            PathManager.copy(dict_path(lang), dst_dict)

            if args['preprocess']['trainpref']:
                out_file = os.path.join(out_dir, f"train.{lang}")
                make_dataset(vocab,
                             args['preprocess']['trainpref'].replace('*', l),
                             "train",
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['validpref']:
                out_file = os.path.join(out_dir, f"valid.{lang}")
                make_dataset(vocab,
                             args['preprocess']['validpref'].replace('*', l),
                             'valid',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['testpref']:
                out_file = os.path.join(out_dir, f"test.{lang}")
                make_dataset(vocab,
                             args['preprocess']['testpref'].replace('*', l),
                             'test',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
Ejemplo n.º 2
0
 def save_lang_dict():
     src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl")
     dict = Dictionary.load(src_file)
     tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl')
     PathManager.mkdir(os.path.dirname(tgt_file))
     dict.save(tgt_file)
     return dict
Ejemplo n.º 3
0
    def docstring_tokens_fn(filename,
                            dest_filename,
                            idx,
                            start=0,
                            end=-1,
                            *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring_tokens = json_io.json_loads(line)
                if docstring_tokens:
                    docstring_tokens = [
                        token for token in docstring_tokens \
                        if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token))
                    ]
                    if not all(
                            str.isascii(token) for token in docstring_tokens):
                        docstring_tokens = None
                    if (docstring_tokens is
                            None) or not (3 < len(docstring_tokens) <= 50):
                        docstring_tokens = None
                else:
                    docstring_tokens = None
                print(json_io.json_dumps(docstring_tokens), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 4
0
 def _save(self, f, kv_iterator):
     if isinstance(f, str):
         PathManager.mkdir(os.path.dirname(f))
         with file_io.open(f, "w") as fd:
             return self.save(fd)
     for k, v in kv_iterator:
         print(json_io.json_dumps([k, v]), file=f)
Ejemplo n.º 5
0
    def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code_tokens = json_io.json_loads(line)
                if code_tokens:
                    # filter comment in code_tokens, eg. //***\n /* */\n
                    code_tokens = [token for token in code_tokens
                                   if not (str.startswith(token, '//') or str.startswith(token, '#') or \
                                           (str.startswith(token, '/*') and str.endswith(token, '*/')))
                                   ]

                    if not all(str.isascii(token) for token in code_tokens):
                        code_tokens = None
                    if code_tokens is None or len(code_tokens) < 1:
                        code_tokens = None
                else:
                    code_tokens = None

                print(json_io.json_dumps(code_tokens), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 6
0
def main():
    from dataset.py150 import (RAW_DIR, ATTRIBUTES_DIR, )
    from ncc.utils.path_manager import PathManager

    parser = argparse.ArgumentParser()
    parser.add_argument('--data_dir', type=str, default=RAW_DIR)
    parser.add_argument('--output_dir', type=str, default=ATTRIBUTES_DIR)

    parser.add_argument('--valid_p', type=float, default=0.2)
    parser.add_argument('--max_path_length', type=int, default=8)
    parser.add_argument('--max_path_width', type=int, default=2)
    parser.add_argument('--use_method_name', type=bool, default=True)
    parser.add_argument('--use_nums', type=bool, default=True)
    parser.add_argument('--n_jobs', type=int, default=multiprocessing.cpu_count())
    parser.add_argument('--seed', type=int, default=239)
    args = parser.parse_args()

    np.random.seed(args.seed)

    data_dir = Path(args.data_dir)
    trains = __collect_asts(data_dir / 'python100k_train.json')
    evals = __collect_asts(data_dir / 'python50k_eval.json')

    train, valid = sklearn_model_selection.train_test_split(
        trains,
        test_size=args.valid_p,
    )
    test = evals

    output_dir = Path(args.output_dir)
    PathManager.mkdir(output_dir)
    for split_name, split in zip(('train', 'valid', 'test'), (train, valid, test)):
        output_file = output_dir / f'{split_name}.method_path'
        __collect_all_and_save(split, args, output_file)
Ejemplo n.º 7
0
 def make_dataset(vocab, aux_dict, input_prefix, output_prefix, lang, max_path_num, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         raise NotImplementedError
     else:
         in_file = file_name(input_prefix, lang)
         out_file = dest_path(output_prefix, lang)
         PathManager.mkdir(os.path.dirname(out_file))
         make_binary_dataset(vocab, aux_dict, in_file, out_file, lang, max_path_num, num_workers)
Ejemplo n.º 8
0
 def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         in_file = file_name(input_prefix, lang)
         out_dir = args['preprocess']['destdir']
         PathManager.mkdir(out_dir)
         LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
         shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
     else:
         in_file = file_name(input_prefix, lang)
         out_file = dest_path(output_prefix, lang)
         PathManager.mkdir(os.path.dirname(out_file))
         make_binary_dataset(vocab, in_file, out_file, num_workers)
Ejemplo n.º 9
0
    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            # TODO: parse json to txt file, one line one traversal, please help me parallize it.
            """
            because only 1 thread is allowed to write file, we have to use multi-processing for deal with data
            and merge results from CPUs into a block and then dumps such block. 
            """
            def _func(line):
                line = py150_util.separate_dps(
                    json_io.json_loads(line.strip()),
                    args['preprocess']['n_ctx'])
                line = [
                    py150_util.get_dfs(ast) + [ext] for ast, ext in line
                    if len(ast) > 1
                ]
                # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1]
                return line

            with PPool() as thread_pool:
                with file_io.open(file_name(input_prefix, lang), 'r') as f, \
                    file_io.open(dest_path(output_prefix, lang), 'w') as fout:

                    def _write(result):
                        for res in itertools.chain(*result):
                            print(json_io.json_dumps(res), file=fout)

                    batch_data = []
                    for line in f:
                        batch_data.append(line)
                        if len(batch_data) >= MAX_BATCH_SIZE:
                            result = thread_pool.feed(_func,
                                                      batch_data,
                                                      one_params=True)
                            _write(result)
                            del batch_data
                            batch_data = []

                    if len(batch_data) > 0:
                        result = thread_pool.feed(_func,
                                                  batch_data,
                                                  one_params=True)
                        _write(result)
                        del batch_data
        else:
            if lang == 'code_types':
                in_file = file_name(input_prefix, 'ast')
            else:
                in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, lang, num_workers)
Ejemplo n.º 10
0
def main(args):
    # task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    def parse_source_input(code):
        code_tokens = vocab.tokenize(code)
        # truncating
        code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2]
        source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token]
        source_ids = vocab.convert_tokens_to_ids(source_tokens)
        source_size = len(source_tokens)
        source_mask = [1] * source_size
        padding_length = config.MAX_SOURCE_LENGTH - len(source_ids)
        source_ids += [vocab.pad()] * padding_length
        source_mask += [0] * padding_length
        return [source_ids, source_mask, source_size]

    def parse_target_input(code):
        target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2]
        target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token]
        target_ids = vocab.convert_tokens_to_ids(target_tokens)
        target_size = len(target_ids)
        target_mask = [1] * target_size
        padding_length = config.MAX_TARGET_LENGTH - len(target_ids)
        target_ids += [vocab.pad_token_id] * padding_length
        target_mask += [0] * padding_length
        return [target_ids, target_mask, target_size]

    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']
    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl")
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(src_file, 'r') as reader:
            keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes']
            data = {key: [] for key in keys}
            for line in reader:
                src_code = json_io.json_loads(line)
                # src_code = SPACE_SPLITTER.sub(" ", line)
                # source_ids, source_mask
                src_line = parse_source_input(src_code)
                # target_ids, target_mask
                tgt_line = parse_target_input(src_code)
                for key, src in zip(keys, [src_code] + src_line + tgt_line):
                    data[key].append(src)
            file_io.open(dst_file, mode='wb', data=data)
Ejemplo n.º 11
0
 def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
     if args['preprocess']['dataset_impl'] == "raw":
         raise NotImplementedError
     else:
         languages = [
             os.path.basename(d)
             for d in PathManager.ls(os.path.dirname(input_prefix))
         ]
         for l in languages:
             in_file = file_name(input_prefix, lang)
             in_file = str.replace(in_file, '*', l)
             out_file = dest_path(os.path.join(l, output_prefix), lang)
             PathManager.mkdir(os.path.dirname(out_file))
             make_binary_dataset(vocab, in_file, out_file, num_workers)
Ejemplo n.º 12
0
    def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring = json_io.json_loads(line)
                print(json_io.json_dumps(docstring), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 13
0
def cast_code(raw_code_file, refined_code_file, dst_file):
    with file_io.open(raw_code_file, 'r') as raw_reader:
        raw_codes = {}
        for line in raw_reader:
            raw_code = line
            raw_code = raw_code[raw_code.find('def '):]
            func_name = raw_code[:raw_code.find('(')][4:].strip()
            raw_codes[func_name] = line.rstrip('\n')

    PathManager.mkdir(os.path.dirname(dst_file))
    with file_io.open(refined_code_file,
                      'r') as refined_reader, file_io.open(dst_file,
                                                           'w') as writer:
        for line in refined_reader:
            func_name = line[line.find('def '):].split()[1]
            raw_code = raw_codes[func_name]
            print(raw_code, file=writer)
Ejemplo n.º 14
0
def flatten_attrs(raw_file, flatten_dir, lang, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        mode = filename[:str.rfind(filename, '.jsonl')]
        return mode

    mode = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}')
        PathManager.mkdir(os.path.dirname(attr_file))
        attr_writers[attr] = file_io.open(attr_file, 'w')
    print('raw_file: ', raw_file)
    with file_io.open(raw_file, 'r') as reader:
        for line in reader:
            code_snippet = json_io.json_loads(line)
            for attr, info in code_snippet.items():
                if attr in attr_writers:
                    print(json_io.json_dumps(info), file=attr_writers[attr])
Ejemplo n.º 15
0
def flatten_attrs(raw_file, flatten_dir, lang, mode, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        filename = filename[:str.rfind(filename, '.jsonl.gz')]
        _, _, idx = filename.split('_')
        return idx

    idx = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_dir = os.path.join(flatten_dir, lang, mode, attr)
        PathManager.mkdir(attr_dir)
        attr_file = os.path.join(attr_dir, '{}.jsonl'.format(idx))
        attr_writers[attr] = file_io.open(attr_file, 'w')

    with file_io.open(raw_file, 'r') as reader:
        for line in reader:
            code_snippet = json_io.json_loads(line)
            for attr, info in code_snippet.items():
                if attr in attr_writers:
                    print(json_io.json_dumps(info), file=attr_writers[attr])
Ejemplo n.º 16
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    dictionary = save_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    lang = args['preprocess']['lang']
    for mode in MODES:
        file = f"{args['preprocess'][f'{mode}pref']}.code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")
        PathManager.mkdir(os.path.dirname(dst_file))
        dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap",
                                               impl='mmap',
                                               vocab_size=len(vocab))
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(file, 'r') as reader:
            data = {'code': []}
            for line in reader:
                line = json_io.json_loads(line)
                code = SPACE_SPLITTER.sub(" ", line)
                data['code'].append(code)
                code_tokens = vocab.encode(code, out_type=str)
                code_tokens = torch.IntTensor(
                    [dictionary.index(token) for token in code_tokens])
                # code_tokens = torch.IntTensor(vocab.encode_as_ids(code))
                dataset.add_item(code_tokens)
            dataset.finalize(f"{dst_file}_tokens.idx")
            # proj indices
            # cp id
            data['proj_indices'] = [1] * len(data['code'])
            file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
Ejemplo n.º 17
0
)
from ncc.utils.file_ops.yaml_io import recursive_expanduser
from ncc.utils.file_ops import file_io
from ncc.utils.path_manager import PathManager

if __name__ == '__main__':
    task = tasks.get_task('multilingual_denoising')
    base_dir = recursive_expanduser(
        '~/ncc_data/codexglue/code_to_text/multilingual_denoising/data-mmap')

    dict_file = os.path.join(base_dir, 'dict.jsonl')
    vocab = task.load_dictionary(dict_file)

    for mode in MODES:
        dst_file = os.path.join(base_dir, 'docstring', f"{mode}.docstring.spm")
        PathManager.mkdir(os.path.dirname(dst_file))
        # mmap
        ds = indexed_dataset.make_builder(f'{dst_file}.mmap',
                                          impl='mmap',
                                          vocab_size=len(vocab))
        for lang in LANGUAGES:
            src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm")
            ds.merge_file_(src_file)
        ds.finalize(f'{dst_file}.idx')
        # # raw
        # with file_io.open(ds, 'w') as writer:
        #     for lang in LANGUAGES:
        #         src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm")
        #         with open(src_file, 'r') as reader:
        #             shutil.copyfileobj(reader, writer)
Ejemplo n.º 18
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    if args['preprocess']['joined_dictionary']:
        assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \
            "cannot use both --srcdict and --tgtdict with --joined-dictionary"
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        elif args['preprocess']['tgtdict']:
            src_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --srcdict is not specified"
            filenames = [
                train_path(args['preprocess']['source_lang']),
                train_path(args['preprocess']['target_lang'])
            ]
            if not args['preprocess']['only_train']:
                filenames.extend( \
                    [valid_path(args['preprocess']['source_lang']), valid_path(args['preprocess']['target_lang'])])
            src_dict = task.build_dictionary(
                filenames,
                tokenize_func=tokenization.dpu_sub_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['threshold'],
                # set max len for joint dictionaries
                nwords=max(args['preprocess']['nwordssrc'],
                           args['preprocess']['nwordstgt']),
            )
        tgt_dict = src_dict

    else:
        if args['preprocess']['srcdict']:
            src_dict = task.load_dictionary(args['preprocess']['srcdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --srcdict is not specified"

            filenames = PathManager.ls(
                train_path(args['preprocess']['source_lang']))
            if not args['preprocess']['only_train']:
                filenames.extend(
                    PathManager.ls(
                        valid_path(args['preprocess']['source_lang'])))
            src_dict = task.build_dictionary(
                filenames,
                tokenize_func=tokenization.dpu_sub_tokenizer,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['thresholdsrc'],
                nwords=args['preprocess']['nwordssrc'],
                padding_factor=args['preprocess']['padding_factor'],
            )
        if target:
            if args['preprocess']['tgtdict']:
                tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
            else:
                assert args['preprocess'][
                    'trainpref'], "--trainpref must be set if --tgtdict is not specified"
                filenames = PathManager.ls(
                    train_path(args['preprocess']['target_lang']))
                if not args['preprocess']['only_train']:
                    filenames.extend(
                        PathManager.ls(
                            valid_path(args['preprocess']['target_lang'])))
                tgt_dict = task.build_dictionary(
                    filenames,
                    tokenize_func=tokenization.dpu_sub_tokenizer,
                    workers=args['preprocess']['workers'],
                    threshold=args['preprocess']['thresholdtgt'],
                    nwords=args['preprocess']['nwordstgt'],
                    padding_factor=args['preprocess']['padding_factor'],
                )
        else:
            tgt_dict = None

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenization.dpu_sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab,
                     input_prefix,
                     output_prefix,
                     lang,
                     out_file=None,
                     num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            in_file = file_name(input_prefix, lang)
            if out_file is None:
                out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]):
            # copy shared dict into each languages
            out_dir = os.path.join(args['preprocess']['destdir'], l)
            PathManager.mkdir(out_dir)
            dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl')
            PathManager.copy(dict_path(lang), dst_dict)

            if args['preprocess']['trainpref']:
                out_file = os.path.join(out_dir, f"train.{lang}")
                make_dataset(vocab,
                             args['preprocess']['trainpref'].replace('*', l),
                             "train",
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['validpref']:
                out_file = os.path.join(out_dir, f"valid.{lang}")
                make_dataset(vocab,
                             args['preprocess']['validpref'].replace('*', l),
                             'valid',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])
            if args['preprocess']['testpref']:
                out_file = os.path.join(out_dir, f"test.{lang}")
                make_dataset(vocab,
                             args['preprocess']['testpref'].replace('*', l),
                             'test',
                             lang,
                             out_file=out_file,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
Ejemplo n.º 19
0
def main(args):
    # task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = TransformersDictionary.from_pretrained('microsoft/graphcodebert-base')

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    def parse_source_input(code, lang):
        code_tokens, dfg = extract_dataflow(code, parsers[lang], lang)
        code_tokens = vocab.subtokenize(code_tokens)

        ori2cur_pos = {}
        ori2cur_pos[-1] = (0, 0)
        for i in range(len(code_tokens)):
            ori2cur_pos[i] = (ori2cur_pos[i - 1][1], ori2cur_pos[i - 1][1] + len(code_tokens[i]))

        # truncating
        code_tokens = code_tokens[
                      :config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - 3 - min(len(dfg), config.MAX_DATA_FLOW_LEN)] \
            [:512 - 3]
        source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token]
        source_ids = vocab.convert_tokens_to_ids(source_tokens)
        position_idx = [i + vocab.pad() + 1 for i in range(len(source_tokens))]
        dfg = dfg[:config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - len(source_tokens)]
        source_tokens += [x[0] for x in dfg]
        position_idx += [0 for _ in dfg]
        source_ids += [vocab.unk() for _ in dfg]
        padding_length = config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - len(source_ids)
        position_idx += [vocab.pad()] * padding_length
        source_ids += [vocab.pad()] * padding_length

        # reindex
        reverse_index = {}
        for idx, x in enumerate(dfg):
            reverse_index[x[1]] = idx
        for idx, x in enumerate(dfg):
            dfg[idx] = x[:-1] + ([reverse_index[i] for i in x[-1] if i in reverse_index],)
        dfg_to_dfg = [x[-1] for x in dfg]
        dfg_to_code = [ori2cur_pos[x[1]] for x in dfg]
        length = len([vocab.cls()])
        dfg_to_code = [(x[0] + length, x[1] + length) for x in dfg_to_code]
        return [source_ids, position_idx, dfg_to_code, dfg_to_dfg]

    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']
    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl")
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(src_file, 'r') as reader:
            keys = [
                'code', 'src_tokens', 'src_positions', 'dfg2code', 'dfg2dfg',
            ]
            data = {key: [] for key in keys}
            for line in reader:
                src_code = json_io.json_loads(line)
                # source_ids, source_mask
                src_line = parse_source_input(src_code, lang)
                for key, src in zip(keys, [src_code] + src_line):
                    data[key].append(src)

            # cp id
            data['proj_indices'] = [1] * len(data['code'])
            file_io.open(dst_file, mode='wb', data=data)
Ejemplo n.º 20
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    from dataset.codexglue.code_to_text import BPE_DIR

    source_dict_file = os.path.join(BPE_DIR, 'csn/csn.spm.vocab')
    target_dict_file = os.path.join(os.path.dirname(args['preprocess']['destdir']), 'dict.jsonl')
    with open(source_dict_file, 'r') as reader, open(target_dict_file, 'w') as writer:
        for line in reader:
            print(json_io.json_dumps([line.split('\t')[0], 100]), file=writer)
    src_dict = tgt_dict = task.load_dictionary(target_dict_file)

    # shared dicts for all languages
    src_dict.save(
        os.path.join(os.path.dirname(args['preprocess']['destdir']), f"{args['preprocess']['source_lang']}.jsonl")
    )
    tgt_dict.save(
        os.path.join(os.path.dirname(args['preprocess']['destdir']), f"{args['preprocess']['target_lang']}.jsonl")
    )

    src_dict.save(dict_path(args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                    callback=merge_result
                )
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(
                input_file, vocab, lambda t: ds.add_item(t),
                tokenize=tokenization.json_tokenizer, offset=0, end=offsets[1], append_eos=False,
            )
        )
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                # attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            )
        )

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            in_file = file_name(input_prefix, lang)
            out_dir = args['preprocess']['destdir']
            PathManager.mkdir(out_dir)
            LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
            shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab, args['preprocess']['trainpref'], "train", lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
Ejemplo n.º 21
0
            mpool.apply_async(func, (_src_filename, _tgt_filename, idx,
                                     offsets[idx], offsets[idx + 1]))
            for idx in range(num_workers)
        ]
        result = [res.get() for res in result]

    def _concate(_tgt_filename, num_workers, tgt_filename):
        src_filenames = [
            _tgt_filename + str(idx) for idx in range(num_workers)
        ]
        with file_io.open(tgt_filename, 'w') as writer:
            for _src_fl in src_filenames:
                with file_io.open(_src_fl, 'r') as reader:
                    shutil.copyfileobj(reader, writer)
                PathManager.rm(_src_fl)

    _concate(_tgt_filename, num_workers, tgt_filename)


if __name__ == '__main__':
    # old ast => new ast
    for file, mode in zip(['python100k_train.json', 'python50k_eval.json'],
                          MODES):
        file = os.path.join(RAW_DIR, file)
        PathManager.mkdir(ATTRIBUTES_DIR)
        tgt_file = os.path.join(ATTRIBUTES_DIR, f'{mode}.ast')
        process(src_filename=file,
                tgt_filename=tgt_file,
                num_workers=cpu_count(),
                func=ast_fn)
Ejemplo n.º 22
0
    )
    parser.add_argument(
        "--raw_dataset_dir",
        "-r",
        default=RAW_DIR,
        type=str,
        help="raw dataset download directory",
    )
    parser.add_argument(
        "--attributes_dir",
        "-d",
        default=ATTRIBUTES_DIR,
        type=str,
        help="data directory of attributes directory",
    )
    args = parser.parse_args()
    # print(args)

    for lang, mode in itertools.product(args.languages, MODES):
        raw_file = os.path.join(args.raw_dataset_dir, f'{lang}.csv')
        dst_dir = os.path.join(args.attributes_dir, lang)
        PathManager.mkdir(dst_dir)
        flatten(raw_file, dst_dir, mode)

        code_tokenization(src_file=os.path.join(dst_dir, f'{mode}.src'))

    # xfg -> inst2vec
    xfg(src_dir=args.raw_dataset_dir,
        languages=args.languages,
        dst_dir=args.attributes_dir)
Ejemplo n.º 23
0
def main(args):
    LOGGER.info('mkdir for {} task'.format(args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    # 1. ***************build vocabulary***************
    task = tasks.get_task(args['preprocess']['task'])

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    def string2dfs(line):
        line = json_io.json_loads(line)
        asts = py150_util.separate_dps(line, args['preprocess']['max_len'])
        ast_dfs = [[py150_util.get_dfs(ast), ext] for ast, ext in asts
                   if len(ast) > 1]
        return ast_dfs

    def string2type_dfs(line):
        type_dfs = type_tokenize_func(line)
        type_dfs = py150_util.separate_dps(type_dfs,
                                           args['preprocess']['max_len'])
        type_dfs = [[dfs, ext] for dfs, ext in type_dfs if len(dfs) > 1]
        return type_dfs

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    target = not args['preprocess']['only_source']

    if not args['preprocess']['srcdict'] and os.path.exists(
            dict_path(args['preprocess']['source_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['source_lang']))
    if target and not args['preprocess']['tgtdict'] and os.path.exists(
            dict_path(args['preprocess']['target_lang'])):
        raise FileExistsError(dict_path(args['preprocess']['target_lang']))

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info(
            'Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['srcdict']:
        src_dict = task.load_dictionary(args['preprocess']['srcdict'])
    else:
        assert args['preprocess'][
            'trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['source_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['source_lang']))
        src_dict = task.build_dictionary(
            filenames,
            tokenize_func=tokenize_func,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordssrc'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None,
            eos=None,
        )
    if target:
        if args['preprocess']['tgtdict']:
            tgt_dict = task.load_dictionary(args['preprocess']['tgtdict'])
        else:
            assert args['preprocess'][
                'trainpref'], "--trainpref must be set if --tgtdict is not specified"
            # code_types are from ast
            filenames = [train_path(args['preprocess']['source_lang'])]
            if not args['preprocess']['only_train']:
                filenames.append(valid_path(args['preprocess']['source_lang']))
            tgt_dict = task.build_dictionary(
                filenames,
                tokenize_func=type_tokenize_func,
                workers=args['preprocess']['workers'],
                threshold=args['preprocess']['threshold'],
                nwords=args['preprocess']['nwordstgt'],
                padding_factor=args['preprocess']['padding_factor'],
                bos=None,
                eos=None,
            )
    else:
        tgt_dict = None

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab, input_file, output_file, lang, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize, (
                    args,
                    input_file,
                    vocab,
                    prefix,
                    lang,
                    offsets[worker_id],
                    offsets[worker_id + 1],
                ),
                                 callback=merge_result)
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq')

        def consumer(data, start_idx):
            ds.add_item(data)
            ext_ds.add_item(start_idx)

        tokenize = string2dfs if lang == 'ast' else string2type_dfs
        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=tokenize,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                ext_ds.merge_file_(f"{temp_file_path}.ext")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                os.remove(
                    indexed_dataset.index_file_path(f"{temp_file_path}.ext"))
        ds.finalize('{}.idx'.format(output_file))
        ext_ds.finalize()
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            # TODO: parse json to txt file, one line one traversal, please help me parallize it.
            """
            because only 1 thread is allowed to write file, we have to use multi-processing for deal with data
            and merge results from CPUs into a block and then dumps such block. 
            """
            def _func(line):
                line = py150_util.separate_dps(
                    json_io.json_loads(line.strip()),
                    args['preprocess']['n_ctx'])
                line = [
                    py150_util.get_dfs(ast) + [ext] for ast, ext in line
                    if len(ast) > 1
                ]
                # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1]
                return line

            with PPool() as thread_pool:
                with file_io.open(file_name(input_prefix, lang), 'r') as f, \
                    file_io.open(dest_path(output_prefix, lang), 'w') as fout:

                    def _write(result):
                        for res in itertools.chain(*result):
                            print(json_io.json_dumps(res), file=fout)

                    batch_data = []
                    for line in f:
                        batch_data.append(line)
                        if len(batch_data) >= MAX_BATCH_SIZE:
                            result = thread_pool.feed(_func,
                                                      batch_data,
                                                      one_params=True)
                            _write(result)
                            del batch_data
                            batch_data = []

                    if len(batch_data) > 0:
                        result = thread_pool.feed(_func,
                                                  batch_data,
                                                  one_params=True)
                        _write(result)
                        del batch_data
        else:
            if lang == 'code_types':
                in_file = file_name(input_prefix, 'ast')
            else:
                in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, in_file, out_file, lang, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
Ejemplo n.º 24
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_token_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    token_dict = save_token_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']

    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens")
        PathManager.mkdir(os.path.dirname(dst_file))

        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        src_file,
                        prefix,
                        vocab,
                        token_dict,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                )
            pool.close()

        ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab))
        end = offsets[1]

        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.encode(line, out_type=str)
                code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize(f"{dst_file}.idx")
Ejemplo n.º 25
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def tokenization(tokens):
        for idx, tok in enumerate(tokens):
            if len(tok) != 0:
                tokens[idx] = vocab.encode(tok, out_type=str)
        return tokens

    def ast_to_graph(ast):
        nodes, tokens, adjacence = [], [], [[] for _ in range(len(ast))]
        for idx, node in enumerate(ast):
            nodes.append(node['type'])
            if 'children' in node:
                tokens.append([])
                for child in node['children']:
                    adjacence[idx].append(child)
                    adjacence[child].append(idx)
            elif 'value' in node:
                tokens.append(node['value'])
            else:
                raise NotImplementedError

        tokens = tokenization(tokens)

        depth = {0: 1}  # 0 for pad
        for idx, node in enumerate(ast[1:], start=1):
            depth[idx] = depth[node['parent']] + 1
        depth = list(depth.values())

        assert len(nodes) == len(tokens) == len(adjacence) == len(depth)
        return nodes, tokens, adjacence, depth

    def save_token_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    token_dict = save_token_dict()

    def save_node_dict():
        src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/node.jsonl")
        dict = Dictionary.load(src_file)
        tgt_file = os.path.join(args['preprocess']['destdir'], 'node.jsonl')
        PathManager.mkdir(os.path.dirname(tgt_file))
        dict.save(tgt_file)
        return dict

    node_dict = save_node_dict()

    def save_lang_dict():
        src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl")
        dict = Dictionary.load(src_file)
        tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl')
        PathManager.mkdir(os.path.dirname(tgt_file))
        dict.save(tgt_file)
        return dict

    lang_dict = save_lang_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    lang = args['preprocess']['lang']
    for mode in MODES:
        src_file = f"{args['preprocess'][f'{mode}pref']}.ast"

        node_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.node")
        PathManager.mkdir(os.path.dirname(node_file))
        node_dataset = indexed_dataset.make_builder(f"{node_file}.mmap", impl='mmap', vocab_size=len(node_dict))

        depth_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.depth")
        depth_dataset = indexed_dataset.make_builder(f"{depth_file}.mmap", impl='mmap')

        code_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code")
        code_dataset = indexed_dataset.make_builder(f"{code_file}.bin", impl='bin', dtype=str)

        adjacence_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.adjacence")
        adjacence_dataset = indexed_dataset.make_builder(f"{adjacence_file}.bin", impl='bin')

        code_tokens_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens")
        code_tokens_dataset = indexed_dataset.make_builder(f"{code_tokens_file}.bin", impl='bin')

        with file_io.open(src_file, 'r') as reader:
            for idx, line in enumerate(reader):
                line = json_io.json_loads(line)
                ast = bfs_to_dfs(line)
                nodes, tokens, adjacence, depth = ast_to_graph(ast)
                # save node into mmap dataset
                nodes = torch.IntTensor([node_dict.index(tok) for tok in nodes])
                node_dataset.add_item(nodes)
                # save depth into mmap dataset
                depth = torch.IntTensor(depth)
                depth_dataset.add_item(depth)
                # code
                code = ''.join(itertools.chain(*tokens)).replace(constants.SPM_SPACE, ' ').strip()
                code_dataset.add_item(code)
                # tokens
                tokens = [[token_dict.index(tok) for tok in toks] if len(toks) > 0 else [] for toks in tokens]
                code_tokens_dataset.add_item(tokens)
                # adjacence
                for adj in adjacence:
                    assert adj == sorted(adj)
                adjacence_dataset.add_item(adjacence)

        node_dataset.finalize(f"{node_file}.idx")
        depth_dataset.finalize(f"{depth_file}.idx")
        code_dataset.finalize(f"{code_file}.idx")
        code_tokens_dataset.finalize(f"{code_tokens_file}.idx")
        adjacence_dataset.finalize(f"{adjacence_file}.idx")

        # proj indices
        with file_io.open(f"{args['preprocess'][f'{mode}pref']}.proj", 'r') as reader:
            projs = [json_io.json_loads(line) for line in reader]
        proj_indices = Counter(projs)
        proj_indices = [proj_num for idx, proj_num in proj_indices.items()]
        proj_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.proj")
        proj_dataset = indexed_dataset.make_builder(f"{proj_file}.seq", impl='seq')
        proj_dataset.add_item(torch.IntTensor(proj_indices))
        proj_dataset.finalize(f"{proj_file}.idx")
Ejemplo n.º 26
0
        "--dataset_dir", "-d", default=RAW_DIR, type=str, help="raw dataset download directory",
    )
    parser.add_argument(
        "--flatten_dir", "-f", default=ATTRIBUTES_DIR, type=str,
        help="data directory of flatten attribute",
    )
    parser.add_argument(
        "--attrs", "-a",
        default=['code', 'code_tokens', 'code_types', 'ast'],
        type=str, nargs='+',
    )
    parser.add_argument(
        "--cores", "-c", default=cpu_count(), type=int, help="cpu cores for flatten raw data attributes",
    )
    args = parser.parse_args()
    # print(args)

    for mode in MODES:
        src_files = [os.path.join(args.dataset_dir, f"{mode}.{lang}") for lang in args.languages]
        src_readers = [file_io.open(file, 'r') for lang, file in zip(args.languages, src_files)]

        for lang in args.languages:
            PathManager.mkdir(os.path.join(args.flatten_dir, lang))
        dst_files = [os.path.join(args.flatten_dir, lang, f"{mode}.code") for lang in args.languages]
        dst_writers = {lang: file_io.open(file, 'w') for lang, file in zip(args.languages, dst_files)}

        for lines in zip(*src_readers):
            lines = list(map(lambda line: SPACE_SPLITTER.sub(" ", line.strip()), lines))
            for lang, line in zip(args.languages, lines):
                print(json_io.json_dumps(line.strip()), file=dst_writers[lang])
Ejemplo n.º 27
0
def main(args):
    LOGGER.info('mkdir for {} task'.format(args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    # 1. ***************build vocabulary***************
    task = tasks.get_task(args['preprocess']['task'])

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "")

    if args['preprocess']['only_train']:
        LOGGER.info('Generating dictionaries with Train data files.')
    else:
        LOGGER.info('Generating dictionaries with Train/Validation data files.')

    if args['preprocess']['subtokendict']:
        subtoken_dict = task.load_dictionary(args['preprocess']['subtokendict'])
    else:
        assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['source_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['source_lang']))
        subtoken_dict = task.build_dictionary(
            filenames,
            tokenize_func=subtoken_tokenize,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordssubtoken'],
            padding_factor=args['preprocess']['padding_factor'],
        )

    if args['preprocess']['typedict']:
        type_dict = task.load_dictionary(args['preprocess']['typedict'])
    else:
        assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['source_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['source_lang']))
        type_dict = task.build_dictionary(
            filenames,
            tokenize_func=type_tokenize,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordstype'],
            padding_factor=args['preprocess']['padding_factor'],
            bos=None, eos=None,
        )

    if args['preprocess']['docstringdict']:
        docstring_dict = task.load_dictionary(args['preprocess']['docstringdict'])
    else:
        assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified"

        filenames = [train_path(args['preprocess']['target_lang'])]
        if not args['preprocess']['only_train']:
            filenames.append(valid_path(args['preprocess']['target_lang']))
        docstring_dict = task.build_dictionary(
            filenames,
            tokenize_func=tokenization.json_tokenizer,
            workers=args['preprocess']['workers'],
            threshold=args['preprocess']['threshold'],
            nwords=args['preprocess']['nwordsdocstring'],
            padding_factor=args['preprocess']['padding_factor'],
        )

    subtoken_dict.save(dict_path('subtoken'))
    type_dict.save(dict_path('type'))
    docstring_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        aux_dict,
                        prefix,
                        lang,
                        tokenize,
                        max_path_num,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result
                )
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
        if lang == 'path':
            sz_ds_file = '{}.sz.mmap'.format(output_file)
            sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'],
                                                 vocab_size=len(vocab))
        else:
            sz_ds = None

        def consumer(tensor, size=None):
            ds.add_item(tensor)
            if size is not None:
                sz_ds.add_item(size)

        if sz_ds is None:
            merge_result(
                Binarizer.binarize(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False,
                    max_path_num=max_path_num,
                )
            )
        else:
            merge_result(
                PathSummarizationBinarizer.path_binarizer(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict,
                    max_path_num=max_path_num,
                )
            )
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                if sz_ds is not None:
                    sz_ds.merge_file_(f"{temp_file_path}.sz")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                if sz_ds is not None:
                    os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz"))
                    os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz"))
        ds.finalize('{}.idx'.format(output_file))
        if sz_ds is not None:
            sz_ds.finalize('{}.sz.idx'.format(output_file))
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            )
        )

    def make_dataset(vocab, aux_dict, input_prefix, output_prefix, lang, max_path_num, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            raise NotImplementedError
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            PathManager.mkdir(os.path.dirname(out_file))
            make_binary_dataset(vocab, aux_dict, in_file, out_file, lang, max_path_num, num_workers)

    def make_all(lang, vocab, aux_dict=None):
        if args['preprocess']['trainpref']:
            max_path_num = args['preprocess']['train_path_num']
            make_dataset(vocab, aux_dict, args['preprocess']['trainpref'], "train", lang, max_path_num,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            max_path_num = args['preprocess']['eval_path_num']
            for k, validpref in enumerate(args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab, aux_dict, validpref, outprefix, lang, max_path_num,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            max_path_num = args['preprocess']['eval_path_num']
            for k, testpref in enumerate(args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab, aux_dict, testpref, outprefix, lang, max_path_num,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], subtoken_dict, type_dict)
    make_all(args['preprocess']['target_lang'], docstring_dict)
Ejemplo n.º 28
0
# -*- coding: utf-8 -*-

import argparse
import os

import gdown

from ncc import (
    __TREE_SITTER_LIBS_DIR__,
    LOGGER,
)
from ncc.utils.path_manager import PathManager

PathManager.mkdir(__TREE_SITTER_LIBS_DIR__)

TREE_SITTER_SO_FILE_ARCHIVE_MAP = {
    "c": "https://drive.google.com/uc?id=1Ce0Wp_IYw4a69dMAd4RbaOqRK-DD592G",
    "cpp": "https://drive.google.com/uc?id=1Ip-_lW95I7DU_wj96CR-j31VehLtJLz2",
    "csharp":
    "https://drive.google.com/uc?id=1fCnNd3WiU1aVqgYZ9ygydTgedHq09pzw",
    "go": "https://drive.google.com/uc?id=18nIHKBahzkK4Xgm5mHRCOY2npiTC2NLd",
    "java": "https://drive.google.com/uc?id=1lP-H7D0IpqijmaseigcyqkKBzxWdwmYH",
    "javascript":
    "https://drive.google.com/uc?id=1OxM0VFhDi2P8WsOuL0pKzZ8MD-CErzqP",
    "julia":
    "https://drive.google.com/uc?id=13_GehtPCUgD1Df6p1-CF0vcEfzMtBTEj",
    "nix": "https://drive.google.com/uc?id=13W5w4OgcmTEakOSOVGvtqmm97_Px6O5z",
    "php": "https://drive.google.com/uc?id=1lGzi98rQn4qRnidKpn0jchL8QyLS6gUT",
    "python":
    "https://drive.google.com/uc?id=1jhadgdOng1I95cwtmNJz2fqW-SUvhpch",
    "ruby": "https://drive.google.com/uc?id=1geDqNll4ewd8zqmvUPg9uNrCMZ1iHbQz",
Ejemplo n.º 29
0
from ncc.utils.file_ops import file_io
from ncc.utils.file_ops import json_io
from ncc.utils.path_manager import PathManager

if __name__ == '__main__':
    from ncc.data.dictionary import TransformersDictionary

    vocab = TransformersDictionary.from_pretrained(
        'microsoft/graphcodebert-base')

    for topk in [1, 3, 5]:

        attributes = ['code', 'ast', 'dfs']
        dst_dir = os.path.join(DATASET_DIR, 'codedisen', 'data')
        for lang in LANGUAGES:
            PathManager.mkdir(os.path.join(dst_dir, f"top{topk}", lang))
        for mode in MODES:
            readers = [
                file_io.open(
                    os.path.join(ATTRIBUTES_DIR, f"top{topk}", lang,
                                 f"{mode}.{attr}"), 'r') for lang in LANGUAGES
                for attr in attributes
            ]
            writers = [
                file_io.open(
                    os.path.join(dst_dir, f"top{topk}", lang,
                                 f"{mode}.{attr}"), 'w') for lang in LANGUAGES
                for attr in attributes
            ]
            writers += [
                file_io.open(
Ejemplo n.º 30
0
    #     tests: 1 X 1

    for mode in MODES:
        code_num = 0
        TOPK = 1 if mode != "train" else args.topk
        file = os.path.join(args.dataset_dir, f"{mode}.jsonl")

        id_file = os.path.join(args.flatten_dir, f"top{args.topk}",
                               f"{mode}.id")
        print(id_file)

        jv_code = os.path.join(args.flatten_dir, f"top{args.topk}", 'java',
                               f"{mode}.code")
        jv_raw_code = os.path.join(args.flatten_dir, f"top{args.topk}", 'java',
                                   f"{mode}.raw_code")
        PathManager.mkdir(os.path.dirname(jv_code))

        py_code = os.path.join(args.flatten_dir, f"top{args.topk}", 'python',
                               f"{mode}.code")
        py_raw_code = os.path.join(args.flatten_dir, f"top{args.topk}",
                                   'python', f"{mode}.raw_code")
        PathManager.mkdir(os.path.dirname(py_code))

        with file_io.open(file, 'r') as reader, file_io.open(id_file, 'w') as id_writer, \
            file_io.open(jv_code, 'w') as jv_code_writer, file_io.open(jv_raw_code, 'w') as jv_raw_writer, \
            file_io.open(py_code, 'w') as py_code_writer, file_io.open(py_raw_code, 'w') as py_raw_writer:
            for line in reader:
                line = json_io.json_loads(line)
                id, jv_codes, py_codes = line['id'], line['java'][:TOPK], line[
                    'python'][:TOPK]