Exemple #1
0
def main(args):
    # task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    def parse_source_input(code):
        code_tokens = vocab.tokenize(code)
        # truncating
        code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2]
        source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token]
        source_ids = vocab.convert_tokens_to_ids(source_tokens)
        source_size = len(source_tokens)
        source_mask = [1] * source_size
        padding_length = config.MAX_SOURCE_LENGTH - len(source_ids)
        source_ids += [vocab.pad()] * padding_length
        source_mask += [0] * padding_length
        return [source_ids, source_mask, source_size]

    def parse_target_input(code):
        target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2]
        target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token]
        target_ids = vocab.convert_tokens_to_ids(target_tokens)
        target_size = len(target_ids)
        target_mask = [1] * target_size
        padding_length = config.MAX_TARGET_LENGTH - len(target_ids)
        target_ids += [vocab.pad_token_id] * padding_length
        target_mask += [0] * padding_length
        return [target_ids, target_mask, target_size]

    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']
    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl")
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(src_file, 'r') as reader:
            keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes']
            data = {key: [] for key in keys}
            for line in reader:
                src_code = json_io.json_loads(line)
                # src_code = SPACE_SPLITTER.sub(" ", line)
                # source_ids, source_mask
                src_line = parse_source_input(src_code)
                # target_ids, target_mask
                tgt_line = parse_target_input(src_code)
                for key, src in zip(keys, [src_code] + src_line + tgt_line):
                    data[key].append(src)
            file_io.open(dst_file, mode='wb', data=data)
Exemple #2
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    vocab = TransformersDictionary.from_pretrained('microsoft/graphcodebert-base')

    # def save_token_dict():
    #     src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
    #     tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
    #     # Dictionary.text_to_jsonl(src_file, tgt_file)
    #     vocab = Dictionary()
    #     with file_io.open(src_file, 'r') as reader:
    #         for line in reader:
    #             token, num = line.strip().split()
    #             vocab.add_symbol(token, eval(num))
    #     vocab.save(tgt_file)
    #     return vocab
    #
    # token_dict = save_token_dict()
    token_dict = vocab

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']

    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        mode = 'valid'
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens")
        PathManager.mkdir(os.path.dirname(dst_file))

        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        src_file,
                        prefix,
                        vocab,
                        token_dict,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                )
            pool.close()

        ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab))
        end = offsets[1]

        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                # code_tokens = vocab.encode(line, out_type=str)
                # code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
                code_tokens = torch.IntTensor(vocab.string_to_indices(line))
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize(f"{dst_file}.idx")
Exemple #3
0
def main(args):
    # task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = TransformersDictionary.from_pretrained('microsoft/graphcodebert-base')

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    def parse_source_input(code, lang):
        code_tokens, dfg = extract_dataflow(code, parsers[lang], lang)
        code_tokens = vocab.subtokenize(code_tokens)

        ori2cur_pos = {}
        ori2cur_pos[-1] = (0, 0)
        for i in range(len(code_tokens)):
            ori2cur_pos[i] = (ori2cur_pos[i - 1][1], ori2cur_pos[i - 1][1] + len(code_tokens[i]))

        # truncating
        code_tokens = code_tokens[
                      :config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - 3 - min(len(dfg), config.MAX_DATA_FLOW_LEN)] \
            [:512 - 3]
        source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token]
        source_ids = vocab.convert_tokens_to_ids(source_tokens)
        position_idx = [i + vocab.pad() + 1 for i in range(len(source_tokens))]
        dfg = dfg[:config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - len(source_tokens)]
        source_tokens += [x[0] for x in dfg]
        position_idx += [0 for _ in dfg]
        source_ids += [vocab.unk() for _ in dfg]
        padding_length = config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - len(source_ids)
        position_idx += [vocab.pad()] * padding_length
        source_ids += [vocab.pad()] * padding_length

        # reindex
        reverse_index = {}
        for idx, x in enumerate(dfg):
            reverse_index[x[1]] = idx
        for idx, x in enumerate(dfg):
            dfg[idx] = x[:-1] + ([reverse_index[i] for i in x[-1] if i in reverse_index],)
        dfg_to_dfg = [x[-1] for x in dfg]
        dfg_to_code = [ori2cur_pos[x[1]] for x in dfg]
        length = len([vocab.cls()])
        dfg_to_code = [(x[0] + length, x[1] + length) for x in dfg_to_code]
        return [source_ids, position_idx, dfg_to_code, dfg_to_dfg]

    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']
    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl")
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(src_file, 'r') as reader:
            keys = [
                'code', 'src_tokens', 'src_positions', 'dfg2code', 'dfg2dfg',
            ]
            data = {key: [] for key in keys}
            for line in reader:
                src_code = json_io.json_loads(line)
                # source_ids, source_mask
                src_line = parse_source_input(src_code, lang)
                for key, src in zip(keys, [src_code] + src_line):
                    data[key].append(src)

            # cp id
            data['proj_indices'] = [1] * len(data['code'])
            file_io.open(dst_file, mode='wb', data=data)
Exemple #4
0
 def __init__(self, args):
     from ncc.data.dictionary import TransformersDictionary
     self.sp = TransformersDictionary.from_pretrained(
         args['sentencepiece_vocab'])
Exemple #5
0
import itertools

from dataset.avatar.translation import (
    LANGUAGES,
    DATASET_DIR,
    ATTRIBUTES_DIR,
    MODES,
)
from ncc.utils.file_ops import file_io
from ncc.utils.file_ops import json_io
from ncc.utils.path_manager import PathManager

if __name__ == '__main__':
    from ncc.data.dictionary import TransformersDictionary

    vocab = TransformersDictionary.from_pretrained(
        'microsoft/graphcodebert-base')

    for topk in [1, 3, 5]:

        attributes = ['code', 'ast', 'dfs']
        dst_dir = os.path.join(DATASET_DIR, 'codedisen', 'data')
        for lang in LANGUAGES:
            PathManager.mkdir(os.path.join(dst_dir, f"top{topk}", lang))
        for mode in MODES:
            readers = [
                file_io.open(
                    os.path.join(ATTRIBUTES_DIR, f"top{topk}", lang,
                                 f"{mode}.{attr}"), 'r') for lang in LANGUAGES
                for attr in attributes
            ]
            writers = [
Exemple #6
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    from ncc.data.dictionary import TransformersDictionary
    vocab = TransformersDictionary.from_pretrained(
        'microsoft/graphcodebert-base')

    file = os.path.join(args['preprocess']['destdir'], 'dfs.jsonl')
    node_dict = Dictionary.load(file)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess'][
        'tgt_lang']

    # code tokens => code tokens
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.code_tokens"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code_tokens")
        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize_tokens,
                    (args, src_file, vocab, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()

        ds_file = '{}.mmap'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="mmap",
                                          vocab_size=len(vocab))
        end = offsets[1]
        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.subtokenize(line)
                code_tokens = torch.IntTensor(
                    vocab.tokens_to_indices(code_tokens))
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(dst_file))

    # code => code
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.code"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")

        ds_file = '{}.bin'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="bin",
                                          vocab_size=len(vocab))
        with open(src_file, 'r') as reader:
            for line in reader:
                line = json_io.json_loads(line)
                ds.add_item(line)
        ds.finalize('{}.idx'.format(dst_file))

    # dfs => dfs
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.dfs"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.dfs")
        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize_dfs,
                    (args, src_file, node_dict, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()

        ds_file = '{}.mmap'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="mmap",
                                          vocab_size=len(vocab))
        end = offsets[1]
        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                dfs = torch.IntTensor([node_dict.index(tok) for tok in line])
                ds.add_item(dfs)
                line = reader.readline()

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(dst_file))