Esempio n. 1
0
def binarize(args: Dict,
             filename: str,
             dict: Dictionary,
             in_file: str,
             offset: int,
             end: int,
             append_eos: bool = False):
    """binarize function for multi-processing"""
    ds_file = f'{in_file}.mmap'
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))
    ext_ds = indexed_dataset.make_builder(f"{in_file}.ext", impl='seq')

    def consumer(data, start_idx):
        ds.add_item(data)
        ext_ds.add_item(start_idx)

    def seperate_tokenize(line):
        line = json_io.json_loads(line)
        tokens = separate_list(line, args['preprocess']['max_len'])
        return tokens

    res = Binarizer.binarize_seperate(filename,
                                      dict,
                                      consumer,
                                      tokenize=seperate_tokenize,
                                      append_eos=append_eos,
                                      offset=offset,
                                      end=end)
    ds.finalize('{}.idx'.format(in_file))
    ext_ds.finalize()
    return res
Esempio n. 2
0
def binarize(args, filename: str, vocab, aux_dict, in_file: str, lang, tokenize, max_path_num: int,
             offset: int, end: int, append_eos: bool = False):
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
    if lang == 'path':
        sz_ds_file = '{}.sz.mmap'.format(in_file)
        sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'],
                                             vocab_size=len(vocab))
    else:
        sz_ds = None

    def consumer(tensor, size=None):
        ds.add_item(tensor)
        if size is not None:
            sz_ds.add_item(size)

    if sz_ds is None:
        res = Binarizer.binarize(filename, vocab, consumer, tokenize=tokenize,
                                 append_eos=append_eos, offset=offset, end=end, )
        ds.finalize('{}.idx'.format(in_file))
    else:
        res = PathSummarizationBinarizer.path_binarizer(filename, vocab, consumer, tokenize=tokenize,
                                                        append_eos=append_eos, offset=offset, end=end,
                                                        type_dict=aux_dict, max_path_num=max_path_num, )
        ds.finalize('{}.idx'.format(in_file))
        sz_ds.finalize('{}.sz.idx'.format(in_file))
    return res
Esempio n. 3
0
def binarize(args: Dict,
             filename: str,
             dict: Dictionary,
             in_file: str,
             offset: int,
             end: int,
             append_eos: bool = False):
    """binarize function for multi-processing"""
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize(filename,
                             dict,
                             consumer,
                             tokenize=tokenizers.sub_tokenizer,
                             append_eos=append_eos,
                             offset=offset,
                             end=end)
    ds.finalize('{}.idx'.format(in_file))
    return res
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            attr: str, num_workers: int):
        """make binary dataset"""
        LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix, attr,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize_bpe(input_file,
                                   vocab,
                                   lambda t: ds.add_item(t),
                                   offset=0,
                                   end=offsets[1]))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, BPE no replaced token".format(
                attr,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
            ))
Esempio n. 5
0
def binarize(args,
             filename: str,
             dict,
             in_file: str,
             lang,
             offset: int,
             end: int,
             append_eos: bool = False):
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))
    ext_ds = indexed_dataset.make_builder(f'{in_file}.ext', impl='seq')

    def consumer(tensor, start_idx):
        ds.add_item(tensor)
        ext_ds.add_item(start_idx)

    def string2dfs(line):
        line = json_io.json_loads(line)
        asts = py150_util.separate_dps(line, args['preprocess']['max_len'])
        ast_dfs = [[py150_util.get_dfs(ast), ext] for ast, ext in asts
                   if len(ast) > 1]
        return ast_dfs

    def string2type_dfs(line):
        type_dfs = type_tokenize_func(line)
        type_dfs = py150_util.separate_dps(type_dfs,
                                           args['preprocess']['max_len'])
        type_dfs = [[dfs, ext] for dfs, ext in type_dfs if len(dfs) > 1]
        return type_dfs

    tokenize = string2dfs if lang == 'ast' else string2type_dfs
    res = Binarizer.binarize_seperate(filename,
                                      dict,
                                      consumer,
                                      tokenize=tokenize,
                                      append_eos=append_eos,
                                      offset=offset,
                                      end=end)
    ds.finalize('{}.idx'.format(in_file))
    ext_ds.finalize()
    return res
Esempio n. 6
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])
    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    dictionary = save_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    lang = args['preprocess']['lang']
    for mode in MODES:
        file = f"{args['preprocess'][f'{mode}pref']}.code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")
        PathManager.mkdir(os.path.dirname(dst_file))
        dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap",
                                               impl='mmap',
                                               vocab_size=len(vocab))
        PathManager.mkdir(os.path.dirname(dst_file))
        with file_io.open(file, 'r') as reader:
            data = {'code': []}
            for line in reader:
                line = json_io.json_loads(line)
                code = SPACE_SPLITTER.sub(" ", line)
                data['code'].append(code)
                code_tokens = vocab.encode(code, out_type=str)
                code_tokens = torch.IntTensor(
                    [dictionary.index(token) for token in code_tokens])
                # code_tokens = torch.IntTensor(vocab.encode_as_ids(code))
                dataset.add_item(code_tokens)
            dataset.finalize(f"{dst_file}_tokens.idx")
            # proj indices
            # cp id
            data['proj_indices'] = [1] * len(data['code'])
            file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
Esempio n. 7
0
def binarize(args, in_file: str, out_file: str, vocab, token_dict, offset: int, end: int):
    ds = indexed_dataset.make_builder(f"{out_file}.mmap", impl='mmap', vocab_size=len(vocab))
    with file_io.open(in_file, 'r') as reader:
        reader.seek(offset)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            code_tokens = vocab.encode(line, out_type=str)
            code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
            ds.add_item(code_tokens)
            line = reader.readline()
    ds.finalize(f'{out_file}.idx')
def binarize(args: Dict, filename: str, dict: Dictionary, out_file_prefix: str,
             attr: str, offset: int, end: int):
    """binarize function for multi-processing"""
    ds_file = '{}.mmap'.format(out_file_prefix)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))

    def consumer(tensor):
        ds.add_item(tensor)

    res = Binarizer.binarize_bpe(filename,
                                 dict,
                                 consumer,
                                 offset=offset,
                                 end=end)
    ds.finalize('{}.idx'.format(out_file_prefix))
    return res
Esempio n. 9
0
def binarize(args, filename, dict, in_file, offset, end, append_eos=False):
    """binarize function for multi-processing"""
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))

    def consumer(data, _):
        ds.add_item(data)

    res = Binarizer.binarize_seperate(filename,
                                      dict,
                                      consumer,
                                      tokenize=string2tokens,
                                      append_eos=append_eos,
                                      offset=offset,
                                      end=end)
    ds.finalize('{}.idx'.format(in_file))
    return res
Esempio n. 10
0
def binarize_dfs(args, filename: str, dict, in_file: str, offset: int,
                 end: int):
    """binarize function for multi-processing"""
    ds_file = '{}.mmap'.format(in_file)
    ds = indexed_dataset.make_builder(ds_file,
                                      impl=args['preprocess']['dataset_impl'],
                                      vocab_size=len(dict))

    with file_io.open(filename, 'r') as reader:
        reader.seek(offset)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            dfs = torch.IntTensor([dict.index(tok) for tok in line])
            ds.add_item(dfs)
            line = reader.readline()
    ds.finalize('{}.idx'.format(in_file))
Esempio n. 11
0
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        # if num_workers > 1:
        #     # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
        #     pool = Pool(processes=num_workers - 1)
        #     for worker_id in range(1, num_workers):
        #         prefix = "{}{}".format(output_file, worker_id)
        #         pool.apply_async(
        #             binarize,
        #             (
        #                 args,
        #                 input_file,
        #                 vocab,
        #                 prefix,
        #                 offsets[worker_id],
        #                 offsets[worker_id + 1]
        #             ),
        #             callback=merge_result
        #         )
        #     pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        if 'code_tokens_wo_func' in os.path.basename(output_file):
            bin_out = Binarizer.binarize_wo_func(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.string_sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'code_tokens' in os.path.basename(input_file):
            bin_out = Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.sub_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'docstring_tokens' in os.path.basename(input_file):
            bin_out = Binarizer.binarize_bpe(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.lower_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )
        elif 'func_name' in os.path.basename(input_file):
            bin_out = Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizers.func_name_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=False,
            )

        merge_result(bin_out)
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))
Esempio n. 12
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def save_token_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    token_dict = save_token_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang']

    for lang, mode in itertools.product([src_lang, tgt_lang], MODES):
        src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code"
        dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens")
        PathManager.mkdir(os.path.dirname(dst_file))

        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        src_file,
                        prefix,
                        vocab,
                        token_dict,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                )
            pool.close()

        ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab))
        end = offsets[1]

        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.encode(line, out_type=str)
                code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens])
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize(f"{dst_file}.idx")
Esempio n. 13
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    vocab = spm.SentencePieceProcessor()
    vocab.load(SPM_VOCAB_FILE)

    def tokenization(tokens):
        for idx, tok in enumerate(tokens):
            if len(tok) != 0:
                tokens[idx] = vocab.encode(tok, out_type=str)
        return tokens

    def ast_to_graph(ast):
        nodes, tokens, adjacence = [], [], [[] for _ in range(len(ast))]
        for idx, node in enumerate(ast):
            nodes.append(node['type'])
            if 'children' in node:
                tokens.append([])
                for child in node['children']:
                    adjacence[idx].append(child)
                    adjacence[child].append(idx)
            elif 'value' in node:
                tokens.append(node['value'])
            else:
                raise NotImplementedError

        tokens = tokenization(tokens)

        depth = {0: 1}  # 0 for pad
        for idx, node in enumerate(ast[1:], start=1):
            depth[idx] = depth[node['parent']] + 1
        depth = list(depth.values())

        assert len(nodes) == len(tokens) == len(adjacence) == len(depth)
        return nodes, tokens, adjacence, depth

    def save_token_dict():
        src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt')
        tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl')
        # Dictionary.text_to_jsonl(src_file, tgt_file)
        vocab = Dictionary()
        with file_io.open(src_file, 'r') as reader:
            for line in reader:
                token, num = line.strip().split()
                vocab.add_symbol(token, eval(num))
        vocab.save(tgt_file)
        return vocab

    token_dict = save_token_dict()

    def save_node_dict():
        src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/node.jsonl")
        dict = Dictionary.load(src_file)
        tgt_file = os.path.join(args['preprocess']['destdir'], 'node.jsonl')
        PathManager.mkdir(os.path.dirname(tgt_file))
        dict.save(tgt_file)
        return dict

    node_dict = save_node_dict()

    def save_lang_dict():
        src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl")
        dict = Dictionary.load(src_file)
        tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl')
        PathManager.mkdir(os.path.dirname(tgt_file))
        dict.save(tgt_file)
        return dict

    lang_dict = save_lang_dict()

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    lang = args['preprocess']['lang']
    for mode in MODES:
        src_file = f"{args['preprocess'][f'{mode}pref']}.ast"

        node_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.node")
        PathManager.mkdir(os.path.dirname(node_file))
        node_dataset = indexed_dataset.make_builder(f"{node_file}.mmap", impl='mmap', vocab_size=len(node_dict))

        depth_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.depth")
        depth_dataset = indexed_dataset.make_builder(f"{depth_file}.mmap", impl='mmap')

        code_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code")
        code_dataset = indexed_dataset.make_builder(f"{code_file}.bin", impl='bin', dtype=str)

        adjacence_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.adjacence")
        adjacence_dataset = indexed_dataset.make_builder(f"{adjacence_file}.bin", impl='bin')

        code_tokens_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens")
        code_tokens_dataset = indexed_dataset.make_builder(f"{code_tokens_file}.bin", impl='bin')

        with file_io.open(src_file, 'r') as reader:
            for idx, line in enumerate(reader):
                line = json_io.json_loads(line)
                ast = bfs_to_dfs(line)
                nodes, tokens, adjacence, depth = ast_to_graph(ast)
                # save node into mmap dataset
                nodes = torch.IntTensor([node_dict.index(tok) for tok in nodes])
                node_dataset.add_item(nodes)
                # save depth into mmap dataset
                depth = torch.IntTensor(depth)
                depth_dataset.add_item(depth)
                # code
                code = ''.join(itertools.chain(*tokens)).replace(constants.SPM_SPACE, ' ').strip()
                code_dataset.add_item(code)
                # tokens
                tokens = [[token_dict.index(tok) for tok in toks] if len(toks) > 0 else [] for toks in tokens]
                code_tokens_dataset.add_item(tokens)
                # adjacence
                for adj in adjacence:
                    assert adj == sorted(adj)
                adjacence_dataset.add_item(adjacence)

        node_dataset.finalize(f"{node_file}.idx")
        depth_dataset.finalize(f"{depth_file}.idx")
        code_dataset.finalize(f"{code_file}.idx")
        code_tokens_dataset.finalize(f"{code_tokens_file}.idx")
        adjacence_dataset.finalize(f"{adjacence_file}.idx")

        # proj indices
        with file_io.open(f"{args['preprocess'][f'{mode}pref']}.proj", 'r') as reader:
            projs = [json_io.json_loads(line) for line in reader]
        proj_indices = Counter(projs)
        proj_indices = [proj_num for idx, proj_num in proj_indices.items()]
        proj_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.proj")
        proj_dataset = indexed_dataset.make_builder(f"{proj_file}.seq", impl='seq')
        proj_dataset.add_item(torch.IntTensor(proj_indices))
        proj_dataset.finalize(f"{proj_file}.idx")
Esempio n. 14
0
    def make_binary_dataset(vocab, input_file, output_file, lang, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize, (
                    args,
                    input_file,
                    vocab,
                    prefix,
                    lang,
                    offsets[worker_id],
                    offsets[worker_id + 1],
                ),
                                 callback=merge_result)
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq')

        def consumer(data, start_idx):
            ds.add_item(data)
            ext_ds.add_item(start_idx)

        tokenize = string2dfs if lang == 'ast' else string2type_dfs
        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=tokenize,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                ext_ds.merge_file_(f"{temp_file_path}.ext")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                os.remove(
                    indexed_dataset.index_file_path(f"{temp_file_path}.ext"))
        ds.finalize('{}.idx'.format(output_file))
        ext_ds.finalize()
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            ))
Esempio n. 15
0
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = f'{output_file}.mmap'
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq')

        def consumer(data, start_idx):
            ds.add_item(data)
            ext_ds.add_item(start_idx)

        merge_result(
            Binarizer.binarize_seperate(
                input_file,
                vocab,
                consumer,
                tokenize=seperate_tokenize,
                offset=0,
                end=offsets[1],
                append_eos=False,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                ext_ds.merge_file_(f"{temp_file_path}.ext")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                os.remove(
                    indexed_dataset.index_file_path(f"{temp_file_path}.ext"))
        ds.finalize('{}.idx'.format(output_file))
        ext_ds.finalize()
        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))
Esempio n. 16
0
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            use_func, num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        offsets = find_offsets(input_file, num_chunks=num_workers)
        func_offsets = None
        modality = input_file.split('.')[-1]
        if modality == 'code_tokens':
            tokenizer = tokenizers.list_tokenizer
            if use_func:
                func_offsets = Binarizer.find_func_offsets(input_file,
                                                           offsets=offsets)
        elif modality == 'func_name':
            tokenizer = tokenizers.func_name_tokenizer
        elif modality == 'docstring_tokens':
            tokenizer = tokenizers.lower_tokenizer
        else:
            raise NotImplementedError(modality)

        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize, (
                    args,
                    input_file,
                    vocab,
                    prefix,
                    tokenizer,
                    use_func and (modality == 'code_tokens'),
                    offsets[worker_id],
                    offsets[worker_id + 1],
                    func_offsets[worker_id] if func_offsets else 0,
                ),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))

        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenizer,
                use_func=use_func and (modality == 'code_tokens'),
                offset=offsets[0],
                end=offsets[1],
                func_offset=func_offsets[0] if func_offsets else 0,
                append_eos=False,
                min_func_len=args['preprocess']['min_func_len'],
            ))

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))
Esempio n. 17
0
    def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers):
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer
        offsets = file_io.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        aux_dict,
                        prefix,
                        lang,
                        tokenize,
                        max_path_num,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result
                )
            pool.close()

        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab))
        if lang == 'path':
            sz_ds_file = '{}.sz.mmap'.format(output_file)
            sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'],
                                                 vocab_size=len(vocab))
        else:
            sz_ds = None

        def consumer(tensor, size=None):
            ds.add_item(tensor)
            if size is not None:
                sz_ds.add_item(size)

        if sz_ds is None:
            merge_result(
                Binarizer.binarize(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False,
                    max_path_num=max_path_num,
                )
            )
        else:
            merge_result(
                PathSummarizationBinarizer.path_binarizer(
                    input_file, vocab, consumer,
                    tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict,
                    max_path_num=max_path_num,
                )
            )
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                if sz_ds is not None:
                    sz_ds.merge_file_(f"{temp_file_path}.sz")
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
                if sz_ds is not None:
                    os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz"))
                    os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz"))
        ds.finalize('{}.idx'.format(output_file))
        if sz_ds is not None:
            sz_ds.finalize('{}.sz.idx'.format(output_file))
        LOGGER.info(
            "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
            )
        )
Esempio n. 18
0
def main(args):
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    PathManager.mkdir(args['preprocess']['destdir'])

    from ncc.data.dictionary import TransformersDictionary
    vocab = TransformersDictionary.from_pretrained(
        'microsoft/graphcodebert-base')

    file = os.path.join(args['preprocess']['destdir'], 'dfs.jsonl')
    node_dict = Dictionary.load(file)

    # 2. ***************build dataset********************
    # dump into pkl file
    # transform a language's code into src format and tgt format simualtaneouly
    num_workers = args['preprocess']['workers']
    src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess'][
        'tgt_lang']

    # code tokens => code tokens
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.code_tokens"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code_tokens")
        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize_tokens,
                    (args, src_file, vocab, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()

        ds_file = '{}.mmap'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="mmap",
                                          vocab_size=len(vocab))
        end = offsets[1]
        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                code_tokens = vocab.subtokenize(line)
                code_tokens = torch.IntTensor(
                    vocab.tokens_to_indices(code_tokens))
                ds.add_item(code_tokens)
                line = reader.readline()

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(dst_file))

    # code => code
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.code"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.code")

        ds_file = '{}.bin'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="bin",
                                          vocab_size=len(vocab))
        with open(src_file, 'r') as reader:
            for line in reader:
                line = json_io.json_loads(line)
                ds.add_item(line)
        ds.finalize('{}.idx'.format(dst_file))

    # dfs => dfs
    for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]):
        data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang)
        src_file = f"{data_dir}.dfs"
        PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang))
        dst_file = os.path.join(args['preprocess']['destdir'], lang,
                                f"{mode}.dfs")
        offsets = find_offsets(src_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(dst_file, worker_id)
                pool.apply_async(
                    binarize_dfs,
                    (args, src_file, node_dict, prefix, offsets[worker_id],
                     offsets[worker_id + 1]),
                )
            pool.close()

        ds_file = '{}.mmap'.format(dst_file)
        ds = indexed_dataset.make_builder(ds_file,
                                          impl="mmap",
                                          vocab_size=len(vocab))
        end = offsets[1]
        with file_io.open(src_file, 'r') as reader:
            reader.seek(0)
            line = file_io.safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                dfs = torch.IntTensor([node_dict.index(tok) for tok in line])
                ds.add_item(dfs)
                line = reader.readline()

        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(dst_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(dst_file))
Esempio n. 19
0
)
from ncc.utils.file_ops.yaml_io import recursive_expanduser
from ncc.utils.file_ops import file_io
from ncc.utils.path_manager import PathManager

if __name__ == '__main__':
    task = tasks.get_task('multilingual_denoising')
    base_dir = recursive_expanduser(
        '~/ncc_data/codexglue/code_to_text/multilingual_denoising/data-mmap')

    dict_file = os.path.join(base_dir, 'dict.jsonl')
    vocab = task.load_dictionary(dict_file)

    for mode in MODES:
        dst_file = os.path.join(base_dir, 'docstring', f"{mode}.docstring.spm")
        PathManager.mkdir(os.path.dirname(dst_file))
        # mmap
        ds = indexed_dataset.make_builder(f'{dst_file}.mmap',
                                          impl='mmap',
                                          vocab_size=len(vocab))
        for lang in LANGUAGES:
            src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm")
            ds.merge_file_(src_file)
        ds.finalize(f'{dst_file}.idx')
        # # raw
        # with file_io.open(ds, 'w') as writer:
        #     for lang in LANGUAGES:
        #         src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm")
        #         with open(src_file, 'r') as reader:
        #             shutil.copyfileobj(reader, writer)