Ejemplo n.º 1
0
    def path_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename_terminals, dest_filename = dest_filename + '.terminals' + str(idx), dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename_terminals, 'w') as writer_terminals, \
            file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    paths = util_path.ast_to_path(ast, MAX_PATH=PATH_NUM)
                    if paths is None:
                        paths = [[None] * 3] * PATH_NUM
                    else:
                        # copy paths size to PATH_NUM
                        if len(paths) < PATH_NUM:
                            supply_ids = list(range(len(paths))) * ((PATH_NUM - len(paths)) // len(paths)) \
                                         + random.sample(range(len(paths)), ((PATH_NUM - len(paths)) % len(paths)))
                            paths.extend([paths[idx] for idx in supply_ids])
                    random.shuffle(paths)
                    assert len(paths) == PATH_NUM
                    head, body, tail = zip(*paths)
                else:
                    head, body, tail = [None] * PATH_NUM, [None] * PATH_NUM, [None] * PATH_NUM
                # terminals
                for terminal in itertools.chain(*zip(head, tail)):
                    print(json_io.json_dumps(terminal), file=writer_terminals)
                # path
                for b in body:
                    print(json_io.json_dumps(b), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 2
0
def flatten_attrs(raw_file, flatten_dir, mode, attrs, start=0, end=-1):
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(flatten_dir, '{}.{}'.format(mode, attr))
        os.makedirs(os.path.dirname(attr_file), exist_ok=True)
        attr_writers[attr] = file_io.open(attr_file, 'w')

    with file_io.open(raw_file, 'r') as reader:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            filename = os.path.join(os.path.dirname(raw_file), line.strip())
            # tokens, types = parse_file(filename)
            try:
                tokens, types = parse_file(filename)
                # replace None with [PAD] for type dictionary build
                types = [PAD if t is None else t for t in types]
            except Exception as err:
                # print(err)
                # print(f'parsing {filename} error')
                line = file_io.safe_readline(reader)
                continue
            print(json_io.json_dumps(tokens), file=attr_writers['code_tokens'])
            print(json_io.json_dumps(types), file=attr_writers['code_types'])
            line = file_io.safe_readline(reader)
Ejemplo n.º 3
0
def tokenization(
    in_file,
    out_file,
    lang,
    attr,
    start=0,
    end=-1,
):
    with file_io.open(in_file, "r") as reader, file_io.open(out_file,
                                                            'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line).strip()

            if lang == 'python' and attr == 'code':
                tokens = python_code_tokenize(line)
                line = ' '.join(tokens).strip()

            if attr == 'code':
                line = normalize_program(line, remove_eol=True)
            else:
                line = normalize_docstring(line,
                                           remove_eol=True,
                                           remove_url=True)

            line = line.strip()
            tokens = tokenizer.encode_as_pieces(line)
            print(json_io.json_dumps(tokens), file=writer)
            line = file_io.safe_readline(reader)
Ejemplo n.º 4
0
    def binary_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    try:
                        ast = util_ast.value2children(ast)
                        ast = util_ast.remove_root_with_uni_child(ast)
                        root_idx = util_ast.get_root_idx(ast)
                        ast = util_ast.delete_node_with_uni_child(ast, idx=root_idx)
                        root_idx = util_ast.get_root_idx(ast)
                        bin_ast = util_ast.binarize_tree(ast, idx=root_idx)  # to binary ast tree
                        root_idx = util_ast.get_root_idx(ast)
                        bin_ast = util_ast.reset_indices(bin_ast, root_idx)  # reset node indices
                        bin_ast = util_ast.pad_leaf_node(bin_ast, MAX_SUB_TOKEN_LEN)
                    except RecursionError:
                        LOGGER.error('RecursionError, ignore this tree')
                        bin_ast = None
                    except Exception as err:
                        LOGGER.error(err)
                        bin_ast = None
                else:
                    bin_ast = None
                print(json_io.json_dumps(bin_ast), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 5
0
    def docstring_tokens_fn(filename,
                            dest_filename,
                            idx,
                            start=0,
                            end=-1,
                            *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring_tokens = json_io.json_loads(line)
                if docstring_tokens:
                    docstring_tokens = [
                        token for token in docstring_tokens \
                        if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token))
                    ]
                    if not all(
                            str.isascii(token) for token in docstring_tokens):
                        docstring_tokens = None
                    if (docstring_tokens is
                            None) or not (3 < len(docstring_tokens) <= 50):
                        docstring_tokens = None
                else:
                    docstring_tokens = None
                print(json_io.json_dumps(docstring_tokens), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 6
0
    def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code_tokens = json_io.json_loads(line)
                if code_tokens:
                    # filter comment in code_tokens, eg. //***\n /* */\n
                    code_tokens = [token for token in code_tokens
                                   if not (str.startswith(token, '//') or str.startswith(token, '#') or \
                                           (str.startswith(token, '/*') and str.endswith(token, '*/')))
                                   ]

                    if not all(str.isascii(token) for token in code_tokens):
                        code_tokens = None
                    if code_tokens is None or len(code_tokens) < 1:
                        code_tokens = None
                else:
                    code_tokens = None

                print(json_io.json_dumps(code_tokens), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 7
0
 def _save(self, f, kv_iterator):
     if isinstance(f, str):
         PathManager.mkdir(os.path.dirname(f))
         with file_io.open(f, "w") as fd:
             return self.save(fd)
     for k, v in kv_iterator:
         print(json_io.json_dumps([k, v]), file=f)
Ejemplo n.º 8
0
    def raw_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing
        lang = kwargs.get('lang')
        so_dir = kwargs.get('so_dir')

        so_filename = os.path.join(os.path.expanduser(so_dir),
                                   '{}.so'.format(lang))
        parser = TreeSitterASTParser(so_filename, lang)
        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                code = json_io.json_loads(line)
                if code:
                    raw_ast = parser.parse_raw_ast(code)
                else:
                    raw_ast = None
                print(json_io.json_dumps(raw_ast), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 9
0
def flatten(raw_file, dst_dir, mode):
    """flatten attributes of raw data"""
    data_frame = pd.read_csv(raw_file)
    attrs = data_frame.columns.values.tolist()[1:-1]
    LOGGER.info('Cast attributes({}) of OpenCL-{} dataset'.format(attrs, lang))
    for attr in attrs:
        dst_file = os.path.join(dst_dir, f"{mode}.{attr}")
        data = getattr(data_frame, attr).values.tolist()
        with file_io.open(dst_file, 'w') as writer:
            for line in data:
                print(json_io.json_dumps(line), file=writer)
Ejemplo n.º 10
0
def __collect_all_and_save(asts, args, output_file):
    from ncc.utils.file_ops.json_io import json_dumps
    parallel = joblib.Parallel(n_jobs=args.n_jobs)
    func = joblib.delayed(__collect_samples)

    samples = parallel(func(ast, args) for ast in tqdm.tqdm(asts))
    samples = list(itertools.chain.from_iterable(samples))

    with open(output_file, 'w') as f:
        for line_index, line in enumerate(samples):
            line = json_dumps(line)
            print(line, file=f)
Ejemplo n.º 11
0
def ast_fn(filename, dest_filename, idx, start=0, end=-1):
    dest_filename = dest_filename + str(idx)
    with file_io.open(filename, "r",
                      encoding="UTF-8") as reader, open(dest_filename,
                                                        'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line)
            ast = convert(line)
            print(json_io.json_dumps(ast), file=writer)
            line = file_io.safe_readline(reader)
Ejemplo n.º 12
0
    def ast_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing
        parser = CodeParser(SO_FILE=os.path.join(kwargs['so_dir'], f"{kwargs['lang']}.so"), LANGUAGE=kwargs['lang'])

        dest_filename = f"{dest_filename}{idx}"
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                line = json_io.json_loads(line)
                ast = parser.parse_raw_ast(code=line, MAX_AST_SIZE=99999999999, append_index=True)
                print(json_io.json_dumps(ast), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 13
0
    def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        """code => raw_ast"""
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        PathManager.mkdir(os.path.dirname(dest_filename))
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                docstring = json_io.json_loads(line)
                print(json_io.json_dumps(docstring), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 14
0
def code_tokenization(src_file):
    from clgen._atomizer import GreedyAtomizer
    from clgen._langs import Language

    with open(src_file, 'r') as reader:
        src_codes = reader.readlines()
    opencl_lang = Language.from_str('opencl')
    atomizer = GreedyAtomizer.from_text(opencl_lang, text='\n'.join(src_codes))

    dst_file = f"{src_file}_tokens"
    with open(dst_file, 'w') as writer:
        for code in src_codes:
            code = json_io.json_loads(code)
            code_tokens = atomizer.atomize(code)
            code_tokens = [atomizer.atoms[idx] for idx in code_tokens]
            print(json_io.json_dumps(code_tokens), file=writer)
Ejemplo n.º 15
0
    def func_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                func_name = json_io.json_loads(line)
                func = func_name.split('.')[-1]
                print(json_io.json_dumps(func), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 16
0
    def dfs_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = f"{dest_filename}{idx}"
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast is not None:
                    dfs, _ = ast_to_dfs(ast)
                else:
                    dfs = None
                print(json_io.json_dumps(dfs), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 17
0
    def path_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename,
                          "r") as reader, file_io.open(dest_filename,
                                                       'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    paths = util_path.ast_to_path(ast)
                    print(json_io.json_dumps(paths), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 18
0
    def traversal_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    ast_traversal = util_traversal.get_dfs(ast)
                else:
                    ast_traversal = None
                print(json_io.json_dumps(ast_traversal), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 19
0
def flatten_attrs(raw_file, flatten_dir, lang, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        mode = filename[:str.rfind(filename, '.jsonl')]
        return mode

    mode = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}')
        PathManager.mkdir(os.path.dirname(attr_file))
        attr_writers[attr] = file_io.open(attr_file, 'w')
    print('raw_file: ', raw_file)
    with file_io.open(raw_file, 'r') as reader:
        for line in reader:
            code_snippet = json_io.json_loads(line)
            for attr, info in code_snippet.items():
                if attr in attr_writers:
                    print(json_io.json_dumps(info), file=attr_writers[attr])
Ejemplo n.º 20
0
    def sbtao_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            while line:
                if end > 0 and reader.tell() > end:
                    break
                ast = json_io.json_loads(line)
                if ast:
                    ast = util_ast.value2children(ast)
                    padded_ast = util_ast.pad_leaf_node(ast, MAX_SUB_TOKEN_LEN)
                    root_idx = util_ast.get_root_idx(padded_ast)
                    sbt = util_ast.build_sbtao_tree(padded_ast, idx=root_idx)
                else:
                    sbt = None
                print(json_io.json_dumps(sbt), file=writer)
                line = safe_readline(reader)
Ejemplo n.º 21
0
def xfg(src_dir, languages, dst_dir):
    xfg_src_files = PathManager.ls(os.path.join(src_dir, "kernels_ir", '*.ll'))

    filenames = []
    ir_data = []
    for filename in xfg_src_files:
        filenames.append(os.path.basename(filename)[:-3])
        with open(filename, 'r') as reader:
            lines = reader.read().splitlines()
        ir_data.append(lines)
    # convert list to dict
    filenames = {name: idx for idx, name in enumerate(filenames)}

    processed_data, _ = inst2vec_preprocess.preprocess(ir_data)
    processed_data, _ = task_utils.inline_struct_types_txt(
        processed_data, ir_data)
    processed_data = task_utils.abstract_statements_from_identifiers_txt(
        processed_data)

    for idx, lines in enumerate(processed_data):
        processed_data[idx] = [
            line for line in lines if
            not re.match(r'((?:<label>:)?(<LABEL>):|; <label>:<LABEL>)', line)
        ]

    for lang in languages:
        raw_file = os.path.join(src_dir, f'{lang}.csv')
        # read raw csv file to load corresponding benchmarks
        data_frame = pd.read_csv(raw_file)
        benchmarks = data_frame["benchmark"].values.tolist()
        datasets = data_frame["dataset"].values.tolist()
        del data_frame

        # write
        dst_file = os.path.join(dst_dir, lang, f'train.xfg')
        with open(dst_file, 'w') as writer:
            for idx, (bm, ds) in enumerate(zip(benchmarks, datasets)):
                if bm[:3] == "npb":
                    bm += f'_{ds}'
                xfg = processed_data[filenames[bm]]
                print(json_io.json_dumps(xfg), file=writer)
Ejemplo n.º 22
0
def flatten_attrs(raw_file, flatten_dir, lang, mode, attrs):
    def _get_file_info(filename):
        """get mode and file index from file name"""
        filename = os.path.split(filename)[-1]
        filename = filename[:str.rfind(filename, '.jsonl.gz')]
        _, _, idx = filename.split('_')
        return idx

    idx = _get_file_info(raw_file)
    attr_writers = {}
    for attr in attrs:
        attr_dir = os.path.join(flatten_dir, lang, mode, attr)
        PathManager.mkdir(attr_dir)
        attr_file = os.path.join(attr_dir, '{}.jsonl'.format(idx))
        attr_writers[attr] = file_io.open(attr_file, 'w')

    with file_io.open(raw_file, 'r') as reader:
        for line in reader:
            code_snippet = json_io.json_loads(line)
            for attr, info in code_snippet.items():
                if attr in attr_writers:
                    print(json_io.json_dumps(info), file=attr_writers[attr])
Ejemplo n.º 23
0
    def code_wo_func_fn(filename, dest_filename, idx, start=0, end=-1, *args):
        kwargs = args[0][0]  # canot feed dict parameters in multi-processing

        func_filename = filename[:str.rfind(filename, '.')] + '.func'
        dest_filename = dest_filename + str(idx)
        with file_io.open(filename, "r") as reader, open(func_filename, 'r') as func_reader, \
            file_io.open(dest_filename, 'w') as writer:
            reader.seek(start)
            line = safe_readline(reader)
            func_line = safe_readline(func_reader)
            while line and func_line:
                if end > 0 and reader.tell() > end:
                    break
                code = json_io.json_loads(line)
                func_name = json_io.json_loads(func_line)
                start_idx = str.find(code, func_name)
                if start_idx != -1:
                    code_wo_func = code[:start_idx] + code[start_idx +
                                                           len(func_name):]
                else:
                    code_wo_func = None
                print(json_io.json_dumps(code_wo_func), file=writer)
                line = safe_readline(reader)
                func_line = safe_readline(func_reader)
Ejemplo n.º 24
0
def tokenization(
    in_file,
    out_file,
    lang,
    attr,
    start=0,
    end=-1,
):
    with file_io.open(in_file, "r") as reader, file_io.open(out_file,
                                                            'w') as writer:
        reader.seek(start)
        line = file_io.safe_readline(reader)
        while line:
            if end > 0 and reader.tell() > end:
                break
            line = json_io.json_loads(line).strip()

            if lang == 'python' and attr == 'code':
                line = re.sub(r'\s+', ' ', line)

            line = line.strip()
            tokens = tokenizer.encode_as_pieces(line)
            print(json_io.json_dumps(tokens), file=writer)
            line = file_io.safe_readline(reader)
Ejemplo n.º 25
0
def main(args, out_file=None, **kwargs):
    assert args['eval']['path'] is not None, '--path required for evaluation!'

    LOGGER.info(args)
    # while evaluation, set fraction_using_func_name = 0, namely, not sample from func_name
    args['task']['fraction_using_func_name'] = 0.
    use_cuda = torch.cuda.is_available() and not args['common']['cpu']
    if use_cuda:
        device = os.environ.get('CUDA_VISIBALE_DEVICES',
                                [0])[0]  # get first device as default
        torch.cuda.set_device(f'cuda:{device}')

    task = tasks.setup_task(args)

    # Load ensemble
    LOGGER.info('loading model(s) from {}'.format(args['eval']['path']))
    models, _model_args = checkpoint_utils.load_model_ensemble(
        utils.split_paths(args['eval']['path']),
        arg_overrides=eval(args['eval']['model_overrides']),
        task=task,
    )

    if out_file is not None:
        writer = open(out_file, 'w')
        top1_indices = []

    for lang in deepcopy(args['dataset']['langs']):
        args['dataset']['langs'] = [lang]
        # Load dataset splits
        LOGGER.info(f'Evaluating {lang} dataset')
        task.load_dataset(args['dataset']['gen_subset'])
        dataset = task.dataset(args['dataset']['gen_subset'])

        # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer)
        for model in models:
            model.make_generation_fast_()
            if args['common']['fp16']:
                model.half()
            if use_cuda:
                model.cuda()

        assert len(models) > 0

        LOGGER.info('num. model params: {}'.format(
            sum(p.numel() for p in models[0].parameters())))

        itr = task.get_batch_iterator(
            dataset=dataset,
            max_tokens=args['dataset']['max_tokens'] or 36000,
            max_sentences=args['eval']['max_sentences'],
            max_positions=utils.resolve_max_positions(
                *[model.max_positions() for model in models]),
            ignore_invalid_inputs=True,
            num_shards=args['dataset']['num_shards'],
            shard_id=args['dataset']['shard_id'],
            num_workers=args['dataset']['num_workers'],
        ).next_epoch_itr(shuffle=False)
        progress = progress_bar.progress_bar(
            itr,
            log_format=args['common']['log_format'],
            log_interval=args['common']['log_interval'],
            default_log_format=('tqdm' if not args['common']['no_progress_bar']
                                else 'none'),
        )

        code_reprs, query_reprs = [], []
        for sample in progress:
            if 'net_input' not in sample:
                continue
            sample = move_to_cuda(sample) if use_cuda else sample
            batch_code_reprs, batch_query_reprs = models[0](
                **sample['net_input'])

            if use_cuda:
                batch_code_reprs = batch_code_reprs.cpu().detach()
                batch_query_reprs = batch_query_reprs.cpu().detach()

            code_reprs.append(batch_code_reprs)
            query_reprs.append(batch_query_reprs)
        code_reprs = torch.cat(code_reprs, dim=0)
        query_reprs = torch.cat(query_reprs, dim=0)

        assert code_reprs.shape == query_reprs.shape, (code_reprs.shape,
                                                       query_reprs.shape)
        eval_size = len(
            code_reprs
        ) if args['eval']['eval_size'] == -1 else args['eval']['eval_size']

        k, MRR, topk_idx, topk_prob = 3, [], [], []
        for idx in range(len(dataset) // eval_size):
            code_emb = code_reprs[idx:idx + eval_size, :]
            query_emb = query_reprs[idx:idx + eval_size, :]

            if use_cuda:
                code_emb = code_emb.cuda()
                query_emb = query_emb.cuda()

            if args['criterion'] == 'retrieval_cosine':
                src_emb_nrom = torch.norm(code_emb, dim=-1,
                                          keepdim=True) + 1e-10
                tgt_emb_nrom = torch.norm(query_emb, dim=-1,
                                          keepdim=True) + 1e-10
                logits = (query_emb / tgt_emb_nrom) @ (code_emb /
                                                       src_emb_nrom).t()
            elif args['criterion'] == 'retrieval_softmax':
                logits = query_emb @ code_emb.t()
            else:
                raise NotImplementedError(args['criterion'])

            correct_scores = logits.diag()
            compared_scores = logits >= correct_scores.unsqueeze(dim=-1)
            if out_file is not None:
                top1_indices.extend((logits.topk(1, dim=-1)[1].view(-1) + 1 +
                                     idx * eval_size).tolist())
            mrr = 1 / compared_scores.sum(dim=-1).float()
            MRR.extend(mrr.tolist())

        if len(dataset) % eval_size:
            code_emb = code_reprs[-eval_size:, :]
            query_emb = query_reprs[-eval_size:, :]

            if use_cuda:
                code_emb = code_emb.cuda()
                query_emb = query_emb.cuda()

            if args['criterion'] == 'retrieval_cosine':
                src_emb_nrom = torch.norm(code_emb, dim=-1,
                                          keepdim=True) + 1e-10
                tgt_emb_nrom = torch.norm(query_emb, dim=-1,
                                          keepdim=True) + 1e-10
                logits = (query_emb / tgt_emb_nrom) @ (code_emb /
                                                       src_emb_nrom).t()
            elif args['criterion'] == 'retrieval_softmax':
                logits = query_emb @ code_emb.t()
            else:
                raise NotImplementedError(args['criterion'])

            correct_scores = logits.diag()
            compared_scores = logits >= correct_scores.unsqueeze(dim=-1)
            last_ids = len(code_reprs) % eval_size
            mrr = 1 / compared_scores.sum(dim=-1).float()[-last_ids:]
            MRR.extend(mrr.tolist())

        print('{}, mrr: {:.4f}'.format(lang, np.mean(MRR)))
        if out_file is not None:
            for idx, mrr in enumerate(MRR):
                print(
                    json_io.json_dumps({
                        "language": lang,
                        "id": idx,
                        "mrr": round(mrr, 6),
                        "topk": top1_indices[idx]
                    }),
                    file=writer,
                )
Ejemplo n.º 26
0
def cast_code_tokens(src_file, dst_file):
    with file_io.open(src_file, 'r') as reader, file_io.open(dst_file,
                                                             'w') as writer:
        for line in reader:
            print(json_io.json_dumps(line.split()), file=writer)
Ejemplo n.º 27
0
def cast_docstring(src_file, dst_file):
    with file_io.open(src_file, 'r') as reader, file_io.open(dst_file,
                                                             'w') as writer:
        for line in reader:
            print(json_io.json_dumps(line.rstrip('\n')), file=writer)
Ejemplo n.º 28
0
def cast_docstring_tokens(src_file, dst_file):
    with file_io.open(src_file, 'r') as reader, file_io.open(dst_file,
                                                             'w') as writer:
        for line in reader:
            docstring_tokens = line.split()
            print(json_io.json_dumps(docstring_tokens), file=writer)
Ejemplo n.º 29
0
def main(args):
    task = tasks.get_task(args['preprocess']['task'])
    LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'],
                                              args['preprocess']['task']))
    os.makedirs(args['preprocess']['destdir'], exist_ok=True)

    def train_path(lang):
        return "{}{}".format(args['preprocess']['trainpref'],
                             ("." + lang) if lang else "")

    def valid_path(lang):
        return "{}{}".format(args['preprocess']['validpref'],
                             ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args['preprocess']['destdir'],
                            file_name(prefix, lang))

    def dict_path(lang):
        return dest_path(lang, "dict") + ".jsonl"

    target = not args['preprocess']['only_source']

    from dataset.codexglue.code_to_text import BPE_DIR

    source_dict_file = os.path.join(BPE_DIR, 'csn/csn.spm.vocab')
    target_dict_file = os.path.join(
        os.path.dirname(args['preprocess']['destdir']), 'dict.jsonl')
    with open(source_dict_file, 'r') as reader, open(target_dict_file,
                                                     'w') as writer:
        for line in reader:
            print(json_io.json_dumps([line.split('\t')[0], 100]), file=writer)
    src_dict = tgt_dict = task.load_dictionary(target_dict_file)

    src_dict.save(dict_path(
        args['preprocess']['source_lang']))  # save spm dict to ncc.dictionary
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args['preprocess']['target_lang']))

    # 2. ***************build dataset********************
    def make_binary_dataset(vocab: Dictionary, input_file, output_file,
                            num_workers: int):
        """make binary dataset"""
        # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()  # save un-recorded tokens

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        # split a file into different parts
        # if use multi-processing, we first process 2nd to last file
        # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ...
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx)
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_file, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, vocab, prefix,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()
        # process 1th file, if multi-processing available. If not, process all file
        # p0 -> 0,end
        ds_file = '{}.mmap'.format(output_file)
        ds = indexed_dataset.make_builder(
            ds_file,
            impl=args['preprocess']['dataset_impl'],
            vocab_size=len(vocab))
        merge_result(
            Binarizer.binarize(
                input_file,
                vocab,
                lambda t: ds.add_item(t),
                tokenize=tokenization.json_tokenizer,
                offset=0,
                end=offsets[1],
                append_eos=True,
            ))
        if num_workers > 1:
            # p1-pN
            pool.join()
            # merge sub-processors' index and data files into final files and delete them.
            for worker_id in range(1, num_workers):
                temp_file_path = "{}{}".format(output_file, worker_id)
                ds.merge_file_(temp_file_path)
                # idx, txt
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))
        ds.finalize('{}.idx'.format(output_file))

        LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            # attr,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            vocab.unk_word,
        ))

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1):
        if args['preprocess']['dataset_impl'] == "raw":
            in_file = file_name(input_prefix, lang)
            out_dir = args['preprocess']['destdir']
            os.makedirs(out_dir, exist_ok=True)
            LOGGER.info('Copying {} into {}'.format(in_file, out_dir))
            shutil.copy(src=in_file, dst=args['preprocess']['destdir'])
        else:
            in_file = file_name(input_prefix, lang)
            out_file = dest_path(output_prefix, lang)
            os.makedirs(os.path.dirname(out_file), exist_ok=True)
            make_binary_dataset(vocab, in_file, out_file, num_workers)

    def make_all(lang, vocab):
        if args['preprocess']['trainpref']:
            make_dataset(vocab,
                         args['preprocess']['trainpref'],
                         "train",
                         lang,
                         num_workers=args['preprocess']['workers'])
        if args['preprocess']['validpref']:
            for k, validpref in enumerate(
                    args['preprocess']['validpref'].split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(vocab,
                             validpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])
        if args['preprocess']['testpref']:
            for k, testpref in enumerate(
                    args['preprocess']['testpref'].split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(vocab,
                             testpref,
                             outprefix,
                             lang,
                             num_workers=args['preprocess']['workers'])

    make_all(args['preprocess']['source_lang'], src_dict)
    if target:
        make_all(args['preprocess']['target_lang'], tgt_dict)
Ejemplo n.º 30
0
        "--dataset_dir", "-d", default=RAW_DIR, type=str, help="raw dataset download directory",
    )
    parser.add_argument(
        "--flatten_dir", "-f", default=ATTRIBUTES_DIR, type=str,
        help="data directory of flatten attribute",
    )
    parser.add_argument(
        "--attrs", "-a",
        default=['code', 'code_tokens', 'code_types', 'ast'],
        type=str, nargs='+',
    )
    parser.add_argument(
        "--cores", "-c", default=cpu_count(), type=int, help="cpu cores for flatten raw data attributes",
    )
    args = parser.parse_args()
    # print(args)

    for mode in MODES:
        src_files = [os.path.join(args.dataset_dir, f"{mode}.{lang}") for lang in args.languages]
        src_readers = [file_io.open(file, 'r') for lang, file in zip(args.languages, src_files)]

        for lang in args.languages:
            PathManager.mkdir(os.path.join(args.flatten_dir, lang))
        dst_files = [os.path.join(args.flatten_dir, lang, f"{mode}.code") for lang in args.languages]
        dst_writers = {lang: file_io.open(file, 'w') for lang, file in zip(args.languages, dst_files)}

        for lines in zip(*src_readers):
            lines = list(map(lambda line: SPACE_SPLITTER.sub(" ", line.strip()), lines))
            for lang, line in zip(args.languages, lines):
                print(json_io.json_dumps(line.strip()), file=dst_writers[lang])