def path_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename_terminals, dest_filename = dest_filename + '.terminals' + str(idx), dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename_terminals, 'w') as writer_terminals, \ file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: paths = util_path.ast_to_path(ast, MAX_PATH=PATH_NUM) if paths is None: paths = [[None] * 3] * PATH_NUM else: # copy paths size to PATH_NUM if len(paths) < PATH_NUM: supply_ids = list(range(len(paths))) * ((PATH_NUM - len(paths)) // len(paths)) \ + random.sample(range(len(paths)), ((PATH_NUM - len(paths)) % len(paths))) paths.extend([paths[idx] for idx in supply_ids]) random.shuffle(paths) assert len(paths) == PATH_NUM head, body, tail = zip(*paths) else: head, body, tail = [None] * PATH_NUM, [None] * PATH_NUM, [None] * PATH_NUM # terminals for terminal in itertools.chain(*zip(head, tail)): print(json_io.json_dumps(terminal), file=writer_terminals) # path for b in body: print(json_io.json_dumps(b), file=writer) line = safe_readline(reader)
def flatten_attrs(raw_file, flatten_dir, mode, attrs, start=0, end=-1): attr_writers = {} for attr in attrs: attr_file = os.path.join(flatten_dir, '{}.{}'.format(mode, attr)) os.makedirs(os.path.dirname(attr_file), exist_ok=True) attr_writers[attr] = file_io.open(attr_file, 'w') with file_io.open(raw_file, 'r') as reader: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break filename = os.path.join(os.path.dirname(raw_file), line.strip()) # tokens, types = parse_file(filename) try: tokens, types = parse_file(filename) # replace None with [PAD] for type dictionary build types = [PAD if t is None else t for t in types] except Exception as err: # print(err) # print(f'parsing {filename} error') line = file_io.safe_readline(reader) continue print(json_io.json_dumps(tokens), file=attr_writers['code_tokens']) print(json_io.json_dumps(types), file=attr_writers['code_types']) line = file_io.safe_readline(reader)
def tokenization( in_file, out_file, lang, attr, start=0, end=-1, ): with file_io.open(in_file, "r") as reader, file_io.open(out_file, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line).strip() if lang == 'python' and attr == 'code': tokens = python_code_tokenize(line) line = ' '.join(tokens).strip() if attr == 'code': line = normalize_program(line, remove_eol=True) else: line = normalize_docstring(line, remove_eol=True, remove_url=True) line = line.strip() tokens = tokenizer.encode_as_pieces(line) print(json_io.json_dumps(tokens), file=writer) line = file_io.safe_readline(reader)
def binary_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: try: ast = util_ast.value2children(ast) ast = util_ast.remove_root_with_uni_child(ast) root_idx = util_ast.get_root_idx(ast) ast = util_ast.delete_node_with_uni_child(ast, idx=root_idx) root_idx = util_ast.get_root_idx(ast) bin_ast = util_ast.binarize_tree(ast, idx=root_idx) # to binary ast tree root_idx = util_ast.get_root_idx(ast) bin_ast = util_ast.reset_indices(bin_ast, root_idx) # reset node indices bin_ast = util_ast.pad_leaf_node(bin_ast, MAX_SUB_TOKEN_LEN) except RecursionError: LOGGER.error('RecursionError, ignore this tree') bin_ast = None except Exception as err: LOGGER.error(err) bin_ast = None else: bin_ast = None print(json_io.json_dumps(bin_ast), file=writer) line = safe_readline(reader)
def docstring_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring_tokens = json_io.json_loads(line) if docstring_tokens: docstring_tokens = [ token for token in docstring_tokens \ if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token)) ] if not all( str.isascii(token) for token in docstring_tokens): docstring_tokens = None if (docstring_tokens is None) or not (3 < len(docstring_tokens) <= 50): docstring_tokens = None else: docstring_tokens = None print(json_io.json_dumps(docstring_tokens), file=writer) line = safe_readline(reader)
def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code_tokens = json_io.json_loads(line) if code_tokens: # filter comment in code_tokens, eg. //***\n /* */\n code_tokens = [token for token in code_tokens if not (str.startswith(token, '//') or str.startswith(token, '#') or \ (str.startswith(token, '/*') and str.endswith(token, '*/'))) ] if not all(str.isascii(token) for token in code_tokens): code_tokens = None if code_tokens is None or len(code_tokens) < 1: code_tokens = None else: code_tokens = None print(json_io.json_dumps(code_tokens), file=writer) line = safe_readline(reader)
def _save(self, f, kv_iterator): if isinstance(f, str): PathManager.mkdir(os.path.dirname(f)) with file_io.open(f, "w") as fd: return self.save(fd) for k, v in kv_iterator: print(json_io.json_dumps([k, v]), file=f)
def raw_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing lang = kwargs.get('lang') so_dir = kwargs.get('so_dir') so_filename = os.path.join(os.path.expanduser(so_dir), '{}.so'.format(lang)) parser = TreeSitterASTParser(so_filename, lang) dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code = json_io.json_loads(line) if code: raw_ast = parser.parse_raw_ast(code) else: raw_ast = None print(json_io.json_dumps(raw_ast), file=writer) line = safe_readline(reader)
def flatten(raw_file, dst_dir, mode): """flatten attributes of raw data""" data_frame = pd.read_csv(raw_file) attrs = data_frame.columns.values.tolist()[1:-1] LOGGER.info('Cast attributes({}) of OpenCL-{} dataset'.format(attrs, lang)) for attr in attrs: dst_file = os.path.join(dst_dir, f"{mode}.{attr}") data = getattr(data_frame, attr).values.tolist() with file_io.open(dst_file, 'w') as writer: for line in data: print(json_io.json_dumps(line), file=writer)
def __collect_all_and_save(asts, args, output_file): from ncc.utils.file_ops.json_io import json_dumps parallel = joblib.Parallel(n_jobs=args.n_jobs) func = joblib.delayed(__collect_samples) samples = parallel(func(ast, args) for ast in tqdm.tqdm(asts)) samples = list(itertools.chain.from_iterable(samples)) with open(output_file, 'w') as f: for line_index, line in enumerate(samples): line = json_dumps(line) print(line, file=f)
def ast_fn(filename, dest_filename, idx, start=0, end=-1): dest_filename = dest_filename + str(idx) with file_io.open(filename, "r", encoding="UTF-8") as reader, open(dest_filename, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) ast = convert(line) print(json_io.json_dumps(ast), file=writer) line = file_io.safe_readline(reader)
def ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing parser = CodeParser(SO_FILE=os.path.join(kwargs['so_dir'], f"{kwargs['lang']}.so"), LANGUAGE=kwargs['lang']) dest_filename = f"{dest_filename}{idx}" with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) ast = parser.parse_raw_ast(code=line, MAX_AST_SIZE=99999999999, append_index=True) print(json_io.json_dumps(ast), file=writer) line = safe_readline(reader)
def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring = json_io.json_loads(line) print(json_io.json_dumps(docstring), file=writer) line = safe_readline(reader)
def code_tokenization(src_file): from clgen._atomizer import GreedyAtomizer from clgen._langs import Language with open(src_file, 'r') as reader: src_codes = reader.readlines() opencl_lang = Language.from_str('opencl') atomizer = GreedyAtomizer.from_text(opencl_lang, text='\n'.join(src_codes)) dst_file = f"{src_file}_tokens" with open(dst_file, 'w') as writer: for code in src_codes: code = json_io.json_loads(code) code_tokens = atomizer.atomize(code) code_tokens = [atomizer.atoms[idx] for idx in code_tokens] print(json_io.json_dumps(code_tokens), file=writer)
def func_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break func_name = json_io.json_loads(line) func = func_name.split('.')[-1] print(json_io.json_dumps(func), file=writer) line = safe_readline(reader)
def dfs_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = f"{dest_filename}{idx}" with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast is not None: dfs, _ = ast_to_dfs(ast) else: dfs = None print(json_io.json_dumps(dfs), file=writer) line = safe_readline(reader)
def path_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: paths = util_path.ast_to_path(ast) print(json_io.json_dumps(paths), file=writer) line = safe_readline(reader)
def traversal_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: ast_traversal = util_traversal.get_dfs(ast) else: ast_traversal = None print(json_io.json_dumps(ast_traversal), file=writer) line = safe_readline(reader)
def flatten_attrs(raw_file, flatten_dir, lang, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] mode = filename[:str.rfind(filename, '.jsonl')] return mode mode = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}') PathManager.mkdir(os.path.dirname(attr_file)) attr_writers[attr] = file_io.open(attr_file, 'w') print('raw_file: ', raw_file) with file_io.open(raw_file, 'r') as reader: for line in reader: code_snippet = json_io.json_loads(line) for attr, info in code_snippet.items(): if attr in attr_writers: print(json_io.json_dumps(info), file=attr_writers[attr])
def sbtao_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: ast = util_ast.value2children(ast) padded_ast = util_ast.pad_leaf_node(ast, MAX_SUB_TOKEN_LEN) root_idx = util_ast.get_root_idx(padded_ast) sbt = util_ast.build_sbtao_tree(padded_ast, idx=root_idx) else: sbt = None print(json_io.json_dumps(sbt), file=writer) line = safe_readline(reader)
def xfg(src_dir, languages, dst_dir): xfg_src_files = PathManager.ls(os.path.join(src_dir, "kernels_ir", '*.ll')) filenames = [] ir_data = [] for filename in xfg_src_files: filenames.append(os.path.basename(filename)[:-3]) with open(filename, 'r') as reader: lines = reader.read().splitlines() ir_data.append(lines) # convert list to dict filenames = {name: idx for idx, name in enumerate(filenames)} processed_data, _ = inst2vec_preprocess.preprocess(ir_data) processed_data, _ = task_utils.inline_struct_types_txt( processed_data, ir_data) processed_data = task_utils.abstract_statements_from_identifiers_txt( processed_data) for idx, lines in enumerate(processed_data): processed_data[idx] = [ line for line in lines if not re.match(r'((?:<label>:)?(<LABEL>):|; <label>:<LABEL>)', line) ] for lang in languages: raw_file = os.path.join(src_dir, f'{lang}.csv') # read raw csv file to load corresponding benchmarks data_frame = pd.read_csv(raw_file) benchmarks = data_frame["benchmark"].values.tolist() datasets = data_frame["dataset"].values.tolist() del data_frame # write dst_file = os.path.join(dst_dir, lang, f'train.xfg') with open(dst_file, 'w') as writer: for idx, (bm, ds) in enumerate(zip(benchmarks, datasets)): if bm[:3] == "npb": bm += f'_{ds}' xfg = processed_data[filenames[bm]] print(json_io.json_dumps(xfg), file=writer)
def flatten_attrs(raw_file, flatten_dir, lang, mode, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] filename = filename[:str.rfind(filename, '.jsonl.gz')] _, _, idx = filename.split('_') return idx idx = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_dir = os.path.join(flatten_dir, lang, mode, attr) PathManager.mkdir(attr_dir) attr_file = os.path.join(attr_dir, '{}.jsonl'.format(idx)) attr_writers[attr] = file_io.open(attr_file, 'w') with file_io.open(raw_file, 'r') as reader: for line in reader: code_snippet = json_io.json_loads(line) for attr, info in code_snippet.items(): if attr in attr_writers: print(json_io.json_dumps(info), file=attr_writers[attr])
def code_wo_func_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing func_filename = filename[:str.rfind(filename, '.')] + '.func' dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, open(func_filename, 'r') as func_reader, \ file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) func_line = safe_readline(func_reader) while line and func_line: if end > 0 and reader.tell() > end: break code = json_io.json_loads(line) func_name = json_io.json_loads(func_line) start_idx = str.find(code, func_name) if start_idx != -1: code_wo_func = code[:start_idx] + code[start_idx + len(func_name):] else: code_wo_func = None print(json_io.json_dumps(code_wo_func), file=writer) line = safe_readline(reader) func_line = safe_readline(func_reader)
def tokenization( in_file, out_file, lang, attr, start=0, end=-1, ): with file_io.open(in_file, "r") as reader, file_io.open(out_file, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line).strip() if lang == 'python' and attr == 'code': line = re.sub(r'\s+', ' ', line) line = line.strip() tokens = tokenizer.encode_as_pieces(line) print(json_io.json_dumps(tokens), file=writer) line = file_io.safe_readline(reader)
def main(args, out_file=None, **kwargs): assert args['eval']['path'] is not None, '--path required for evaluation!' LOGGER.info(args) # while evaluation, set fraction_using_func_name = 0, namely, not sample from func_name args['task']['fraction_using_func_name'] = 0. use_cuda = torch.cuda.is_available() and not args['common']['cpu'] if use_cuda: device = os.environ.get('CUDA_VISIBALE_DEVICES', [0])[0] # get first device as default torch.cuda.set_device(f'cuda:{device}') task = tasks.setup_task(args) # Load ensemble LOGGER.info('loading model(s) from {}'.format(args['eval']['path'])) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(args['eval']['path']), arg_overrides=eval(args['eval']['model_overrides']), task=task, ) if out_file is not None: writer = open(out_file, 'w') top1_indices = [] for lang in deepcopy(args['dataset']['langs']): args['dataset']['langs'] = [lang] # Load dataset splits LOGGER.info(f'Evaluating {lang} dataset') task.load_dataset(args['dataset']['gen_subset']) dataset = task.dataset(args['dataset']['gen_subset']) # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer) for model in models: model.make_generation_fast_() if args['common']['fp16']: model.half() if use_cuda: model.cuda() assert len(models) > 0 LOGGER.info('num. model params: {}'.format( sum(p.numel() for p in models[0].parameters()))) itr = task.get_batch_iterator( dataset=dataset, max_tokens=args['dataset']['max_tokens'] or 36000, max_sentences=args['eval']['max_sentences'], max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models]), ignore_invalid_inputs=True, num_shards=args['dataset']['num_shards'], shard_id=args['dataset']['shard_id'], num_workers=args['dataset']['num_workers'], ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=args['common']['log_format'], log_interval=args['common']['log_interval'], default_log_format=('tqdm' if not args['common']['no_progress_bar'] else 'none'), ) code_reprs, query_reprs = [], [] for sample in progress: if 'net_input' not in sample: continue sample = move_to_cuda(sample) if use_cuda else sample batch_code_reprs, batch_query_reprs = models[0]( **sample['net_input']) if use_cuda: batch_code_reprs = batch_code_reprs.cpu().detach() batch_query_reprs = batch_query_reprs.cpu().detach() code_reprs.append(batch_code_reprs) query_reprs.append(batch_query_reprs) code_reprs = torch.cat(code_reprs, dim=0) query_reprs = torch.cat(query_reprs, dim=0) assert code_reprs.shape == query_reprs.shape, (code_reprs.shape, query_reprs.shape) eval_size = len( code_reprs ) if args['eval']['eval_size'] == -1 else args['eval']['eval_size'] k, MRR, topk_idx, topk_prob = 3, [], [], [] for idx in range(len(dataset) // eval_size): code_emb = code_reprs[idx:idx + eval_size, :] query_emb = query_reprs[idx:idx + eval_size, :] if use_cuda: code_emb = code_emb.cuda() query_emb = query_emb.cuda() if args['criterion'] == 'retrieval_cosine': src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 logits = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() elif args['criterion'] == 'retrieval_softmax': logits = query_emb @ code_emb.t() else: raise NotImplementedError(args['criterion']) correct_scores = logits.diag() compared_scores = logits >= correct_scores.unsqueeze(dim=-1) if out_file is not None: top1_indices.extend((logits.topk(1, dim=-1)[1].view(-1) + 1 + idx * eval_size).tolist()) mrr = 1 / compared_scores.sum(dim=-1).float() MRR.extend(mrr.tolist()) if len(dataset) % eval_size: code_emb = code_reprs[-eval_size:, :] query_emb = query_reprs[-eval_size:, :] if use_cuda: code_emb = code_emb.cuda() query_emb = query_emb.cuda() if args['criterion'] == 'retrieval_cosine': src_emb_nrom = torch.norm(code_emb, dim=-1, keepdim=True) + 1e-10 tgt_emb_nrom = torch.norm(query_emb, dim=-1, keepdim=True) + 1e-10 logits = (query_emb / tgt_emb_nrom) @ (code_emb / src_emb_nrom).t() elif args['criterion'] == 'retrieval_softmax': logits = query_emb @ code_emb.t() else: raise NotImplementedError(args['criterion']) correct_scores = logits.diag() compared_scores = logits >= correct_scores.unsqueeze(dim=-1) last_ids = len(code_reprs) % eval_size mrr = 1 / compared_scores.sum(dim=-1).float()[-last_ids:] MRR.extend(mrr.tolist()) print('{}, mrr: {:.4f}'.format(lang, np.mean(MRR))) if out_file is not None: for idx, mrr in enumerate(MRR): print( json_io.json_dumps({ "language": lang, "id": idx, "mrr": round(mrr, 6), "topk": top1_indices[idx] }), file=writer, )
def cast_code_tokens(src_file, dst_file): with file_io.open(src_file, 'r') as reader, file_io.open(dst_file, 'w') as writer: for line in reader: print(json_io.json_dumps(line.split()), file=writer)
def cast_docstring(src_file, dst_file): with file_io.open(src_file, 'r') as reader, file_io.open(dst_file, 'w') as writer: for line in reader: print(json_io.json_dumps(line.rstrip('\n')), file=writer)
def cast_docstring_tokens(src_file, dst_file): with file_io.open(src_file, 'r') as reader, file_io.open(dst_file, 'w') as writer: for line in reader: docstring_tokens = line.split() print(json_io.json_dumps(docstring_tokens), file=writer)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] from dataset.codexglue.code_to_text import BPE_DIR source_dict_file = os.path.join(BPE_DIR, 'csn/csn.spm.vocab') target_dict_file = os.path.join( os.path.dirname(args['preprocess']['destdir']), 'dict.jsonl') with open(source_dict_file, 'r') as reader, open(target_dict_file, 'w') as writer: for line in reader: print(json_io.json_dumps([line.split('\t')[0], 100]), file=writer) src_dict = tgt_dict = task.load_dictionary(target_dict_file) src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenization.json_tokenizer, offset=0, end=offsets[1], append_eos=True, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)
"--dataset_dir", "-d", default=RAW_DIR, type=str, help="raw dataset download directory", ) parser.add_argument( "--flatten_dir", "-f", default=ATTRIBUTES_DIR, type=str, help="data directory of flatten attribute", ) parser.add_argument( "--attrs", "-a", default=['code', 'code_tokens', 'code_types', 'ast'], type=str, nargs='+', ) parser.add_argument( "--cores", "-c", default=cpu_count(), type=int, help="cpu cores for flatten raw data attributes", ) args = parser.parse_args() # print(args) for mode in MODES: src_files = [os.path.join(args.dataset_dir, f"{mode}.{lang}") for lang in args.languages] src_readers = [file_io.open(file, 'r') for lang, file in zip(args.languages, src_files)] for lang in args.languages: PathManager.mkdir(os.path.join(args.flatten_dir, lang)) dst_files = [os.path.join(args.flatten_dir, lang, f"{mode}.code") for lang in args.languages] dst_writers = {lang: file_io.open(file, 'w') for lang, file in zip(args.languages, dst_files)} for lines in zip(*src_readers): lines = list(map(lambda line: SPACE_SPLITTER.sub(" ", line.strip()), lines)) for lang, line in zip(args.languages, lines): print(json_io.json_dumps(line.strip()), file=dst_writers[lang])