def tokenization( in_file, out_file, lang, attr, start=0, end=-1, ): with file_io.open(in_file, "r") as reader, file_io.open(out_file, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line).strip() if lang == 'python' and attr == 'code': tokens = python_code_tokenize(line) line = ' '.join(tokens).strip() if attr == 'code': line = normalize_program(line, remove_eol=True) else: line = normalize_docstring(line, remove_eol=True, remove_url=True) line = line.strip() tokens = tokenizer.encode_as_pieces(line) print(json_io.json_dumps(tokens), file=writer) line = file_io.safe_readline(reader)
def binary_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: try: ast = util_ast.value2children(ast) ast = util_ast.remove_root_with_uni_child(ast) root_idx = util_ast.get_root_idx(ast) ast = util_ast.delete_node_with_uni_child(ast, idx=root_idx) root_idx = util_ast.get_root_idx(ast) bin_ast = util_ast.binarize_tree(ast, idx=root_idx) # to binary ast tree root_idx = util_ast.get_root_idx(ast) bin_ast = util_ast.reset_indices(bin_ast, root_idx) # reset node indices bin_ast = util_ast.pad_leaf_node(bin_ast, MAX_SUB_TOKEN_LEN) except RecursionError: LOGGER.error('RecursionError, ignore this tree') bin_ast = None except Exception as err: LOGGER.error(err) bin_ast = None else: bin_ast = None print(json_io.json_dumps(bin_ast), file=writer) line = safe_readline(reader)
def docstring_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring_tokens = json_io.json_loads(line) if docstring_tokens: docstring_tokens = [ token for token in docstring_tokens \ if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token)) ] if not all( str.isascii(token) for token in docstring_tokens): docstring_tokens = None if (docstring_tokens is None) or not (3 < len(docstring_tokens) <= 50): docstring_tokens = None else: docstring_tokens = None print(json_io.json_dumps(docstring_tokens), file=writer) line = safe_readline(reader)
def raw_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing lang = kwargs.get('lang') so_dir = kwargs.get('so_dir') so_filename = os.path.join(os.path.expanduser(so_dir), '{}.so'.format(lang)) parser = TreeSitterASTParser(so_filename, lang) dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code = json_io.json_loads(line) if code: raw_ast = parser.parse_raw_ast(code) else: raw_ast = None print(json_io.json_dumps(raw_ast), file=writer) line = safe_readline(reader)
def _cat_and_remove(tgt_filename, num_workers): with file_io.open(tgt_filename, 'w') as writer: for idx in range(num_workers): src_filename = tgt_filename + str(idx) with file_io.open(src_filename, 'r') as reader: PathManager.copyfileobj(reader, writer) PathManager.rm(src_filename)
def path_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename_terminals, dest_filename = dest_filename + '.terminals' + str(idx), dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename_terminals, 'w') as writer_terminals, \ file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: paths = util_path.ast_to_path(ast, MAX_PATH=PATH_NUM) if paths is None: paths = [[None] * 3] * PATH_NUM else: # copy paths size to PATH_NUM if len(paths) < PATH_NUM: supply_ids = list(range(len(paths))) * ((PATH_NUM - len(paths)) // len(paths)) \ + random.sample(range(len(paths)), ((PATH_NUM - len(paths)) % len(paths))) paths.extend([paths[idx] for idx in supply_ids]) random.shuffle(paths) assert len(paths) == PATH_NUM head, body, tail = zip(*paths) else: head, body, tail = [None] * PATH_NUM, [None] * PATH_NUM, [None] * PATH_NUM # terminals for terminal in itertools.chain(*zip(head, tail)): print(json_io.json_dumps(terminal), file=writer_terminals) # path for b in body: print(json_io.json_dumps(b), file=writer) line = safe_readline(reader)
def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code_tokens = json_io.json_loads(line) if code_tokens: # filter comment in code_tokens, eg. //***\n /* */\n code_tokens = [token for token in code_tokens if not (str.startswith(token, '//') or str.startswith(token, '#') or \ (str.startswith(token, '/*') and str.endswith(token, '*/'))) ] if not all(str.isascii(token) for token in code_tokens): code_tokens = None if code_tokens is None or len(code_tokens) < 1: code_tokens = None else: code_tokens = None print(json_io.json_dumps(code_tokens), file=writer) line = safe_readline(reader)
def flatten_attrs(raw_file, flatten_dir, mode, attrs, start=0, end=-1): attr_writers = {} for attr in attrs: attr_file = os.path.join(flatten_dir, '{}.{}'.format(mode, attr)) os.makedirs(os.path.dirname(attr_file), exist_ok=True) attr_writers[attr] = file_io.open(attr_file, 'w') with file_io.open(raw_file, 'r') as reader: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break filename = os.path.join(os.path.dirname(raw_file), line.strip()) # tokens, types = parse_file(filename) try: tokens, types = parse_file(filename) # replace None with [PAD] for type dictionary build types = [PAD if t is None else t for t in types] except Exception as err: # print(err) # print(f'parsing {filename} error') line = file_io.safe_readline(reader) continue print(json_io.json_dumps(tokens), file=attr_writers['code_tokens']) print(json_io.json_dumps(types), file=attr_writers['code_types']) line = file_io.safe_readline(reader)
def _concate(_tgt_filename, num_workers, tgt_filename): src_filenames = [ _tgt_filename + str(idx) for idx in range(num_workers) ] with file_io.open(tgt_filename, 'w') as writer: for _src_fl in src_filenames: with file_io.open(_src_fl, 'r') as reader: shutil.copyfileobj(reader, writer) PathManager.rm(_src_fl)
def find_func_offsets(filename, offsets): func_filename = filename[:str.rfind(filename, '.')] + '.func_name' count = 1 func_offsets = [0 for _ in range(len(offsets))] with file_io.open(filename, "r", encoding="utf-8") as f, \ file_io.open(func_filename, "r", encoding="utf-8") as func: line, _ = f.readline(), func.readline() while line: if f.tell() == offsets[count]: func_offsets[count] = func.tell() count += 1 line, _ = f.readline(), func.readline() return func_offsets
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": # TODO: parse json to txt file, one line one traversal, please help me parallize it. """ because only 1 thread is allowed to write file, we have to use multi-processing for deal with data and merge results from CPUs into a block and then dumps such block. """ def _func(line): line = py150_util.separate_dps( json_io.json_loads(line.strip()), args['preprocess']['n_ctx']) line = [ py150_util.get_dfs(ast) + [ext] for ast, ext in line if len(ast) > 1 ] # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1] return line with PPool() as thread_pool: with file_io.open(file_name(input_prefix, lang), 'r') as f, \ file_io.open(dest_path(output_prefix, lang), 'w') as fout: def _write(result): for res in itertools.chain(*result): print(json_io.json_dumps(res), file=fout) batch_data = [] for line in f: batch_data.append(line) if len(batch_data) >= MAX_BATCH_SIZE: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data batch_data = [] if len(batch_data) > 0: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data else: if lang == 'code_types': in_file = file_name(input_prefix, 'ast') else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, lang, num_workers)
def main(args): # task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly def parse_source_input(code): code_tokens = vocab.tokenize(code) # truncating code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2] source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token] source_ids = vocab.convert_tokens_to_ids(source_tokens) source_size = len(source_tokens) source_mask = [1] * source_size padding_length = config.MAX_SOURCE_LENGTH - len(source_ids) source_ids += [vocab.pad()] * padding_length source_mask += [0] * padding_length return [source_ids, source_mask, source_size] def parse_target_input(code): target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2] target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token] target_ids = vocab.convert_tokens_to_ids(target_tokens) target_size = len(target_ids) target_mask = [1] * target_size padding_length = config.MAX_TARGET_LENGTH - len(target_ids) target_ids += [vocab.pad_token_id] * padding_length target_mask += [0] * padding_length return [target_ids, target_mask, target_size] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl") PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(src_file, 'r') as reader: keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes'] data = {key: [] for key in keys} for line in reader: src_code = json_io.json_loads(line) # src_code = SPACE_SPLITTER.sub(" ", line) # source_ids, source_mask src_line = parse_source_input(src_code) # target_ids, target_mask tgt_line = parse_target_input(src_code) for key, src in zip(keys, [src_code] + src_line + tgt_line): data[key].append(src) file_io.open(dst_file, mode='wb', data=data)
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab dictionary = save_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly lang = args['preprocess']['lang'] for mode in MODES: file = f"{args['preprocess'][f'{mode}pref']}.code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") PathManager.mkdir(os.path.dirname(dst_file)) dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap", impl='mmap', vocab_size=len(vocab)) PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(file, 'r') as reader: data = {'code': []} for line in reader: line = json_io.json_loads(line) code = SPACE_SPLITTER.sub(" ", line) data['code'].append(code) code_tokens = vocab.encode(code, out_type=str) code_tokens = torch.IntTensor( [dictionary.index(token) for token in code_tokens]) # code_tokens = torch.IntTensor(vocab.encode_as_ids(code)) dataset.add_item(code_tokens) dataset.finalize(f"{dst_file}_tokens.idx") # proj indices # cp id data['proj_indices'] = [1] * len(data['code']) file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring = json_io.json_loads(line) print(json_io.json_dumps(docstring), file=writer) line = safe_readline(reader)
def ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing parser = CodeParser(SO_FILE=os.path.join(kwargs['so_dir'], f"{kwargs['lang']}.so"), LANGUAGE=kwargs['lang']) dest_filename = f"{dest_filename}{idx}" with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) ast = parser.parse_raw_ast(code=line, MAX_AST_SIZE=99999999999, append_index=True) print(json_io.json_dumps(ast), file=writer) line = safe_readline(reader)
def _add_tok_to_dictionary_single_worker( filename: str, tokenize: Any, eos_word: Optional[str], worker_id: int = 0, num_workers: int = 1, ) -> Counter: counter = Counter() with file_io.open(filename, "r") as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_workers offset = worker_id * chunk_size end = offset + chunk_size f.seek(offset) if offset > 0: safe_readline(f) # drop first incomplete line line = f.readline() while line: tokens = tokenize(line) counter.update(tokens) if eos_word is not None: counter.update([eos_word]) if f.tell() > end: break line = f.readline() return counter
def func_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break func_name = json_io.json_loads(line) func = func_name.split('.')[-1] print(json_io.json_dumps(func), file=writer) line = safe_readline(reader)
def _save(self, f, kv_iterator): if isinstance(f, str): PathManager.mkdir(os.path.dirname(f)) with file_io.open(f, "w") as fd: return self.save(fd) for k, v in kv_iterator: print(json_io.json_dumps([k, v]), file=f)
def __init__(self, path): with file_io.open(path, 'rb') as stream: magic_test = stream.read(9) assert self._HDR_MAGIC == magic_test, ( 'Index file doesn\'t match expected format. ' 'Make sure that --dataset-impl is configured properly.') version = struct.unpack('<Q', stream.read(8)) assert (1, ) == version dtype_code, = struct.unpack('<B', stream.read(1)) self._dtype = dtypes[dtype_code] self._dtype_size = self._dtype().itemsize self._len = struct.unpack('<Q', stream.read(8))[0] offset = stream.tell() _warmup_mmap_file(path) self._bin_buffer_mmap = np.memmap(path, mode='r', order='C') self._bin_buffer = memoryview(self._bin_buffer_mmap) self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset) self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len, offset=offset + self._sizes.nbytes)
def traversal_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: ast_traversal = util_traversal.get_dfs(ast) else: ast_traversal = None print(json_io.json_dumps(ast_traversal), file=writer) line = safe_readline(reader)
def path_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: paths = util_path.ast_to_path(ast) print(json_io.json_dumps(paths), file=writer) line = safe_readline(reader)
def read_data(self, path): with file_io.open(index_file_path(path), mode='rb') as stream: magic_test = stream.read(8) assert self._HDR_MAGIC == magic_test, ( 'Index file doesn\'t match expected format. ' 'Make sure that --dataset-impl is configured properly.') buffer = stream.read() self._data = np.frombuffer(buffer, dtype=self._dtype)
def cast_code(raw_code_file, refined_code_file, dst_file): with file_io.open(raw_code_file, 'r') as raw_reader: raw_codes = {} for line in raw_reader: raw_code = line raw_code = raw_code[raw_code.find('def '):] func_name = raw_code[:raw_code.find('(')][4:].strip() raw_codes[func_name] = line.rstrip('\n') PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(refined_code_file, 'r') as refined_reader, file_io.open(dst_file, 'w') as writer: for line in refined_reader: func_name = line[line.find('def '):].split()[1] raw_code = raw_codes[func_name] print(raw_code, file=writer)
def dfs_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = f"{dest_filename}{idx}" with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast is not None: dfs, _ = ast_to_dfs(ast) else: dfs = None print(json_io.json_dumps(dfs), file=writer) line = safe_readline(reader)
def __enter__(self): """for with open. this init method""" self._file = file_io.open(path, 'wb') self._file.write(cls._HDR_MAGIC) # self-defined format self._file.write(struct.pack('<Q', 1)) # version number, occupying 8 bit self._file.write(struct.pack('<B', code(dtype))) # data type, 1 bit return self
def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab
def read_data(self, path, dictionary, tokenizer): with file_io.open(path, 'r', encoding='utf-8') as f: for line in f: self.lines.append(line.strip('\n')) tokens = dictionary.encode_line( line, tokenizer, add_if_not_exist=False, append_eos=self.append_eos, reverse_order=self.reverse_order, ).long() self.tokens_list.append(tokens) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes)
def sbtao_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: ast = util_ast.value2children(ast) padded_ast = util_ast.pad_leaf_node(ast, MAX_SUB_TOKEN_LEN) root_idx = util_ast.get_root_idx(padded_ast) sbt = util_ast.build_sbtao_tree(padded_ast, idx=root_idx) else: sbt = None print(json_io.json_dumps(sbt), file=writer) line = safe_readline(reader)
def flatten(raw_file, dst_dir, mode): """flatten attributes of raw data""" data_frame = pd.read_csv(raw_file) attrs = data_frame.columns.values.tolist()[1:-1] LOGGER.info('Cast attributes({}) of OpenCL-{} dataset'.format(attrs, lang)) for attr in attrs: dst_file = os.path.join(dst_dir, f"{mode}.{attr}") data = getattr(data_frame, attr).values.tolist() with file_io.open(dst_file, 'w') as writer: for line in data: print(json_io.json_dumps(line), file=writer)
def flatten_attrs(raw_file, flatten_dir, lang, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] mode = filename[:str.rfind(filename, '.jsonl')] return mode mode = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}') PathManager.mkdir(os.path.dirname(attr_file)) attr_writers[attr] = file_io.open(attr_file, 'w') print('raw_file: ', raw_file) with file_io.open(raw_file, 'r') as reader: for line in reader: code_snippet = json_io.json_loads(line) for attr, info in code_snippet.items(): if attr in attr_writers: print(json_io.json_dumps(info), file=attr_writers[attr])