def main(args): # task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly def parse_source_input(code): code_tokens = vocab.tokenize(code) # truncating code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2] source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token] source_ids = vocab.convert_tokens_to_ids(source_tokens) source_size = len(source_tokens) source_mask = [1] * source_size padding_length = config.MAX_SOURCE_LENGTH - len(source_ids) source_ids += [vocab.pad()] * padding_length source_mask += [0] * padding_length return [source_ids, source_mask, source_size] def parse_target_input(code): target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2] target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token] target_ids = vocab.convert_tokens_to_ids(target_tokens) target_size = len(target_ids) target_mask = [1] * target_size padding_length = config.MAX_TARGET_LENGTH - len(target_ids) target_ids += [vocab.pad_token_id] * padding_length target_mask += [0] * padding_length return [target_ids, target_mask, target_size] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl") PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(src_file, 'r') as reader: keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes'] data = {key: [] for key in keys} for line in reader: src_code = json_io.json_loads(line) # src_code = SPACE_SPLITTER.sub(" ", line) # source_ids, source_mask src_line = parse_source_input(src_code) # target_ids, target_mask tgt_line = parse_target_input(src_code) for key, src in zip(keys, [src_code] + src_line + tgt_line): data[key].append(src) file_io.open(dst_file, mode='wb', data=data)
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/graphcodebert-base') # def save_token_dict(): # src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') # tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # # Dictionary.text_to_jsonl(src_file, tgt_file) # vocab = Dictionary() # with file_io.open(src_file, 'r') as reader: # for line in reader: # token, num = line.strip().split() # vocab.add_symbol(token, eval(num)) # vocab.save(tgt_file) # return vocab # # token_dict = save_token_dict() token_dict = vocab # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): mode = 'valid' src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") PathManager.mkdir(os.path.dirname(dst_file)) offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize, ( args, src_file, prefix, vocab, token_dict, offsets[worker_id], offsets[worker_id + 1] ), ) pool.close() ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) # code_tokens = vocab.encode(line, out_type=str) # code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) code_tokens = torch.IntTensor(vocab.string_to_indices(line)) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{dst_file}.idx")
def main(args): # task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/graphcodebert-base') # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly def parse_source_input(code, lang): code_tokens, dfg = extract_dataflow(code, parsers[lang], lang) code_tokens = vocab.subtokenize(code_tokens) ori2cur_pos = {} ori2cur_pos[-1] = (0, 0) for i in range(len(code_tokens)): ori2cur_pos[i] = (ori2cur_pos[i - 1][1], ori2cur_pos[i - 1][1] + len(code_tokens[i])) # truncating code_tokens = code_tokens[ :config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - 3 - min(len(dfg), config.MAX_DATA_FLOW_LEN)] \ [:512 - 3] source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token] source_ids = vocab.convert_tokens_to_ids(source_tokens) position_idx = [i + vocab.pad() + 1 for i in range(len(source_tokens))] dfg = dfg[:config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - len(source_tokens)] source_tokens += [x[0] for x in dfg] position_idx += [0 for _ in dfg] source_ids += [vocab.unk() for _ in dfg] padding_length = config.MAX_TOKEN_LEN + config.MAX_DATA_FLOW_LEN - len(source_ids) position_idx += [vocab.pad()] * padding_length source_ids += [vocab.pad()] * padding_length # reindex reverse_index = {} for idx, x in enumerate(dfg): reverse_index[x[1]] = idx for idx, x in enumerate(dfg): dfg[idx] = x[:-1] + ([reverse_index[i] for i in x[-1] if i in reverse_index],) dfg_to_dfg = [x[-1] for x in dfg] dfg_to_code = [ori2cur_pos[x[1]] for x in dfg] length = len([vocab.cls()]) dfg_to_code = [(x[0] + length, x[1] + length) for x in dfg_to_code] return [source_ids, position_idx, dfg_to_code, dfg_to_dfg] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl") PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(src_file, 'r') as reader: keys = [ 'code', 'src_tokens', 'src_positions', 'dfg2code', 'dfg2dfg', ] data = {key: [] for key in keys} for line in reader: src_code = json_io.json_loads(line) # source_ids, source_mask src_line = parse_source_input(src_code, lang) for key, src in zip(keys, [src_code] + src_line): data[key].append(src) # cp id data['proj_indices'] = [1] * len(data['code']) file_io.open(dst_file, mode='wb', data=data)
def __init__(self, args): from ncc.data.dictionary import TransformersDictionary self.sp = TransformersDictionary.from_pretrained( args['sentencepiece_vocab'])
import itertools from dataset.avatar.translation import ( LANGUAGES, DATASET_DIR, ATTRIBUTES_DIR, MODES, ) from ncc.utils.file_ops import file_io from ncc.utils.file_ops import json_io from ncc.utils.path_manager import PathManager if __name__ == '__main__': from ncc.data.dictionary import TransformersDictionary vocab = TransformersDictionary.from_pretrained( 'microsoft/graphcodebert-base') for topk in [1, 3, 5]: attributes = ['code', 'ast', 'dfs'] dst_dir = os.path.join(DATASET_DIR, 'codedisen', 'data') for lang in LANGUAGES: PathManager.mkdir(os.path.join(dst_dir, f"top{topk}", lang)) for mode in MODES: readers = [ file_io.open( os.path.join(ATTRIBUTES_DIR, f"top{topk}", lang, f"{mode}.{attr}"), 'r') for lang in LANGUAGES for attr in attributes ] writers = [
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) from ncc.data.dictionary import TransformersDictionary vocab = TransformersDictionary.from_pretrained( 'microsoft/graphcodebert-base') file = os.path.join(args['preprocess']['destdir'], 'dfs.jsonl') node_dict = Dictionary.load(file) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess'][ 'tgt_lang'] # code tokens => code tokens for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.code_tokens" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize_tokens, (args, src_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() ds_file = '{}.mmap'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="mmap", vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.subtokenize(line) code_tokens = torch.IntTensor( vocab.tokens_to_indices(code_tokens)) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(dst_file)) # code => code for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.code" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") ds_file = '{}.bin'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="bin", vocab_size=len(vocab)) with open(src_file, 'r') as reader: for line in reader: line = json_io.json_loads(line) ds.add_item(line) ds.finalize('{}.idx'.format(dst_file)) # dfs => dfs for mode, lang in itertools.product(MODES, [src_lang, tgt_lang]): data_dir = str.replace(args['preprocess'][f'{mode}pref'], '*', lang) src_file = f"{data_dir}.dfs" PathManager.mkdir(os.path.join(args['preprocess']['destdir'], lang)) dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.dfs") offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize_dfs, (args, src_file, node_dict, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() ds_file = '{}.mmap'.format(dst_file) ds = indexed_dataset.make_builder(ds_file, impl="mmap", vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) dfs = torch.IntTensor([node_dict.index(tok) for tok in line]) ds.add_item(dfs) line = reader.readline() if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(dst_file))