def load_raw_data(data_dir, load_keys): raw_data = {} for mode in constants.MODES: for key in load_keys: mode_data_dir = os.path.join(data_dir, key, '{}.*'.format(mode)) jsonl_gz_files = PathManager.ls(mode_data_dir) raw_data[mode] = list(load_jsonl_gzs(jsonl_gz_files)) return raw_data
def flatten(raw_dir, lang, mode, flatten_dir, attrs, num_cores): """flatten attributes of raw data""" LOGGER.info('Cast attributes({}) of {}-{} dataset'.format( attrs, lang, mode)) with Pool(num_cores) as mpool: result = [ mpool.apply_async(flatten_attrs, (raw_file, flatten_dir, lang, mode, set(attrs))) for raw_file in PathManager.ls( os.path.join(raw_dir, lang, mode, '*.jsonl.gz')) ] result = [res.get() for res in result]
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: languages = [ os.path.basename(d) for d in PathManager.ls(os.path.dirname(input_prefix)) ] for l in languages: in_file = file_name(input_prefix, lang) in_file = str.replace(in_file, '*', l) out_file = dest_path(os.path.join(l, output_prefix), lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers)
def xfg(src_dir, languages, dst_dir): xfg_src_files = PathManager.ls(os.path.join(src_dir, "kernels_ir", '*.ll')) filenames = [] ir_data = [] for filename in xfg_src_files: filenames.append(os.path.basename(filename)[:-3]) with open(filename, 'r') as reader: lines = reader.read().splitlines() ir_data.append(lines) # convert list to dict filenames = {name: idx for idx, name in enumerate(filenames)} processed_data, _ = inst2vec_preprocess.preprocess(ir_data) processed_data, _ = task_utils.inline_struct_types_txt( processed_data, ir_data) processed_data = task_utils.abstract_statements_from_identifiers_txt( processed_data) for idx, lines in enumerate(processed_data): processed_data[idx] = [ line for line in lines if not re.match(r'((?:<label>:)?(<LABEL>):|; <label>:<LABEL>)', line) ] for lang in languages: raw_file = os.path.join(src_dir, f'{lang}.csv') # read raw csv file to load corresponding benchmarks data_frame = pd.read_csv(raw_file) benchmarks = data_frame["benchmark"].values.tolist() datasets = data_frame["dataset"].values.tolist() del data_frame # write dst_file = os.path.join(dst_dir, lang, f'train.xfg') with open(dst_file, 'w') as writer: for idx, (bm, ds) in enumerate(zip(benchmarks, datasets)): if bm[:3] == "npb": bm += f'_{ds}' xfg = processed_data[filenames[bm]] print(json_io.json_dumps(xfg), file=writer)
def merge_attr_files(flatten_dir, lang, mode, attrs): """shell cat""" def _merge_files(src_files, tgt_file): with file_io.open(tgt_file, 'w') as writer: for src_fl in src_files: with file_io.open(src_fl, 'r') as reader: shutil.copyfileobj(reader, writer) def _get_file_idx(filename): filename = os.path.split(filename)[-1] idx = int(filename[:str.rfind(filename, '.json')]) return idx for attr in attrs: attr_files = PathManager.ls( os.path.join(flatten_dir, lang, mode, attr, '*.jsonl')) attr_files = sorted(attr_files, key=_get_file_idx) assert len(attr_files) > 0, RuntimeError( 'Attribute({}) files do not exist.'.format(attr)) dest_file = os.path.join(flatten_dir, lang, '{}.{}'.format(mode, attr)) _merge_files(attr_files, dest_file) PathManager.rm(os.path.join(flatten_dir, lang, mode))
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info( 'Generating dictionaries with Train/Validation data files.') if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['source_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls(valid_path(args['preprocess']['source_lang']))) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.json_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary # copy shared dict into each language's data directory for d in PathManager.ls(os.path.dirname(args['preprocess']['trainpref'])): lang = os.path.basename(d) src_dict.save( os.path.join(args['preprocess']['destdir'], lang, f"{args['preprocess']['source_lang']}.dict.jsonl")) # 2. ***************build dataset******************** def make_binary_dataset(vocab, input_file, output_file, num_workers): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) def consumer(data, _): ds.add_item(data) merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=string2tokens, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: languages = [ os.path.basename(d) for d in PathManager.ls(os.path.dirname(input_prefix)) ] for l in languages: in_file = file_name(input_prefix, lang) in_file = str.replace(in_file, '*', l) out_file = dest_path(os.path.join(l, output_prefix), lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" data_files = train_path(args['preprocess']['source_lang']) data_files = PathManager.ls(data_files) src_dict = task.build_bpe_dictionary( data_files, tokenize_func=tokenizers.sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, bpe_portion=args['preprocess']['source_bpe_portion'], ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: data_files = train_path(args['preprocess']['target_lang']) if '*' in data_files: data_files = glob(data_files) else: data_files = [data_files] assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" tgt_dict = task.build_bpe_dictionary( data_files, tokenize_func=tokenizers.lower_tokenizer, workers=args['preprocess']['workers'], threshold=0, nwords=args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, bpe_portion=args['preprocess']['target_bpe_portion'], ) else: tgt_dict = None # src_dict.save(dict_path(args['preprocess']['source_lang'])) # tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # tgt_dict.save(dict_path("func_name")) # save target_lang dict for func_name # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, use_func, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = find_offsets(input_file, num_chunks=num_workers) func_offsets = None modality = input_file.split('.')[-1] if modality == 'code_tokens': tokenizer = tokenizers.list_tokenizer if use_func: func_offsets = Binarizer.find_func_offsets(input_file, offsets=offsets) elif modality == 'func_name': tokenizer = tokenizers.func_name_tokenizer elif modality == 'docstring_tokens': tokenizer = tokenizers.lower_tokenizer else: raise NotImplementedError(modality) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, ( args, input_file, vocab, prefix, tokenizer, use_func and (modality == 'code_tokens'), offsets[worker_id], offsets[worker_id + 1], func_offsets[worker_id] if func_offsets else 0, ), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizer, use_func=use_func and (modality == 'code_tokens'), offset=offsets[0], end=offsets[1], func_offset=func_offsets[0] if func_offsets else 0, append_eos=False, min_func_len=args['preprocess']['min_func_len'], )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, use_func=False, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_files = file_name(input_prefix, lang) if '*' in in_files: in_files = glob(in_files) else: in_files = [in_files] for in_file in in_files: if lang == 'code_tokens': out_file = dest_path(output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang + ".wo_func"}') \ if use_func == True else dest_path(output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang}') else: out_file = dest_path( output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang}') os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, use_func, num_workers) def make_all(lang, vocab, use_func=False): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers'], use_func=use_func) if args['preprocess']['validpref']: make_dataset(vocab, args['preprocess']['validpref'], "valid", lang, num_workers=args['preprocess']['workers'], use_func=use_func) if args['preprocess']['testpref']: make_dataset(vocab, args['preprocess']['testpref'], "test", lang, num_workers=args['preprocess']['workers'], use_func=use_func) make_all(args['preprocess']['source_lang'], src_dict) make_all(args['preprocess']['source_lang'], src_dict, use_func=True) if target: make_all(args['preprocess']['target_lang'], tgt_dict) make_all('func_name', tgt_dict) # func_name as query
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] if args['preprocess']['joined_dictionary']: assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \ "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) elif args['preprocess']['tgtdict']: src_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [ train_path(args['preprocess']['source_lang']), train_path(args['preprocess']['target_lang']) ] if not args['preprocess']['only_train']: filenames.extend( \ [valid_path(args['preprocess']['source_lang']), valid_path(args['preprocess']['target_lang'])]) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], # set max len for joint dictionaries nwords=max(args['preprocess']['nwordssrc'], args['preprocess']['nwordstgt']), ) tgt_dict = src_dict else: if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['source_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls( valid_path(args['preprocess']['source_lang']))) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['target_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls( valid_path(args['preprocess']['target_lang']))) tgt_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdtgt'], nwords=args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], ) else: tgt_dict = None src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenization.dpu_sub_tokenizer, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, out_file=None, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_file = file_name(input_prefix, lang) if out_file is None: out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]): # copy shared dict into each languages out_dir = os.path.join(args['preprocess']['destdir'], l) PathManager.mkdir(out_dir) dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl') PathManager.copy(dict_path(lang), dst_dict) if args['preprocess']['trainpref']: out_file = os.path.join(out_dir, f"train.{lang}") make_dataset(vocab, args['preprocess']['trainpref'].replace('*', l), "train", lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: out_file = os.path.join(out_dir, f"valid.{lang}") make_dataset(vocab, args['preprocess']['validpref'].replace('*', l), 'valid', lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: out_file = os.path.join(out_dir, f"test.{lang}") make_dataset(vocab, args['preprocess']['testpref'].replace('*', l), 'test', lang, out_file=out_file, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)