def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".json" def bpe_dict_path(lang): return dest_path(lang + '.bpe', "dict") + ".json" target = not args['preprocess']['only_source'] if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" src_dict = task.build_dictionary( [train_path(args['preprocess']['source_lang'])], tokenize_func=tokenizers.sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'], ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" tgt_dict = task.build_bpe_dictionary( [train_path(args['preprocess']['target_lang'])], tokenize_func=tokenizers.lower_tokenizer, workers=args['preprocess']['workers'], threshold=0, nwords=args['preprocess']['nwordstgt'], ) else: tgt_dict = None if args['preprocess']['source_bpe']: src_dict_name = bpe_dict_path(args['preprocess']['source_lang']) else: src_dict_name = dict_path(args['preprocess']['source_lang']) src_dict.save_json(src_dict_name) if args['preprocess']['target_bpe']: tgt_dict_name = bpe_dict_path(args['preprocess']['target_lang']) else: tgt_dict_name = dict_path(args['preprocess']['target_lang']) tgt_dict.save_json(tgt_dict_name) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None # if num_workers > 1: # # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) # pool = Pool(processes=num_workers - 1) # for worker_id in range(1, num_workers): # prefix = "{}{}".format(output_file, worker_id) # pool.apply_async( # binarize, # ( # args, # input_file, # vocab, # prefix, # offsets[worker_id], # offsets[worker_id + 1] # ), # callback=merge_result # ) # pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if 'code_tokens_wo_func' in os.path.basename(output_file): bin_out = Binarizer.binarize_wo_func( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.string_sub_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'code_tokens' in os.path.basename(input_file): bin_out = Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.sub_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'docstring_tokens' in os.path.basename(input_file): bin_out = Binarizer.binarize_bpe( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.lower_tokenizer, offset=0, end=offsets[1], append_eos=False, ) elif 'func_name' in os.path.basename(input_file): bin_out = Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizers.func_name_tokenizer, offset=0, end=offsets[1], append_eos=False, ) merge_result(bin_out) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1, is_bpe=False): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: if lang == 'code_tokens_wo_func': in_file = file_name(input_prefix, 'code_tokens') out_file = dest_path(output_prefix, lang) num_workers = 1 # not support multi-processing else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) if is_bpe: out_file += '.bpe' make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab, is_bpe=False): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers'], is_bpe=is_bpe) if args['preprocess']['validpref']: make_dataset(vocab, args['preprocess']['validpref'], "valid", lang, num_workers=args['preprocess']['workers'], is_bpe=is_bpe) if args['preprocess']['testpref']: make_dataset(vocab, args['preprocess']['testpref'], "test", lang, num_workers=args['preprocess']['workers'], is_bpe=is_bpe) make_all(args['preprocess']['source_lang'], src_dict, is_bpe=args['preprocess']['source_bpe']) make_all("code_tokens_wo_func", src_dict, is_bpe=args['preprocess']['source_bpe']) # func_name if target: make_all(args['preprocess']['target_lang'], tgt_dict, is_bpe=args['preprocess']['target_bpe']) make_all("func_name", tgt_dict, is_bpe=args['preprocess']['target_bpe']) # func_name
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info( 'Generating dictionaries with Train/Validation data files.') if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['source_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls(valid_path(args['preprocess']['source_lang']))) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.json_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary # copy shared dict into each language's data directory for d in PathManager.ls(os.path.dirname(args['preprocess']['trainpref'])): lang = os.path.basename(d) src_dict.save( os.path.join(args['preprocess']['destdir'], lang, f"{args['preprocess']['source_lang']}.dict.jsonl")) # 2. ***************build dataset******************** def make_binary_dataset(vocab, input_file, output_file, num_workers): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) def consumer(data, _): ds.add_item(data) merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=string2tokens, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: languages = [ os.path.basename(d) for d in PathManager.ls(os.path.dirname(input_prefix)) ] for l in languages: in_file = file_name(input_prefix, lang) in_file = str.replace(in_file, '*', l) out_file = dest_path(os.path.join(l, output_prefix), lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] from dataset.codexglue.code_to_text import BPE_DIR source_dict_file = os.path.join(BPE_DIR, 'csn/csn.spm.vocab') target_dict_file = os.path.join( os.path.dirname(args['preprocess']['destdir']), 'dict.jsonl') with open(source_dict_file, 'r') as reader, open(target_dict_file, 'w') as writer: for line in reader: print(json_io.json_dumps([line.split('\t')[0], 100]), file=writer) src_dict = tgt_dict = task.load_dictionary(target_dict_file) src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenization.json_tokenizer, offset=0, end=offsets[1], append_eos=True, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir for {} task'.format(args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(modality): train_files = [] for lang, value in args['preprocess']['dataprefs'].items(): train_files.append("{}{}".format( args['preprocess']['dataprefs'][lang]['trainpref'], ("." + modality) if modality else "")) return train_files def build_dictionary(filenames, modality, src=False, tgt=False): """ ['code_tokens', 'docstring_tokens', 'path', 'sbt', 'sbtao', 'binary_ast', 'traversal'] """ assert src ^ tgt if modality in ['binary_ast']: tokenize_func = tokenization.json_tokenizer elif modality in [ 'code_tokens', 'docstring_tokens', 'path', 'path.terminals', 'sbt', 'sbtao', 'traversal' ]: tokenize_func = tokenization.json_tokenizer else: raise NotImplementedError("{}".format(modality)) return task.build_dictionary( filenames, tokenize_func=tokenize_func, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'] if src else args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], ) def build_vocab_dict(args): """Build vocabulary (dictionary) for source and target domain""" LOGGER.info('Build vocabularies...') # task = tasks.get_task(args['preprocess']['task']) src_dicts = OrderedDict() def load_dict(modality): modality_dict_filename = os.path.join( args['preprocess']['destdir'], 'data-{}'.format(args['preprocess']['dataset_impl']), '{}.dict.json'.format(modality)) os.makedirs(os.path.dirname(modality_dict_filename), exist_ok=True) if os.path.exists(modality_dict_filename): LOGGER.info('Loading {} dict from {}'.format( modality, modality_dict_filename)) modality_dict = Dictionary.load_json(modality_dict_filename) else: modality_dict = build_dictionary(train_path(modality), modality, src=True) LOGGER.info('Saving {} dict at {}'.format( modality, modality_dict_filename)) modality_dict.save_json(modality_dict_filename) return modality_dict if args['preprocess']['joined_dictionary']: modalities = args['preprocess']['source_lang'] + [ args['preprocess']['target_lang'] ] modalities = sorted( list( itertools.filterfalse(lambda modality: modality is None, modalities))) joined_dictionary_filename = os.path.join( args['preprocess']['destdir'], '{}.dict.txt'.format('_'.join(modalities))) if os.path.exists(joined_dictionary_filename): LOGGER.info('Loading joint dict from {}'.format( joined_dictionary_filename)) joined_dictionary = Dictionary.load_json( joined_dictionary_filename) else: joined_dictionary = build_dictionary( [train_path(modality) for modality in modalities], modalities, src=True) LOGGER.info('Saving joint dict at {}'.format( joined_dictionary_filename)) joined_dictionary.save_json(joined_dictionary_filename) for modality in modalities: src_dicts[modality] = joined_dictionary tgt_dict = joined_dictionary else: # src dict for modality in args['preprocess']['source_lang']: src_dicts[modality] = load_dict(modality) # tgt dict if args['preprocess']['target_lang']: tgt_dict = load_dict(args['preprocess']['target_lang']) else: tgt_dict = None return src_dicts, tgt_dict # 1. build vocabulary src_dicts, tgt_dict = build_vocab_dict(args) # 2. ***************build dataset******************** def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, impl, lang, modality): return os.path.join(args['preprocess']['destdir'], 'data-{}'.format(impl), lang, file_name(prefix, modality)) def make_binary_dataset(dict: Dictionary, input_file, output_file, attr: str, num_workers: int): """make binary dataset""" LOGGER.info("[{}] Dictionary: {} types".format(attr, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, dict, prefix, attr, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) merge_result( Binarizer.binarize( input_file, dict, lambda t: ds.add_item(t), tokenize=tokenization.json_tokenizer, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word, )) def make_graph_bin_dataset(dict: Dictionary, input_file, output_file, num_workers): offsets = Binarizer.find_offsets(input_file, num_workers) if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers) for worker_id in range(num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize_dgl, (args, input_file, dict, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() else: prefix = "{}0".format(output_file) binarize_dgl(args, input_file, dict, prefix, 0, -1) def make_dataset(vocab, input_prefix, output_prefix, lang, modality, num_workers=1): in_file = file_name(input_prefix, modality) out_file = dest_path(output_prefix, args['preprocess']['dataset_impl'], lang, modality) os.makedirs(os.path.dirname(out_file), exist_ok=True) if args['preprocess']['dataset_impl'] == "raw": logger.info('Copying {} into {}'.format(in_file, out_file)) shutil.copy(src=in_file, dst=out_file) else: if modality == 'binary_ast': make_graph_bin_dataset(vocab, in_file, out_file, num_workers) else: make_binary_dataset(vocab, in_file, out_file, modality, num_workers) def make_all(modality, vocab, lang, data_prefs): num_workers = min(args['preprocess']['workers'], cpu_count()) if data_prefs['trainpref']: make_dataset(vocab, data_prefs['trainpref'], "train", lang, modality, num_workers=num_workers) if data_prefs['validpref']: for k, validpref in enumerate(data_prefs['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, modality, num_workers=num_workers) if data_prefs['testpref']: for k, testpref in enumerate(data_prefs['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, modality, num_workers=num_workers) def build_dataset(args: Dict, src_dicts: Dict[str, Dictionary], tgt_dict: Dictionary): """build dataset for modal""" for modality, src_dict in src_dicts.items(): LOGGER.info('Building dataset for {}'.format(modality)) for lang, data_prefs in args['preprocess']['dataprefs'].items(): make_all(modality, src_dict, lang, data_prefs) # 2. build dataset build_dataset(args, src_dicts, tgt_dict)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}.{}".format(args['preprocess']['trainpref'], lang) def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): return f"{prefix}.{lang}" def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if target and not args['preprocess']['tgtdict'] and os.path.exists( dict_path(args['preprocess']['target_lang'])): raise FileExistsError(dict_path(args['preprocess']['target_lang'])) if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info( 'Generating dictionaries with Train/Validation data files.') if args['preprocess']['joined_dictionary']: assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \ "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) elif args['preprocess']['tgtdict']: src_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [ train_path(args['preprocess']['source_lang']), train_path(args['preprocess']['target_lang']) ] if not args['preprocess']['only_train']: filenames.extend( \ [valid_path(args['preprocess']['source_lang']), valid_path(args['preprocess']['target_lang'])]) src_dict = task.build_dictionary( filenames, tokenize_func=json_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], # set max len for joint dictionaries nwords=max(args['preprocess']['nwordssrc'], args['preprocess']['nwordstgt']), ) tgt_dict = src_dict else: if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) src_dict = task.build_dictionary( filenames, tokenize_func=json_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" filenames = [train_path(args['preprocess']['target_lang'])] if not args['preprocess']['only_train']: filenames.append( valid_path(args['preprocess']['target_lang'])) tgt_dict = task.build_dictionary( filenames, tokenize_func=json_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) else: tgt_dict = None src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def seperate_tokenize(line): line = json_io.json_loads(line) tokens = separate_list(line, args['preprocess']['max_len']) return tokens def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = f'{output_file}.mmap' ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq') def consumer(data, start_idx): ds.add_item(data) ext_ds.add_item(start_idx) merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=seperate_tokenize, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) ext_ds.merge_file_(f"{temp_file_path}.ext") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) os.remove( indexed_dataset.index_file_path(f"{temp_file_path}.ext")) ds.finalize('{}.idx'.format(output_file)) ext_ds.finalize() LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" data_files = train_path(args['preprocess']['source_lang']) data_files = PathManager.ls(data_files) src_dict = task.build_bpe_dictionary( data_files, tokenize_func=tokenizers.sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, bpe_portion=args['preprocess']['source_bpe_portion'], ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: data_files = train_path(args['preprocess']['target_lang']) if '*' in data_files: data_files = glob(data_files) else: data_files = [data_files] assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" tgt_dict = task.build_bpe_dictionary( data_files, tokenize_func=tokenizers.lower_tokenizer, workers=args['preprocess']['workers'], threshold=0, nwords=args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, bpe_portion=args['preprocess']['target_bpe_portion'], ) else: tgt_dict = None # src_dict.save(dict_path(args['preprocess']['source_lang'])) # tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # tgt_dict.save(dict_path("func_name")) # save target_lang dict for func_name # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, use_func, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = find_offsets(input_file, num_chunks=num_workers) func_offsets = None modality = input_file.split('.')[-1] if modality == 'code_tokens': tokenizer = tokenizers.list_tokenizer if use_func: func_offsets = Binarizer.find_func_offsets(input_file, offsets=offsets) elif modality == 'func_name': tokenizer = tokenizers.func_name_tokenizer elif modality == 'docstring_tokens': tokenizer = tokenizers.lower_tokenizer else: raise NotImplementedError(modality) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, ( args, input_file, vocab, prefix, tokenizer, use_func and (modality == 'code_tokens'), offsets[worker_id], offsets[worker_id + 1], func_offsets[worker_id] if func_offsets else 0, ), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenizer, use_func=use_func and (modality == 'code_tokens'), offset=offsets[0], end=offsets[1], func_offset=func_offsets[0] if func_offsets else 0, append_eos=False, min_func_len=args['preprocess']['min_func_len'], )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, use_func=False, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_files = file_name(input_prefix, lang) if '*' in in_files: in_files = glob(in_files) else: in_files = [in_files] for in_file in in_files: if lang == 'code_tokens': out_file = dest_path(output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang + ".wo_func"}') \ if use_func == True else dest_path(output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang}') else: out_file = dest_path( output_prefix, f'{str.split(in_file, os.sep)[-2]}.{lang}') os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, use_func, num_workers) def make_all(lang, vocab, use_func=False): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers'], use_func=use_func) if args['preprocess']['validpref']: make_dataset(vocab, args['preprocess']['validpref'], "valid", lang, num_workers=args['preprocess']['workers'], use_func=use_func) if args['preprocess']['testpref']: make_dataset(vocab, args['preprocess']['testpref'], "test", lang, num_workers=args['preprocess']['workers'], use_func=use_func) make_all(args['preprocess']['source_lang'], src_dict) make_all(args['preprocess']['source_lang'], src_dict, use_func=True) if target: make_all(args['preprocess']['target_lang'], tgt_dict) make_all('func_name', tgt_dict) # func_name as query
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info( 'Generating dictionaries with Train/Validation data files.') if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) src_dict = task.build_dictionary( filenames, tokenize_func=bin_ast_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], ) src_dict.save(dict_path(args['preprocess']['source_lang'])) # 2. ***************build dataset******************** def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) offsets = find_offsets(in_file, num_workers) with Pool(num_workers) as mpool: results = [ mpool.apply_async( build_dgl_graph, (vocab, in_file, f'{out_file}{worker_id}.mmap', offsets[worker_id], offsets[worker_id + 1]), ) for worker_id in range(num_workers) ] results = [res.get() for res in results] graph_batch = [] for worker_id in range(num_workers): sub_file = f'{out_file}{worker_id}.mmap' glist, _ = load_graphs(sub_file) graph_batch.extend(glist) os.remove(sub_file) save_graphs(f'{out_file}.mmap', graph_batch) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict)
from dataset.codexglue.code_to_text import ( LANGUAGES, MODES, ) from ncc import tasks from ncc.data import ( Dictionary, indexed_dataset, ) from ncc.utils.file_ops.yaml_io import recursive_expanduser from ncc.utils.file_ops import file_io from ncc.utils.path_manager import PathManager if __name__ == '__main__': task = tasks.get_task('multilingual_denoising') base_dir = recursive_expanduser( '~/ncc_data/codexglue/code_to_text/multilingual_denoising/data-mmap') dict_file = os.path.join(base_dir, 'dict.jsonl') vocab = task.load_dictionary(dict_file) for mode in MODES: dst_file = os.path.join(base_dir, 'docstring', f"{mode}.docstring.spm") PathManager.mkdir(os.path.dirname(dst_file)) # mmap ds = indexed_dataset.make_builder(f'{dst_file}.mmap', impl='mmap', vocab_size=len(vocab)) for lang in LANGUAGES: src_file = os.path.join(base_dir, lang, f"{mode}.docstring.spm")
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir for {} task'.format(args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path("dict", lang) + ".txt" target = not args['preprocess']['only_source'] # 1. build vocabulary from bpe directory if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if target and not args['preprocess']['tgtdict'] and os.path.exists( dict_path(args['preprocess']['target_lang'])): raise FileExistsError(dict_path(args['preprocess']['target_lang'])) if args['preprocess']['joined_dictionary']: assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \ "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) elif args['preprocess']['tgtdict']: src_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: LOGGER.error( 'Please run sentencepiece to generate the model and vocab files first.' ) exit() tgt_dict = src_dict # Load sentencepiece (sp) module if args['preprocess']['src_sp']: src_sp = spm.SentencePieceProcessor() src_sp.load(args['preprocess']['src_sp']) elif args['preprocess']['tgt_sp']: src_sp = spm.SentencePieceProcessor() src_sp.load(args['preprocess']['tgt_sp']) else: LOGGER.error('Please assign the sentencepiece model path.') exit() tgt_sp = src_sp else: if args['preprocess']['srcdict'] and args['preprocess']['src_sp']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) src_sp = spm.SentencePieceProcessor() src_sp.load(args['preprocess']['src_sp']) else: LOGGER.error( 'Please run sentencepiece to generate the model and vocab files first.' ) exit() if target: if args['preprocess']['tgtdict'] and args['preprocess']['tgt_sp']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) tgt_sp = spm.SentencePieceProcessor() tgt_sp.load(args['preprocess']['tgt_sp']) else: # assert args['preprocess']['trainpref'], "--trainpref must be set if --tgtdict is not specified" # tgt_dict = build_dictionary([train_path(args['preprocess']['target_lang'])], tgt=True) LOGGER.error( 'Please run sentencepiece to generate the model and vocab files first.' ) exit() else: tgt_dict = None tgt_sp = None # exit() # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, attr: str, num_workers: int): """make binary dataset""" LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, attr, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize_bpe(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, BPE no replaced token".format( attr, input_file, n_seq_tok[0], n_seq_tok[1], )) def make_dataset(vocab, sp, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == 'raw': with open(file_name(input_prefix, lang), 'rb') as input_file, open( dest_path(output_prefix, lang), 'w', encoding="utf-8") as output_file: for line in input_file.readlines( )[0:100]: # TODO only for debug line = ujson.loads(line) line = normalize_program(line) line = sp.EncodeAsPieces(line) output_file.write(ujson.dumps(line) + '\n') else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, lang, num_workers) def make_all(lang, vocab, sp): if args['preprocess']['trainpref']: make_dataset(vocab, sp, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, sp, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, sp, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) # # 2. build dataset make_all(args['preprocess']['source_lang'], src_dict, src_sp) if target: make_all(args['preprocess']['target_lang'], tgt_dict, tgt_sp)
def main(args): LOGGER.info('mkdir for {} task'.format(args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) # 1. ***************build vocabulary*************** task = tasks.get_task(args['preprocess']['task']) def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info('Generating dictionaries with Train/Validation data files.') if args['preprocess']['subtokendict']: subtoken_dict = task.load_dictionary(args['preprocess']['subtokendict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) subtoken_dict = task.build_dictionary( filenames, tokenize_func=subtoken_tokenize, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordssubtoken'], padding_factor=args['preprocess']['padding_factor'], ) if args['preprocess']['typedict']: type_dict = task.load_dictionary(args['preprocess']['typedict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) type_dict = task.build_dictionary( filenames, tokenize_func=type_tokenize, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordstype'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) if args['preprocess']['docstringdict']: docstring_dict = task.load_dictionary(args['preprocess']['docstringdict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['target_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['target_lang'])) docstring_dict = task.build_dictionary( filenames, tokenize_func=tokenization.json_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordsdocstring'], padding_factor=args['preprocess']['padding_factor'], ) subtoken_dict.save(dict_path('subtoken')) type_dict.save(dict_path('type')) docstring_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, aux_dict, prefix, lang, tokenize, max_path_num, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result ) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if lang == 'path': sz_ds_file = '{}.sz.mmap'.format(output_file) sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) else: sz_ds = None def consumer(tensor, size=None): ds.add_item(tensor) if size is not None: sz_ds.add_item(size) if sz_ds is None: merge_result( Binarizer.binarize( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, max_path_num=max_path_num, ) ) else: merge_result( PathSummarizationBinarizer.path_binarizer( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict, max_path_num=max_path_num, ) ) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) if sz_ds is not None: sz_ds.merge_file_(f"{temp_file_path}.sz") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) if sz_ds is not None: os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz")) os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz")) ds.finalize('{}.idx'.format(output_file)) if sz_ds is not None: sz_ds.finalize('{}.sz.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ) ) def make_dataset(vocab, aux_dict, input_prefix, output_prefix, lang, max_path_num, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, aux_dict, in_file, out_file, lang, max_path_num, num_workers) def make_all(lang, vocab, aux_dict=None): if args['preprocess']['trainpref']: max_path_num = args['preprocess']['train_path_num'] make_dataset(vocab, aux_dict, args['preprocess']['trainpref'], "train", lang, max_path_num, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: max_path_num = args['preprocess']['eval_path_num'] for k, validpref in enumerate(args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, aux_dict, validpref, outprefix, lang, max_path_num, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: max_path_num = args['preprocess']['eval_path_num'] for k, testpref in enumerate(args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, aux_dict, testpref, outprefix, lang, max_path_num, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], subtoken_dict, type_dict) make_all(args['preprocess']['target_lang'], docstring_dict)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir for {} task'.format(args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" # 1. build vocabulary LOGGER.info('Build vocabularies...') if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if not args['preprocess']['tgtdict'] and os.path.exists( dict_path(args['preprocess']['target_lang'])): raise FileExistsError(dict_path(args['preprocess']['target_lang'])) if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" src_dict = task.build_dictionary( [train_path(args['preprocess']['source_lang'])], tokenize_func=tokenization.json_tokenizer, workers=args['preprocess']['workers'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, unk=None, ) if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" tgt_dict = task.build_dictionary( filenames=[], tokenize_func=label_tokenization, workers=args['preprocess']['workers'], padding_factor=args['preprocess']['padding_factor'], pad=None, bos=None, eos=None, unk=None, ) tgt_dict.add_symbol('CPU', 0) tgt_dict.add_symbol('GPU', 1) LOGGER.info('dict_path: {}'.format( dict_path(args['preprocess']['source_lang']))) src_dict.save(dict_path(args['preprocess']['source_lang'])) LOGGER.info('dict_path: {}'.format( dict_path(args['preprocess']['target_lang']))) tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, attr: str, num_workers: int): """make binary dataset""" LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab))) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, attr, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=label_tokenization if attr == 'oracle' else tokenization.json_tokenizer, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, lang, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) make_all(args['preprocess']['target_lang'], tgt_dict) # dump other attributes for lang in args['preprocess']['auxiliary_langs']: src_file = f"{args['preprocess']['trainpref']}.{lang}" tgt_file = os.path.join(args['preprocess']['destdir'], f"train.{lang}") with open(src_file, 'r') as reader, open(tgt_file, 'wb') as writer: data = [eval(line.strip()) for line in reader] pickle.dump(data, file=writer)
def main(args): LOGGER.info('mkdir for {} task'.format(args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) # 1. ***************build vocabulary*************** task = tasks.get_task(args['preprocess']['task']) def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" def string2dfs(line): line = json_io.json_loads(line) asts = py150_util.separate_dps(line, args['preprocess']['max_len']) ast_dfs = [[py150_util.get_dfs(ast), ext] for ast, ext in asts if len(ast) > 1] return ast_dfs def string2type_dfs(line): type_dfs = type_tokenize_func(line) type_dfs = py150_util.separate_dps(type_dfs, args['preprocess']['max_len']) type_dfs = [[dfs, ext] for dfs, ext in type_dfs if len(dfs) > 1] return type_dfs def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") target = not args['preprocess']['only_source'] if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if target and not args['preprocess']['tgtdict'] and os.path.exists( dict_path(args['preprocess']['target_lang'])): raise FileExistsError(dict_path(args['preprocess']['target_lang'])) if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info( 'Generating dictionaries with Train/Validation data files.') if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) src_dict = task.build_dictionary( filenames, tokenize_func=tokenize_func, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" # code_types are from ast filenames = [train_path(args['preprocess']['source_lang'])] if not args['preprocess']['only_train']: filenames.append(valid_path(args['preprocess']['source_lang'])) tgt_dict = task.build_dictionary( filenames, tokenize_func=type_tokenize_func, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) else: tgt_dict = None src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab, input_file, output_file, lang, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, ( args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) ext_ds = indexed_dataset.make_builder(f'{output_file}.ext', impl='seq') def consumer(data, start_idx): ds.add_item(data) ext_ds.add_item(start_idx) tokenize = string2dfs if lang == 'ast' else string2type_dfs merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) ext_ds.merge_file_(f"{temp_file_path}.ext") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) os.remove( indexed_dataset.index_file_path(f"{temp_file_path}.ext")) ds.finalize('{}.idx'.format(output_file)) ext_ds.finalize() LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": # TODO: parse json to txt file, one line one traversal, please help me parallize it. """ because only 1 thread is allowed to write file, we have to use multi-processing for deal with data and merge results from CPUs into a block and then dumps such block. """ def _func(line): line = py150_util.separate_dps( json_io.json_loads(line.strip()), args['preprocess']['n_ctx']) line = [ py150_util.get_dfs(ast) + [ext] for ast, ext in line if len(ast) > 1 ] # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1] return line with PPool() as thread_pool: with file_io.open(file_name(input_prefix, lang), 'r') as f, \ file_io.open(dest_path(output_prefix, lang), 'w') as fout: def _write(result): for res in itertools.chain(*result): print(json_io.json_dumps(res), file=fout) batch_data = [] for line in f: batch_data.append(line) if len(batch_data) >= MAX_BATCH_SIZE: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data batch_data = [] if len(batch_data) > 0: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data else: if lang == 'code_types': in_file = file_name(input_prefix, 'ast') else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, lang, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" target = not args['preprocess']['only_source'] if args['preprocess']['joined_dictionary']: assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \ "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) elif args['preprocess']['tgtdict']: src_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = [ train_path(args['preprocess']['source_lang']), train_path(args['preprocess']['target_lang']) ] if not args['preprocess']['only_train']: filenames.extend( \ [valid_path(args['preprocess']['source_lang']), valid_path(args['preprocess']['target_lang'])]) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], # set max len for joint dictionaries nwords=max(args['preprocess']['nwordssrc'], args['preprocess']['nwordstgt']), ) tgt_dict = src_dict else: if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['source_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls( valid_path(args['preprocess']['source_lang']))) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], ) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --tgtdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['target_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls( valid_path(args['preprocess']['target_lang']))) tgt_dict = task.build_dictionary( filenames, tokenize_func=tokenization.dpu_sub_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdtgt'], nwords=args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], ) else: tgt_dict = None src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary if target and tgt_dict is not None: tgt_dict.save(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, num_workers: int): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenization.dpu_sub_tokenizer, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, out_file=None, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_file = file_name(input_prefix, lang) if out_file is None: out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]): # copy shared dict into each languages out_dir = os.path.join(args['preprocess']['destdir'], l) PathManager.mkdir(out_dir) dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl') PathManager.copy(dict_path(lang), dst_dict) if args['preprocess']['trainpref']: out_file = os.path.join(out_dir, f"train.{lang}") make_dataset(vocab, args['preprocess']['trainpref'].replace('*', l), "train", lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: out_file = os.path.join(out_dir, f"valid.{lang}") make_dataset(vocab, args['preprocess']['validpref'].replace('*', l), 'valid', lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: out_file = os.path.join(out_dir, f"test.{lang}") make_dataset(vocab, args['preprocess']['testpref'].replace('*', l), 'test', lang, out_file=out_file, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir for {} task'.format(args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".json" def build_dictionary(filenames, modality, src=False, tgt=False): assert src ^ tgt if modality in ['binary_ast']: tokenize_func = tokenization.json_tokenizer elif modality in ['code_tokens', 'docstring_tokens', 'sbt', 'sbtao', 'path']: tokenize_func = tokenization.json_tokenizer return task.build_dictionary( filenames, tokenize_func=tokenize_func, workers=args['preprocess']['workers'], threshold=args['preprocess']['thresholdsrc'], nwords=args['preprocess']['nwordssrc'] if src else args['preprocess']['nwordstgt'], padding_factor=args['preprocess']['padding_factor'], ) # 1. build vocabulary LOGGER.info('Build vocabularies...') target = not args['preprocess']['only_source'] # if not args['preprocess']['srcdict'] and os.path.exists(dict_path(args['preprocess']['source_lang'])): # raise FileExistsError(dict_path(args['preprocess']['source_lang'])) # if target and not args['preprocess']['tgtdict'] and os.path.exists(dict_path(args['preprocess']['target_lang'])): # raise FileExistsError(dict_path(args['preprocess']['target_lang'])) if args['preprocess']['joined_dictionary']: assert not args['preprocess']['srcdict'] or not args['preprocess']['tgtdict'], \ "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) elif args['preprocess']['tgtdict']: src_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary( {train_path(lang) for lang in [args['preprocess']['source_lang'], args['preprocess']['target_lang']]}, args['preprocess']['source_lang'], src=True ) tgt_dict = src_dict else: if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args['preprocess']['source_lang'])], args['preprocess']['source_lang'], src=True) if target: if args['preprocess']['tgtdict']: tgt_dict = task.load_dictionary(args['preprocess']['tgtdict']) else: assert args['preprocess']['trainpref'], "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args['preprocess']['target_lang'])], args['preprocess']['target_lang'], tgt=True) else: tgt_dict = None LOGGER.info('dict_path: {}'.format(dict_path(args['preprocess']['source_lang']))) src_dict.save_json(dict_path(args['preprocess']['source_lang'])) if target and tgt_dict is not None: tgt_dict.save_json(dict_path(args['preprocess']['target_lang'])) # 2. ***************build dataset******************** def make_binary_dataset(vocab: Dictionary, input_file, output_file, attr: str, num_workers: int): """make binary dataset""" LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, prefix, attr, offsets[worker_id], offsets[worker_id + 1] ), callback=merge_result ) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize( input_file, vocab, lambda t: ds.add_item(t), tokenize=tokenization.json_tokenizer, offset=0, end=offsets[1], append_eos=False, ) ) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ) ) def make_graph_binary_dataset(vocab: Dictionary, input_file, output_file): import torch from dgl.data.graph_serialize import GraphData from dgl.data.utils import save_graphs from tqdm import tqdm graph_batch, ids = [], [] with open(input_file, 'r') as reader: num_lines = sum(1 for _ in reader) reader.seek(0) for idx, line in tqdm(enumerate(reader), total=num_lines): ast = ujson.loads(line) graph = tree2dgl(ast, dict) graph = GraphData.create(graph) graph_batch.append(graph) ids.append(idx) graph_labels = {"glabel": torch.IntTensor(ids)} save_graphs(output_file + '.mmap', graph_batch, graph_labels) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] os.makedirs(out_dir, exist_ok=True) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) os.makedirs(os.path.dirname(out_file), exist_ok=True) make_binary_dataset(vocab, in_file, out_file, lang, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate(args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate(args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict) if target: make_all(args['preprocess']['target_lang'], tgt_dict)