def _cat_and_remove(tgt_filename, num_workers): with file_io.open(tgt_filename, 'w') as writer: for idx in range(num_workers): src_filename = tgt_filename + str(idx) with file_io.open(src_filename, 'r') as reader: PathManager.copyfileobj(reader, writer) PathManager.rm(src_filename)
def _save(self, f, kv_iterator): if isinstance(f, str): PathManager.mkdir(os.path.dirname(f)) with file_io.open(f, "w") as fd: return self.save(fd) for k, v in kv_iterator: print(json_io.json_dumps([k, v]), file=f)
def docstring_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring_tokens = json_io.json_loads(line) if docstring_tokens: docstring_tokens = [ token for token in docstring_tokens \ if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token)) ] if not all( str.isascii(token) for token in docstring_tokens): docstring_tokens = None if (docstring_tokens is None) or not (3 < len(docstring_tokens) <= 50): docstring_tokens = None else: docstring_tokens = None print(json_io.json_dumps(docstring_tokens), file=writer) line = safe_readline(reader)
def save_lang_dict(): src_file = PathManager.expanduser("~/clcdsa/astbert/data-mmap/lang.jsonl") dict = Dictionary.load(src_file) tgt_file = os.path.join(args['preprocess']['destdir'], 'lang.jsonl') PathManager.mkdir(os.path.dirname(tgt_file)) dict.save(tgt_file) return dict
def main(): from dataset.py150 import (RAW_DIR, ATTRIBUTES_DIR, ) from ncc.utils.path_manager import PathManager parser = argparse.ArgumentParser() parser.add_argument('--data_dir', type=str, default=RAW_DIR) parser.add_argument('--output_dir', type=str, default=ATTRIBUTES_DIR) parser.add_argument('--valid_p', type=float, default=0.2) parser.add_argument('--max_path_length', type=int, default=8) parser.add_argument('--max_path_width', type=int, default=2) parser.add_argument('--use_method_name', type=bool, default=True) parser.add_argument('--use_nums', type=bool, default=True) parser.add_argument('--n_jobs', type=int, default=multiprocessing.cpu_count()) parser.add_argument('--seed', type=int, default=239) args = parser.parse_args() np.random.seed(args.seed) data_dir = Path(args.data_dir) trains = __collect_asts(data_dir / 'python100k_train.json') evals = __collect_asts(data_dir / 'python50k_eval.json') train, valid = sklearn_model_selection.train_test_split( trains, test_size=args.valid_p, ) test = evals output_dir = Path(args.output_dir) PathManager.mkdir(output_dir) for split_name, split in zip(('train', 'valid', 'test'), (train, valid, test)): output_file = output_dir / f'{split_name}.method_path' __collect_all_and_save(split, args, output_file)
def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code_tokens = json_io.json_loads(line) if code_tokens: # filter comment in code_tokens, eg. //***\n /* */\n code_tokens = [token for token in code_tokens if not (str.startswith(token, '//') or str.startswith(token, '#') or \ (str.startswith(token, '/*') and str.endswith(token, '*/'))) ] if not all(str.isascii(token) for token in code_tokens): code_tokens = None if code_tokens is None or len(code_tokens) < 1: code_tokens = None else: code_tokens = None print(json_io.json_dumps(code_tokens), file=writer) line = safe_readline(reader)
def make_all(lang, vocab): for l in os.listdir(args['preprocess']['trainpref'].split('*')[0]): # copy shared dict into each languages out_dir = os.path.join(args['preprocess']['destdir'], l) PathManager.mkdir(out_dir) dst_dict = os.path.join(out_dir, f'{lang}.dict.jsonl') PathManager.copy(dict_path(lang), dst_dict) if args['preprocess']['trainpref']: out_file = os.path.join(out_dir, f"train.{lang}") make_dataset(vocab, args['preprocess']['trainpref'].replace('*', l), "train", lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: out_file = os.path.join(out_dir, f"valid.{lang}") make_dataset(vocab, args['preprocess']['validpref'].replace('*', l), 'valid', lang, out_file=out_file, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: out_file = os.path.join(out_dir, f"test.{lang}") make_dataset(vocab, args['preprocess']['testpref'].replace('*', l), 'test', lang, out_file=out_file, num_workers=args['preprocess']['workers'])
def make_dataset(vocab, aux_dict, input_prefix, output_prefix, lang, max_path_num, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, aux_dict, in_file, out_file, lang, max_path_num, num_workers)
def _concate(_tgt_filename, num_workers, tgt_filename): src_filenames = [ _tgt_filename + str(idx) for idx in range(num_workers) ] with file_io.open(tgt_filename, 'w') as writer: for _src_fl in src_filenames: with file_io.open(_src_fl, 'r') as reader: shutil.copyfileobj(reader, writer) PathManager.rm(_src_fl)
def recursive_expanduser(obj): if isinstance(obj, dict): for key, value in obj.items(): obj[key] = recursive_expanduser(value) elif isinstance(obj, str) and obj.startswith('~/'): obj = PathManager.expanduser(obj) elif isinstance(obj, list): for i, val in enumerate(obj): if isinstance(val, str) and val.startswith('~/'): obj[i] = PathManager.expanduser(val) return obj
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": in_file = file_name(input_prefix, lang) out_dir = args['preprocess']['destdir'] PathManager.mkdir(out_dir) LOGGER.info('Copying {} into {}'.format(in_file, out_dir)) shutil.copy(src=in_file, dst=args['preprocess']['destdir']) else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers)
def main(args): # task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = TransformersDictionary.from_pretrained('microsoft/codebert-base', do_lower_case=False) # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly def parse_source_input(code): code_tokens = vocab.tokenize(code) # truncating code_tokens = code_tokens[:config.MAX_SOURCE_LENGTH - 2] source_tokens = [vocab.cls_token] + code_tokens + [vocab.sep_token] source_ids = vocab.convert_tokens_to_ids(source_tokens) source_size = len(source_tokens) source_mask = [1] * source_size padding_length = config.MAX_SOURCE_LENGTH - len(source_ids) source_ids += [vocab.pad()] * padding_length source_mask += [0] * padding_length return [source_ids, source_mask, source_size] def parse_target_input(code): target_tokens = vocab.tokenize(code)[:config.MAX_TARGET_LENGTH - 2] target_tokens = [vocab.cls_token] + target_tokens + [vocab.sep_token] target_ids = vocab.convert_tokens_to_ids(target_tokens) target_size = len(target_ids) target_mask = [1] * target_size padding_length = config.MAX_TARGET_LENGTH - len(target_ids) target_ids += [vocab.pad_token_id] * padding_length target_mask += [0] * padding_length return [target_ids, target_mask, target_size] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.pkl") PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(src_file, 'r') as reader: keys = ['code', 'src_tokens', 'src_masks', 'src_sizes', 'tgt_tokens', 'tgt_masks', 'tgt_sizes'] data = {key: [] for key in keys} for line in reader: src_code = json_io.json_loads(line) # src_code = SPACE_SPLITTER.sub(" ", line) # source_ids, source_mask src_line = parse_source_input(src_code) # target_ids, target_mask tgt_line = parse_target_input(src_code) for key, src in zip(keys, [src_code] + src_line + tgt_line): data[key].append(src) file_io.open(dst_file, mode='wb', data=data)
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": # TODO: parse json to txt file, one line one traversal, please help me parallize it. """ because only 1 thread is allowed to write file, we have to use multi-processing for deal with data and merge results from CPUs into a block and then dumps such block. """ def _func(line): line = py150_util.separate_dps( json_io.json_loads(line.strip()), args['preprocess']['n_ctx']) line = [ py150_util.get_dfs(ast) + [ext] for ast, ext in line if len(ast) > 1 ] # line = [json.dumps([py150_utils.get_dfs(ast), ext]) for ast, ext in line if len(ast) > 1] return line with PPool() as thread_pool: with file_io.open(file_name(input_prefix, lang), 'r') as f, \ file_io.open(dest_path(output_prefix, lang), 'w') as fout: def _write(result): for res in itertools.chain(*result): print(json_io.json_dumps(res), file=fout) batch_data = [] for line in f: batch_data.append(line) if len(batch_data) >= MAX_BATCH_SIZE: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data batch_data = [] if len(batch_data) > 0: result = thread_pool.feed(_func, batch_data, one_params=True) _write(result) del batch_data else: if lang == 'code_types': in_file = file_name(input_prefix, 'ast') else: in_file = file_name(input_prefix, lang) out_file = dest_path(output_prefix, lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, lang, num_workers)
def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: languages = [ os.path.basename(d) for d in PathManager.ls(os.path.dirname(input_prefix)) ] for l in languages: in_file = file_name(input_prefix, lang) in_file = str.replace(in_file, '*', l) out_file = dest_path(os.path.join(l, output_prefix), lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers)
def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring = json_io.json_loads(line) print(json_io.json_dumps(docstring), file=writer) line = safe_readline(reader)
def cast_code(raw_code_file, refined_code_file, dst_file): with file_io.open(raw_code_file, 'r') as raw_reader: raw_codes = {} for line in raw_reader: raw_code = line raw_code = raw_code[raw_code.find('def '):] func_name = raw_code[:raw_code.find('(')][4:].strip() raw_codes[func_name] = line.rstrip('\n') PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(refined_code_file, 'r') as refined_reader, file_io.open(dst_file, 'w') as writer: for line in refined_reader: func_name = line[line.find('def '):].split()[1] raw_code = raw_codes[func_name] print(raw_code, file=writer)
def load_raw_data(data_dir, load_keys): raw_data = {} for mode in constants.MODES: for key in load_keys: mode_data_dir = os.path.join(data_dir, key, '{}.*'.format(mode)) jsonl_gz_files = PathManager.ls(mode_data_dir) raw_data[mode] = list(load_jsonl_gzs(jsonl_gz_files)) return raw_data
def flatten_attrs(raw_file, flatten_dir, lang, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] mode = filename[:str.rfind(filename, '.jsonl')] return mode mode = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_file = os.path.join(flatten_dir, lang, f'{mode}.{attr}') PathManager.mkdir(os.path.dirname(attr_file)) attr_writers[attr] = file_io.open(attr_file, 'w') print('raw_file: ', raw_file) with file_io.open(raw_file, 'r') as reader: for line in reader: code_snippet = json_io.json_loads(line) for attr, info in code_snippet.items(): if attr in attr_writers: print(json_io.json_dumps(info), file=attr_writers[attr])
def flatten(raw_dir, lang, mode, flatten_dir, attrs, num_cores): """flatten attributes of raw data""" LOGGER.info('Cast attributes({}) of {}-{} dataset'.format( attrs, lang, mode)) with Pool(num_cores) as mpool: result = [ mpool.apply_async(flatten_attrs, (raw_file, flatten_dir, lang, mode, set(attrs))) for raw_file in PathManager.ls( os.path.join(raw_dir, lang, mode, '*.jsonl.gz')) ] result = [res.get() for res in result]
def flatten_attrs(raw_file, flatten_dir, lang, mode, attrs): def _get_file_info(filename): """get mode and file index from file name""" filename = os.path.split(filename)[-1] filename = filename[:str.rfind(filename, '.jsonl.gz')] _, _, idx = filename.split('_') return idx idx = _get_file_info(raw_file) attr_writers = {} for attr in attrs: attr_dir = os.path.join(flatten_dir, lang, mode, attr) PathManager.mkdir(attr_dir) attr_file = os.path.join(attr_dir, '{}.jsonl'.format(idx)) attr_writers[attr] = file_io.open(attr_file, 'w') with file_io.open(raw_file, 'r') as reader: for line in reader: code_snippet = json_io.json_loads(line) for attr, info in code_snippet.items(): if attr in attr_writers: print(json_io.json_dumps(info), file=attr_writers[attr])
def merge_attr_files(flatten_dir, lang, mode, attrs): """shell cat""" def _merge_files(src_files, tgt_file): with file_io.open(tgt_file, 'w') as writer: for src_fl in src_files: with file_io.open(src_fl, 'r') as reader: shutil.copyfileobj(reader, writer) def _get_file_idx(filename): filename = os.path.split(filename)[-1] idx = int(filename[:str.rfind(filename, '.json')]) return idx for attr in attrs: attr_files = PathManager.ls( os.path.join(flatten_dir, lang, mode, attr, '*.jsonl')) attr_files = sorted(attr_files, key=_get_file_idx) assert len(attr_files) > 0, RuntimeError( 'Attribute({}) files do not exist.'.format(attr)) dest_file = os.path.join(flatten_dir, lang, '{}.{}'.format(mode, attr)) _merge_files(attr_files, dest_file) PathManager.rm(os.path.join(flatten_dir, lang, mode))
def initialize_from_checkpoint(args, model): if args['checkpoint'].get('init_checkpoint', False) and PathManager.exists(args['checkpoint']['init_checkpoint']): with open(args['checkpoint']['init_checkpoint'], 'rb') as reader: state = torch.load(reader) pretrained_params = state['model'] del state init_params = model.state_dict() for module_name, module_param in pretrained_params.items(): if module_name in init_params: if init_params[module_name].data.size() == module_param.data.size(): init_params[module_name].data.copy_(module_param.data) else: # emebedding token_num = module_param.size(0) # init token embedding init_params[module_name].data[:token_num, ...].copy_(module_param.data[:token_num, ...]) LOGGER.info(f"Restore parameters from {args['checkpoint']['init_checkpoint']}.") else: LOGGER.info(f"{args['checkpoint']['init_checkpoint']} does not exist.")
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab dictionary = save_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly lang = args['preprocess']['lang'] for mode in MODES: file = f"{args['preprocess'][f'{mode}pref']}.code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code") PathManager.mkdir(os.path.dirname(dst_file)) dataset = indexed_dataset.make_builder(f"{dst_file}_tokens.mmap", impl='mmap', vocab_size=len(vocab)) PathManager.mkdir(os.path.dirname(dst_file)) with file_io.open(file, 'r') as reader: data = {'code': []} for line in reader: line = json_io.json_loads(line) code = SPACE_SPLITTER.sub(" ", line) data['code'].append(code) code_tokens = vocab.encode(code, out_type=str) code_tokens = torch.IntTensor( [dictionary.index(token) for token in code_tokens]) # code_tokens = torch.IntTensor(vocab.encode_as_ids(code)) dataset.add_item(code_tokens) dataset.finalize(f"{dst_file}_tokens.idx") # proj indices # cp id data['proj_indices'] = [1] * len(data['code']) file_io.open(f"{dst_file}.pkl", mode='wb', data=data)
def xfg(src_dir, languages, dst_dir): xfg_src_files = PathManager.ls(os.path.join(src_dir, "kernels_ir", '*.ll')) filenames = [] ir_data = [] for filename in xfg_src_files: filenames.append(os.path.basename(filename)[:-3]) with open(filename, 'r') as reader: lines = reader.read().splitlines() ir_data.append(lines) # convert list to dict filenames = {name: idx for idx, name in enumerate(filenames)} processed_data, _ = inst2vec_preprocess.preprocess(ir_data) processed_data, _ = task_utils.inline_struct_types_txt( processed_data, ir_data) processed_data = task_utils.abstract_statements_from_identifiers_txt( processed_data) for idx, lines in enumerate(processed_data): processed_data[idx] = [ line for line in lines if not re.match(r'((?:<label>:)?(<LABEL>):|; <label>:<LABEL>)', line) ] for lang in languages: raw_file = os.path.join(src_dir, f'{lang}.csv') # read raw csv file to load corresponding benchmarks data_frame = pd.read_csv(raw_file) benchmarks = data_frame["benchmark"].values.tolist() datasets = data_frame["dataset"].values.tolist() del data_frame # write dst_file = os.path.join(dst_dir, lang, f'train.xfg') with open(dst_file, 'w') as writer: for idx, (bm, ds) in enumerate(zip(benchmarks, datasets)): if bm[:3] == "npb": bm += f'_{ds}' xfg = processed_data[filenames[bm]] print(json_io.json_dumps(xfg), file=writer)
def __init__(self, SO_FILE, LANGUAGE, to_lower=False, operators_file=None): self.parser = Parser() try: assert PathManager.exists(SO_FILE), FileExistsError( f"{SO_FILE} does not exist, automatically download TreeSitter parse file {LANGUAGE}.so." ) except FileExistsError as err: LOGGER.warning(err) from ncc.hub.tree_sitter.download import download download(LANGUAGE) if LANGUAGE == 'csharp': LANGUAGE = 'c_sharp' self.parser.set_language(Language(SO_FILE, LANGUAGE)) self.LANGUAGE = LANGUAGE self.to_lower = to_lower if operators_file is None: operators_file = os.path.join(os.path.dirname(__file__), 'operators.json') with open(operators_file, 'r') as reader: self.operators = json_io.json_load(reader)
def load_model_ensemble_and_task(filenames, arg_overrides=None, task=None, strict=True, suffix=''): from ncc import tasks ensemble = [] for filename in filenames: filename = filename.replace(".pt", suffix + ".pt") if not PathManager.exists(filename): raise IOError("Model file not found: {}".format(filename)) state = load_checkpoint_to_cpu(filename, arg_overrides) args = state["args"] if task is None: task = tasks.setup_task(args) # build model for ensemble model = task.build_model(args) model.load_state_dict(state["model"], strict=strict, args=args) ensemble.append(model) return ensemble, args, task
def single_main(args, init_distributed=False): assert args['dataset']['max_tokens'] is not None or args['dataset']['max_sentences'] is not None, \ 'Must specify batch size either with --max-tokens or --max-sentences' metrics.reset() # 0. Initialize CUDA and distributed training if torch.cuda.is_available() and not args['common']['cpu']: torch.cuda.set_device(args['distributed_training']['device_id']) set_seed.set_seed(args['common']['seed']) if init_distributed: args['distributed_training'][ 'distributed_rank'] = distributed_utils.distributed_init(args) # Verify checkpoint directory if distributed_utils.is_master(args): save_dir = args['checkpoint']['save_dir'] checkpoint_utils.verify_checkpoint_directory(save_dir) PathManager.rm(os.path.join( save_dir, '*.pt')) # this code will remove pre-trained models # 1. Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # 2. Load valid dataset (we load training data below, based on the latest checkpoint) task.load_dataset(args['dataset']['valid_subset'], combine=False, epoch=1) # 3. Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) LOGGER.info(model) LOGGER.info('model {}, criterion {}'.format(args['model']['arch'], criterion.__class__.__name__)) LOGGER.info('num. model params: {} (num. trained: {})'.format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) # 4. Build trainer trainer = Trainer(args, task, model, criterion) LOGGER.info('training on {} GPUs'.format( args['distributed_training']['distributed_world_size'])) LOGGER.info( 'max tokens per GPU = {} and max sentences per GPU = {}'.format( args['dataset']['max_tokens'], args['dataset']['max_sentences'], )) # 5. Load the latest checkpoint if one is available and restore the corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer, combine=False) # 6. Train until the learning rate gets too small max_epoch = args['optimization']['max_epoch'] or math.inf max_update = args['optimization']['max_update'] or math.inf lr = trainer.get_lr() train_meter = meters.StopwatchMeter() train_meter.start() valid_subsets = args['dataset']['valid_subset'].split(',') while (lr > args['optimization']['min_lr'] and epoch_itr.next_epoch_idx <= max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr) if not args['dataset']['disable_validation'] and epoch_itr.epoch % args[ 'dataset']['validate_interval'] == 0: valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args['checkpoint']['save_interval'] == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) # early stop if should_stop_early(args, valid_losses[0]): LOGGER.info( 'early stop since valid performance hasn\'t improved for last {} runs' .format(args['checkpoint']['patience'])) break epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, combine=False, # TODO to be checked # sharded data: get train iterator for next epoch load_dataset=(os.pathsep in args['task']['data']), ) train_meter.stop() LOGGER.info('done training in {:.1f} seconds'.format(train_meter.sum))
def main(args): task = tasks.get_task(args['preprocess']['task']) LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) os.makedirs(args['preprocess']['destdir'], exist_ok=True) def train_path(lang): return "{}{}".format(args['preprocess']['trainpref'], ("." + lang) if lang else "") def valid_path(lang): return "{}{}".format(args['preprocess']['validpref'], ("." + lang) if lang else "") def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args['preprocess']['destdir'], file_name(prefix, lang)) def dict_path(lang): return dest_path(lang, "dict") + ".jsonl" if not args['preprocess']['srcdict'] and os.path.exists( dict_path(args['preprocess']['source_lang'])): raise FileExistsError(dict_path(args['preprocess']['source_lang'])) if args['preprocess']['only_train']: LOGGER.info('Generating dictionaries with Train data files.') else: LOGGER.info( 'Generating dictionaries with Train/Validation data files.') if args['preprocess']['srcdict']: src_dict = task.load_dictionary(args['preprocess']['srcdict']) else: assert args['preprocess'][ 'trainpref'], "--trainpref must be set if --srcdict is not specified" filenames = PathManager.ls( train_path(args['preprocess']['source_lang'])) if not args['preprocess']['only_train']: filenames.extend( PathManager.ls(valid_path(args['preprocess']['source_lang']))) src_dict = task.build_dictionary( filenames, tokenize_func=tokenization.json_tokenizer, workers=args['preprocess']['workers'], threshold=args['preprocess']['threshold'], nwords=args['preprocess']['nwordssrc'], padding_factor=args['preprocess']['padding_factor'], bos=None, eos=None, ) src_dict.save(dict_path( args['preprocess']['source_lang'])) # save spm dict to ncc.dictionary # copy shared dict into each language's data directory for d in PathManager.ls(os.path.dirname(args['preprocess']['trainpref'])): lang = os.path.basename(d) src_dict.save( os.path.join(args['preprocess']['destdir'], lang, f"{args['preprocess']['source_lang']}.dict.jsonl")) # 2. ***************build dataset******************** def make_binary_dataset(vocab, input_file, output_file, num_workers): """make binary dataset""" # LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) def consumer(data, _): ds.add_item(data) merge_result( Binarizer.binarize_seperate( input_file, vocab, consumer, tokenize=string2tokens, offset=0, end=offsets[1], append_eos=False, )) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info("{}: {} sents, {} tokens, {:.3}% replaced by {}".format( # attr, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, )) def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1): if args['preprocess']['dataset_impl'] == "raw": raise NotImplementedError else: languages = [ os.path.basename(d) for d in PathManager.ls(os.path.dirname(input_prefix)) ] for l in languages: in_file = file_name(input_prefix, lang) in_file = str.replace(in_file, '*', l) out_file = dest_path(os.path.join(l, output_prefix), lang) PathManager.mkdir(os.path.dirname(out_file)) make_binary_dataset(vocab, in_file, out_file, num_workers) def make_all(lang, vocab): if args['preprocess']['trainpref']: make_dataset(vocab, args['preprocess']['trainpref'], "train", lang, num_workers=args['preprocess']['workers']) if args['preprocess']['validpref']: for k, validpref in enumerate( args['preprocess']['validpref'].split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(vocab, validpref, outprefix, lang, num_workers=args['preprocess']['workers']) if args['preprocess']['testpref']: for k, testpref in enumerate( args['preprocess']['testpref'].split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(vocab, testpref, outprefix, lang, num_workers=args['preprocess']['workers']) make_all(args['preprocess']['source_lang'], src_dict)
def save_checkpoint(args, trainer, epoch_itr, val_loss): from ncc import meters from ncc.utils import distributed_utils prev_best = getattr(save_checkpoint, "best", val_loss) if val_loss is not None: best_function = max if args['checkpoint'][ 'maximize_best_checkpoint_metric'] else min save_checkpoint.best = best_function(val_loss, prev_best) if args['checkpoint']['no_save'] or not distributed_utils.is_master(args): return def is_better(a, b): return a >= b if args['checkpoint'][ 'maximize_best_checkpoint_metric'] else a <= b write_timer = meters.StopwatchMeter() write_timer.start() epoch = epoch_itr.epoch end_of_epoch = epoch_itr.end_of_epoch() updates = trainer.get_num_updates() checkpoint_conds = collections.OrderedDict() checkpoint_conds["checkpoint{}.pt".format(epoch)] = ( end_of_epoch and not args['checkpoint']['no_epoch_checkpoints'] and epoch % args['checkpoint']['save_interval'] == 0) checkpoint_conds["checkpoint_{}_{}.pt".format(epoch, updates)] = ( not end_of_epoch and args['checkpoint']['save_interval_updates'] > 0 and updates % args['checkpoint']['save_interval_updates'] == 0) checkpoint_conds["checkpoint_best.pt"] = val_loss is not None and ( not hasattr(save_checkpoint, "best") or is_better(val_loss, save_checkpoint.best)) if val_loss is not None and args['checkpoint']['keep_best_checkpoints'] > 0: checkpoint_conds["checkpoint.best_{}_{:.2f}.pt".format( args['checkpoint']['best_checkpoint_metric'], val_loss)] = (not hasattr(save_checkpoint, "best") or is_better(val_loss, save_checkpoint.best)) checkpoint_conds[ "checkpoint_last.pt"] = not args['checkpoint']['no_last_checkpoints'] extra_state = { "train_iterator": epoch_itr.state_dict(), "val_loss": val_loss } if hasattr(save_checkpoint, "best"): extra_state.update({"best": save_checkpoint.best}) checkpoints = [ os.path.join(args['checkpoint']['save_dir'], fn) for fn, cond in checkpoint_conds.items() if cond ] if len(checkpoints) > 0: trainer.save_checkpoint(checkpoints[0], extra_state) for cp in checkpoints[1:]: PathManager.copy(checkpoints[0], cp) write_timer.stop() LOGGER.info( "saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {:.6f} seconds)" .format(checkpoints[0], epoch, updates, val_loss, write_timer.sum)) if not end_of_epoch and args['checkpoint']['keep_interval_updates'] > 0: # remove old checkpoints; checkpoints are sorted in descending order checkpoints = checkpoint_paths(args['checkpoint']['save_dir'], pattern=r"checkpoint_\d+_(\d+)\.pt") for old_chk in checkpoints[ args['checkpoint']['keep_interval_updates']:]: if os.path.lexists(old_chk): os.remove(old_chk) if args['checkpoint']['keep_last_epochs'] > 0: # remove old epoch checkpoints; checkpoints are sorted in descending order checkpoints = checkpoint_paths(args['checkpoint']['save_dir'], pattern=r"checkpoint(\d+)\.pt") for old_chk in checkpoints[args['checkpoint']['keep_last_epochs']:]: if os.path.lexists(old_chk): os.remove(old_chk) if args['checkpoint']['keep_best_checkpoints'] > 0: # only keep the best N checkpoints according to validation metric checkpoints = checkpoint_paths( args['checkpoint']['save_dir'], pattern=r"checkpoint\.best_{}_(\d+\.?\d*)\.pt".format( args['checkpoint']['best_checkpoint_metric'])) if not args['checkpoint']['maximize_best_checkpoint_metric']: checkpoints = checkpoints[::-1] for old_chk in checkpoints[ args['checkpoint']['keep_best_checkpoints']:]: if os.path.lexists(old_chk): os.remove(old_chk)
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab token_dict = save_token_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") PathManager.mkdir(os.path.dirname(dst_file)) offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize, ( args, src_file, prefix, vocab, token_dict, offsets[worker_id], offsets[worker_id + 1] ), ) pool.close() ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.encode(line, out_type=str) code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{dst_file}.idx")