def main(): args = parse_args() name = construct_name( args.exp_name, args.lr, args.batch_size, args.num_epochs, args.weight_decay, args.optimizer, args.iter_per_step, ) log_dir = name if args.work_dir: log_dir = os.path.join(args.work_dir, name) # instantiate Neural Factory with supported backend neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=log_dir, checkpoint_dir=args.checkpoint_dir, create_tb_writer=args.create_tb_writer, files_to_copy=[args.model_config, __file__], cudnn_benchmark=args.cudnn_benchmark, tensorboard_dir=args.tensorboard_dir, ) args.num_gpus = neural_factory.world_size checkpoint_dir = neural_factory.checkpoint_dir if args.local_rank is not None: nemo.logging.info('Doing ALL GPU') # build dags train_loss, callbacks, steps_per_epoch = create_all_dags( args, neural_factory) # train model neural_factory.train( tensors_to_optimize=[train_loss], callbacks=callbacks, lr_policy=SquareAnnealing(args.num_epochs * steps_per_epoch, warmup_steps=args.warmup_steps), optimizer=args.optimizer, optimization_params={ "num_epochs": args.num_epochs, "lr": args.lr, "betas": (args.beta1, args.beta2), "weight_decay": args.weight_decay, "grad_norm_clip": None, }, batches_per_step=args.iter_per_step, )
callback = nemo.core.SimpleLossLoggerCallback( step_freq=50, tb_writer=tb_writer, tensor_list2str=lambda x: str(x[0].item()), tensor_list2str_evl=lambda x: compute_accuracy(x)) callback_eval = nemo.core.EvaluatorCallback( eval_tensors=[e_loss, e_outputs, e_labels], user_iter_callback=eval_iter_callback, user_epochs_done_callback=eval_epochs_done_callback, eval_step=10000, tb_writer=tb_writer) # Instantiate an optimizer to perform `train` action optimizer = neural_factory.get_trainer( params={ "optimization_params": { "num_epochs": num_epochs, "lr": learning_rate, "max_steps": max_steps, "weight_decay": weight_decay, "momentum": momentum } }) optimizer.train(tensors_to_optimize=[train_loss], tensors_to_evaluate=[outputs, labels], callbacks=[callback, callback_eval], lr_policy=SquareAnnealing(num_epochs * step_per_epoch))
callback = nemo.core.EvaluatorCallback( eval_tensors=all_eval_tensors[eval_dataset], user_iter_callback=lambda x, y: eval_iter_callback(x, y, tokenizer), user_epochs_done_callback=eval_epochs_done_callback_wer, eval_step=args.eval_freq, tb_writer=nf.tb_writer, ) callbacks.append(callback) checkpointer_callback = CheckpointCallback(folder=args.work_dir, step_freq=args.checkpoint_save_freq) callbacks.append(checkpointer_callback) # define learning rate decay policy lr_policy = SquareAnnealing(total_steps=args.max_steps, min_lr=1e-5, warmup_steps=args.warmup_steps) # Create trainer and execute training action nf.train( tensors_to_optimize=[train_loss], callbacks=callbacks, optimizer=args.optimizer, lr_policy=lr_policy, optimization_params={ "num_epochs": 300, "max_steps": args.max_steps, "lr": args.lr, "weight_decay": args.weight_decay, }, batches_per_step=args.iter_per_step,
def main(): # Parse args args = parse_args() cfg = parse_cfg(args) name = construct_name(args, cfg) # instantiate Neural Factory with supported backend neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=name, checkpoint_dir=args.checkpoint_dir, create_tb_writer=args.create_tb_writer, files_to_copy=[args.model_config, __file__], cudnn_benchmark=args.cudnn_benchmark, tensorboard_dir=args.tensorboard_dir) logger = neural_factory.logger tb_writer = neural_factory.tb_writer args.checkpoint_dir = neural_factory.checkpoint_dir logger.info(f'Name:\n{name}') logger.info(f'Args to be passed to job #{args.local_rank}:') logger.info(pformat(vars(args))) if args.random_seed is not None: random.seed(args.random_seed) np.random.seed(args.random_seed) torch.manual_seed(args.random_seed) logger.info(f'Using seed {args.random_seed}') # Defining computational graph (train_loss, evals), cfg, dag_callbacks = create_dag( args, cfg, neural_factory.world_size) logger.info('Config:') logger.info(pformat(cfg)) num_data = cfg['input']['train']['num_data'] steps_per_epoch = cfg['optimization']['steps_per_epoch'] total_steps = cfg['optimization']['total_steps'] logger.info(f'Num data: {num_data}\n' f'Steps per epoch: {steps_per_epoch}\n' f'Total steps: {total_steps}') dag_callbacks[0].tb_writer = tb_writer # Callbacks train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[train_loss], print_func=lambda x: logger.info(f"Loss: {x[0].item()}"), get_tb_values=lambda x: [("loss", x[0])], tb_writer=tb_writer ) log_callbacks = [train_callback] target = cfg['target'] labels = target['labels'] specials = {f'{ss.name}_id': target[f'{ss.name}_id'] for ss in sss} for name, tensors in evals: eval_callback = nemo.core.EvaluatorCallback( # TODO: Should be fixed soon, so we don't need to pass exactly list eval_tensors=list(tensors), user_iter_callback=partial( process_evaluation_batch, labels=labels, specials=specials, write_attn=False ), user_epochs_done_callback=partial( process_evaluation_epoch, tag=os.path.basename(name), logger=logger ), eval_step=args.eval_freq, tb_writer=tb_writer ) log_callbacks.append(eval_callback) # noinspection PyTypeChecker callbacks = log_callbacks + dag_callbacks # Optimize neural_factory.train( tensors_to_optimize=[train_loss], callbacks=callbacks, lr_policy=SquareAnnealing( cfg['optimization']['total_steps'], min_lr=cfg['optimization']['min_lr'], warmup_steps=( cfg['optimization']['warmup_epochs'] * cfg['optimization']['steps_per_epoch'] ) ), optimizer=cfg['optimization']['optimizer'], optimization_params=cfg['optimization']['params'], batches_per_step=args.iter_per_step )
def test_square(self): policy = SquareAnnealing(100) lr1, lr2, lr3 = (policy(1e-3, x, 0) for x in (0, 10, 20)) self.assertTrue(lr1 >= lr2) self.assertTrue(lr2 >= lr3) self.assertTrue(lr1 - lr2 >= lr2 - lr3)
def test_warmup(self): policy = SquareAnnealing(100, warmup_ratio=0.5) lr1, lr2, lr3 = (policy(1e-3, x, 0) for x in (0, 50, 100)) self.assertTrue(lr1 < lr2) self.assertTrue(lr2 > lr3)
train_data_size = len(train_data_layer) steps_per_epoch = int(train_data_size / (args.batch_size * args.num_gpus * args.batch_per_step)) callback_dev = nemo.core.EvaluatorCallback( # eval_tensors=[dev_mlm_loss, dev_nsp_loss], eval_tensors=[dev_mlm_loss], user_iter_callback=eval_iter_callback, user_epochs_done_callback=eval_epochs_done_callback, eval_step=steps_per_epoch, tb_writer=tb_writer) # define learning rate decay policy if args.lr_decay_policy == "poly": lr_policy = SquareAnnealing(args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion) elif args.lr_decay_policy == "cosine": lr_policy = CosineAnnealing(args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion) elif args.lr_decay_policy == "noam": lr_policy = \ InverseSquareRootAnnealing(args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion) else: raise NotImplementedError # save config file if not os.path.exists(args.checkpoint_directory): os.makedirs(args.checkpoint_directory) config_path = os.path.join(args.checkpoint_directory, "bert-config.json")
callback = nemo.core.SimpleLossLoggerCallback( tensor_list2str=lambda x: str(x[0].item()), tb_writer=tb_writer, step_freq=100) # callback which calculates evaluation loss without label smoothing # and BLEU scores between outputs of beam search and reference translations callback_dev = nemo.core.EvaluatorCallback( eval_tensors=[eval_loss], user_iter_callback=eval_iter_callback, user_epochs_done_callback=eval_epochs_done_callback, eval_step=args.eval_step_frequency, tb_writer=tb_writer) # define learning rate decay policy if args.lr_decay_policy == "poly": lr_policy = SquareAnnealing(args.max_num_steps, warmup_steps=args.warmup_steps) elif args.lr_decay_policy == "cosine": lr_policy = CosineAnnealing(args.max_num_steps, warmup_steps=args.warmup_steps) elif args.lr_decay_policy == "noam": lr_policy = InverseSquareRootAnnealing(args.max_num_steps, warmup_steps=args.warmup_steps) else: raise NotImplementedError # define and launch training algorithm (optimizer) optimizer = neural_factory.get_trainer( params={ "optimizer_kind": args.optimizer, "optimization_params": { "num_epochs": args.max_num_epochs,
} }) train_data_size = len(train_data_layer) steps_per_epoch = int(train_data_size / (args.batch_size * args.num_gpus)) print("steps_per_epoch =", steps_per_epoch) callback_eval = nemo.core.EvaluatorCallback( eval_tensors=[eval_logits, eval_seq_ids], user_iter_callback=lambda x, y: eval_iter_callback(x, y, eval_data_layer, tag_ids), user_epochs_done_callback=lambda x: eval_epochs_done_callback( x, tag_ids, args.output_filename), tb_writer=tb_writer, eval_step=steps_per_epoch) if args.lr_policy == "lr_warmup": lr_policy_func = WarmupAnnealing(args.num_epochs * steps_per_epoch, warmup_ratio=args.lr_warmup_proportion) elif args.lr_policy == "lr_poly": lr_policy_func = SquareAnnealing(args.num_epochs * steps_per_epoch) elif args.lr_policy == "lr_cosine": lr_policy_func = CosineAnnealing(args.num_epochs * steps_per_epoch) else: raise ValueError("Invalid lr_policy, must be lr_warmup or lr_poly") optimizer.train(tensors_to_optimize=[train_loss], callbacks=[callback_train, callback_eval], lr_policy=lr_policy_func)