tensors_eval, total_loss_eval, ptr_loss_eval, gate_loss_eval, steps_per_epoch_eval, data_layer_eval = create_pipeline( args.num_eval_samples, batch_size=args.eval_batch_size, num_gpus=args.num_gpus, input_dropout=0.0, data_prefix=args.eval_file_prefix, is_training=False, ) # Create callbacks for train and eval modes train_callback = nemo_core.SimpleLossLoggerCallback( tensors=[total_loss_train, gate_loss_train, ptr_loss_train], print_func=lambda x: logging.info( f'Loss:{str(np.round(x[0].item(), 3))}, ' f'Gate Loss:{str(np.round(x[1].item(), 3))}, ' f'Pointer Loss:{str(np.round(x[2].item(), 3))}' ), tb_writer=nf.tb_writer, get_tb_values=lambda x: [["loss", x[0]], ["gate_loss", x[1]], ["pointer_loss", x[2]]], step_freq=steps_per_epoch_train, ) eval_callback = nemo_core.EvaluatorCallback( eval_tensors=tensors_eval, user_iter_callback=lambda x, y: eval_iter_callback(x, y, data_desc), user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, data_desc), tb_writer=nf.tb_writer, eval_step=steps_per_epoch_train, ) ckpt_callback = nemo_core.CheckpointCallback(
max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, batch_size=args.batch_size, version_2_with_negative=args.version_2_with_negative, num_gpus=args.num_gpus, batches_per_step=args.batches_per_step, mode="test", use_data_cache=not args.no_data_cache, ) if "train" in args.mode: logging.info(f"steps_per_epoch = {train_steps_per_epoch}") train_callback = nemo_core.SimpleLossLoggerCallback( tensors=[train_loss], print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())), get_tb_values=lambda x: [["loss", x[0]]], step_freq=args.train_step_freq, tb_writer=nf.tb_writer, ) ckpt_callback = nemo_core.CheckpointCallback( folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq ) callbacks = [train_callback, ckpt_callback] if "eval" in args.mode: eval_callback = nemo_core.EvaluatorCallback( eval_tensors=eval_output, user_iter_callback=lambda x, y: eval_iter_callback(x, y), user_epochs_done_callback=lambda x: eval_epochs_done_callback( x, eval_data_layer=eval_data_layer, do_lower_case=args.do_lower_case,
batch_size=args.batch_size, batches_per_step=args.batches_per_step, ) logging.info("steps per epoch", steps_per_epoch) # callback which prints training loss and perplexity once in a while if not args.only_mlm_loss: log_tensors = [train_loss, mlm_loss, nsp_loss] print_msg = "Loss: {:.3f} MLM Loss: {:.3f} NSP Loss: {:.3f}" else: log_tensors = [train_loss] print_msg = "Loss: {:.3f}" train_callback = nemo_core.SimpleLossLoggerCallback( tensors=log_tensors, step_freq=args.print_step_freq, print_func=lambda x: logging.info(print_msg.format(*[y.item() for y in x])), get_tb_values=lambda x: [["loss", x[0]]], tb_writer=nf.tb_writer, ) ckpt_callback = nemo_core.CheckpointCallback( folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, load_from_folder=args.load_dir, step_freq=args.save_step_freq, ) # define learning rate decay policy if args.lr_policy is not None: if args.max_steps < 0: lr_policy_fn = get_lr_policy(args.lr_policy,