Example #1
0
tensors_eval, total_loss_eval, ptr_loss_eval, gate_loss_eval, steps_per_epoch_eval, data_layer_eval = create_pipeline(
    args.num_eval_samples,
    batch_size=args.eval_batch_size,
    num_gpus=args.num_gpus,
    input_dropout=0.0,
    data_prefix=args.eval_file_prefix,
    is_training=False,
)

# Create callbacks for train and eval modes
train_callback = nemo_core.SimpleLossLoggerCallback(
    tensors=[total_loss_train, gate_loss_train, ptr_loss_train],
    print_func=lambda x: logging.info(
        f'Loss:{str(np.round(x[0].item(), 3))}, '
        f'Gate Loss:{str(np.round(x[1].item(), 3))}, '
        f'Pointer Loss:{str(np.round(x[2].item(), 3))}'
    ),
    tb_writer=nf.tb_writer,
    get_tb_values=lambda x: [["loss", x[0]], ["gate_loss", x[1]], ["pointer_loss", x[2]]],
    step_freq=steps_per_epoch_train,
)

eval_callback = nemo_core.EvaluatorCallback(
    eval_tensors=tensors_eval,
    user_iter_callback=lambda x, y: eval_iter_callback(x, y, data_desc),
    user_epochs_done_callback=lambda x: eval_epochs_done_callback(x, data_desc),
    tb_writer=nf.tb_writer,
    eval_step=steps_per_epoch_train,
)

ckpt_callback = nemo_core.CheckpointCallback(
Example #2
0
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            batch_size=args.batch_size,
            version_2_with_negative=args.version_2_with_negative,
            num_gpus=args.num_gpus,
            batches_per_step=args.batches_per_step,
            mode="test",
            use_data_cache=not args.no_data_cache,
        )

    if "train" in args.mode:
        logging.info(f"steps_per_epoch = {train_steps_per_epoch}")
        train_callback = nemo_core.SimpleLossLoggerCallback(
            tensors=[train_loss],
            print_func=lambda x: logging.info("Loss: {:.3f}".format(x[0].item())),
            get_tb_values=lambda x: [["loss", x[0]]],
            step_freq=args.train_step_freq,
            tb_writer=nf.tb_writer,
        )
        ckpt_callback = nemo_core.CheckpointCallback(
            folder=nf.checkpoint_dir, epoch_freq=args.save_epoch_freq, step_freq=args.save_step_freq
        )
        callbacks = [train_callback, ckpt_callback]
        if "eval" in args.mode:
            eval_callback = nemo_core.EvaluatorCallback(
                eval_tensors=eval_output,
                user_iter_callback=lambda x, y: eval_iter_callback(x, y),
                user_epochs_done_callback=lambda x: eval_epochs_done_callback(
                    x,
                    eval_data_layer=eval_data_layer,
                    do_lower_case=args.do_lower_case,
Example #3
0
        batch_size=args.batch_size,
        batches_per_step=args.batches_per_step,
    )

logging.info("steps per epoch", steps_per_epoch)
# callback which prints training loss and perplexity once in a while
if not args.only_mlm_loss:
    log_tensors = [train_loss, mlm_loss, nsp_loss]
    print_msg = "Loss: {:.3f} MLM Loss: {:.3f} NSP Loss: {:.3f}"
else:
    log_tensors = [train_loss]
    print_msg = "Loss: {:.3f}"
train_callback = nemo_core.SimpleLossLoggerCallback(
    tensors=log_tensors,
    step_freq=args.print_step_freq,
    print_func=lambda x: logging.info(print_msg.format(*[y.item()
                                                         for y in x])),
    get_tb_values=lambda x: [["loss", x[0]]],
    tb_writer=nf.tb_writer,
)

ckpt_callback = nemo_core.CheckpointCallback(
    folder=nf.checkpoint_dir,
    epoch_freq=args.save_epoch_freq,
    load_from_folder=args.load_dir,
    step_freq=args.save_step_freq,
)

# define learning rate decay policy
if args.lr_policy is not None:
    if args.max_steps < 0:
        lr_policy_fn = get_lr_policy(args.lr_policy,