Exemple #1
0
 def test_warmup_hold(self):
     policy = PolynomialHoldDecayAnnealing(1000,
                                           warmup_ratio=0.25,
                                           hold_ratio=0.25,
                                           power=2)
     lr1, lr2, lr3, lr4 = (policy(1e-3, x, 0) for x in (0, 250, 500, 1000))
     self.assertTrue(lr1 < lr2)
     self.assertTrue(lr2 == lr3)
     self.assertTrue(lr4 < lr3)
     self.assertTrue(lr4 == 0.0)
Exemple #2
0
def main():
    args = parse_args()
    name = construct_name(
        args.exp_name,
        args.lr,
        args.batch_size,
        args.max_steps,
        args.num_epochs,
        args.weight_decay,
        args.optimizer,
        args.iter_per_step,
    )

    # time stamp
    date_time = datetime.now().strftime("%m-%d-%Y -- %H-%M-%S")

    log_dir = name
    if args.work_dir:
        log_dir = os.path.join(args.work_dir, name)

    if args.tensorboard_dir is None:
        tensorboard_dir = os.path.join(name, 'tensorboard', date_time)
    else:
        tensorboard_dir = args.tensorboard_dir

    if args.checkpoint_dir is None:
        checkpoint_dir = os.path.join(name, date_time)
    else:
        base_checkpoint_dir = args.checkpoint_dir
        if len(glob.glob(os.path.join(base_checkpoint_dir, '*.pt'))) > 0:
            checkpoint_dir = base_checkpoint_dir
        else:
            checkpoint_dir = os.path.join(args.checkpoint_dir, date_time)

    # instantiate Neural Factory with supported backend
    neural_factory = nemo.core.NeuralModuleFactory(
        backend=nemo.core.Backend.PyTorch,
        local_rank=args.local_rank,
        optimization_level=args.amp_opt_level,
        log_dir=log_dir,
        checkpoint_dir=checkpoint_dir,
        create_tb_writer=args.create_tb_writer,
        files_to_copy=[args.model_config, __file__],
        cudnn_benchmark=args.cudnn_benchmark,
        tensorboard_dir=tensorboard_dir,
    )
    args.num_gpus = neural_factory.world_size

    if args.local_rank is not None:
        logging.info('Doing ALL GPU')

    # build dags
    train_loss, callbacks, steps_per_epoch = create_all_dags(
        args, neural_factory)

    yaml = YAML(typ="safe")
    with open(args.model_config) as f:
        jasper_params = yaml.load(f)

    lr_schedule = jasper_params.get('lr_schedule', 'CosineAnnealing')

    if lr_schedule == 'CosineAnnealing':
        lr_policy = CosineAnnealing(
            total_steps=args.max_steps if args.max_steps is not None else
            args.num_epochs * steps_per_epoch,
            warmup_ratio=args.warmup_ratio,
            min_lr=args.min_lr,
        )
    elif lr_schedule == 'PolynomialDecayAnnealing':
        lr_policy = PolynomialDecayAnnealing(
            total_steps=args.max_steps if args.max_steps is not None else
            args.num_epochs * steps_per_epoch,
            warmup_ratio=args.warmup_ratio,
            min_lr=args.min_lr,
            power=2.0,
        )
    elif lr_schedule == 'PolynomialHoldDecayAnnealing':
        lr_policy = PolynomialHoldDecayAnnealing(
            total_steps=args.max_steps if args.max_steps is not None else
            args.num_epochs * steps_per_epoch,
            warmup_ratio=args.warmup_ratio,
            hold_ratio=args.hold_ratio,
            min_lr=args.min_lr,
            power=2.0,
        )
    else:
        raise ValueError("LR schedule is invalid !")

    logging.info(f"Using `{lr_policy}` Learning Rate Scheduler")

    # train model
    neural_factory.train(
        tensors_to_optimize=[train_loss],
        callbacks=callbacks,
        lr_policy=lr_policy,
        optimizer=args.optimizer,
        optimization_params={
            "num_epochs": args.num_epochs,
            "max_steps": args.max_steps,
            "lr": args.lr,
            "momentum": 0.95,
            "betas": (args.beta1, args.beta2),
            "weight_decay": args.weight_decay,
            "grad_norm_clip": None,
        },
        batches_per_step=args.iter_per_step,
    )