예제 #1
0
    def configure_optimizers(self):
        optimizer = {
            "sgd":
            FusedSGD(self.parameters(),
                     lr=self.learning_rate,
                     momentum=self.args.momentum),
            "adam":
            FusedAdam(self.parameters(),
                      lr=self.learning_rate,
                      weight_decay=self.args.weight_decay),
        }[self.args.optimizer.lower()]

        if self.args.scheduler:
            scheduler = {
                "scheduler":
                WarmupCosineSchedule(
                    optimizer=optimizer,
                    warmup_steps=250,
                    t_total=self.args.epochs *
                    len(self.trainer.datamodule.train_dataloader()),
                ),
                "interval":
                "step",
                "frequency":
                1,
            }
            return {
                "optimizer": optimizer,
                "monitor": "val_loss",
                "lr_scheduler": scheduler
            }
        return {"optimizer": optimizer, "monitor": "val_loss"}
예제 #2
0
def SGD(params, lr=required, momentum=0, dampening=0, weight_decay=0, nesterov=False,
        wd_after_momentum=False, materialize_master_grads=True):
    try:
        from apex.optimizers import FusedSGD
        return FusedSGD(params, lr, momentum, dampening, weight_decay, nesterov, wd_after_momentum,
                        materialize_master_grads)
    except ImportError:
        return optim.SGD(params, lr, momentum, dampening, weight_decay, nesterov)
예제 #3
0
    def configure_optimizers(self):
        optimizer = {
            "sgd":
            FusedSGD(self.parameters(),
                     lr=self.lr,
                     momentum=self.args.momentum),
            "adam":
            FusedAdam(self.parameters(),
                      lr=self.lr,
                      weight_decay=self.args.weight_decay),
            "adamw":
            torch.optim.AdamW(self.parameters(),
                              lr=self.lr,
                              weight_decay=self.args.weight_decay),
            "radam":
            RAdam(self.parameters(),
                  lr=self.lr,
                  weight_decay=self.args.weight_decay),
            "adabelief":
            AdaBelief(self.parameters(),
                      lr=self.lr,
                      weight_decay=self.args.weight_decay),
            "adabound":
            AdaBound(self.parameters(),
                     lr=self.lr,
                     weight_decay=self.args.weight_decay),
            "adamp":
            AdamP(self.parameters(),
                  lr=self.lr,
                  weight_decay=self.args.weight_decay),
            "novograd":
            FusedNovoGrad(self.parameters(),
                          lr=self.lr,
                          weight_decay=self.args.weight_decay),
        }[self.args.optimizer.lower()]

        if not self.args.use_scheduler:
            return optimizer

        scheduler = {
            "scheduler":
            NoamLR(
                optimizer=optimizer,
                warmup_epochs=self.args.warmup,
                total_epochs=self.args.epochs,
                steps_per_epoch=len(self.train_dataloader()) // self.args.gpus,
                init_lr=self.args.init_lr,
                max_lr=self.args.lr,
                final_lr=self.args.final_lr,
            ),
            "interval":
            "step",
            "frequency":
            1,
        }

        return {"optimizer": optimizer, "lr_scheduler": scheduler}
예제 #4
0
def get_optimizer_for_params(cfg_opt, params):
    r"""Return the scheduler object.

    Args:
        cfg_opt (obj): Config for the specific optimization module (gen/dis).
        params (obj): Parameters to be trained by the parameters.

    Returns:
        (obj): Optimizer
    """
    # We will use fuse optimizers by default.
    fused_opt = cfg_opt.fused_opt
    if cfg_opt.type == 'adam':
        if fused_opt:
            opt = FusedAdam(params,
                            lr=cfg_opt.lr,
                            eps=cfg_opt.eps,
                            betas=(cfg_opt.adam_beta1, cfg_opt.adam_beta2))
        else:
            opt = Adam(params,
                       lr=cfg_opt.lr,
                       eps=cfg_opt.eps,
                       betas=(cfg_opt.adam_beta1, cfg_opt.adam_beta2))

    elif cfg_opt.type == 'madam':
        g_bound = getattr(cfg_opt, 'g_bound', None)
        opt = Madam(params,
                    lr=cfg_opt.lr,
                    scale=cfg_opt.scale,
                    g_bound=g_bound)
    elif cfg_opt.type == 'fromage':
        opt = Fromage(params, lr=cfg_opt.lr)
    elif cfg_opt.type == 'rmsprop':
        opt = RMSprop(params,
                      lr=cfg_opt.lr,
                      eps=cfg_opt.eps,
                      weight_decay=cfg_opt.weight_decay)
    elif cfg_opt.type == 'sgd':
        if fused_opt:
            opt = FusedSGD(params,
                           lr=cfg_opt.lr,
                           momentum=cfg_opt.momentum,
                           weight_decay=cfg_opt.weight_decay)
        else:
            opt = SGD(params,
                      lr=cfg_opt.lr,
                      momentum=cfg_opt.momentum,
                      weight_decay=cfg_opt.weight_decay)
    else:
        raise NotImplementedError(
            'Optimizer {} is not yet implemented.'.format(cfg_opt.type))
    return opt
예제 #5
0
def get_optimizer(optimizer_name: str, parameters, learning_rate: float, weight_decay=0.0, **kwargs):
    if optimizer_name.lower() == "sgd":
        return SGD(parameters, learning_rate, momentum=0.9, weight_decay=weight_decay, **kwargs)

    if optimizer_name.lower() == "adam":
        return Adam(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs)  # As Jeremy suggests

    if optimizer_name.lower() == "rms":
        return RMSprop(parameters, learning_rate, weight_decay=weight_decay, **kwargs)

    if optimizer_name.lower() == "adamw":
        return AdamW(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs)

    if optimizer_name.lower() == "radam":
        return RAdam(parameters, learning_rate, weight_decay=weight_decay, eps=1e-5, **kwargs)  # As Jeremy suggests

    if optimizer_name.lower() == "ranger":
        return Ranger(parameters, learning_rate, weight_decay=weight_decay, **kwargs)

    # if optimizer_name.lower() == "qhadamw":
    #     return QHAdamW(parameters, learning_rate, weight_decay=weight_decay,
    #                    **kwargs)
    #
    if optimizer_name.lower() == "lamb":
        return Lamb(parameters, learning_rate, weight_decay=weight_decay, **kwargs)

    if optimizer_name.lower() == "fused_lamb":
        from apex.optimizers import FusedLAMB

        return FusedLAMB(parameters, learning_rate, weight_decay=weight_decay, **kwargs)

    if optimizer_name.lower() == "fused_adam":
        from apex.optimizers import FusedAdam

        return FusedAdam(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, adam_w_mode=True, **kwargs)

    if optimizer_name.lower() == "fused_sgd":
        from apex.optimizers import FusedSGD

        return FusedSGD(parameters, learning_rate, weight_decay=weight_decay, momentum=0.9, **kwargs)

    if optimizer_name.lower() == "diffgrad":
        return DiffGrad(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, **kwargs)

    if optimizer_name.lower() == "novograd":
        return Novograd(parameters, learning_rate, eps=1e-5, weight_decay=weight_decay, **kwargs)

    raise ValueError("Unsupported optimizer name " + optimizer_name)
예제 #6
0
    def configure_optimizers(self):
        optimizer = {
            "sgd":
            FusedSGD(self.parameters(),
                     lr=self.learning_rate,
                     momentum=self.args.momentum),
            "adam":
            FusedAdam(self.parameters(),
                      lr=self.learning_rate,
                      weight_decay=self.args.weight_decay),
            "radam":
            RAdam(self.parameters(),
                  lr=self.learning_rate,
                  weight_decay=self.args.weight_decay),
        }[self.args.optimizer.lower()]

        scheduler = {
            "none":
            None,
            "multistep":
            torch.optim.lr_scheduler.MultiStepLR(optimizer,
                                                 self.args.steps,
                                                 gamma=self.args.factor),
            "cosine":
            torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                       self.args.max_epochs),
            "plateau":
            torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                factor=self.args.factor,
                patience=self.args.lr_patience),
        }[self.args.scheduler.lower()]

        opt_dict = {"optimizer": optimizer, "monitor": "val_loss"}
        if scheduler is not None:
            opt_dict.update({"lr_scheduler": scheduler})
        return opt_dict
예제 #7
0
def create_optimizer(optimizer_config, model, master_params=None):
    """Creates optimizer and schedule from configuration

    Parameters
    ----------
    optimizer_config : dict
        Dictionary containing the configuration options for the optimizer.
    model : Model
        The network model.

    Returns
    -------
    optimizer : Optimizer
        The optimizer.
    scheduler : LRScheduler
        The learning rate scheduler.
    """
    if optimizer_config["classifier_lr"] != -1:
        # Separate classifier parameters from all others
        net_params = []
        classifier_params = []
        for k, v in model.named_parameters():
            if not v.requires_grad:
                continue
            if k.find("encoder") != -1:
                net_params.append(v)
            else:
                classifier_params.append(v)
        params = [
            {
                "params": net_params
            },
            {
                "params": classifier_params,
                "lr": optimizer_config["classifier_lr"]
            },
        ]
    else:
        if master_params:
            params = master_params
        else:
            params = model.parameters()

    if optimizer_config["type"] == "SGD":
        optimizer = optim.SGD(params,
                              lr=optimizer_config["learning_rate"],
                              momentum=optimizer_config["momentum"],
                              weight_decay=optimizer_config["weight_decay"],
                              nesterov=optimizer_config["nesterov"])
    elif optimizer_config["type"] == "FusedSGD":
        optimizer = FusedSGD(params,
                             lr=optimizer_config["learning_rate"],
                             momentum=optimizer_config["momentum"],
                             weight_decay=optimizer_config["weight_decay"],
                             nesterov=optimizer_config["nesterov"])
    elif optimizer_config["type"] == "Adam":
        optimizer = optim.Adam(params,
                               lr=optimizer_config["learning_rate"],
                               weight_decay=optimizer_config["weight_decay"])
    elif optimizer_config["type"] == "FusedAdam":
        optimizer = FusedAdam(params,
                              lr=optimizer_config["learning_rate"],
                              weight_decay=optimizer_config["weight_decay"])
    elif optimizer_config["type"] == "AdamW":
        optimizer = optim.Adam(params,
                               lr=optimizer_config["learning_rate"],
                               weight_decay=optimizer_config["weight_decay"])
    elif optimizer_config["type"] == "RmsProp":
        optimizer = optim.Adam(params,
                               lr=optimizer_config["learning_rate"],
                               weight_decay=optimizer_config["weight_decay"])
    else:
        raise KeyError("unrecognized optimizer {}".format(
            optimizer_config["type"]))

    if optimizer_config["schedule"]["type"] == "step":
        scheduler = LRStepScheduler(optimizer,
                                    **optimizer_config["schedule"]["params"])
    elif optimizer_config["schedule"]["type"] == "multistep":
        scheduler = MultiStepLR(optimizer,
                                **optimizer_config["schedule"]["params"])
    elif optimizer_config["schedule"]["type"] == "exponential":
        scheduler = ExponentialLRScheduler(
            optimizer, **optimizer_config["schedule"]["params"])
    elif optimizer_config["schedule"]["type"] == "poly":
        scheduler = PolyLR(optimizer, **optimizer_config["schedule"]["params"])
    elif optimizer_config["schedule"]["type"] == "constant":
        scheduler = lr_scheduler.LambdaLR(optimizer, lambda epoch: 1.0)
    elif optimizer_config["schedule"]["type"] == "linear":

        def linear_lr(it):
            return it * optimizer_config["schedule"]["params"][
                "alpha"] + optimizer_config["schedule"]["params"]["beta"]

        scheduler = lr_scheduler.LambdaLR(optimizer, linear_lr)

    return optimizer, scheduler
def get_optimizer(optimizer_name: str,
                  parameters,
                  learning_rate: float,
                  weight_decay=1e-5,
                  eps=1e-5,
                  **kwargs) -> Optimizer:
    from torch.optim import SGD, Adam, RMSprop, AdamW
    from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger

    if optimizer_name.lower() == "sgd":
        return SGD(parameters,
                   learning_rate,
                   momentum=0.9,
                   nesterov=True,
                   weight_decay=weight_decay,
                   **kwargs)

    if optimizer_name.lower() == "adam":
        return Adam(parameters,
                    learning_rate,
                    weight_decay=weight_decay,
                    eps=eps,
                    **kwargs)  # As Jeremy suggests

    if optimizer_name.lower() == "rms":
        return RMSprop(parameters,
                       learning_rate,
                       weight_decay=weight_decay,
                       **kwargs)

    if optimizer_name.lower() == "adamw":
        return AdamW(parameters,
                     learning_rate,
                     weight_decay=weight_decay,
                     eps=eps,
                     **kwargs)

    if optimizer_name.lower() == "radam":
        return RAdam(parameters,
                     learning_rate,
                     weight_decay=weight_decay,
                     eps=eps,
                     **kwargs)  # As Jeremy suggests

    # Optimizers from torch-optimizer
    if optimizer_name.lower() == "ranger":
        return Ranger(parameters,
                      learning_rate,
                      eps=eps,
                      weight_decay=weight_decay,
                      **kwargs)

    if optimizer_name.lower() == "lamb":
        return Lamb(parameters,
                    learning_rate,
                    eps=eps,
                    weight_decay=weight_decay,
                    **kwargs)

    if optimizer_name.lower() == "diffgrad":
        return DiffGrad(parameters,
                        learning_rate,
                        eps=eps,
                        weight_decay=weight_decay,
                        **kwargs)

    if optimizer_name.lower() == "novograd":
        return NovoGrad(parameters,
                        learning_rate,
                        eps=eps,
                        weight_decay=weight_decay,
                        **kwargs)

    # Optimizers from Apex (Fused version is faster on GPU with tensor cores)
    if optimizer_name.lower() == "fused_lamb":
        from apex.optimizers import FusedLAMB

        return FusedLAMB(parameters,
                         learning_rate,
                         eps=eps,
                         weight_decay=weight_decay,
                         **kwargs)

    if optimizer_name.lower() == "fused_sgd":
        from apex.optimizers import FusedSGD

        return FusedSGD(parameters,
                        learning_rate,
                        momentum=0.9,
                        nesterov=True,
                        weight_decay=weight_decay,
                        **kwargs)

    if optimizer_name.lower() == "fused_adam":
        from apex.optimizers import FusedAdam

        return FusedAdam(parameters,
                         learning_rate,
                         eps=eps,
                         weight_decay=weight_decay,
                         adam_w_mode=True,
                         **kwargs)

    raise ValueError("Unsupported optimizer name " + optimizer_name)
예제 #9
0
    def test_3models2losses2optimizers(self):
        model0 = MyModel(1)
        model1 = MyModel(2)
        model2 = MyModel(3)

        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                      {'params' : model1.parameters(), 'lr' : 1.0}],
                                     momentum=0.5)
        optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
                                     momentum=0.25)

        # Again, can't do this:  reference_grads = [[]]*9
        reference_grads = [[], [], [], [], [], [], [], [], []]
        final_params = [None, None, None, None, None, None, None, None, None]
        for i in range(2):
            optimizer0.zero_grad()
            optimizer1.zero_grad()
            loss0 = model0(self.x) + model1(self.x)
            loss1 = model2(self.x) + model1(self.x)
            loss0.backward()
            loss1.backward()

            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
                                   [param.grad.data.clone() for param in model1.parameters()])

            optimizer0.step()
            optimizer1.step()

        final_params[0] = \
            [param.data.clone() for param in model0.parameters()] + \
            [param.data.clone() for param in model1.parameters()] + \
            [param.data.clone() for param in model2.parameters()]

        def what_got_skipped(which_iter, which_backward, which_model):
            if which_iter == 0:
                if which_backward == 0:
                    if which_model == 0:
                        return 1
                    if which_model == 1:
                        return 2
                if which_backward == 1:
                    if which_model == 2:
                        return 3
                    if which_model == 1:
                        return 4
            if which_iter == 1:
                if which_backward == 0:
                    if which_model == 0:
                        return 5
                    if which_model == 1:
                        return 6
                if which_backward == 1:
                    if which_model == 2:
                        return 7
                    if which_model == 1:
                        return 8
            return 0

        for which_iter in (0,1):
            for which_backward in (0,1):
                if which_backward == 0:
                    which_models = (0,1)
                if which_backward == 1:
                    which_models = (2,1)
                for which_model in which_models:

                    model0 = MyModel(1)
                    model1 = MyModel(2)
                    model2 = MyModel(3)

                    optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                                  {'params' : model1.parameters(), 'lr' : 1.0}],
                                                 momentum=0.5)
                    optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
                                                 momentum=0.25)

                    for i in range(3):
                        optimizer0.zero_grad()
                        optimizer1.zero_grad()
                        loss0 = model0(self.x) + model1(self.x)
                        loss1 = model2(self.x) + model1(self.x)
                        loss0.backward()
                        loss1.backward()

                        if i != which_iter:
                            reference_grads[what_got_skipped(which_iter,
                                    which_backward, which_model)].append(
                                [param.grad.data.clone() for param in model0.parameters()] +
                                [param.grad.data.clone() for param in model1.parameters()])

                        if i == which_iter:
                            if which_backward == 0:
                                # if which_model == 0:
                                    optimizer1.step()
                                # if which_model == 1:
                                #     optimizer1.step()
                            if which_backward == 1:
                                # if which_model == 2:
                                #     optimizer0.step()
                                # if which_model == 1:
                                    continue
                        else:
                            optimizer0.step()
                            optimizer1.step()

                    final_params[what_got_skipped(which_iter, which_backward, which_model)] = \
                        [param.data.clone() for param in model0.parameters()] + \
                        [param.data.clone() for param in model1.parameters()] + \
                        [param.data.clone() for param in model2.parameters()]

        for materialize_master_grads in (False, True):
          for opt_level in ("O0", "O1", "O2", "O3"):
            for how_to_zero in ("none", "model", "optimizer"):
              for use_multiple_loss_scalers in (False, True):
                if opt_level == "O1" or opt_level == "O2":
                    inject_inf_iters = (-1, 0, 1)
                else:
                    inject_inf_iters = (-1,)

                for inject_inf in inject_inf_iters:
                  if inject_inf >= 0:
                     inject_inf_locs = ("fp16", "fp32")
                     which_backwards = (0, 1)
                  else:
                     inject_inf_locs = ("fdsa",)
                     which_backwards = (None,)

                  for inject_inf_loc in inject_inf_locs:
                    for which_backward in which_backwards:
                      if use_multiple_loss_scalers:
                          num_losses = 2
                          loss_ids = [0, 1]
                      else:
                          num_losses = 1
                          loss_ids = [0, 0]

                      if inject_inf >= 0:
                          iters = 3
                          if which_backward == 0:
                              which_models = (0, 1)
                          elif which_backward == 1:
                              which_models = (2, 1)
                      else:
                          iters = 2
                          which_models = (None,)

                      for which_model in which_models:
                          model0 = MyModel(1)
                          model1 = MyModel(2)
                          model2 = MyModel(3)

                          models = [model0, model1, model2]

                          optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                            {'params' : model1.parameters(), 'lr' : 1.0}],
                                            momentum=0.5, materialize_master_grads=materialize_master_grads)
                          optimizer1 = FusedSGD([{'params' : model2.parameters(), 'lr' : 0.5}],
                                                momentum=0.25, materialize_master_grads=materialize_master_grads)

                          _amp_state.allow_incoming_model_not_fp32 = True
                          [model0, model1, model2], [optimizer0, optimizer1] = amp.initialize(
                              [model0, model1, model2],
                              [optimizer0, optimizer1],
                              opt_level=opt_level,
                              verbosity=0,
                              cast_model_type=False,
                              num_losses=num_losses)
                          _amp_state.allow_incoming_model_not_fp32 = False

                          _amp_state.loss_scalers[0]._loss_scale = 4.0
                          if use_multiple_loss_scalers:
                              _amp_state.loss_scalers[1]._loss_scale = 16.0

                          unskipped = 0
                          for i in range(iters):
                              if how_to_zero == "none":
                                  for model in models:
                                      for param in model.parameters():
                                          param.grad = None
                              elif how_to_zero == "model":
                                  for model in models:
                                      model.zero_grad()
                              else:
                                  optimizer0.zero_grad()
                                  optimizer1.zero_grad()

                              loss0 = model0(self.x) + model1(self.x)
                              loss1 = model2(self.x) + model1(self.x)

                              with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
                                  scaled_loss.backward()
                                  if i == inject_inf and which_backward == 0:
                                      if which_model == 0:
                                          inj_model = model0
                                      elif which_model == 1:
                                          inj_model = model1
                                      else:
                                          raise RuntimeError(which_model + " invalid for loss 0")
                                      if inject_inf_loc == "fp32":
                                          inj_model.weight0.grad[0] = float('inf')
                                      elif inject_inf_loc == "fp16":
                                          inj_model.weight1.grad[0] = float('inf')
                              with amp.scale_loss(loss1, [optimizer0, optimizer1], loss_id=loss_ids[1]) as scaled_loss:
                                  scaled_loss.backward()
                                  if i == inject_inf and which_backward == 1:
                                      if which_model == 2:
                                          inj_model = model2
                                      elif which_model == 1:
                                          inj_model = model1
                                      else:
                                          raise RuntimeError(which_model + " invalid for loss 1 ")
                                      if inject_inf_loc == "fp32":
                                          inj_model.weight0.grad[0] = float('inf')
                                      elif inject_inf_loc == "fp16":
                                          inj_model.weight1.grad[0] = float('inf')

                              if i != inject_inf:
                                  master_params = list(amp.master_params(optimizer0)) + \
                                                  list(amp.master_params(optimizer1))
                                  for param, reference_grad in zip(master_params,
                                        reference_grads[what_got_skipped(inject_inf,
                                            which_backward, which_model)][unskipped]):
                                      if opt_level == "O2" and not materialize_master_grads:
                                          continue
                                      else:
                                          self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
                                  unskipped += 1

                              optimizer0.step()
                              optimizer1.step()

                          model_params = [p for p in model0.parameters()] + \
                                         [p for p in model1.parameters()] + \
                                         [p for p in model2.parameters()]
                          master_params = [p for p in amp.master_params(optimizer0)] + \
                                          [p for p in amp.master_params(optimizer1)]

                          # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {} which_model {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers, which_model))

                          for model, master, reference in zip(
                                  model_params,
                                  master_params,
                                  final_params[what_got_skipped(inject_inf, which_backward, which_model)]):
                              self.assertTrue(torch.allclose(model, reference))
                              self.assertTrue(torch.allclose(model, master.to(model.dtype)))

                          if opt_level == "O1":
                              _amp_state.handle._deactivate()
예제 #10
0
def create_optimizer(args, model, filter_bias_and_bn=True):
    opt_lower = args.opt.lower()
    weight_decay = args.weight_decay
    if 'adamw' in opt_lower or 'radam' in opt_lower:
        # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay
        # I don't believe they follow the paper or original Torch7 impl which schedules weight
        # decay based on the ratio of current_lr/initial_lr
        weight_decay /= args.lr
    if weight_decay and filter_bias_and_bn:
        print("has weight decay and filter bias")
        parameters = add_weight_decay(model, weight_decay)
        weight_decay = 0.
    else:
        print("Comes here to unfrozen params inside optim")

        parameters = unfrozen_params(model)

    if 'fused' in opt_lower:
        assert has_apex and torch.cuda.is_available(
        ), 'APEX and CUDA required for fused optimizers'

    opt_split = opt_lower.split('_')
    opt_lower = opt_split[-1]
    if opt_lower == 'sgd' or opt_lower == 'nesterov':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=weight_decay,
                              nesterov=True)
    elif opt_lower == 'momentum':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=weight_decay,
                              nesterov=False)
    elif opt_lower == 'adam':
        optimizer = optim.Adam(parameters,
                               lr=args.lr,
                               weight_decay=weight_decay,
                               eps=args.opt_eps)
    elif opt_lower == 'adamw':
        optimizer = AdamW(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'nadam':
        optimizer = Nadam(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'radam':
        optimizer = RAdam(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'adadelta':
        optimizer = optim.Adadelta(parameters,
                                   lr=args.lr,
                                   weight_decay=weight_decay,
                                   eps=args.opt_eps)
    elif opt_lower == 'rmsprop':
        optimizer = optim.RMSprop(parameters,
                                  lr=args.lr,
                                  alpha=0.9,
                                  eps=args.opt_eps,
                                  momentum=args.momentum,
                                  weight_decay=weight_decay)
    elif opt_lower == 'rmsproptf':
        optimizer = RMSpropTF(parameters,
                              lr=args.lr,
                              alpha=0.9,
                              eps=args.opt_eps,
                              momentum=args.momentum,
                              weight_decay=weight_decay)
    elif opt_lower == 'novograd':
        optimizer = NovoGrad(parameters,
                             lr=args.lr,
                             weight_decay=weight_decay,
                             eps=args.opt_eps)
    elif opt_lower == 'nvnovograd':
        optimizer = NvNovoGrad(parameters,
                               lr=args.lr,
                               weight_decay=weight_decay,
                               eps=args.opt_eps)
    elif opt_lower == 'fusedsgd':
        optimizer = FusedSGD(parameters,
                             lr=args.lr,
                             momentum=args.momentum,
                             weight_decay=weight_decay,
                             nesterov=True)
    elif opt_lower == 'fusedmomentum':
        print("my optimizer")
        optimizer = FusedSGD(parameters,
                             lr=args.lr,
                             momentum=args.momentum,
                             weight_decay=weight_decay,
                             nesterov=False)
    elif opt_lower == 'fusedadam':
        optimizer = FusedAdam(parameters,
                              lr=args.lr,
                              adam_w_mode=False,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusedadamw':
        optimizer = FusedAdam(parameters,
                              lr=args.lr,
                              adam_w_mode=True,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusedlamb':
        optimizer = FusedLAMB(parameters,
                              lr=args.lr,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusednovograd':
        optimizer = FusedNovoGrad(parameters,
                                  lr=args.lr,
                                  betas=(0.95, 0.98),
                                  weight_decay=weight_decay,
                                  eps=args.opt_eps)
    else:
        assert False and "Invalid optimizer"
        raise ValueError

    if len(opt_split) > 1:
        if opt_split[0] == 'lookahead':
            optimizer = Lookahead(optimizer)

    return optimizer
def create_optimizer(args, model, filter_bias_and_bn=True):
    opt_lower = args.opt.lower()
    weight_decay = args.weight_decay
    if weight_decay and filter_bias_and_bn:
        skip = {}
        if hasattr(model, 'no_weight_decay'):
            skip = model.no_weight_decay()
        parameters = add_weight_decay(model, weight_decay, skip)
        weight_decay = 0.
    else:
        parameters = model.parameters()

    if 'fused' in opt_lower:
        assert has_apex and torch.cuda.is_available(
        ), 'APEX and CUDA required for fused optimizers'

    opt_args = dict(lr=args.lr, weight_decay=weight_decay)
    if hasattr(args, 'opt_eps') and args.opt_eps is not None:
        opt_args['eps'] = args.opt_eps
    if hasattr(args, 'opt_betas') and args.opt_betas is not None:
        opt_args['betas'] = args.opt_betas
    if hasattr(args, 'opt_args') and args.opt_args is not None:
        opt_args.update(args.opt_args)

    opt_split = opt_lower.split('_')
    opt_lower = opt_split[-1]
    if opt_lower == 'sgd' or opt_lower == 'nesterov':
        opt_args.pop('eps', None)
        optimizer = optim.SGD(parameters,
                              momentum=args.momentum,
                              nesterov=True,
                              **opt_args)
    elif opt_lower == 'momentum':
        opt_args.pop('eps', None)
        optimizer = optim.SGD(parameters,
                              momentum=args.momentum,
                              nesterov=False,
                              **opt_args)
    elif opt_lower == 'adam':
        optimizer = optim.Adam(parameters, **opt_args)
    elif opt_lower == 'adamw':
        optimizer = optim.AdamW(parameters, **opt_args)
    elif opt_lower == 'nadam':
        optimizer = Nadam(parameters, **opt_args)
    elif opt_lower == 'radam':
        optimizer = RAdam(parameters, **opt_args)
    elif opt_lower == 'adamp':
        # ================================
        # optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args)

        print(' ')
        print('Gradient centralization is enabled for AdamP optimizer.')
        print(' ')

        optimizer = AdamP(parameters,
                          wd_ratio=0.01,
                          nesterov=True,
                          use_gc=True,
                          gc_conv_only=True,
                          gc_loc=False,
                          **opt_args)
        # ================================
    elif opt_lower == 'sgdp':
        optimizer = SGDP(parameters,
                         momentum=args.momentum,
                         nesterov=True,
                         **opt_args)
    elif opt_lower == 'adadelta':
        optimizer = optim.Adadelta(parameters, **opt_args)
    elif opt_lower == 'adafactor':
        if not args.lr:
            opt_args['lr'] = None
        optimizer = Adafactor(parameters, **opt_args)
    elif opt_lower == 'adahessian':
        optimizer = Adahessian(parameters, **opt_args)
    elif opt_lower == 'rmsprop':
        optimizer = optim.RMSprop(parameters,
                                  alpha=0.9,
                                  momentum=args.momentum,
                                  **opt_args)
    elif opt_lower == 'rmsproptf':
        optimizer = RMSpropTF(parameters,
                              alpha=0.9,
                              momentum=args.momentum,
                              **opt_args)
    elif opt_lower == 'novograd':
        optimizer = NovoGrad(parameters, **opt_args)
    elif opt_lower == 'nvnovograd':
        optimizer = NvNovoGrad(parameters, **opt_args)
    elif opt_lower == 'fusedsgd':
        opt_args.pop('eps', None)
        optimizer = FusedSGD(parameters,
                             momentum=args.momentum,
                             nesterov=True,
                             **opt_args)
    elif opt_lower == 'fusedmomentum':
        opt_args.pop('eps', None)
        optimizer = FusedSGD(parameters,
                             momentum=args.momentum,
                             nesterov=False,
                             **opt_args)
    elif opt_lower == 'fusedadam':
        optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
    elif opt_lower == 'fusedadamw':
        optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
    elif opt_lower == 'fusedlamb':
        optimizer = FusedLAMB(parameters, **opt_args)
    elif opt_lower == 'fusednovograd':
        opt_args.setdefault('betas', (0.95, 0.98))
        optimizer = FusedNovoGrad(parameters, **opt_args)
    else:
        assert False and "Invalid optimizer"
        raise ValueError

    if len(opt_split) > 1:
        if opt_split[0] == 'lookahead':
            optimizer = Lookahead(optimizer)

    return optimizer
예제 #12
0
def create_optimizer_v2(
        model: nn.Module,
        optimizer_name: str = 'sgd',
        learning_rate: Optional[float] = None,
        weight_decay: float = 0.,
        momentum: float = 0.9,
        filter_bias_and_bn: bool = True,
        **kwargs):
    """ Create an optimizer.

    TODO currently the model is passed in and all parameters are selected for optimization.
    For more general use an interface that allows selection of parameters to optimize and lr groups, one of:
      * a filter fn interface that further breaks params into groups in a weight_decay compatible fashion
      * expose the parameters interface and leave it up to caller

    Args:
        model (nn.Module): model containing parameters to optimize
        optimizer_name: name of optimizer to create
        learning_rate: initial learning rate
        weight_decay: weight decay to apply in optimizer
        momentum:  momentum for momentum based optimizers (others may use betas via kwargs)
        filter_bias_and_bn:  filter out bias, bn and other 1d params from weight decay
        **kwargs: extra optimizer specific kwargs to pass through

    Returns:
        Optimizer
    """
    opt_lower = optimizer_name.lower()
    if weight_decay and filter_bias_and_bn:
        skip = {}
        if hasattr(model, 'no_weight_decay'):
            skip = model.no_weight_decay()
        parameters = add_weight_decay(model, weight_decay, skip)
        weight_decay = 0.
    else:
        parameters = model.parameters()
    if 'fused' in opt_lower:
        assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'

    opt_args = dict(lr=learning_rate, weight_decay=weight_decay, **kwargs)
    opt_split = opt_lower.split('_')
    opt_lower = opt_split[-1]
    if opt_lower == 'sgd' or opt_lower == 'nesterov':
        opt_args.pop('eps', None)
        optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args)
    elif opt_lower == 'momentum':
        opt_args.pop('eps', None)
        optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args)
    elif opt_lower == 'adam':
        optimizer = optim.Adam(parameters, **opt_args) 
    elif opt_lower == 'adabelief':
        optimizer = AdaBelief(parameters, rectify = False, print_change_log = False,**opt_args)
    elif opt_lower == 'adamw':
        optimizer = optim.AdamW(parameters, **opt_args)
    elif opt_lower == 'nadam':
        optimizer = Nadam(parameters, **opt_args)
    elif opt_lower == 'radam':
        optimizer = RAdam(parameters, **opt_args)
    elif opt_lower == 'adamp':        
        optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args)
    elif opt_lower == 'sgdp':
        optimizer = SGDP(parameters, momentum=momentum, nesterov=True, **opt_args)
    elif opt_lower == 'adadelta':
        optimizer = optim.Adadelta(parameters, **opt_args)
    elif opt_lower == 'adafactor':
        if not learning_rate:
            opt_args['lr'] = None
        optimizer = Adafactor(parameters, **opt_args)
    elif opt_lower == 'adahessian':
        optimizer = Adahessian(parameters, **opt_args)
    elif opt_lower == 'rmsprop':
        optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=momentum, **opt_args)
    elif opt_lower == 'rmsproptf':
        optimizer = RMSpropTF(parameters, alpha=0.9, momentum=momentum, **opt_args)
    elif opt_lower == 'novograd':
        optimizer = NovoGrad(parameters, **opt_args)
    elif opt_lower == 'nvnovograd':
        optimizer = NvNovoGrad(parameters, **opt_args)
    elif opt_lower == 'fusedsgd':
        opt_args.pop('eps', None)
        optimizer = FusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args)
    elif opt_lower == 'fusedmomentum':
        opt_args.pop('eps', None)
        optimizer = FusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args)
    elif opt_lower == 'fusedadam':
        optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
    elif opt_lower == 'fusedadamw':
        optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
    elif opt_lower == 'fusedlamb':
        optimizer = FusedLAMB(parameters, **opt_args)
    elif opt_lower == 'fusednovograd':
        opt_args.setdefault('betas', (0.95, 0.98))
        optimizer = FusedNovoGrad(parameters, **opt_args)
    else:
        assert False and "Invalid optimizer"
        raise ValueError

    if len(opt_split) > 1:
        if opt_split[0] == 'lookahead':
            optimizer = Lookahead(optimizer)

    return optimizer
예제 #13
0
def create_optimizer_v2(model_or_params,
                        opt: str = 'sgd',
                        lr: Optional[float] = None,
                        weight_decay: float = 0.,
                        momentum: float = 0.9,
                        filter_bias_and_bn: bool = True,
                        layer_decay: Optional[float] = None,
                        param_group_fn: Optional[Callable] = None,
                        **kwargs):
    """ Create an optimizer.

    TODO currently the model is passed in and all parameters are selected for optimization.
    For more general use an interface that allows selection of parameters to optimize and lr groups, one of:
      * a filter fn interface that further breaks params into groups in a weight_decay compatible fashion
      * expose the parameters interface and leave it up to caller

    Args:
        model_or_params (nn.Module): model containing parameters to optimize
        opt: name of optimizer to create
        lr: initial learning rate
        weight_decay: weight decay to apply in optimizer
        momentum:  momentum for momentum based optimizers (others may use betas via kwargs)
        filter_bias_and_bn:  filter out bias, bn and other 1d params from weight decay
        **kwargs: extra optimizer specific kwargs to pass through

    Returns:
        Optimizer
    """
    if isinstance(model_or_params, nn.Module):
        # a model was passed in, extract parameters and add weight decays to appropriate layers
        no_weight_decay = {}
        if hasattr(model_or_params, 'no_weight_decay'):
            no_weight_decay = model_or_params.no_weight_decay()

        if param_group_fn:
            parameters = param_group_fn(model_or_params)
        elif layer_decay is not None:
            parameters = param_groups_layer_decay(
                model_or_params,
                weight_decay=weight_decay,
                layer_decay=layer_decay,
                no_weight_decay_list=no_weight_decay)
            weight_decay = 0.
        elif weight_decay and filter_bias_and_bn:
            parameters = param_groups_weight_decay(model_or_params,
                                                   weight_decay,
                                                   no_weight_decay)
            weight_decay = 0.
        else:
            parameters = model_or_params.parameters()
    else:
        # iterable of parameters or param groups passed in
        parameters = model_or_params

    opt_lower = opt.lower()
    opt_split = opt_lower.split('_')
    opt_lower = opt_split[-1]
    if 'fused' in opt_lower:
        assert has_apex and torch.cuda.is_available(
        ), 'APEX and CUDA required for fused optimizers'

    opt_args = dict(weight_decay=weight_decay, **kwargs)
    if lr is not None:
        opt_args.setdefault('lr', lr)

    # basic SGD & related
    if opt_lower == 'sgd' or opt_lower == 'nesterov':
        # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons
        opt_args.pop('eps', None)
        optimizer = optim.SGD(parameters,
                              momentum=momentum,
                              nesterov=True,
                              **opt_args)
    elif opt_lower == 'momentum':
        opt_args.pop('eps', None)
        optimizer = optim.SGD(parameters,
                              momentum=momentum,
                              nesterov=False,
                              **opt_args)
    elif opt_lower == 'sgdp':
        optimizer = SGDP(parameters,
                         momentum=momentum,
                         nesterov=True,
                         **opt_args)

    # adaptive
    elif opt_lower == 'adam':
        optimizer = optim.Adam(parameters, **opt_args)
    elif opt_lower == 'adamw':
        optimizer = optim.AdamW(parameters, **opt_args)
    elif opt_lower == 'adamp':
        optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args)
    elif opt_lower == 'nadam':
        try:
            # NOTE PyTorch >= 1.10 should have native NAdam
            optimizer = optim.Nadam(parameters, **opt_args)
        except AttributeError:
            optimizer = Nadam(parameters, **opt_args)
    elif opt_lower == 'radam':
        optimizer = RAdam(parameters, **opt_args)
    elif opt_lower == 'adamax':
        optimizer = optim.Adamax(parameters, **opt_args)
    elif opt_lower == 'adabelief':
        optimizer = AdaBelief(parameters, rectify=False, **opt_args)
    elif opt_lower == 'radabelief':
        optimizer = AdaBelief(parameters, rectify=True, **opt_args)
    elif opt_lower == 'adadelta':
        optimizer = optim.Adadelta(parameters, **opt_args)
    elif opt_lower == 'adagrad':
        opt_args.setdefault('eps', 1e-8)
        optimizer = optim.Adagrad(parameters, **opt_args)
    elif opt_lower == 'adafactor':
        optimizer = Adafactor(parameters, **opt_args)
    elif opt_lower == 'lamb':
        optimizer = Lamb(parameters, **opt_args)
    elif opt_lower == 'lambc':
        optimizer = Lamb(parameters, trust_clip=True, **opt_args)
    elif opt_lower == 'larc':
        optimizer = Lars(parameters,
                         momentum=momentum,
                         trust_clip=True,
                         **opt_args)
    elif opt_lower == 'lars':
        optimizer = Lars(parameters, momentum=momentum, **opt_args)
    elif opt_lower == 'nlarc':
        optimizer = Lars(parameters,
                         momentum=momentum,
                         trust_clip=True,
                         nesterov=True,
                         **opt_args)
    elif opt_lower == 'nlars':
        optimizer = Lars(parameters,
                         momentum=momentum,
                         nesterov=True,
                         **opt_args)
    elif opt_lower == 'madgrad':
        optimizer = MADGRAD(parameters, momentum=momentum, **opt_args)
    elif opt_lower == 'madgradw':
        optimizer = MADGRAD(parameters,
                            momentum=momentum,
                            decoupled_decay=True,
                            **opt_args)
    elif opt_lower == 'novograd' or opt_lower == 'nvnovograd':
        optimizer = NvNovoGrad(parameters, **opt_args)
    elif opt_lower == 'rmsprop':
        optimizer = optim.RMSprop(parameters,
                                  alpha=0.9,
                                  momentum=momentum,
                                  **opt_args)
    elif opt_lower == 'rmsproptf':
        optimizer = RMSpropTF(parameters,
                              alpha=0.9,
                              momentum=momentum,
                              **opt_args)

    # second order
    elif opt_lower == 'adahessian':
        optimizer = Adahessian(parameters, **opt_args)

    # NVIDIA fused optimizers, require APEX to be installed
    elif opt_lower == 'fusedsgd':
        opt_args.pop('eps', None)
        optimizer = FusedSGD(parameters,
                             momentum=momentum,
                             nesterov=True,
                             **opt_args)
    elif opt_lower == 'fusedmomentum':
        opt_args.pop('eps', None)
        optimizer = FusedSGD(parameters,
                             momentum=momentum,
                             nesterov=False,
                             **opt_args)
    elif opt_lower == 'fusedadam':
        optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
    elif opt_lower == 'fusedadamw':
        optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
    elif opt_lower == 'fusedlamb':
        optimizer = FusedLAMB(parameters, **opt_args)
    elif opt_lower == 'fusednovograd':
        opt_args.setdefault('betas', (0.95, 0.98))
        optimizer = FusedNovoGrad(parameters, **opt_args)

    else:
        assert False and "Invalid optimizer"
        raise ValueError

    if len(opt_split) > 1:
        if opt_split[0] == 'lookahead':
            optimizer = Lookahead(optimizer)

    return optimizer
예제 #14
0
    def test_2models2losses1optimizer(self):
        model0 = MyModel(1)
        model1 = MyModel(2)

        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                     {'params' : model1.parameters(), 'lr' : 0.5}],
                                    momentum=0.125)

        reference_grads = []
        for i in range(2):
            optimizer.zero_grad()
            loss0 = model0(self.x)
            loss1 = model1(self.x)
            loss0.backward()
            loss1.backward()

            reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
                                   [param.grad.data.clone() for param in model1.parameters()])

            optimizer.step()

        final_params = [param.data.clone() for param in model0.parameters()] + \
                       [param.data.clone() for param in model1.parameters()]

        for materialize_master_grads in (False, True):
          for opt_level in ("O0", "O1", "O2", "O3"):
            for how_to_zero in ("none", "model", "optimizer"):
              for use_multiple_loss_scalers in (False, True):
                if opt_level == "O1" or opt_level == "O2":
                    inject_inf_iters = (-1, 0, 1)
                else:
                    inject_inf_iters = (-1,)

                for inject_inf in inject_inf_iters:
                  if inject_inf >= 0:
                     inject_inf_locs = ("fp16", "fp32")
                     which_backwards = (0, 1)
                  else:
                     inject_inf_locs = ("fdsa",)
                     which_backwards = (None,)

                  for inject_inf_loc in inject_inf_locs:
                    for which_backward in which_backwards:
                        if use_multiple_loss_scalers:
                            num_losses = 2
                            loss_ids = [0, 1]
                        else:
                            num_losses = 1
                            loss_ids = [0, 0]

                        if inject_inf >= 0:
                            iters = 3
                        else:
                            iters = 2

                        model0 = MyModel(1)
                        model1 = MyModel(2)

                        models = [model0, model1]

                        optimizer = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                              {'params' : model1.parameters(), 'lr' : 0.5}],
                                             momentum=0.125,
                                             materialize_master_grads=materialize_master_grads)

                        _amp_state.allow_incoming_model_not_fp32 = True
                        [model0, model1], optimizer = amp.initialize(
                            [model0, model1],
                            optimizer,
                            opt_level=opt_level,
                            verbosity=0,
                            cast_model_type=False,
                            num_losses=num_losses)
                        _amp_state.allow_incoming_model_not_fp32 = False

                        _amp_state.loss_scalers[0]._loss_scale = 4.0
                        if use_multiple_loss_scalers:
                            _amp_state.loss_scalers[1]._loss_scale = 16.0

                        unskipped = 0
                        for i in range(iters):
                            if how_to_zero == "none":
                                for model in models:
                                    for param in model.parameters():
                                        param.grad = None
                            elif how_to_zero == "model":
                                for model in models:
                                    model.zero_grad()
                            else:
                                optimizer.zero_grad()

                            loss0 = model0(self.x)
                            loss1 = model1(self.x)

                            with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
                                scaled_loss.backward()
                                if i == inject_inf and which_backward == 0:
                                    if inject_inf_loc == "fp32":
                                        model0.weight0.grad[0] = float('inf')
                                    elif inject_inf_loc == "fp16":
                                        model0.weight1.grad[0] = float('inf')
                            with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
                                scaled_loss.backward()
                                if i == inject_inf and which_backward == 1:
                                    if inject_inf_loc == "fp32":
                                        model1.weight0.grad[0] = float('inf')
                                    elif inject_inf_loc == "fp16":
                                        model1.weight1.grad[0] = float('inf')

                            if i != inject_inf:
                                master_params = amp.master_params(optimizer)
                                for param, reference_grad in zip(master_params, reference_grads[unskipped]):
                                    if opt_level == "O2" and not materialize_master_grads:
                                        continue
                                    else:
                                        self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()),
                                                        "opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))
                                unskipped += 1
                            optimizer.step()

                        model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
                        for model, master, reference in zip(
                                model_params,
                                amp.master_params(optimizer),
                                final_params):
                            self.assertTrue(torch.allclose(model, reference))
                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))

                        if opt_level == "O1":
                            _amp_state.handle._deactivate()
예제 #15
0
def create_optimizer_param(args, parameters):
    opt_lower = args.opt.lower()

    if 'fused' in opt_lower:
        assert has_apex and torch.cuda.is_available(
        ), 'APEX and CUDA required for fused optimizers'

    opt_args = dict(lr=args.lr, weight_decay=args.weight_decay)
    if hasattr(args, 'opt_eps') and args.opt_eps is not None:
        opt_args['eps'] = args.opt_eps
    if hasattr(args, 'opt_betas') and args.opt_betas is not None:
        opt_args['betas'] = args.opt_betas
    if hasattr(args, 'opt_args') and args.opt_args is not None:
        opt_args.update(args.opt_args)

    opt_split = opt_lower.split('_')
    opt_lower = opt_split[-1]
    if opt_lower == 'sgd' or opt_lower == 'nesterov':
        opt_args.pop('eps', None)
        optimizer = optim.SGD(parameters,
                              momentum=args.momentum,
                              nesterov=True,
                              **opt_args)
    elif opt_lower == 'momentum':
        opt_args.pop('eps', None)
        optimizer = optim.SGD(parameters,
                              momentum=args.momentum,
                              nesterov=False,
                              **opt_args)
    elif opt_lower == 'adam':
        optimizer = optim.Adam(parameters, **opt_args)
    elif opt_lower == 'adamw':
        optimizer = optim.AdamW(parameters, **opt_args)
    elif opt_lower == 'nadam':
        optimizer = Nadam(parameters, **opt_args)
    elif opt_lower == 'radam':
        optimizer = RAdam(parameters, **opt_args)
    elif opt_lower == 'adamp':
        optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args)
    elif opt_lower == 'sgdp':
        optimizer = SGDP(parameters,
                         momentum=args.momentum,
                         nesterov=True,
                         **opt_args)
    elif opt_lower == 'adadelta':
        optimizer = optim.Adadelta(parameters, **opt_args)
    elif opt_lower == 'adafactor':
        if not args.lr:
            opt_args['lr'] = None
        optimizer = Adafactor(parameters, **opt_args)
    elif opt_lower == 'adahessian':
        optimizer = Adahessian(parameters, **opt_args)
    elif opt_lower == 'rmsprop':
        optimizer = optim.RMSprop(parameters,
                                  alpha=0.9,
                                  momentum=args.momentum,
                                  **opt_args)
    elif opt_lower == 'rmsproptf':
        optimizer = RMSpropTF(parameters,
                              alpha=0.9,
                              momentum=args.momentum,
                              **opt_args)
    elif opt_lower == 'novograd':
        optimizer = NovoGrad(parameters, **opt_args)
    elif opt_lower == 'nvnovograd':
        optimizer = NvNovoGrad(parameters, **opt_args)
    elif opt_lower == 'fusedsgd':
        opt_args.pop('eps', None)
        optimizer = FusedSGD(parameters,
                             momentum=args.momentum,
                             nesterov=True,
                             **opt_args)
    elif opt_lower == 'fusedmomentum':
        opt_args.pop('eps', None)
        optimizer = FusedSGD(parameters,
                             momentum=args.momentum,
                             nesterov=False,
                             **opt_args)
    elif opt_lower == 'fusedadam':
        optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
    elif opt_lower == 'fusedadamw':
        optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
    elif opt_lower == 'fusedlamb':
        optimizer = FusedLAMB(parameters, **opt_args)
    elif opt_lower == 'fusednovograd':
        opt_args.setdefault('betas', (0.95, 0.98))
        optimizer = FusedNovoGrad(parameters, **opt_args)
    else:
        assert False and "Invalid optimizer"
        raise ValueError

    if len(opt_split) > 1:
        if opt_split[0] == 'lookahead':
            optimizer = Lookahead(optimizer)

    return optimizer
예제 #16
0
def create_optimizer(args, model, filter_bias_and_bn=True, freeze_stage=""):
    opt_lower = args.opt.lower()
    weight_decay = args.weight_decay
    if 'adamw' in opt_lower or 'radam' in opt_lower:
        # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay
        # I don't believe they follow the paper or original Torch7 impl which schedules weight
        # decay based on the ratio of current_lr/initial_lr
        weight_decay /= args.lr
    if weight_decay and filter_bias_and_bn:
        if freeze_stage == "stage1":
            stage1_train_attn(model, layer_names=['fc'])
            print('stage1, Freeze layer successfully')
        if freeze_stage == "stage2":
            stage1_train_attn(model,
                              layer_names=['layer3', 'layer4', 'se', 'fc'])
            stage2_train_layer4(model)
            print('stage2, Freeze layer successfully')
        # 对未冻结的层进行权重衰减
        parameters = add_weight_decay(model, weight_decay)
        weight_decay = 0.
    else:
        parameters = model.parameters()

    for name, param in model.named_parameters():
        print(name, param.requires_grad)

    if 'fused' in opt_lower:
        assert has_apex and torch.cuda.is_available(
        ), 'APEX and CUDA required for fused optimizers'

    opt_split = opt_lower.split('_')
    opt_lower = opt_split[-1]
    if opt_lower == 'sgd' or opt_lower == 'nesterov':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=weight_decay,
                              nesterov=True)
    elif opt_lower == 'momentum':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=weight_decay,
                              nesterov=False)
    elif opt_lower == 'adam':
        optimizer = optim.Adam(parameters,
                               lr=args.lr,
                               weight_decay=weight_decay,
                               eps=args.opt_eps)
    elif opt_lower == 'adamw':
        optimizer = AdamW(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'nadam':
        optimizer = Nadam(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'radam':
        optimizer = RAdam(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'adamp':
        optimizer = AdamP(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps,
                          delta=0.1,
                          wd_ratio=0.01,
                          nesterov=True)
    elif opt_lower == 'sgdp':
        optimizer = SGDP(parameters,
                         lr=args.lr,
                         momentum=args.momentum,
                         weight_decay=weight_decay,
                         eps=args.opt_eps,
                         nesterov=True)
    elif opt_lower == 'adadelta':
        optimizer = optim.Adadelta(parameters,
                                   lr=args.lr,
                                   weight_decay=weight_decay,
                                   eps=args.opt_eps)
    elif opt_lower == 'rmsprop':
        optimizer = optim.RMSprop(parameters,
                                  lr=args.lr,
                                  alpha=0.9,
                                  eps=args.opt_eps,
                                  momentum=args.momentum,
                                  weight_decay=weight_decay)
    elif opt_lower == 'rmsproptf':
        optimizer = RMSpropTF(parameters,
                              lr=args.lr,
                              alpha=0.9,
                              eps=args.opt_eps,
                              momentum=args.momentum,
                              weight_decay=weight_decay)
    elif opt_lower == 'novograd':
        optimizer = NovoGrad(parameters,
                             lr=args.lr,
                             weight_decay=weight_decay,
                             eps=args.opt_eps)
    elif opt_lower == 'nvnovograd':
        optimizer = NvNovoGrad(parameters,
                               lr=args.lr,
                               weight_decay=weight_decay,
                               eps=args.opt_eps)
    elif opt_lower == 'fusedsgd':
        optimizer = FusedSGD(parameters,
                             lr=args.lr,
                             momentum=args.momentum,
                             weight_decay=weight_decay,
                             nesterov=True)
    elif opt_lower == 'fusedmomentum':
        optimizer = FusedSGD(parameters,
                             lr=args.lr,
                             momentum=args.momentum,
                             weight_decay=weight_decay,
                             nesterov=False)
    elif opt_lower == 'fusedadam':
        optimizer = FusedAdam(parameters,
                              lr=args.lr,
                              adam_w_mode=False,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusedadamw':
        optimizer = FusedAdam(parameters,
                              lr=args.lr,
                              adam_w_mode=True,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusedlamb':
        optimizer = FusedLAMB(parameters,
                              lr=args.lr,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusednovograd':
        optimizer = FusedNovoGrad(parameters,
                                  lr=args.lr,
                                  betas=(0.95, 0.98),
                                  weight_decay=weight_decay,
                                  eps=args.opt_eps)
    else:
        assert False and "Invalid optimizer"
        raise ValueError

    if len(opt_split) > 1:
        if opt_split[0] == 'lookahead':
            optimizer = Lookahead(optimizer)

    return optimizer
예제 #17
0
def create_optimizer(args, model, filter_bias_and_bn=True):
    opt_lower = args.opt.lower()
    weight_decay = args.weight_decay
    if weight_decay and filter_bias_and_bn:
        skip = {}
        if hasattr(model, 'no_weight_decay'):
            skip = model.no_weight_decay
        parameters = add_weight_decay(model, weight_decay, skip)
        weight_decay = 0.
    else:
        parameters = model.parameters()

    if 'fused' in opt_lower:
        assert has_apex and torch.cuda.is_available(
        ), 'APEX and CUDA required for fused optimizers'

    opt_split = opt_lower.split('_')
    opt_lower = opt_split[-1]

    opt_args = dict(lr=args.lr, weight_decay=weight_decay)

    opt_args = dict(lr=args.lr, weight_decay=weight_decay)
    if hasattr(args,
               'opt_eps') and args.opt_eps is not None and opt_lower not in [
                   'sgd', 'momentum', 'fusedmomentum', 'fusedsgd'
               ]:
        opt_args['eps'] = args.opt_eps
    if hasattr(args, 'opt_betas') and args.opt_betas is not None:
        opt_args['betas'] = args.opt_betas

    if opt_lower == 'sgd' or opt_lower == 'nesterov':
        optimizer = optim.SGD(parameters,
                              momentum=args.momentum,
                              nesterov=True,
                              **opt_args)
    elif opt_lower == 'momentum':
        optimizer = optim.SGD(parameters,
                              momentum=args.momentum,
                              nesterov=False,
                              **opt_args)
    elif opt_lower == 'adam':
        optimizer = optim.Adam(parameters, **opt_args)
    elif opt_lower == 'adamw':
        optimizer = optim.AdamW(parameters, **opt_args)
    elif opt_lower == 'fusedsgd':
        optimizer = FusedSGD(parameters,
                             momentum=args.momentum,
                             nesterov=True,
                             **opt_args)
    elif opt_lower == 'fusedmomentum':
        optimizer = FusedSGD(parameters,
                             momentum=args.momentum,
                             nesterov=False,
                             **opt_args)
    elif opt_lower == 'fusedadam':
        optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args)
    elif opt_lower == 'fusedadamw':
        optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args)
    elif opt_lower == 'fusedlamb':
        optimizer = FusedLAMB(parameters, **opt_args)
    elif opt_lower == 'fusednovograd':
        opt_args.setdefault('betas', (0.95, 0.98))
        optimizer = FusedNovoGrad(parameters, **opt_args)
    else:
        assert False and "Invalid optimizer"
        raise ValueError

    if len(opt_split) > 1:
        if opt_split[0] == 'lookahead':
            optimizer = Lookahead(optimizer)

    return optimizer
예제 #18
0
def get_optimizer(
    model: nn.Module,
    optimizer_name: str,
    learning_rate: float,
    weight_decay: float = 1e-5,
    no_weight_decay_on_bias: bool = False,
    eps: float = 1e-5,
    **kwargs,
) -> Optimizer:
    """
    Construct an Optimizer for given model
    Args:
        model: Model to optimize. Only parameters that require_grad will be used
        optimizer_name: Name of the optimizer. Case-insensitive
        learning_rate: Target learning rate (regardless of the scheduler)
        weight_decay: Target weight decay
        no_weight_decay_on_bias: Whether to disable weight decay on bias parameters
        eps: Default epsilon for Adam-like optimizers.
        **kwargs: Additional parameters for optimizer

    Returns:

    """
    from torch.optim import ASGD, SGD, Adam, RMSprop, AdamW
    from torch_optimizer import RAdam, Lamb, DiffGrad, NovoGrad, Ranger

    # Optimizer parameter groups
    default_pg, biases_pg = [], []

    for k, v in model.named_parameters():
        if v.requires_grad:
            if str.endswith(k, ".bias"):
                biases_pg.append(v)  # biases
            else:
                default_pg.append(v)  # all else

    if no_weight_decay_on_bias:
        parameters = default_pg
    else:
        parameters = default_pg + biases_pg

    optimizer: Optimizer = None

    if optimizer_name.lower() == "sgd":
        optimizer = SGD(
            parameters,
            lr=learning_rate,
            momentum=0.9,
            nesterov=True,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "asgd":
        optimizer = ASGD(
            parameters,
            lr=learning_rate,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "adam":
        optimizer = Adam(
            parameters,
            lr=learning_rate,
            weight_decay=weight_decay,
            eps=eps,
            **kwargs,
        )
    elif optimizer_name.lower() == "rms":
        optimizer = RMSprop(parameters,
                            learning_rate,
                            weight_decay=weight_decay,
                            **kwargs)
    elif optimizer_name.lower() == "adamw":
        optimizer = AdamW(
            parameters,
            lr=learning_rate,
            weight_decay=weight_decay,
            eps=eps,
            **kwargs,
        )
    elif optimizer_name.lower() == "radam":
        optimizer = RAdam(
            parameters,
            lr=learning_rate,
            weight_decay=weight_decay,
            eps=eps,
            **kwargs,
        )
    elif optimizer_name.lower() == "ranger":
        optimizer = Ranger(
            parameters,
            lr=learning_rate,
            eps=eps,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "lamb":
        optimizer = Lamb(
            parameters,
            lr=learning_rate,
            eps=eps,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "diffgrad":
        optimizer = DiffGrad(
            parameters,
            lr=learning_rate,
            eps=eps,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "novograd":
        optimizer = NovoGrad(
            parameters,
            lr=learning_rate,
            eps=eps,
            weight_decay=weight_decay,
            **kwargs,
        )
    elif optimizer_name.lower() == "fused_lamb":
        from apex.optimizers import FusedLAMB

        optimizer = FusedLAMB(parameters,
                              learning_rate,
                              eps=eps,
                              weight_decay=weight_decay,
                              **kwargs)
    elif optimizer_name.lower() == "fused_sgd":
        from apex.optimizers import FusedSGD

        optimizer = FusedSGD(parameters,
                             learning_rate,
                             momentum=0.9,
                             nesterov=True,
                             weight_decay=weight_decay,
                             **kwargs)
    elif optimizer_name.lower() == "fused_adam":
        from apex.optimizers import FusedAdam

        optimizer = FusedAdam(parameters,
                              learning_rate,
                              eps=eps,
                              weight_decay=weight_decay,
                              adam_w_mode=True,
                              **kwargs)
    else:
        raise KeyError(f"Cannot get optimizer by name {optimizer_name}")

    # Currently either no_wd or per-group lr
    if no_weight_decay_on_bias:
        optimizer.add_param_group({"params": biases_pg, "weight_decay": 0})

    return optimizer
예제 #19
0
def create_optimizer(args,
                     model,
                     filter_bias_and_bn=True,
                     classification_layer_name=None):
    opt_lower = args.opt.lower()
    weight_decay = args.weight_decay
    if 'adamw' in opt_lower or 'radam' in opt_lower:
        # Compensate for the way current AdamW and RAdam optimizers apply LR to the weight-decay
        # I don't believe they follow the paper or original Torch7 impl which schedules weight
        # decay based on the ratio of current_lr/initial_lr
        weight_decay /= args.lr

    if weight_decay and filter_bias_and_bn:  # batch norm and bias params
        if classification_layer_name is not None:
            parameters = set_lr_per_params(args, model,
                                           classification_layer_name,
                                           weight_decay)
        else:
            parameters = add_weight_decay(model, weight_decay)
        weight_decay = 0.  # reset to 0
    else:
        if classification_layer_name is not None:
            parameters = set_lr_per_params(args,
                                           model,
                                           classification_layer_name,
                                           weight_decay=0)
        else:
            parameters = model.parameters()

    if 'fused' in opt_lower:
        assert has_apex and torch.cuda.is_available(
        ), 'APEX and CUDA required for fused optimizers'

    opt_split = opt_lower.split('_')
    opt_lower = opt_split[-1]
    if opt_lower == 'sgd' or opt_lower == 'nesterov':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=weight_decay,
                              nesterov=True)
    elif opt_lower == 'momentum':
        optimizer = optim.SGD(parameters,
                              lr=args.lr,
                              momentum=args.momentum,
                              weight_decay=weight_decay,
                              nesterov=False)
    elif opt_lower == 'adam':
        optimizer = optim.Adam(parameters,
                               lr=args.lr,
                               weight_decay=weight_decay,
                               eps=args.opt_eps)
    elif opt_lower == 'adamw':
        optimizer = AdamW(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'nadam':
        optimizer = Nadam(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'radam':
        optimizer = RAdam(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps)
    elif opt_lower == 'adamp':
        optimizer = AdamP(parameters,
                          lr=args.lr,
                          weight_decay=weight_decay,
                          eps=args.opt_eps,
                          delta=0.1,
                          wd_ratio=0.01,
                          nesterov=True)
    elif opt_lower == 'sgdp':
        optimizer = SGDP(parameters,
                         lr=args.lr,
                         momentum=args.momentum,
                         weight_decay=weight_decay,
                         eps=args.opt_eps,
                         nesterov=True)
    elif opt_lower == 'adadelta':
        optimizer = optim.Adadelta(parameters,
                                   lr=args.lr,
                                   weight_decay=weight_decay,
                                   eps=args.opt_eps)
    elif opt_lower == 'rmsprop':
        optimizer = optim.RMSprop(parameters,
                                  lr=args.lr,
                                  alpha=0.9,
                                  eps=args.opt_eps,
                                  momentum=args.momentum,
                                  weight_decay=weight_decay)
    elif opt_lower == 'rmsproptf':
        optimizer = RMSpropTF(parameters,
                              lr=args.lr,
                              alpha=0.9,
                              eps=args.opt_eps,
                              momentum=args.momentum,
                              weight_decay=weight_decay)
    elif opt_lower == 'novograd':
        optimizer = NovoGrad(parameters,
                             lr=args.lr,
                             weight_decay=weight_decay,
                             eps=args.opt_eps)
    elif opt_lower == 'nvnovograd':
        optimizer = NvNovoGrad(parameters,
                               lr=args.lr,
                               weight_decay=weight_decay,
                               eps=args.opt_eps)
    elif opt_lower == 'fusedsgd':
        optimizer = FusedSGD(parameters,
                             lr=args.lr,
                             momentum=args.momentum,
                             weight_decay=weight_decay,
                             nesterov=True)
    elif opt_lower == 'fusedmomentum':
        optimizer = FusedSGD(parameters,
                             lr=args.lr,
                             momentum=args.momentum,
                             weight_decay=weight_decay,
                             nesterov=False)
    elif opt_lower == 'fusedadam':
        optimizer = FusedAdam(parameters,
                              lr=args.lr,
                              adam_w_mode=False,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusedadamw':
        optimizer = FusedAdam(parameters,
                              lr=args.lr,
                              adam_w_mode=True,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusedlamb':
        optimizer = FusedLAMB(parameters,
                              lr=args.lr,
                              weight_decay=weight_decay,
                              eps=args.opt_eps)
    elif opt_lower == 'fusednovograd':
        optimizer = FusedNovoGrad(parameters,
                                  lr=args.lr,
                                  betas=(0.95, 0.98),
                                  weight_decay=weight_decay,
                                  eps=args.opt_eps)
    else:
        assert False and "Invalid optimizer"
        raise ValueError

    if len(opt_split) > 1:
        if opt_split[0] == 'lookahead':
            optimizer = Lookahead(optimizer)

    return optimizer
예제 #20
0
    def test_2models2losses2optimizers(self):
        model0 = MyModel(1)
        model1 = MyModel(2)

        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
                                      momentum=0.125)
        optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
                                      momentum=0.25)

        # Don't do it like this:  reference_grads = [[]]*5
        # because then it creates a list of 5 references to the same "[]" and appending
        # to any of them effectively makes you append to all of them, which multiplies
        # the resulting size of reference_grads by 5x and needless to say makes the test fail.
        reference_grads = [[], [], [], [], []]
        final_params = [None, None, None, None, None]
        for i in range(2):
            optimizer0.zero_grad()
            optimizer1.zero_grad()
            loss0 = model0(self.x)
            loss1 = model1(self.x)
            loss0.backward()
            loss1.backward()

            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
                                   [param.grad.data.clone() for param in model1.parameters()])

            optimizer0.step()
            optimizer1.step()

        final_params[0] = [param.data.clone() for param in model0.parameters()] + \
                          [param.data.clone() for param in model1.parameters()]

        def what_got_skipped(which_iter, which_backward):
            if which_iter == 0 and which_backward == 0:
                return 1
            if which_iter == 0 and which_backward == 1:
                return 2
            if which_iter == 1 and which_backward == 0:
                return 3
            if which_iter == 1 and which_backward == 1:
                return 4
            return 0

        for which_iter in (0,1):
            for which_backward in (0,1):
                model0 = MyModel(1)
                model1 = MyModel(2)

                optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
                                              momentum=0.125)
                optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
                                              momentum=0.25)

                for i in range(3):
                    optimizer0.zero_grad()
                    optimizer1.zero_grad()
                    loss0 = model0(self.x)
                    loss1 = model1(self.x)
                    loss0.backward()
                    loss1.backward()

                    if i != which_iter:
                        reference_grads[what_got_skipped(which_iter, which_backward)].append(
                            [param.grad.data.clone() for param in model0.parameters()] +
                            [param.grad.data.clone() for param in model1.parameters()])

                    if i == which_iter:
                        if which_backward == 0:
                            optimizer1.step()
                        else:
                            optimizer0.step()
                    else:
                        optimizer0.step()
                        optimizer1.step()

                final_params[what_got_skipped(which_iter, which_backward)] = \
                    [param.data.clone() for param in model0.parameters()] + \
                    [param.data.clone() for param in model1.parameters()]

        for materialize_master_grads in (False, True):
          for opt_level in ("O0", "O1", "O2", "O3"):
            for how_to_zero in ("none", "model", "optimizer"):
              for use_multiple_loss_scalers in (False, True):
                if opt_level == "O1" or opt_level == "O2":
                    inject_inf_iters = (-1, 0, 1)
                else:
                    inject_inf_iters = (-1,)

                for inject_inf in inject_inf_iters:
                  if inject_inf >= 0:
                     inject_inf_locs = ("fp16", "fp32")
                     which_backwards = (0, 1)
                  else:
                     inject_inf_locs = ("fdsa",)
                     which_backwards = (None,)

                  for inject_inf_loc in inject_inf_locs:
                    for which_backward in which_backwards:
                        if use_multiple_loss_scalers:
                            num_losses = 2
                            loss_ids = [0, 1]
                        else:
                            num_losses = 1
                            loss_ids = [0, 0]

                        if inject_inf >= 0:
                            iters = 3
                        else:
                            iters = 2

                        model0 = MyModel(1)
                        model1 = MyModel(2)

                        models = [model0, model1]

                        optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}],
                                              momentum=0.125, materialize_master_grads=materialize_master_grads)
                        optimizer1 = FusedSGD([{'params' : model1.parameters(), 'lr' : 0.5}],
                                              momentum=0.25, materialize_master_grads=materialize_master_grads)

                        _amp_state.allow_incoming_model_not_fp32 = True
                        [model0, model1], [optimizer0, optimizer1] = amp.initialize(
                            [model0, model1],
                            [optimizer0, optimizer1],
                            opt_level=opt_level,
                            verbosity=0,
                            cast_model_type=False,
                            num_losses=num_losses)
                        _amp_state.allow_incoming_model_not_fp32 = False

                        _amp_state.loss_scalers[0]._loss_scale = 4.0
                        if use_multiple_loss_scalers:
                            _amp_state.loss_scalers[1]._loss_scale = 16.0

                        unskipped = 0
                        for i in range(iters):
                            if how_to_zero == "none":
                                for model in models:
                                    for param in model.parameters():
                                        param.grad = None
                            elif how_to_zero == "model":
                                for model in models:
                                    model.zero_grad()
                            else:
                                optimizer0.zero_grad()
                                optimizer1.zero_grad()

                            loss0 = model0(self.x)
                            loss1 = model1(self.x)

                            with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
                                scaled_loss.backward()
                                if i == inject_inf and which_backward == 0:
                                    if inject_inf_loc == "fp32":
                                        model0.weight0.grad[0] = float('inf')
                                    elif inject_inf_loc == "fp16":
                                        model0.weight1.grad[0] = float('inf')
                            with amp.scale_loss(loss1, optimizer1, loss_id=loss_ids[1]) as scaled_loss:
                                scaled_loss.backward()
                                if i == inject_inf and which_backward == 1:
                                    if inject_inf_loc == "fp32":
                                        model1.weight0.grad[0] = float('inf')
                                    elif inject_inf_loc == "fp16":
                                        model1.weight1.grad[0] = float('inf')

                            # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))

                            if i != inject_inf:
                                master_params = list(amp.master_params(optimizer0)) + \
                                                list(amp.master_params(optimizer1))
                                for param, reference_grad in zip(master_params,
                                        reference_grads[what_got_skipped(inject_inf, which_backward)][unskipped]):
                                    if opt_level == "O2" and not materialize_master_grads:
                                        continue
                                    else:
                                        self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
                                unskipped += 1

                            optimizer0.step()
                            optimizer1.step()

                        model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
                        master_params = [p for p in amp.master_params(optimizer0)] + \
                                        [p for p in amp.master_params(optimizer1)]
                        for model, master, reference in zip(
                                model_params,
                                master_params,
                                final_params[what_got_skipped(inject_inf, which_backward)]):
                            self.assertTrue(torch.allclose(model, reference))
                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))

                        if opt_level == "O1":
                            _amp_state.handle._deactivate()