def __call__(self, parameters): opt = optim.Momentum(learning_rate=self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay, grad_clip=self.grad_clip, parameters=parameters) return opt
def __call__(self, model_list): # model_list is None in static graph parameters = sum([m.parameters() for m in model_list], []) if model_list else None opt = optim.Momentum(learning_rate=self.learning_rate, momentum=self.momentum, weight_decay=self.weight_decay, grad_clip=self.grad_clip, multi_precision=self.multi_precision, parameters=parameters) return opt
DISP_FREQ = len(train_loader) # frequency to display training loss & acc NUM_EPOCH_WARM_UP = NUM_EPOCH // 25 # use the first 1/25 epochs to warm up NUM_BATCH_WARM_UP = len( train_loader ) * NUM_EPOCH_WARM_UP # use the first 1/25 epochs to warm up scheduler = paddle.optimizer.lr.LinearWarmup( learning_rate=LR, warmup_steps=NUM_BATCH_WARM_UP, start_lr=LR / 2, end_lr=LR, verbose=True) clip = paddle.nn.ClipGradByValue(min=-CLIP, max=CLIP) strategy = fleet.DistributedStrategy() OPTIMIZER_decay = optim.Momentum(parameters=backbone_paras_wo_bn + head_paras_wo_bn, learning_rate=scheduler, weight_decay=WEIGHT_DECAY, momentum=MOMENTUM) OPTIMIZER_decay = fleet.distributed_optimizer(optimizer=OPTIMIZER_decay, strategy=strategy) OPTIMIZER = optim.Momentum(parameters=backbone_paras_only_bn, learning_rate=scheduler, momentum=MOMENTUM) OPTIMIZER = fleet.distributed_optimizer(optimizer=OPTIMIZER, strategy=strategy) BACKBONE = fleet.distributed_model(BACKBONE) HEAD = fleet.distributed_model(HEAD) logger.info("=" * 60) logger.info(OPTIMIZER) logger.info("Optimizer Generated") logger.info("=" * 60)