def __init__(self, *, latent_dim, image_size, optimizer="adam", fmap_max=512, fmap_inverse_coef=12, transparent=False, disc_output_size=5, attn_res_layers=[], sle_spatial=False, ttur_mult=1., lr=2e-4, rank=0, ddp=False): super().__init__() self.latent_dim = latent_dim self.image_size = image_size G_kwargs = dict(image_size=image_size, latent_dim=latent_dim, fmap_max=fmap_max, fmap_inverse_coef=fmap_inverse_coef, transparent=transparent, attn_res_layers=attn_res_layers, sle_spatial=sle_spatial) self.G = Generator(**G_kwargs) self.D = Discriminator(image_size=image_size, fmap_max=fmap_max, fmap_inverse_coef=fmap_inverse_coef, transparent=transparent, attn_res_layers=attn_res_layers, disc_output_size=disc_output_size) self.ema_updater = EMA(0.995) self.GE = Generator(**G_kwargs) set_requires_grad(self.GE, False) if optimizer == "adam": self.G_opt = Adam(self.G.parameters(), lr=lr, betas=(0.5, 0.9)) self.D_opt = Adam(self.D.parameters(), lr=lr * ttur_mult, betas=(0.5, 0.9)) elif optimizer == "adabelief": self.G_opt = AdaBelief(self.G.parameters(), lr=lr, betas=(0.5, 0.9)) self.D_opt = AdaBelief(self.D.parameters(), lr=lr * ttur_mult, betas=(0.5, 0.9)) else: assert False, "No valid optimizer is given" self.apply(self._init_weights) self.reset_parameter_averaging() self.cuda(rank) self.D_aug = AugWrapper(self.D, image_size)
def __init__(self, args, env): self.action_space = env.action_space() self.atoms = args.atoms self.Vmin = args.V_min self.Vmax = args.V_max self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device) # Support (range) of z self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1) self.batch_size = args.batch_size self.n = args.multi_step self.discount = args.discount self.norm_clip = args.norm_clip self.online_net = DQN(args, self.action_space).to(device=args.device) if args.model: # Load pretrained model if provided if os.path.isfile(args.model): state_dict = torch.load(args.model, map_location='cpu') # Always load tensors onto CPU by default, will shift to GPU if necessary if 'conv1.weight' in state_dict.keys(): for old_key, new_key in (('conv1.weight', 'convs.0.weight'), ('conv1.bias', 'convs.0.bias'), ('conv2.weight', 'convs.2.weight'), ('conv2.bias', 'convs.2.bias'), ('conv3.weight', 'convs.4.weight'), ('conv3.bias', 'convs.4.bias')): state_dict[new_key] = state_dict[old_key] # Re-map state dict for old pretrained models del state_dict[old_key] # Delete old keys for strict load_state_dict self.online_net.load_state_dict(state_dict) print("Loading pretrained model: " + args.model) else: # Raise error if incorrect model path provided raise FileNotFoundError(args.model) self.online_net.train() self.target_net = DQN(args, self.action_space).to(device=args.device) self.update_target_net() self.target_net.train() for param in self.target_net.parameters(): param.requires_grad = False self.optimiser = AdaBelief(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps, rectify=True)#optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
def select_optimizer(self): if self.args.optimizer == 'Adam': self.optimizer = optim.Adam(filter(lambda p: p.requires_grad, self.mdl.parameters()), lr=self.args.learning_rate, weight_decay=self.args.weight_decay) elif self.args.optimizer == 'AdaBelief': self.optimizer = AdaBelief(self.mdl.parameters(), weight_decay=self.args.weight_decay) elif self.args.optimizer == 'RMS': self.optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, self.mdl.parameters()), lr=self.args.learning_rate) elif self.args.optimizer == 'SGD': self.optimizer = optim.SGD(filter(lambda p: p.requires_grad, self.mdl.parameters()), lr=self.args.learning_rate, momentum=0.9) elif self.args.optimizer == 'Adagrad': self.optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, self.mdl.parameters()), lr=self.args.learning_rate) elif self.args.optimizer == 'Adadelta': self.optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, self.mdl.parameters()), lr=self.args.learning_rate)
def reset(self): self.model.reset() if self.adabelief: if self.adabelief_args != None: self.optimizer = AdaBelief(self.model.model.latents.parameters(), lr=self.adabelief_args.lr, betas=(self.adabelief_args.b1, self.adabelief_args.b2), eps=self.adabelief_args.eps, weight_decay=self.adabelief_args.weight_decay, amsgrad=self.adabelief_args.amsgrad, weight_decouple=self.adabelief_args.weight_decouple, fixed_decay=self.adabelief_args.fixed_decay, rectify=self.adabelief_args.rectify) else: self.optimizer = AdaBelief(self.model.model.latents.parameters(), lr=self.lr, betas=(0.5, 0.999), eps=1e-12, weight_decay=0, amsgrad=False, weight_decouple=True, fixed_decay=False, rectify=True) else: self.optimizer = Adam(self.model.model.latents.parameters(), self.lr) if self.lr_scheduling: #self.lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, max_lr=self.lr, steps_per_epoch=self.iterations, epochs=self.epochs) self.lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.optimizer, gamma = .96)
def get_optimizer( name: str, model_params: Iterable, lr: float = 1e-3, wd: float = 0, lookahead: bool = False, ): if name == "adam": base_optimizer = optim.Adam(model_params, lr=lr, weight_decay=wd) elif name == "sgd": base_optimizer = optim.SGD(model_params, lr=lr, weight_decay=wd, momentum=0.9, nesterov=True) elif name == "radam": base_optimizer = RAdam(model_params, lr=lr, weight_decay=wd) elif name == "ralamb": base_optimizer = Ralamb(model_params, lr=lr, weight_decay=wd) elif name == "adabelief": base_optimizer = AdaBelief(model_params, lr=lr, weight_decay=wd) else: raise ValueError # Use lookahead if lookahead: optimizer = Lookahead(base_optimizer) else: optimizer = base_optimizer return optimizer
def __init__(self, summ_model, ckpt): super(ParallelLoss, self).__init__() self.loss_mse = torch.nn.MSELoss(reduction='mean') self.loss = EucDistanceLoss(1) self.summ = summ_model self.summ.eval() self.pre_att_model = PreAttModel(layers=2, d_model=1024, num_heads=16, dff=4096, rate=0.0) lr = 2e-5 try: self.pre_att_model.load_state_dict(torch.load(ckpt)) print('load {}'.format(ckpt)) except: print('no checkpoints now!') self.optimizer = AdaBelief(self.pre_att_model.parameters(), lr=lr, eps=1e-16, betas=(0.9, 0.999), weight_decay=1e-4, weight_decouple=True, rectify=True)
def select_optimizer(self, model): if self.args.optimizer == 'Adam': optimizer = optim.Adam( filter(lambda p: p.requires_grad, model.parameters()), lr=self.args.learning_rate, betas=(0.5, 0.999), ) elif self.args.optimizer == 'AdaBelief': optimizer = AdaBelief(model.parameters(), lr=self.args.learning_rate, betas=(0.5, 0.999)) elif self.args.optimizer == 'RMS': optimizer = optim.RMSprop(filter(lambda p: p.requires_grad, model.parameters()), lr=self.args.learning_rate) elif self.args.optimizer == 'SGD': optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=self.args.learning_rate, momentum=0.9) elif self.args.optimizer == 'Adagrad': optimizer = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=self.args.learning_rate) elif self.args.optimizer == 'Adadelta': optimizer = optim.Adadelta(filter(lambda p: p.requires_grad, model.parameters()), lr=self.args.learning_rate) return optimizer
def configure_optimizers(self): # for transformer return AdaBelief( params=self.model.parameters(), lr=self.hparams.optimizer.lr, weight_decay=1.2e-6 )
def create_optimizer(args, model_params): args.optim = args.optim.lower() if args.optim == 'sgd': return torch.optim.SGD(model_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optim == 'adam': return torch.optim.Adam(model_params, args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, eps=args.eps) elif args.optim == 'adamw': return torch.optim.AdamW(model_params, args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, eps=args.eps) elif args.optim == 'adabelief': return AdaBelief(model_params, args.lr, betas=(args.beta1, args.beta2), weight_decay=args.weight_decay, eps=args.eps, print_change_log=False) else: print('Optimizer not found')
def get_optimizer(parameters, hparams): name = hparams.optimizer if name == 'sgd': print('Using SGD optimizer') return optim.SGD( parameters, lr=hparams.lr, momentum=hparams.momentum, weight_decay=hparams.weight_decay, nesterov=hparams.nesterov ) elif name == 'adam': print('Using Adam optimizer') return optim.Adam( parameters, lr=hparams.lr, betas=(hparams.beta1, hparams.beta2), weight_decay=hparams.weight_decay ) elif name == 'ranger': print('Using Ranger optimizer') return Ranger( parameters, lr=hparams.lr, alpha=hparams.ranger_alpha, k=hparams.ranger_k, betas=(hparams.beta1, hparams.beta2), weight_decay=hparams.weight_decay, use_gc=hparams.ranger_gc, ) elif name == 'adabelief': print('Using AdaBelief optimizer') return AdaBelief( parameters, lr=hparams.lr, weight_decay=hparams.weight_decay, eps=hparams.belief_eps, betas=(hparams.beta1, hparams.beta2), weight_decouple=hparams.belief_weight_decouple, rectify=hparams.belief_recitfy, amsgrad=hparams.belief_amsgrad, fixed_decay=hparams.belief_fixed_decay ) elif name == 'ranger_adabelief': print('Using RangerAdaBelief optimizer') return RangerAdaBelief( parameters, lr=hparams.lr, alpha=hparams.ranger_alpha, k=hparams.ranger_k, betas=(hparams.beta1, hparams.beta2), weight_decay=hparams.weight_decay, use_gc=hparams.ranger_gc, adabelief=True, ) else: raise NotImplementedError(f'{name} is not an available optimizer')
def configure_optimizers(self): return AdaBelief(params=self.model.parameters(), lr=self.hparams.optimizer.lr, eps=1e-12, weight_decay=1.2e-6, weight_decouple=False, rectify=False, fixed_decay=False, amsgrad=False)
def get_optimizer(optim_params, model): # Optimizer if optim_params['optimizer'].lower() == 'adam': opt = Adam(model.parameters(), lr=optim_params['step_size'], weight_decay=optim_params['weight_decay'], betas=(optim_params['momentum'], 0.999), eps=1e-08) elif optim_params['optimizer'].lower() == 'adabelief': opt = AdaBelief(model.parameters(), lr=optim_params['step_size'], eps=1e-16, betas=(0.9, 0.999), weight_decouple=True, rectify=False) elif optim_params['optimizer'].lower() == 'sgd': opt = SGD(model.parameters(), lr=optim_params['step_size'], weight_decay=optim_params['weight_decay']) elif optim_params['optimizer'].lower() == 'alig': opt = AliG(model.parameters(), max_lr=optim_params['step_size']) elif optim_params['optimizer'].lower() == 'aggmo': opt = AggMo(model.parameters(), lr=optim_params['step_size'], momentum=optim_params['betas']) elif optim_params['optimizer'].lower() == 'adahessian': opt = Adahessian(model.parameters(), lr=optim_params['step_size'], weight_decay=optim_params['weight_decay'], betas=(optim_params['momentum'], 0.999), eps=1e-08) # Scheduler schedule_params = optim_params['lr_scheduler'] if optim_params['optimizer'].lower() == 'alig': return opt, None if schedule_params['name'] == 'plateau': scheduler = lr_scheduler.ReduceLROnPlateau( opt, mode='min', patience=schedule_params['patience'], factor=schedule_params['factor']) elif schedule_params['name'] == 'exp': scheduler = lr_scheduler.ExponentialLR(opt, schedule_params['lr_decay'], schedule_params['last_epoch']) elif schedule_params['name'] == 'step': scheduler = lr_scheduler.MultiStepLR(opt, schedule_params['milestones'], schedule_params['lr_decay']) return opt, scheduler
def configure_optimizers(self): # for transformer return AdaBelief(params=self.model.parameters(), lr=self.hparams.optimizer.lr, eps=1e-16, weight_decay=1e-4, weight_decouple=True, rectify=True, fixed_decay=False, amsgrad=False)
def configure_optimizers(self): opt = AdaBelief(self.vae.parameters(), lr=self.hparams.learning_rate, weight_decay=self.hparams.weight_decay, print_change_log=False) # opt = torch.optim.AdamW( # self.vae.parameters(), # lr=self.hparams.learning_rate, # weight_decay=self.hparams.weight_decay, # ) return opt
def configure_optimizers(self): if self.learning_params["optimizer"] == "belief": optimizer = AdaBelief( self.parameters(), lr=self.learning_params["lr"], eps=self.learning_params["eplison_belief"], weight_decouple=self.learning_params["weight_decouple"], weight_decay=self.learning_params["weight_decay"], rectify=self.learning_params["rectify"]) elif self.learning_params["optimizer"] == "ranger_belief": optimizer = RangerAdaBelief( self.parameters(), lr=self.learning_params["lr"], eps=self.learning_params["eplison_belief"], weight_decouple=self.learning_params["weight_decouple"], weight_decay=self.learning_params["weight_decay"], ) elif self.learning_params["optimizer"] == "adam": optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_params["lr"]) elif self.learning_params["optimizer"] == "adamW": optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_params["lr"]) if self.learning_params["add_sch"]: lr_scheduler = { 'scheduler': torch.optim.lr_scheduler.OneCycleLR( optimizer, max_lr=self.learning_params["lr"], steps_per_epoch=self.hparams. steps_per_epoch, #int(len(train_loader)) epochs=self.learning_params["epochs"], anneal_strategy='linear'), 'name': 'lr_scheduler_lr', 'interval': 'step', # or 'epoch' 'frequency': 1, } print("sch added") return [optimizer], [lr_scheduler] return optimizer
def optimizer_cls(cls, params=None, lr=0.001, beta1=0.9, beta2=0.999, eps=1e-8, weight_decay=1e-5, amsgrad=False, weight_decouple=False, fixed_decay=False, rectify=False): return AdaBelief(params=params, lr=lr, betas=(beta1, beta2), eps=eps, weight_decay=weight_decay, amsgrad=amsgrad, weight_decouple=weight_decouple, fixed_decay=fixed_decay, rectify=rectify)
def build_optimizer(parameters, config): if config.train.opt.type == "sgd": optimizer = torch.optim.SGD( parameters, lr=config.train.opt.lr, momentum=config.train.opt.sgd.momentum, weight_decay=config.train.opt.weight_decay, nesterov=True, ) elif config.train.opt.type == "adam": optimizer = torch.optim.Adam( parameters, lr=config.train.opt.lr, weight_decay=config.train.opt.weight_decay, ) elif config.train.opt.type == "ada_belief": optimizer = AdaBelief( parameters, lr=config.train.opt.lr, betas=(0.9, 0.999), eps=1e-8, weight_decay=config.train.opt.weight_decay, weight_decouple=config.train.opt.ada_belief.weight_decouple, rectify=False, fixed_decay=False, amsgrad=False, ) else: raise AssertionError("invalid optimizer {}".format( config.train.opt.type)) if config.train.opt.look_ahead is not None: optimizer = LookAhead( optimizer, lr=config.train.opt.look_ahead.lr, num_steps=config.train.opt.look_ahead.num_steps, ) return optimizer
def configure_optimizers(self): # optimizer = optim.SGD(self.model.parameters(), lr=self.hparams.optimizer.lr, momentum=0.9, nesterov=True) # scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 10, 2, eta_min=1e-6) # optimizer = AdaBelief( # params=self.model.parameters(), # lr=self.hparams.optimizer.lr, # betas=(0.9, 0.999), # eps=1e-16, # weight_decouple=True, # rectify=True, # fixed_decay=False, # amsgrad=False # ) # scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 10, 2, eta_min=1e-6) # return [optimizer], [scheduler] return AdaBelief(params=self.model.parameters(), lr=self.hparams.optimizer.lr, betas=(0.9, 0.999), eps=1e-16, weight_decouple=True, rectify=True, fixed_decay=False, amsgrad=False)
def train(hyp, opt, device, tb_writer=None, wandb=None): logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint if hyp.get('anchors'): ckpt['model'].yaml['anchors'] = round( hyp['anchors']) # force autoanchor model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg or hyp.get('anchors') else [ ] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load logger.info( 'Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [] # parameter names to freeze (full or partial) for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): pg2.append(v.bias) # biases if isinstance(v, nn.BatchNorm2d): pg0.append(v.weight) # no decay elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): pg1.append(v.weight) # apply decay if opt.adam: optimizer = AdaBelief(model.parameters(), lr=1e-4, eps=1e-16, betas=(0.9, 0.999), weight_decouple=True, rectify=True) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) # optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']}) # add pg1 with weight_decay # optimizer.add_param_group({'params': pg2}) # add pg2 (biases) logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 # Scheduler https://arxiv.org/pdf/1812.01187.pdf # https://pytorch.org/docs/stable/_modules/torch/optim/lr_scheduler.html#OneCycleLR lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # Logging if wandb and wandb.run is None: id = ckpt.get('wandb_id') if 'ckpt' in locals() else None wandb_run = wandb.init(config=opt, resume="allow", project="YOLOv5", name=os.path.basename(log_dir), id=id) # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}' ) # save previous weights if epochs < start_epoch: logger.info( '%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=opt.local_rank) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Process 0 if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader(test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images and not opt.notest, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # testloader if not opt.resume: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # iou loss ratio (obj_loss = 1.0 or iou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0 ) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) logger.info('Image sizes %g train, %g test\n' 'Using %g dataloader workers\nLogging results to %s\n' 'Starting training for %g epochs...' % (imgsz, imgsz_test, dataloader.num_workers, log_dir, epochs)) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if opt.image_weights: # Generate indices if rank in [-1, 0]: cw = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices( range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = (torch.tensor(dataset.indices) if rank == 0 else torch.zeros(dataset.n)).int() dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) logger.info( ('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'total', 'targets', 'img_size')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [ hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch) ]) if 'momentum' in x: x['momentum'] = np.interp( ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, targets.shape[0], imgs.shape[-1]) pbar.set_description(s) # Plot if ni < 3: f = str(log_dir / f'train_batch{ni}.jpg') # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) # if tb_writer and result is not None: # tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test( opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir, plots=epoch == 0 or final_epoch, # plot first and last log_imgs=opt.log_imgs) # Write with open(results_file, 'a') as f: f.write( s + '%10.4g' * 7 % results + '\n') # P, R, [email protected], [email protected], val_loss(box, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Log tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): if tb_writer: tb_writer.add_scalar(tag, x, epoch) # tensorboard if wandb: wandb.log({tag: x}) # W&B # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict(), 'wandb_id': wandb_run.id if wandb else None } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = opt.name if opt.name.isnumeric() else '' fresults, flast, fbest = log_dir / f'results{n}.txt', wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', results_file], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: plot_results(save_dir=log_dir) # save as results.png logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
def memo_valor(env_fn, model=MEMO, memo_kwargs=dict(), annealing_kwargs=dict(), seed=0, episodes_per_expert=40, epochs=50, # warmup=10, train_iters=5, step_size=5, memo_lr=1e-3, train_batch_size=50, eval_batch_size=200, max_ep_len=1000, logger_kwargs=dict(), config_name='standard', save_freq=10, # replay_buffers=[], memories=[]): # W&B Logging wandb.login() composite_name = 'E ' + str(epochs) + ' B ' + str(train_batch_size) + ' ENC ' + \ str(memo_kwargs['encoder_hidden']) + 'DEC ' + str(memo_kwargs['decoder_hidden']) wandb.init(project="MEMO", group='Epochs: ' + str(epochs), name=composite_name, config=locals()) assert memories != [], "No examples found! Replay/memory buffers must be set to proceed." # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Model # Create discriminator and monitor it con_dim = len(memories) memo = model(obs_dim=obs_dim[0], out_dim=act_dim[0], **memo_kwargs) # Set up model saving logger.setup_pytorch_saver([memo]) # Sync params across processes sync_params(memo) N_expert = episodes_per_expert*max_ep_len print("N Expert: ", N_expert) # Buffer # local_episodes_per_epoch = int(episodes_per_epoch / num_procs()) local_iter_per_epoch = int(train_iters / num_procs()) # Count variables var_counts = tuple(count_vars(module) for module in [memo]) logger.log('\nNumber of parameters: \t d: %d\n' % var_counts) # Optimizers # memo_optimizer = AdaBelief(memo.parameters(), lr=memo_lr, eps=1e-20, rectify=True) memo_optimizer = AdaBelief(memo.parameters(), lr=memo_lr, eps=1e-16, rectify=True) # memo_optimizer = Adam(memo.parameters(), lr=memo_lr, betas=(0.9, 0.98), eps=1e-9) start_time = time.time() # Prepare data mem = MemoryBatch(memories, step=step_size) # transition_states, pure_states, transition_actions, expert_ids = mem.collate() transition_states, pure_states, transition_actions, expert_ids = mem.collate() total_l_old, recon_l_old, context_l_old = 0, 0, 0 # Main Loop kl_beta_schedule = frange_cycle_sigmoid(epochs, **annealing_kwargs) for epoch in range(epochs): memo.train() # Select state transitions and actions at random indexes batch_indexes = torch.randint(len(transition_states), (train_batch_size,)) raw_states_batch, delta_states_batch, actions_batch, sampled_experts = \ pure_states[batch_indexes], transition_states[batch_indexes], transition_actions[batch_indexes], expert_ids[batch_indexes] for i in range(local_iter_per_epoch): # kl_beta = kl_beta_schedule[epoch] kl_beta = 1 # only take context labeling into account for first label loss, recon_loss, X, latent_labels, vq_loss = memo(raw_states_batch, delta_states_batch, actions_batch, kl_beta) memo_optimizer.zero_grad() loss.mean().backward() mpi_avg_grads(memo) memo_optimizer.step() # scheduler.step(loss.mean().data.item()) total_l_new, recon_l_new, vq_l_new = loss.mean().data.item(), recon_loss.mean().data.item(), vq_loss.mean().data.item() memo_metrics = {'MEMO Loss': total_l_new, 'Recon Loss': recon_l_new, "VQ Labeling Loss": vq_l_new, "KL Beta": kl_beta_schedule[epoch]} wandb.log(memo_metrics) logger.store(TotalLoss=total_l_new, PolicyLoss=recon_l_new, # ContextLoss=context_l_new, DeltaTotalLoss=total_l_new-total_l_old, DeltaPolicyLoss=recon_l_new-recon_l_old, ) total_l_old, recon_l_old = total_l_new, recon_l_new # , context_l_new if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, [memo], None) # Log logger.log_tabular('Epoch', epoch) logger.log_tabular('EpochBatchSize', train_batch_size) logger.log_tabular('TotalLoss', average_only=True) logger.log_tabular('PolicyLoss', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() print("Finished training, and detected %d contexts!" % len(memo.found_contexts)) # wandb.finish() print('memo type', memo) return memo, mem
def load_frameowrk( seed, disable_debugging_API, num_workers, config_path, checkpoint_folder, reduce_train_dataset, standing_statistics, standing_step, freeze_layers, load_current, eval_type, dataset_name, num_classes, img_size, data_path, architecture, conditional_strategy, hypersphere_dim, nonlinear_embed, normalize_embed, g_spectral_norm, d_spectral_norm, activation_fn, attention, attention_after_nth_gen_block, attention_after_nth_dis_block, z_dim, shared_dim, g_conv_dim, d_conv_dim, G_depth, D_depth, optimizer, batch_size, d_lr, g_lr, momentum, nesterov, alpha, beta1, beta2, total_step, adv_loss, cr, g_init, d_init, random_flip_preprocessing, prior, truncated_factor, ema, ema_decay, ema_start, synchronized_bn, mixed_precision, hdf5_path_train, train_config, model_config, **_): if seed == 0: cudnn.benchmark = True cudnn.deterministic = False else: fix_all_seed(seed) cudnn.benchmark = False cudnn.deterministic = True if disable_debugging_API: torch.autograd.set_detect_anomaly(False) n_gpus = torch.cuda.device_count() default_device = torch.cuda.current_device() check_flag_0(batch_size, n_gpus, standing_statistics, ema, freeze_layers, checkpoint_folder) assert batch_size % n_gpus == 0, "batch_size should be divided by the number of gpus " if n_gpus == 1: warnings.warn('You have chosen a specific GPU. This will completely ' 'disable data parallelism.') prev_ada_p, step, best_step, best_fid, best_fid_checkpoint_path = None, 0, 0, None, None standing_step = standing_step if standing_statistics is True else batch_size run_name = make_run_name(RUN_NAME_FORMAT, framework=config_path.split('/')[-1][:-5], phase='train') logger = make_logger(run_name, None) writer = SummaryWriter(log_dir=join('./logs', run_name)) logger.info('Run name : {run_name}'.format(run_name=run_name)) logger.info(train_config) logger.info(model_config) logger.info('Loading train datasets...') train_dataset = LoadDataset(dataset_name, data_path, train=True, download=True, resize_size=img_size, hdf5_path=hdf5_path_train, random_flip=random_flip_preprocessing) if reduce_train_dataset < 1.0: num_train = int(reduce_train_dataset * len(train_dataset)) train_dataset, _ = torch.utils.data.random_split( train_dataset, [num_train, len(train_dataset) - num_train]) logger.info('Train dataset size : {dataset_size}'.format( dataset_size=len(train_dataset))) logger.info('Loading {mode} datasets...'.format(mode=eval_type)) eval_mode = True if eval_type == 'train' else False eval_dataset = LoadDataset(dataset_name, data_path, train=eval_mode, download=True, resize_size=img_size, hdf5_path=None, random_flip=False) logger.info('Eval dataset size : {dataset_size}'.format( dataset_size=len(eval_dataset))) logger.info('Building model...') if architecture == "dcgan": assert img_size == 32, "Sry, StudioGAN does not support dcgan models for generation of images larger than 32 resolution." module = __import__( 'models.{architecture}'.format(architecture=architecture), fromlist=['something']) logger.info('Modules are located on models.{architecture}'.format( architecture=architecture)) Gen = module.Generator(z_dim, shared_dim, img_size, g_conv_dim, g_spectral_norm, attention, attention_after_nth_gen_block, activation_fn, conditional_strategy, num_classes, g_init, G_depth, mixed_precision).to(default_device) Dis = module.Discriminator(img_size, d_conv_dim, d_spectral_norm, attention, attention_after_nth_dis_block, activation_fn, conditional_strategy, hypersphere_dim, num_classes, nonlinear_embed, normalize_embed, d_init, D_depth, mixed_precision).to(default_device) if ema: print('Preparing EMA for G with decay of {}'.format(ema_decay)) Gen_copy = module.Generator( z_dim, shared_dim, img_size, g_conv_dim, g_spectral_norm, attention, attention_after_nth_gen_block, activation_fn, conditional_strategy, num_classes, initialize=False, G_depth=G_depth, mixed_precision=mixed_precision).to(default_device) Gen_ema = ema_(Gen, Gen_copy, ema_decay, ema_start) else: Gen_copy, Gen_ema = None, None logger.info(count_parameters(Gen)) logger.info(Gen) logger.info(count_parameters(Dis)) logger.info(Dis) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=num_workers, drop_last=True) eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=num_workers, drop_last=False) G_loss = { 'vanilla': loss_dcgan_gen, 'least_square': loss_lsgan_gen, 'hinge': loss_hinge_gen, 'wasserstein': loss_wgan_gen } D_loss = { 'vanilla': loss_dcgan_dis, 'least_square': loss_lsgan_dis, 'hinge': loss_hinge_dis, 'wasserstein': loss_wgan_dis } if optimizer == "SGD": G_optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, Gen.parameters()), g_lr, momentum=momentum, nesterov=nesterov) D_optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, Dis.parameters()), d_lr, momentum=momentum, nesterov=nesterov) elif optimizer == "RMSprop": G_optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, Gen.parameters()), g_lr, momentum=momentum, alpha=alpha) D_optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, Dis.parameters()), d_lr, momentum=momentum, alpha=alpha) elif optimizer == "Adam": G_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, Gen.parameters()), g_lr, [beta1, beta2], eps=1e-6) D_optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, Dis.parameters()), d_lr, [beta1, beta2], eps=1e-6) elif optimizer == "AdaBelief": G_optimizer = AdaBelief(filter(lambda p: p.requires_grad, Gen.parameters()), g_lr, [beta1, beta2], eps=1e-12, rectify=False) D_optimizer = AdaBelief(filter(lambda p: p.requires_grad, Dis.parameters()), d_lr, [beta1, beta2], eps=1e-12, rectify=False) else: raise NotImplementedError if checkpoint_folder is not None: when = "current" if load_current is True else "best" if not exists(abspath(checkpoint_folder)): raise NotADirectoryError checkpoint_dir = make_checkpoint_dir(checkpoint_folder, run_name) g_checkpoint_dir = glob.glob( join(checkpoint_dir, "model=G-{when}-weights-step*.pth".format(when=when)))[0] d_checkpoint_dir = glob.glob( join(checkpoint_dir, "model=D-{when}-weights-step*.pth".format(when=when)))[0] Gen, G_optimizer, trained_seed, run_name, step, prev_ada_p = load_checkpoint( Gen, G_optimizer, g_checkpoint_dir) Dis, D_optimizer, trained_seed, run_name, step, prev_ada_p, best_step, best_fid, best_fid_checkpoint_path =\ load_checkpoint(Dis, D_optimizer, d_checkpoint_dir, metric=True) logger = make_logger(run_name, None) if ema: g_ema_checkpoint_dir = glob.glob( join(checkpoint_dir, "model=G_ema-{when}-weights-step*.pth".format( when=when)))[0] Gen_copy = load_checkpoint(Gen_copy, None, g_ema_checkpoint_dir, ema=True) Gen_ema.source, Gen_ema.target = Gen, Gen_copy writer = SummaryWriter(log_dir=join('./logs', run_name)) if train_config['train']: assert seed == trained_seed, "seed for sampling random numbers should be same!" logger.info('Generator checkpoint is {}'.format(g_checkpoint_dir)) logger.info('Discriminator checkpoint is {}'.format(d_checkpoint_dir)) if freeze_layers > -1: prev_ada_p, step, best_step, best_fid, best_fid_checkpoint_path = None, 0, 0, None, None else: checkpoint_dir = make_checkpoint_dir(checkpoint_folder, run_name) if n_gpus > 1: Gen = DataParallel(Gen, output_device=default_device) Dis = DataParallel(Dis, output_device=default_device) if ema: Gen_copy = DataParallel(Gen_copy, output_device=default_device) if synchronized_bn: Gen = convert_model(Gen).to(default_device) Dis = convert_model(Dis).to(default_device) if ema: Gen_copy = convert_model(Gen_copy).to(default_device) if train_config['eval']: inception_model = InceptionV3().to(default_device) if n_gpus > 1: inception_model = DataParallel(inception_model, output_device=default_device) mu, sigma = prepare_inception_moments(dataloader=eval_dataloader, generator=Gen, eval_mode=eval_type, inception_model=inception_model, splits=1, run_name=run_name, logger=logger, device=default_device) else: mu, sigma, inception_model = None, None, None train_eval = Train_Eval( run_name=run_name, best_step=best_step, dataset_name=dataset_name, eval_type=eval_type, logger=logger, writer=writer, n_gpus=n_gpus, gen_model=Gen, dis_model=Dis, inception_model=inception_model, Gen_copy=Gen_copy, Gen_ema=Gen_ema, train_dataset=train_dataset, eval_dataset=eval_dataset, train_dataloader=train_dataloader, eval_dataloader=eval_dataloader, freeze_layers=freeze_layers, conditional_strategy=conditional_strategy, pos_collected_numerator=model_config['model'] ['pos_collected_numerator'], z_dim=z_dim, num_classes=num_classes, hypersphere_dim=hypersphere_dim, d_spectral_norm=d_spectral_norm, g_spectral_norm=g_spectral_norm, G_optimizer=G_optimizer, D_optimizer=D_optimizer, batch_size=batch_size, g_steps_per_iter=model_config['optimization']['g_steps_per_iter'], d_steps_per_iter=model_config['optimization']['d_steps_per_iter'], accumulation_steps=model_config['optimization']['accumulation_steps'], total_step=total_step, G_loss=G_loss[adv_loss], D_loss=D_loss[adv_loss], contrastive_lambda=model_config['loss_function']['contrastive_lambda'], margin=model_config['loss_function']['margin'], tempering_type=model_config['loss_function']['tempering_type'], tempering_step=model_config['loss_function']['tempering_step'], start_temperature=model_config['loss_function']['start_temperature'], end_temperature=model_config['loss_function']['end_temperature'], weight_clipping_for_dis=model_config['loss_function'] ['weight_clipping_for_dis'], weight_clipping_bound=model_config['loss_function'] ['weight_clipping_bound'], gradient_penalty_for_dis=model_config['loss_function'] ['gradient_penalty_for_dis'], gradient_penalty_lambda=model_config['loss_function'] ['gradient_penalty_lambda'], deep_regret_analysis_for_dis=model_config['loss_function'] ['deep_regret_analysis_for_dis'], regret_penalty_lambda=model_config['loss_function'] ['regret_penalty_lambda'], cr=cr, cr_lambda=model_config['loss_function']['cr_lambda'], bcr=model_config['loss_function']['bcr'], real_lambda=model_config['loss_function']['real_lambda'], fake_lambda=model_config['loss_function']['fake_lambda'], zcr=model_config['loss_function']['zcr'], gen_lambda=model_config['loss_function']['gen_lambda'], dis_lambda=model_config['loss_function']['dis_lambda'], sigma_noise=model_config['loss_function']['sigma_noise'], diff_aug=model_config['training_and_sampling_setting']['diff_aug'], ada=model_config['training_and_sampling_setting']['ada'], prev_ada_p=prev_ada_p, ada_target=model_config['training_and_sampling_setting']['ada_target'], ada_length=model_config['training_and_sampling_setting']['ada_length'], prior=prior, truncated_factor=truncated_factor, ema=ema, latent_op=model_config['training_and_sampling_setting']['latent_op'], latent_op_rate=model_config['training_and_sampling_setting'] ['latent_op_rate'], latent_op_step=model_config['training_and_sampling_setting'] ['latent_op_step'], latent_op_step4eval=model_config['training_and_sampling_setting'] ['latent_op_step4eval'], latent_op_alpha=model_config['training_and_sampling_setting'] ['latent_op_alpha'], latent_op_beta=model_config['training_and_sampling_setting'] ['latent_op_beta'], latent_norm_reg_weight=model_config['training_and_sampling_setting'] ['latent_norm_reg_weight'], default_device=default_device, print_every=train_config['print_every'], save_every=train_config['save_every'], checkpoint_dir=checkpoint_dir, evaluate=train_config['eval'], mu=mu, sigma=sigma, best_fid=best_fid, best_fid_checkpoint_path=best_fid_checkpoint_path, mixed_precision=mixed_precision, train_config=train_config, model_config=model_config, ) if train_config['train']: step = train_eval.train(current_step=step, total_step=total_step) if train_config['eval']: is_save = train_eval.evaluation( step=step, standing_statistics=standing_statistics, standing_step=standing_step) if train_config['save_images']: train_eval.save_images(is_generate=True, png=True, npz=True, standing_statistics=standing_statistics, standing_step=standing_step) if train_config['image_visualization']: train_eval.run_image_visualization( nrow=train_config['nrow'], ncol=train_config['ncol'], standing_statistics=standing_statistics, standing_step=standing_step) if train_config['k_nearest_neighbor']: train_eval.run_nearest_neighbor( nrow=train_config['nrow'], ncol=train_config['ncol'], standing_statistics=standing_statistics, standing_step=standing_step) if train_config['interpolation']: assert architecture in [ "big_resnet", "biggan_deep" ], "Not supported except for biggan and biggan_deep." train_eval.run_linear_interpolation( nrow=train_config['nrow'], ncol=train_config['ncol'], fix_z=True, fix_y=False, standing_statistics=standing_statistics, standing_step=standing_step) train_eval.run_linear_interpolation( nrow=train_config['nrow'], ncol=train_config['ncol'], fix_z=False, fix_y=True, standing_statistics=standing_statistics, standing_step=standing_step) if train_config['frequency_analysis']: train_eval.run_frequency_analysis( num_images=len(train_dataset) // num_classes, standing_statistics=standing_statistics, standing_step=standing_step)
# # Create capsule network. # network = HDGCN(nnodes=max_seq_len, nfeat=bert_dim, nhid=nhid, nclass=nclass, max_seq_len=max_seq_len, device=device, batch_size=batch_size, vocab=vocab).to(device) optimizer = AdaBelief(network.parameters(), lr=learning_rate, eps=1e-16, betas=(0.9, 0.999), weight_decouple=True, rectify=False) # Converts batches of class indices to classes of one-hot vectors. def to_one_hot(x, length): batch_size = x.size(0) x_one_hot = torch.zeros(batch_size, length) for i in range(batch_size): x_one_hot[i, x[i]] = 1.0 return x_one_hot def test(): network.eval()
lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'adam': optimizer = torch.optim.Adam(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'fromage': optimizer = Fromage(params, lr=args.lr) if args.optimizer == 'adamw': optimizer = AdamW(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer == 'radam': optimizer = RAdam(params, lr=args.lr, weight_decay=args.wdecay) if args.optimizer.lower() == 'adabelief': optimizer = AdaBelief(params, lr=args.lr, weight_decay=args.wdecay, eps=args.eps, betas=(args.beta1, args.beta2)) if args.optimizer == 'adabound': optimizer = AdaBound(params, lr=args.lr, weight_decay=args.wdecay, final_lr=30, gamma=1e-3) if args.optimizer == 'amsbound': optimizer = AdaBound(params, lr=args.lr, weight_decay=args.wdecay, final_lr=30, gamma=1e-3, amsbound=True)
def __init__(self, policy, value_fun, cost_fun, simulator, target_kl=1e-2, vf_lr=1e-2, cf_lr=1e-2, cost_lim=0.1, train_v_iters=5, train_c_iters=5, val_l2_reg=1e-3, cost_l2_reg=1e-3, gamma=0.995, cost_gamma=0.995, cg_damping=1e-3, cg_max_iters=10, line_search_coef=0.9, line_search_max_iter=10, line_search_accept_ratio=0.1, optim_mode="adam", optim_max_iter=25, model_name=None, continue_from_file=False, save_every=10, save_dir='trained-models-dir', print_updates=True): # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() self.save_dir = save_dir self.mse_loss = MSELoss(reduction='mean') # Set policy and functions if starting from scratch # if continue_from_file == False: # Different Optimizer Modes (Think LBFGS, Adam and AdaBelief) if optim_mode == "adam": self.value_fun_optimizer = Adam(self.value_fun.parameters(), lr=vf_lr) self.cost_fun_optimizer = Adam(self.cost_fun.parameters(), lr=vf_lr) elif optim_mode == "adabelief": self.value_fun_optimizer = AdaBelief(self.value_fun.parameters(), betas=(0.9, 0.999), eps=1e-8) self.cost_fun_optimizer = AdaBelief(self.cost_fun.parameters(), betas=(0.9, 0.999), eps=1e-8) else: self.value_fun_optimizer = LBFGS(self.value_fun.parameters(), lr=vf_lr, max_iter=optim_max_iter) self.cost_fun_optimizer = LBFGS(self.cost_fun.parameters(), lr=cf_lr, max_iter=optim_max_iter) self.epoch_num = 0 self.elapsed_time = timedelta(0) self.device = get_device() self.mean_rewards = [] self.mean_costs = [] self.session_cum_avg_rewards = 0 self.session_cum_avg_costs = 0 if not model_name and continue_from_file: raise Exception('Argument continue_from_file to __init__ method of ' \ 'CPO case was set to True but model_name was not ' \ 'specified.') if not model_name and save_every: raise Exception('Argument save_every to __init__ method of CPO ' \ 'was set to a value greater than 0 but model_name ' \ 'was not specified.') if continue_from_file: print("about to continue") self.load_session()
def vail(env_fn, actor_critic=MLPActorCritic, discrim = Discriminator, agent=PPOAgent(), ac_kwargs=dict(), seed=0, # Experience Collection steps_per_epoch=4000, epochs=50, max_ep_len=1000, # Discount factors: gamma=0.99, lam=0.97, cost_gamma=0.99, cost_lam=0.97, # Policy Learning: ent_reg=0., # Cost constraints / penalties: cost_lim=25, penalty_init=1., penalty_lr=5e-3, # KL divergence: target_kl=0.01, # Value learning: vf_lr=1e-3, train_v_iters=100, # Policy Learning: pi_lr=3e-4, train_pi_iters=100, # Discriminator Learning: discrim_lr= 1e-3, train_discrim_iters=100, # Clipping clip_ratio=0.2, logger_kwargs=dict(), # Experimenting config_name = 'standard', save_every=10): """ ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_every (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Print some params print("here are some params") print("penalty lr: ", penalty_lr) print("cost limit: ", cost_lim) print("gamma: ", gamma) print("cost gamma", cost_gamma) print("seed: ", seed) # W&B Logging wandb.login() # config_name = 'marigold-gail' config_name = 'marigold' # train_discriminator = True composite_name = 'ppo_penalized_' + config_name + '_' + str(int(steps_per_epoch/1000)) + \ 'Ks_' + str(epochs) + 'e_' + str(ac_kwargs['hidden_sizes'][0]) + 'x' + \ str(len(ac_kwargs['hidden_sizes'])) # 4 million env interactions wandb.init(project="vail-experts-1000epochs", group="full_runs", name='vail_'+composite_name) # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() # Paths _project_dir = '/home/tyna/Documents/openai/research-project/' _root_data_path = _project_dir + 'data/' _expert_path = _project_dir + 'expert_data/' _clone_path = _project_dir + 'clone_data/' _demo_dir = os.path.join(_expert_path, config_name + '_episodes/') # load demonstrations # expert_demo, _ = pickle.load(open('./expert_demo/expert_demo.p', "rb")) # demonstrations = np.array(expert_demo) # print("demonstrations.shape", demonstrations.shape) f = open(_demo_dir + 'sim_data_' + str(1000) + '_buffer.pkl', "rb") buffer_file = pickle.load(f) f.close() expert_demonstrations = samples_from_cpprb(npsamples=buffer_file) # Reconstruct the data, then pass it to replay buffer np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(expert_demonstrations) print("constraints in the environment") print("constrain hazards: ", env.constrain_hazards) print("hazards cost: ", env.hazards_cost) obs_dim = env.observation_space.shape act_dim = env.action_space.shape running_state = ZFilter((obs_dim[0],), clip=1) # Create actor-critic module and monitor it ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) discrim = discrim(env.observation_space, env.action_space, **ac_kwargs) # Sync params across processes sync_params(ac) # Note, also sync for Discriminator sync_params(discrim) # Count variables var_counts = tuple(count_vars(module) for module in [ac.pi, ac.v, discrim]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t discrim: %d \n' % var_counts) z_filter = False # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = CostPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam, cost_gamma, cost_lam) # Set up optimizers for policy and value function # pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) # vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # discrim_optimizer = Adam(discrim.parameters(), lr=discrim_lr) pi_optimizer = AdaBelief(ac.pi.parameters(), betas=(0.9, 0.999), eps=1e-8) vf_optimizer = AdaBelief(ac.v.parameters(), betas=(0.9, 0.999), eps=1e-8) discrim_optimizer = AdaBelief(discrim.parameters(), betas=(0.9, 0.999), eps=1e-8) # self.value_fun_optimizer = AdaBelief(self.value_fun.parameters(), betas=(0.9, 0.999), eps=1e-8) # self.cost_fun_optimizer = AdaBelief(self.cost_fun.parameters(), betas=(0.9, 0.999), eps=1e-8) penalty = np.log(max(np.exp(penalty_init)-1, 1e-8)) mov_avg_ret = 0 mov_avg_cost = 0 # Discriminator reward def get_reward(discrim, state, action): state = torch.Tensor(state) action = torch.Tensor(action) state_action = torch.cat([state, action]) with torch.no_grad(): return -math.log(discrim(state_action)[0].item()) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data['logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up functions for computing value loss(es) def compute_loss_v(data): obs, ret, cret = data['obs'], data['ret'], data['cret'] v_loss = ((ac.v(obs) - ret) ** 2).mean() return v_loss def compute_loss_discrim(data, demonstrations, acc=False): # memory = np.array(memory) # states = np.vstack(memory[:, 0]) # actions = list(memory[:, 1]) obs = data['obs'] act = data['act'] # states = torch.Tensor(states) # actions = torch.Tensor(actions) criterion = torch.nn.BCELoss() # change demo format demonstrations = torch.Tensor(demonstrations) # Pass both expert and learner through discriminator learner = discrim(torch.cat([obs, act], dim=1)) expert = discrim(demonstrations) learner_acc = (learner > 0.5).float().mean() expert_acc = (expert < 0.5).float().mean() discrim_loss = criterion(learner, torch.ones((obs.shape[0], 1))) + \ criterion(expert, torch.zeros((demonstrations.shape[0], 1))) if acc: return discrim_loss, expert_acc, learner_acc else: return discrim_loss # Set up model saving logger.setup_pytorch_saver(ac) penalty_init_param = np.log(max(np.exp(penalty_init) - 1, 1e-8)) TRAIN_DISC = True def update(cur_penalty, TRAIN_DISC): cur_cost = logger.get_stats('EpCost')[0] cur_rew = logger.get_stats('EpRet')[0] if len(rew_mov_avg_10) >= 10: rew_mov_avg_10.pop(0) cost_mov_avg_10.pop(0) rew_mov_avg_10.append(cur_rew) cost_mov_avg_10.append(cur_cost) mov_avg_ret = np.mean(rew_mov_avg_10) mov_avg_cost = np.mean(cost_mov_avg_10) c = cur_cost - cost_lim if c > 0 and agent.cares_about_cost: logger.log('Warning! Safety constraint is already violated.', 'red') # c is the safety constraint print("current cost: ", cur_cost) data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # discrim_l_old = compute_loss_discrim(data, expert_demonstrations).item() # print("data shape") # print(data['obs'].shape) # print("states shape") # print(np_states.shape) # print("obs shape") # print(np_actions.shape) # print("combined shape") combined_expert_demos = np.concatenate((np_states, np_actions), axis=1) # print(comb.shape) # print(comb.shape) # discrim_l_old = compute_loss_discrim(data, np_states).item() discrim_l_old = compute_loss_discrim(data, combined_expert_demos, acc=False).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Discriminator learning if TRAIN_DISC: for i in range(train_discrim_iters): discrim_optimizer.zero_grad() # loss_discrim = compute_loss_discrim(data, expert_demonstrations) loss_discrim, expert_acc, learner_acc = compute_loss_discrim(data, combined_expert_demos, acc=True) print("discriminator loss: ", loss_discrim) loss_discrim.backward() mpi_avg_grads(discrim) # average grads across MPI processes discrim_optimizer.step() if expert_acc.item() > 0.99 and learner_acc.item() > 0.98: # train_discriminator = False # print("hello") TRAIN_DISC = False # expert_acc = ((discrim(combined_expert_demos) < 0.5).float()).mean() # learner_acc = ((discrim(torch.cat([data['obs'], data['act']], dim=1)) > 0.5).float()).mean() # learner = discrim(torch.cat([obs, act], dim=1)) # # print("expert acc: ", expert_acc.item()) # print("learning acc: ", learner_acc.item()) # expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, demonstrations, args) # print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100)) # if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen: # train_discrim_flag = False # Penalty update print("old penalty: ", cur_penalty) cur_penalty = max(0, cur_penalty + penalty_lr*(cur_cost - cost_lim)) print("new penalty: ", cur_penalty) # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, LossDiscrim=discrim_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old), # DeltaLossDiscrim=(loss_discrim.item() - discrim_l_old) ) vf_loss_avg = mpi_avg(v_l_old) pi_loss_avg = mpi_avg(pi_l_old) update_metrics = {'10p mov avg ret': mov_avg_ret, '10p mov avg cost': mov_avg_cost, 'value function loss': vf_loss_avg, 'policy loss': pi_loss_avg, 'current penalty': cur_penalty } wandb.log(update_metrics) # return cur_penalty, train_discriminator return cur_penalty, TRAIN_DISC # Prepare for interaction with environment start_time = time.time() o, r, d, c, ep_ret, ep_cost, ep_len, cum_cost, cum_reward = env.reset(), 0, False, 0, 0, 0, 0, 0, 0 rew_mov_avg_10 = [] cost_mov_avg_10 = [] cur_penalty = penalty_init_param # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): state = running_state(o) # # print("filtered observations") # print(state) # print("unfiltered observations") # print(o) if z_filter: a, v, vc, logp = ac.step(torch.as_tensor(state, dtype=torch.float32)) else: a, v, vc, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) # env.step => Take action next_o, r, d, info = env.step(a) if z_filter: next_o = running_state(next_o) irl_reward = get_reward(discrim, o, a) # Include penalty on cost c = info.get('cost', 0) # Track cumulative cost over training cum_reward += r cum_cost += c ep_ret += r ep_cost += c ep_len += 1 r_total = r - cur_penalty * c r_total /= (1 + cur_penalty) irl_updated = irl_reward - cur_penalty*c irl_updated /= (1 + cur_penalty) # buf.store(o, a, r_total, v, 0, 0, logp, info) # modify # buf.store(o, a, irl_reward, v, 0, 0, logp, info) buf.store(o, a, irl_updated, v, 0, 0, logp, info) # save and log logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not terminal: print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) # if z_filter: # _, v, _, _ = ac.step(torch.as_tensor(state, dtype=torch.float32)) # else: last_v = v last_vc = 0 else: last_v = 0 buf.finish_path(last_v, last_vc) if terminal: # only save EpRet / EpLen if trajectory finished print("end of episode return: ", ep_ret) logger.store(EpRet=ep_ret, EpLen=ep_len, EpCost=ep_cost) # average ep ret and cost avg_ep_ret = ep_ret avg_ep_cost = ep_cost episode_metrics = {'average ep ret': avg_ep_ret, 'average ep cost': avg_ep_cost} wandb.log(episode_metrics) # o, ep_ret, ep_len, ep_cost = env.reset(), 0, 0, 0 # Reset environment o, r, d, c, ep_ret, ep_len, ep_cost = env.reset(), 0, False, 0, 0, 0, 0 # Save model and save last trajectory if (epoch % save_every == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! cur_penalty, TRAIN_DISC = update(cur_penalty, TRAIN_DISC) # Cumulative cost calculations cumulative_cost = mpi_sum(cum_cost) cumulative_reward = mpi_sum(cum_reward) cost_rate = cumulative_cost / ((epoch + 1) * steps_per_epoch) reward_rate = cumulative_reward / ((epoch + 1) * steps_per_epoch) log_metrics = {'cost rate': cost_rate, 'reward rate': reward_rate} wandb.log(log_metrics) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('EpCost', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) # logger.log_tabular('LossDiscrim', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) # logger.log_tabular('DeltaLossDiscrim', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def train(hyp, # path/to/hyp.yaml or hyp dictionary opt, device, callbacks ): save_dir, epochs, batch_size, weights, single_cls, evolve, data, cfg, resume, noval, nosave, workers, freeze, = \ Path(opt.save_dir), opt.epochs, opt.batch_size, opt.weights, opt.single_cls, opt.evolve, opt.data, opt.cfg, \ opt.resume, opt.noval, opt.nosave, opt.workers, opt.freeze # Directories w = save_dir / 'weights' # weights dir (w.parent if evolve else w).mkdir(parents=True, exist_ok=True) # make dir last, best = w / 'last.pt', w / 'best.pt' # Hyperparameters if isinstance(hyp, str): with open(hyp, errors='ignore') as f: hyp = yaml.safe_load(f) # load hyps dict LOGGER.info(colorstr('hyperparameters: ') + ', '.join(f'{k}={v}' for k, v in hyp.items())) # Save run settings if not evolve: with open(save_dir / 'hyp.yaml', 'w') as f: yaml.safe_dump(hyp, f, sort_keys=False) with open(save_dir / 'opt.yaml', 'w') as f: yaml.safe_dump(vars(opt), f, sort_keys=False) # Loggers data_dict = None if RANK in [-1, 0]: loggers = Loggers(save_dir, weights, opt, hyp, LOGGER) # loggers instance if loggers.wandb: data_dict = loggers.wandb.data_dict if resume: weights, epochs, hyp = opt.weights, opt.epochs, opt.hyp # Register actions for k in methods(loggers): callbacks.register_action(k, callback=getattr(loggers, k)) # Config plots = not evolve # create plots cuda = device.type != 'cpu' init_seeds(1 + RANK) with torch_distributed_zero_first(LOCAL_RANK): data_dict = data_dict or check_dataset(data) # check if None train_path, val_path = data_dict['train'], data_dict['val'] nc = 1 if single_cls else int(data_dict['nc']) # number of classes names = ['item'] if single_cls and len(data_dict['names']) != 1 else data_dict['names'] # class names assert len(names) == nc, f'{len(names)} names found for nc={nc} dataset in {data}' # check is_coco = isinstance(val_path, str) and val_path.endswith('coco/val2017.txt') # COCO dataset # Model check_suffix(weights, '.pt') # check weights pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(LOCAL_RANK): weights = attempt_download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Model(cfg or ckpt['model'].yaml, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create exclude = ['anchor'] if (cfg or hyp.get('anchors')) and not resume else [] # exclude keys csd = ckpt['model'].float().state_dict() # checkpoint state_dict as FP32 csd = intersect_dicts(csd, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(csd, strict=False) # load LOGGER.info(f'Transferred {len(csd)}/{len(model.state_dict())} items from {weights}') # report else: model = Model(cfg, ch=3, nc=nc, anchors=hyp.get('anchors')).to(device) # create # Freeze freeze = [f'model.{x}.' for x in range(freeze)] # layers to freeze for k, v in model.named_parameters(): v.requires_grad = True # train all layers if any(x in k for x in freeze): LOGGER.info(f'freezing {k}') v.requires_grad = False # Image size gs = max(int(model.stride.max()), 32) # grid size (max stride) imgsz = check_img_size(opt.imgsz, gs, floor=gs * 2) # verify imgsz is gs-multiple # Batch size if RANK == -1 and batch_size == -1: # single-GPU only, estimate best batch size batch_size = check_train_batch_size(model, imgsz) # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= batch_size * accumulate / nbs # scale weight_decay LOGGER.info(f"Scaled weight_decay = {hyp['weight_decay']}") g0, g1, g2 = [], [], [] # optimizer parameter groups for v in model.modules(): if hasattr(v, 'bias') and isinstance(v.bias, nn.Parameter): # bias g2.append(v.bias) if isinstance(v, nn.BatchNorm2d): # weight (no decay) g0.append(v.weight) elif hasattr(v, 'weight') and isinstance(v.weight, nn.Parameter): # weight (with decay) g1.append(v.weight) if opt.adam: optimizer = Adam(g0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum elif opt.adabelief: from adabelief_pytorch import AdaBelief optimizer_parameters = {'lr': hyp['lr0'], 'weight_decay': hyp['weight_decay'], 'eps': 1e-8, 'betas': (0.9, 0.999), 'weight_decouple': True, 'rectify': False, 'print_change_log': False} optimizer = AdaBelief(g0, **optimizer_parameters) else: optimizer = SGD(g0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({'params': g1, 'weight_decay': hyp['weight_decay']}) # add g1 with weight_decay optimizer.add_param_group({'params': g2}) # add g2 (biases) LOGGER.info(f"{colorstr('optimizer:')} {type(optimizer).__name__} with parameter groups " f"{len(g0)} weight, {len(g1)} weight (no decay), {len(g2)} bias") del g0, g1, g2 # Scheduler if opt.linear_lr: def lf(x): return (1 - x / (epochs - 1)) * (1.0 - hyp['lrf']) + hyp['lrf'] # linear else: lf = one_cycle(1, hyp['lrf'], epochs) # cosine 1->hyp['lrf'] scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) # plot_lr_scheduler(optimizer, scheduler, epochs) # EMA ema = ModelEMA(model) if RANK in [-1, 0] else None # Resume start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # EMA if ema and ckpt.get('ema'): ema.ema.load_state_dict(ckpt['ema'].float().state_dict()) ema.updates = ckpt['updates'] # Epochs start_epoch = ckpt['epoch'] + 1 if resume: assert start_epoch > 0, f'{weights} training to {epochs} epochs is finished, nothing to resume.' if epochs < start_epoch: LOGGER.info(f"{weights} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {epochs} more epochs.") epochs += ckpt['epoch'] # finetune additional epochs del ckpt, csd # DP mode if cuda and RANK == -1 and torch.cuda.device_count() > 1: LOGGER.warning('WARNING: DP not recommended, use torch.distributed.run for best DDP Multi-GPU results.\n' 'See Multi-GPU Tutorial at https://github.com/ultralytics/yolov5/issues/475 to get started.') model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and RANK != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) LOGGER.info('Using SyncBatchNorm()') # Trainloader train_loader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=opt.cache, rect=opt.rect, rank=LOCAL_RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '), shuffle=True) mlc = int(np.concatenate(dataset.labels, 0)[:, 0].max()) # max label class nb = len(train_loader) # number of batches assert mlc < nc, f'Label class {mlc} exceeds nc={nc} in {data}. Possible class labels are 0-{nc - 1}' # Process 0 if RANK in [-1, 0]: val_loader = create_dataloader(val_path, imgsz, batch_size // WORLD_SIZE * 2, gs, single_cls, hyp=hyp, cache=None if noval else opt.cache, rect=True, rank=-1, workers=workers, pad=0.5, prefix=colorstr('val: '))[0] if not resume: labels = np.concatenate(dataset.labels, 0) # c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # frequency # model._initialize_biases(cf.to(device)) if plots: plot_labels(labels, names, save_dir) # Anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) model.half().float() # pre-reduce anchor precision callbacks.run('on_pretrain_routine_end') # DDP mode if cuda and RANK != -1: model = DDP(model, device_ids=[LOCAL_RANK], output_device=LOCAL_RANK) # Model attributes nl = de_parallel(model).model[-1].nl # number of detection layers (to scale hyps) hyp['box'] *= 3 / nl # scale to layers hyp['cls'] *= nc / 80 * 3 / nl # scale to classes and layers hyp['obj'] *= (imgsz / 640) ** 2 * 3 / nl # scale to image size and layers hyp['label_smoothing'] = opt.label_smoothing model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device) * nc # attach class weights model.names = names # Start training t0 = time.time() nw = max(round(hyp['warmup_epochs'] * nb), 1000) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training last_opt_step = -1 maps = np.zeros(nc) # mAP per class results = (0, 0, 0, 0, 0, 0, 0) # P, R, [email protected], [email protected], val_loss(box, obj, cls) scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) stopper = EarlyStopping(patience=opt.patience) compute_loss = ComputeLoss(model) # init loss class LOGGER.info(f'Image sizes {imgsz} train, {imgsz} val\n' f'Using {train_loader.num_workers * WORLD_SIZE} dataloader workers\n' f"Logging results to {colorstr('bold', save_dir)}\n" f'Starting training for {epochs} epochs...') for epoch in range(start_epoch, epochs): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional, single-GPU only) if opt.image_weights: cw = model.class_weights.cpu().numpy() * (1 - maps) ** 2 / nc # class weights iw = labels_to_image_weights(dataset.labels, nc=nc, class_weights=cw) # image weights dataset.indices = random.choices(range(dataset.n), weights=iw, k=dataset.n) # rand weighted idx # Update mosaic border (optional) # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses # lmello changed if RANK != -1: train_loader.sampler.set_epoch(epoch) pbar = enumerate(train_loader) LOGGER.info(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'box', 'obj', 'cls', 'closs','labels', 'img_size')) if RANK in [-1, 0]: pbar = tqdm(pbar, total=nb, bar_format='{l_bar}{bar:10}{r_bar}{bar:-10b}') # progress bar optimizer.zero_grad() for i, (imgs, targets, paths, _) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float() / 255 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # compute_loss.gr = np.interp(ni, xi, [0.0, 1.0]) # iou loss ratio (obj_loss = 1.0 or iou) accumulate = max(1, np.interp(ni, xi, [1, nbs / batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp(ni, xi, [hyp['warmup_bias_lr'] if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [hyp['warmup_momentum'], hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:]] # new shape (stretched to gs-multiple) imgs = nn.functional.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss(pred, targets.to(device)) # loss scaled by batch_size if RANK != -1: loss *= WORLD_SIZE # gradient averaged between devices in DDP mode if opt.quad: loss *= 4. # Backward scaler.scale(loss).backward() # Optimize if ni - last_opt_step >= accumulate: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) last_opt_step = ni # Log if RANK in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1) # update mean losses mem = f'{torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0:.3g}G' # (GB) pbar.set_description(('%10s' * 2 + '%10.4g' * 6) % ( # lmello changed f'{epoch}/{epochs - 1}', mem, *mloss, targets.shape[0], imgs.shape[-1])) callbacks.run('on_train_batch_end', ni, model, imgs, targets, paths, plots, opt.sync_bn) # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for loggers scheduler.step() if RANK in [-1, 0]: # mAP callbacks.run('on_train_epoch_end', epoch=epoch) ema.update_attr(model, include=['yaml', 'nc', 'hyp', 'names', 'stride', 'class_weights']) final_epoch = (epoch + 1 == epochs) or stopper.possible_stop if not noval or final_epoch: # Calculate mAP results, maps, _ = val.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=ema.ema, single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, plots=False, callbacks=callbacks, compute_loss=compute_loss) # Update best mAP fi = fitness(np.array(results).reshape(1, -1)) # weighted combination of [P, R, [email protected], [email protected]] if fi > best_fitness: best_fitness = fi log_vals = list(mloss) + list(results) + lr callbacks.run('on_fit_epoch_end', log_vals, epoch, best_fitness, fi) # Save model if (not nosave) or (final_epoch and not evolve): # if save ckpt = {'epoch': epoch, 'best_fitness': best_fitness, 'model': deepcopy(de_parallel(model)).half(), 'ema': deepcopy(ema.ema).half(), 'updates': ema.updates, 'optimizer': optimizer.state_dict(), 'wandb_id': loggers.wandb.wandb_run.id if loggers.wandb else None, 'date': datetime.now().isoformat()} # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) if (epoch > 0) and (opt.save_period > 0) and (epoch % opt.save_period == 0): torch.save(ckpt, w / f'epoch{epoch}.pt') del ckpt callbacks.run('on_model_save', last, epoch, final_epoch, best_fitness, fi) # Stop Single-GPU if RANK == -1 and stopper(epoch=epoch, fitness=fi): break # Stop DDP TODO: known issues shttps://github.com/ultralytics/yolov5/pull/4576 # stop = stopper(epoch=epoch, fitness=fi) # if RANK == 0: # dist.broadcast_object_list([stop], 0) # broadcast 'stop' to all ranks # Stop DPP # with torch_distributed_zero_first(RANK): # if stop: # break # must break all DDP ranks # end epoch ---------------------------------------------------------------------------------------------------- # end training ----------------------------------------------------------------------------------------------------- if RANK in [-1, 0]: LOGGER.info(f'\n{epoch - start_epoch + 1} epochs completed in {(time.time() - t0) / 3600:.3f} hours.') for f in last, best: if f.exists(): strip_optimizer(f) # strip optimizers if f is best: LOGGER.info(f'\nValidating {f}...') results, _, _ = val.run(data_dict, batch_size=batch_size // WORLD_SIZE * 2, imgsz=imgsz, model=attempt_load(f, device).half(), iou_thres=0.65 if is_coco else 0.60, # best pycocotools results at 0.65 single_cls=single_cls, dataloader=val_loader, save_dir=save_dir, save_json=is_coco, verbose=True, plots=True, callbacks=callbacks, compute_loss=compute_loss) # val best model with plots if is_coco: callbacks.run('on_fit_epoch_end', list(mloss) + list(results) + lr, epoch, best_fitness, fi) callbacks.run('on_train_end', last, best, plots, epoch, results) LOGGER.info(f"Results saved to {colorstr('bold', save_dir)}") torch.cuda.empty_cache() return results
def __init__(self, args, params): super().__init__(args) self._optimizer = AdaBelief(params, **self.optimizer_config)
def get_optimizer_and_scheduler(net, dataloader): print_fn = print if not config.USE_TPU else xm.master_print # m = xm.xrt_world_size() if config.USE_TPU else 1 m = 1 print_fn(f"World Size: {m}") m /= config.WARMUP_FACTOR print_fn(f"Learning Rate Multiplier: {m}") print_fn(f"Start Learning Rate: {config.LEARNING_RATE * m}") # Optimizers print_fn(f"Optimizer: {config.OPTIMIZER}") if config.OPTIMIZER == "Adam": optimizer = torch.optim.Adam( params=net.parameters(), lr=config.LEARNING_RATE * m, weight_decay=1e-5, amsgrad=False ) elif config.OPTIMIZER == "AdamW": optimizer = optim.AdamW( net.parameters(), lr=config.LEARNING_RATE * m, weight_decay=0.001) elif config.OPTIMIZER == "AdaBelief": optimizer = AdaBelief(net.parameters( ), lr=config.LEARNING_RATE * m, eps=1e-16, betas=(0.9, 0.999), weight_decouple=True, rectify=False, print_change_log=False) elif config.OPTIMIZER == "RangerAdaBelief": optimizer = RangerAdaBelief( net.parameters(), lr=config.LEARNING_RATE * m, eps=1e-12, betas=(0.9, 0.999), print_change_log=False) elif config.OPTIMIZER == "RAdam": optimizer = RAdam( net.parameters(), lr=config.LEARNING_RATE * m ) else: optimizer = optim.SGD( net.parameters(), lr=config.LEARNING_RATE * m) # Schedulers print_fn(f"Scheduler: {config.SCHEDULER}") if config.SCHEDULER == "ReduceLROnPlateau": scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, patience=0, factor=0.1, verbose=config.LEARNING_VERBOSE) elif config.SCHEDULER == "CosineAnnealingLR": scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, T_max=5, eta_min=0) elif config.SCHEDULER == "OneCycleLR": steps_per_epoch = len(dataloader) scheduler = torch.optim.lr_scheduler.OneCycleLR( optimizer=optimizer, max_lr=1e-2, epochs=config.MAX_EPOCHS, steps_per_epoch=steps_per_epoch, pct_start=0.25,) elif config.SCHEDULER == "CosineAnnealingWarmRestarts": scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=config.MAX_EPOCHS - config.WARMUP_EPOCHS, T_mult=1, eta_min=1e-6, last_epoch=-1) elif config.SCHEDULER == "StepLR": scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=2, gamma=0.1) else: scheduler = None print_fn(f"Gradual Warmup: {config.SCHEDULER_WARMUP}") if config.SCHEDULER_WARMUP: scheduler = scheduler = GradualWarmupSchedulerV2( optimizer, multiplier=config.WARMUP_FACTOR, total_epoch=config.WARMUP_EPOCHS, after_scheduler=scheduler) return optimizer, scheduler
options = {} options.update({'method': 'Dopri5'}) options.update({'h': 0.1}) options.update({'rtol': 1e-5}) options.update({'atol': 1e-6}) options.update({'print_neval': False}) options.update({'neval_max': 1000000}) options.update({'safety': None}) # create multiple-shooting instance multi_shoot = MultipleShoot( ode_func=dcmfunc, observation_length=time_length, ODE_options=options, smooth_penalty=smooth_penalty, chunk_length=chunk_length) multi_shoot.prepare_intermediate( input_tensor ) # create model optimizer = AdaBelief(filter(lambda p: p.requires_grad, multi_shoot.parameters()), lr = lr, eps=1e-16, rectify=False, betas=(0.5, 0.9)) #optimizer = Adam(filter(lambda p: p.requires_grad, multi_shoot.parameters()), lr=lr, eps=1e-16, # betas=(0.5, 0.9)) best_loss = np.inf for _epoch in range(N_epoch): # adjust learning rate for param_group in optimizer.param_groups: param_group['lr'] *= gamma optimizer.zero_grad() # forward and optimizer prediction_chunks, data_chunks = multi_shoot.fit_and_grad( input_tensor, time_points ) loss = multi_shoot.get_loss( prediction_chunks, data_chunks )
print(f"Not using pretrained {args.arch}") model = models.__dict__[args.arch]() model = model.to(device) # print(model) start_epoch = 1 if args.resume == True: loc = "cuda:0" checkpoint = torch.load(args.save_path, map_location=loc) model.load_state_dict(checkpoint['net']) print(f"Done loading pretrained, ") # optimizer = optim.AdamW(model.parameters(), lr = args.lr, weight_decay = # args.weight_decay) # optimizer = AdaBelief(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, eps=1e-10, weight_decouple=True, rectify=True) # scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr = # args.max_lr,steps_per_epoch = # len(train_loader), epochs = 10) for epoch in tqdm(range(start_epoch, args.epochs + 1)): train(args, model, device, train_loader, optimizer, epoch) test(model, device, test_loader) # scheduler.step()