def lr_find(self, freeze_until=None, start_lr=1e-7, end_lr=1, num_it=100): """Gridsearch the optimal learning rate for the training Args: freeze_until (str, optional): last layer to freeze start_lr (float, optional): initial learning rate end_lr (float, optional): final learning rate num_it (int, optional): number of iterations to perform """ self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(start_lr) gamma = (end_lr / start_lr)**(1 / (num_it - 1)) scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma) self.lr_recorder = [start_lr * gamma**idx for idx in range(num_it)] self.loss_recorder = [] for batch_idx, (x, target) in enumerate(self.train_loader): x, target = self.to_cuda(x, target) # Forward batch_loss = self._get_loss(x, target) self._backprop_step(batch_loss) # Update LR scheduler.step() # Record self.loss_recorder.append(batch_loss.item()) # Stop after the number of iterations if batch_idx + 1 == num_it: break
def __init__(self, args={}): self.args = args self.parse_args(args) self.classifier = ConvNet() self.optimizer = optim.Adam(self.classifier.parameters(), lr=self.lr, betas=(0.9, 0.98), eps=1e-9) self.loss_function = nn.CrossEntropyLoss() lmbda = lambda epoch: self.lr_factor self.lr_scheduler = MultiplicativeLR(self.optimizer, lr_lambda=lmbda)
def lr_find( self, freeze_until: Optional[str] = None, start_lr: float = 1e-7, end_lr: float = 1, norm_weight_decay: Optional[float] = None, num_it: int = 100, ) -> None: """Gridsearch the optimal learning rate for the training Args: freeze_until (str, optional): last layer to freeze start_lr (float, optional): initial learning rate end_lr (float, optional): final learning rate norm_weight_decay (float, optional): weight decay to apply to normalization parameters num_it (int, optional): number of iterations to perform """ if num_it > len(self.train_loader): raise ValueError("the value of `num_it` needs to be lower than the number of available batches") self.model = freeze_model(self.model.train(), freeze_until) # Update param groups & LR self._reset_opt(start_lr, norm_weight_decay) gamma = (end_lr / start_lr) ** (1 / (num_it - 1)) scheduler = MultiplicativeLR(self.optimizer, lambda step: gamma) self.lr_recorder = [start_lr * gamma ** idx for idx in range(num_it)] self.loss_recorder = [] if self.amp: self.scaler = torch.cuda.amp.GradScaler() for batch_idx, (x, target) in enumerate(self.train_loader): x, target = self.to_cuda(x, target) # Forward batch_loss = self._get_loss(x, target) self._backprop_step(batch_loss) # Update LR scheduler.step() # Record if torch.isnan(batch_loss) or torch.isinf(batch_loss): if batch_idx == 0: raise ValueError("loss value is NaN or inf.") else: break self.loss_recorder.append(batch_loss.item()) # Stop after the number of iterations if batch_idx + 1 == num_it: break self.lr_recorder = self.lr_recorder[:len(self.loss_recorder)]
def configure_optimizers(self): lr_params = self.hparams.Optim optim_args = lr_params["args"][lr_params["name"]] optimizers = {"adam": Adam, "sgd": SGD, "rmsprop": RMSprop} # Define optimizer optimizer = optimizers[lr_params["name"]]( self.parameters(), lr=self.hparams.lr, **optim_args ) # Define Learning Rate Scheduling def lambda1(val): return lambda epoch: epoch // val sched_params = self.hparams.Optim["Schedule"] sched_name = sched_params["name"] if not sched_name: return optimizer sched_args = sched_params["args"][sched_name] if sched_name == "step": scheduler = StepLR(optimizer, **sched_args) elif sched_name == "multiplicative": scheduler = MultiplicativeLR( optimizer, lr_lambda=[lambda1(sched_args["val"])] ) elif sched_name == "lambda": scheduler = LambdaLR(optimizer, lr_lambda=[lambda1(sched_args["val"])]) else: raise NotImplementedError("Unimplemented Scheduler!") return [optimizer], [scheduler]
def lr_range_test(model, train, test, train_loader, test_loader): device = 'cuda' if torch.cuda.is_available() else 'cpu' #model = Net().to(device) optimizer = optim.SGD(model.parameters(), lr=0.0001) lmbda = lambda epoch: 1.4 #scheduler = OneCycleLR(optimizer,max_lr=0.5,total_steps=25) scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda) learning_lr_trace = [] for epoch in range(1, 25): print(f'Epoch: {epoch} Learning_Rate {scheduler.get_lr()}') learning_lr_trace.append(scheduler.get_lr()) train_loss, train_acc = train(model, device, train_loader, optimizer, epoch) test_loss, test_acc_l1 = test(model, device, test_loader) scheduler.step() return learning_lr_trace, train_acc, test_acc_l1
def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.hparams.args.lr) #,momentum=0.9) lmbda = lambda epoch: 1.05 scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda) return { 'optimizer': optimizer, 'lr_scheduler': scheduler #'monitor': 'train_loss' }
def configure_optimizers(self): # log config here because it's the only place where we always have the logger (it's never called during inference) with NamedTemporaryFile(suffix=".yml") as f: self.config.to_yaml_file(f.name) self.logger.log_artifact(f.name, "config.yml") no_decay = {"bias", "norm.weight"} # norm.weight only applies to nn.LayerNorm optimizer_grouped_parameters = [ { "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": self.config.experiment.tts_training.weight_decay, }, { "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0, }, ] def get_optimizer(lr): return AdamW( optimizer_grouped_parameters, lr=lr, weight_decay=self.config.experiment.tts_training.weight_decay, ) if self.config.experiment.tts_training.lr_scheduler is not None: schedule_config = self.config.experiment.tts_training.lr_scheduler assert schedule_config.start_schedule_epoch >= 1, "start_schedule_epoch has to be >= 1" start = schedule_config.start_schedule_epoch end = schedule_config.end_schedule_epoch end = self.config.experiment.max_epochs if end is None else end gamma = np.log(schedule_config.initial_lr) - np.log(schedule_config.final_lr) gamma /= end - start def exp_dec(current): if start <= current <= end: a = np.exp(-1 * gamma * float(current)) * schedule_config.initial_lr b = np.exp(-1 * gamma * float(current-1)) * schedule_config.initial_lr decay = a / b else: decay = 1 self.logger.log_metric("learning_rate_decay", decay) return decay optimizer = get_optimizer(schedule_config.initial_lr) scheduler = MultiplicativeLR(optimizer, exp_dec) return [optimizer], [scheduler] else: optimizer = get_optimizer(self.config.experiment.tts_training.learning_rate) return optimizer
def get_generator(args): if args.model == 'rrdb': generator = RRDBNet() optimizer_G = torch.optim.Adam(generator.parameters(), lr=args.lr, betas=args.betas) if args.multistep_lr: scheduler_G = MultiStepLR(optimizer_G, milestones=args.multistep_milestones, gamma=args.multistep_gamma) else: lr_lambda = lambda epoch: 1 scheduler_G = MultiplicativeLR(optimizer_G, lr_lambda) return generator, optimizer_G, scheduler_G raise NotImplementedError(str(args.model) + " is not implemented")
def get_scheduler(sched_params, optimizer): sched_name = sched_params["name"] if not sched_name: return optimizer sched_args = sched_params["args"][sched_name] if sched_name == "step": scheduler = StepLR(optimizer, **sched_args) elif sched_name == "multiplicative": scheduler = MultiplicativeLR(optimizer, lr_lambda=[lambda1(sched_args["val"])]) elif sched_name == "lambda": scheduler = LambdaLR(optimizer, lr_lambda=[lambda1(sched_args["val"])]) else: raise NotImplementedError("Unimplemented Scheduler!") return [scheduler]
def get_linear_schedule_with_minlr(optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, last_epoch: int = -1, min_lr: int = 1e-07): """ Creates a scheduler with a learning rate that linearly decreases but saturates at min_lr value. Args: optimizer (:class:`~torch.optim.Optimizer`): The optimizer for which to schedule the learning rate. num_warmup_steps (:obj:`int`): The number of steps for the warmup phase. num_training_steps (:obj:`int`): The total number of training steps. num_cycles (:obj:`int`, `optional`, defaults to 1): The number of hard restarts to use. last_epoch (:obj:`int`, `optional`, defaults to -1): The index of the last epoch when resuming training. min_lr (:obj:`int`, `optional`, defaults to 1e-07): The value of minimum learning rate where it should saturate. Return: :obj:`torch.optim.lr_scheduler.MultiplicativeLR` with the appropriate schedule. """ init_lr = optimizer.defaults['lr'] def lr_lambda(current_step: int): steps_done = float(num_training_steps - current_step) if current_step > 1: mul_fac = steps_done / max(steps_done + 1, 1) else: mul_fac = steps_done / (num_training_steps) if mul_fac * init_lr > min_lr: return mul_fac else: return 1 return MultiplicativeLR(optimizer, lr_lambda, last_epoch)
def init_network(params, params_gan): netD = None optimizerD = None train_gan = params['train_gan'] ngpu = torch.cuda.device_count() modelMDE = FPNNet(num_channels=params['num_channels']) # save model architecture with open(f'{MODEL_DIR}/network_layers.txt', 'w') as f: print(modelMDE, file=f) if train_gan: netD = Discriminator(ngpu) with open(f'{MODEL_DIR}/discrim_layers.txt', 'w') as f: print(netD, file=f) # wrap into DataParallel to run in several GPUs if params['parallel'] and ngpu > 1: print(f"Using {ngpu} GPUs") modelMDE = nn.DataParallel(modelMDE, list(range(ngpu))) if train_gan: netD = nn.DataParallel(netD, list(range(ngpu))) modelMDE.to(device) optimizer = torch.optim.Adam(modelMDE.parameters(), lr=params['lr'], weight_decay=4e-5) lmbda = lambda epoch: params['lr_decay'] scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda) total_params = sum(p.numel() for p in modelMDE.parameters()) print(f'\nNum of parameters in MDE net: {total_params}') if train_gan: netD.to(device) netD.apply(weights_init) total_params_d = sum(p.numel() for p in netD.parameters()) print(f'\nNum of parameters in Discriminator net: {total_params_d}') optimizerD = torch.optim.Adam(netD.parameters(), lr=params_gan['lr'], betas=(params_gan['beta1_d'], 0.999)) return modelMDE, netD, optimizer, optimizerD, scheduler
# Load EfficientNet Model model = IVFEfficientNet(ARCHITECTURE) #model.load('model/regression_epoch-18.pt') #model.load('model/efficientnet-b4regression_epoch-20.pt') #model.load('model/efficientnet-b4_finetune_regression_epoch-10.pt') # loss function criterion = nn.SmoothL1Loss() # optimizer # optimizer = optim.SGD(model.parameters(), lr=3e-3, momentum=0.9, nesterov = True) optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=L2_COEFFICIENT) lmbda = lambda epoch: LEARNING_RATE_DECAY scheduler = MultiplicativeLR(optimizer, lr_lambda=lmbda) model._train(train_loader, epochs=EPOCHS, loss_function=criterion, optimizer=optimizer, valid_loader=valid_loader, scheduler=scheduler, save_filename=MODEL_SAVE_NAME) outputs = model(overall_loader) outputs = scaler.inverse_transform(outputs) outputs = pd.DataFrame(outputs).rename(columns={ 0: 'pred BS', 1: 'pred ICM', 2: 'pred TE'
def fit( self, X_train, y_train, X_validation=None, y_validation=None, loss_key="opt", batch_size=128, num_workers=0, learning_rate=1e-3, learning_rate_lambda=0.995, max_epoch=10000, early_stopping=100, device="cpu", verbose=False, ): """ Train the model using gradient descent back propagation Parameters ---------- X_train : {array-like, sparse matrix} of shape (n_samples, n_features) Features matrix used to train the model y_train : vector-like of shape (n_samples, 1) The target vector used to train the model X_validation : {array-like, sparse matrix} of shape (n_samples, n_features) Features matrix used for early stopping of the training y_validation : vector-like of shape (n_samples, 1) The target vector used for early stopping of the training loss_key: string (default = 'opt') Which field of the loss dictionary to optimize batch_size: int (default = 128) Batch size num_workers: int (default = 0) Number of cpus to use learning_rate: float (default = 1e-3) Gradient descent learning rate learning_rate_lambda: float (default = 0.995) The rate of decreasing learning_rate max_epoch: int (default = 10000) The maximum number of optimization epochs early_stopping: int (default = 100) The number of epochs without improving the bast validation loss allowed before stopping device : 'cpu' or 'gpu' (default = 'cpu') Device used by pytorch for training the model and using the trained model for encoding/decoding verbose: True or False (default = False) Verbosity """ assert X_train.shape[1] == self.input_dim self.to(device) train_loader = torch.utils.data.DataLoader( TensorDataset(torch.Tensor(X_train), torch.Tensor(y_train)), batch_size=batch_size, shuffle=True, num_workers=num_workers, ) if X_validation is not None: validation_loader = torch.utils.data.DataLoader( TensorDataset(torch.Tensor(X_validation), torch.Tensor(y_validation)), batch_size=batch_size, shuffle=True, num_workers=num_workers, ) else: validation_loader = None optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate) scheduler = MultiplicativeLR( optimizer, lr_lambda=(lambda epoch: learning_rate_lambda)) best_validation_loss = None iter_no_improve = 0 for epoch in range(max_epoch): self.train() training_loss = 0 for data in train_loader: Xb = data[0].to(device) optimizer.zero_grad() output = self(Xb) loss = self.loss(output, Xb)[loss_key] loss.backward() optimizer.step() training_loss += loss.detach().cpu().numpy() self.eval() validation_loss = 0 if validation_loader: with torch.no_grad(): for data in validation_loader: Xb = data[0].to(device) output = self(Xb) loss = self.loss(output, Xb)[loss_key] validation_loss += loss.detach().cpu().numpy() if best_validation_loss is None or validation_loss < best_validation_loss: best_validation_loss = validation_loss iter_no_improve = 0 else: iter_no_improve += 1 if iter_no_improve > early_stopping: if verbose: print(f"Early stopping after {epoch} epochs") break scheduler.step() if verbose: print( f"[{epoch}] training loss={training_loss}, validation loss={validation_loss}" ) return self
def set_scheduler(self, lmbda): self.scheduler = MultiplicativeLR(self.optimizer, lr_lambda=lmbda, verbose=False)
def collate(batch): return dist_custom_collate(batch, dist_bins, 64) training_loader = DataLoader(training_data, batch_size=4, shuffle=True, num_workers=0, pin_memory=True, collate_fn=collate) accumulate = 1 torch.autograd.set_detect_anomaly(True) optimizer = AdamW(network.parameters(), lr=1e-6) scheduler = MLR(optimizer, lambda x: 1.1) class FocalLoss(nn.CrossEntropyLoss): ''' Focal loss for classification tasks on imbalanced datasets ''' def __init__(self, gamma, alpha=None, ignore_index=-100, reduction='mean'): super().__init__(weight=alpha, ignore_index=ignore_index, reduction='mean') self.reduction = reduction self.gamma = gamma def forward(self, input_, target): cross_entropy = super().forward(input_, target) # Temporarily mask out ignore index to '0' for valid gather-indices input. # This won't contribute final loss as the cross_entropy contribution
return MetaAgent([agent(params, env_arg) for env_arg in train_envs]) rho = 16 n_rules = int( sum([t.Size(s).numel() for s in param_shapes.values()]) / rho) population = GaussianMixturePopulation( {k: t.Size(v[:-1]) for k, v in param_shapes.items()}, (n_rules, 5), constructor, 0.1, device) iterations = 500 pop_size = 500 optim = SGD(population.parameters(), lr=0.02) lr_decay = 0.995 # t.exp(t.log(t.scalar_tensor(0.5)) / 100) # halves every 100 steps sched = MultiplicativeLR(optim, lr_lambda=lambda step: lr_decay) pbar = tqdm.tqdm(range(iterations)) best_so_far = -1e9 train_writer, test_writer = util.get_writers('hebbian') def fitness_shaping(x): return normalize(compute_centered_ranks(x)) for i in pbar: optim.zero_grad() with Pool(cpu_count() // 2) as pool: raw_fitness = population.fitness_grads(pop_size, pool, fitness_shaping) train_writer.add_scalar('fitness', raw_fitness.mean(), i) train_writer.add_scalar('fitness/std', raw_fitness.std(), i)
def initalize_schedulers(ae_optim, disc_optim, cfg): ae_sched = MultiplicativeLR(ae_optim, lr_lambda=partial(lambda_rule_ae, cfg=cfg)) disc_sched = MultiplicativeLR(disc_optim, lr_lambda=partial(lambda_rule_disc, cfg=cfg)) return ae_sched, disc_sched
num_train = int(P_TRAIN * num_cars) num_test = num_cars - num_train train_data, test_data = random_split(dataset, [num_train, num_test]) # set up the train and test data loaders train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False) # load ResNet-50 with every layer frozen except for layer3-bottleneck5 and beyond, # and a new fully-connected network which outputs a 196-dim vector device = get_device() model = load_resnet50_layer3_bottleneck5(num_car_models) model = model.to(device) criterion = nn.CrossEntropyLoss() optimizer = Adam(model.parameters(), lr=LEARNING_RATE) scheduler = MultiplicativeLR(optimizer, lr_lambda=lambda epoch: LR_DECAY) # set up the output logger output_dir = '/home/mchobanyan/data/research/transfer/vis/finetune-car-resnet50' model_dir = os.path.join(output_dir, 'models') create_folder(model_dir) logger = TrainingLogger(filepath=os.path.join(output_dir, 'training-log.csv')) for epoch in tqdm(range(NUM_EPOCHS)): train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device) test_loss, test_acc = test_epoch(model, test_loader, criterion, device) scheduler.step() logger.add_entry(epoch, train_loss, test_loss, train_acc, test_acc) checkpoint(model, os.path.join(model_dir, f'model_epoch{epoch}.pt'))
def train( train_data, exp_dir=datetime.now().strftime("corrector_model/%Y-%m-%d_%H%M"), learning_rate=0.00005, rsize=10, epochs=1, checkpoint_path='', seed=6548, batch_size=4, edge_loss=False, model_type='cnet', model_cap='normal', optimizer_type='radam', reset_optimizer=True, # if true, does not load optimizer chekcpoints safe_descent=True, activation_type='mish', activation_args={}, io=None, dynamic_lr=True, dropout=0, rotations=False, use_batch_norm=True, batch_norm_momentum=None, batch_norm_affine=True, use_gc=True, no_lr_schedule=False, diff_features_only=False): start_time = time.time() io.cprint("-------------------------------------------------------" + "\nexport dir = " + '/checkpoints/' + exp_dir + "\nbase_learning_rate = " + str(learning_rate) + "\nuse_batch_norm = " + str(use_batch_norm) + "\nbatch_norm_momentum = " + str(batch_norm_momentum) + "\nbatch_norm_affine = " + str(batch_norm_affine) + "\nno_lr_schedule = " + str(no_lr_schedule) + "\nuse_gc = " + str(use_gc) + "\nrsize = " + str(rsize) + "\npython_version: " + sys.version + "\ntorch_version: " + torch.__version__ + "\nnumpy_version: " + np.version.version + "\nmodel_type: " + model_type + "\nmodel_cap: " + model_cap + "\noptimizer: " + optimizer_type + "\nactivation_type: " + activation_type + "\nsafe_descent: " + str(safe_descent) + "\ndynamic_lr: " + str(dynamic_lr) + "\nrotations: " + str(rotations) + "\nepochs = " + str(epochs) + (("\ncheckpoint = " + checkpoint_path) if (checkpoint_path != None and checkpoint_path != '') else '') + "\nseed = " + str(seed) + "\nbatch_size = " + str(batch_size) + "\n#train_data = " + str(sum([bin.size(0) for bin in train_data["train_bins"]])) + "\n#test_data = " + str(len(train_data["test_samples"])) + "\n#validation_data = " + str(len(train_data["val_samples"])) + "\nedge_loss = " + str(edge_loss) + "\n-------------------------------------------------------" + "\nstart_time: " + datetime.now().strftime("%Y-%m-%d_%H%M%S") + "\n-------------------------------------------------------") # initialize torch & cuda --------------------------------------------------------------------- torch.manual_seed(seed) np.random.seed(seed) device = utils.getDevice(io) # extract train- & test data (and move to device) -------------------------------------------- # train_bins = [bin.float().to(device) for bin in train_data["train_bins"]] # test_samples = [sample.float().to(device) for sample in train_data["test_samples"]] # val_samples = [sample.float().to(device) for sample in train_data["val_samples"]] train_bins = [bin.float() for bin in train_data["train_bins"]] test_samples = [sample.float() for sample in train_data["test_samples"]] val_samples = [sample.float() for sample in train_data["val_samples"]] # Initialize Model ------------------------------------------------------------------------------ model_args = { 'model_type': model_type, 'model_cap': model_cap, 'input_channels': test_samples[0].size(1), 'output_channels': test_samples[0].size(1), 'rsize': rsize, 'emb_dims': 1024, 'activation_type': activation_type, 'activation_args': activation_args, 'dropout': dropout, 'batch_norm': use_batch_norm, 'batch_norm_affine': batch_norm_affine, 'batch_norm_momentum': batch_norm_momentum, 'diff_features_only': diff_features_only } model = getModel(model_args).to(device) # init optimizer & scheduler ------------------------------------------------------------------- lookahead_sync_period = 6 optimizer = None if optimizer_type == 'radam': optimizer = RAdam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, use_gc=use_gc) elif optimizer_type == 'lookahead': optimizer = Ranger(model.parameters(), lr=learning_rate, alpha=0.9, k=lookahead_sync_period) # make sure that either a LR schedule is given or dynamic LR is enabled assert dynamic_lr or not no_lr_schedule scheduler = None if no_lr_schedule else MultiplicativeLR( optimizer, lr_lambda=MultiplicativeAnnealing(epochs)) # set train settings & load previous model state ------------------------------------------------------------ checkpoint = getEmptyCheckpoint() last_epoch = 0 if (checkpoint_path != None and checkpoint_path != ''): checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint['model_state_dict'][-1]) if not reset_optimizer: optimizer.load_state_dict(checkpoint['optimizer_state_dict'][-1]) last_epoch = len(checkpoint['model_state_dict']) print('> loaded checkpoint! (%d epochs)' % (last_epoch)) checkpoint['train_settings'].append({ 'learning_rate': learning_rate, 'scheduler': scheduler, 'epochs': epochs, 'seed': seed, 'batch_size': batch_size, 'edge_loss': edge_loss, 'optimizer': optimizer_type, 'safe_descent:': str(safe_descent), 'dynamic_lr': str(dynamic_lr), 'rotations': str(rotations), 'train_data_count': sum([bin.size(0) for bin in train_data["train_bins"]]), 'test_data_count': len(train_data["test_samples"]), 'validation_data_count': len(train_data["val_samples"]), 'model_args': model_args }) # set up report interval (for logging) and batch size ------------------------------------------------------------------- report_interval = 100 loss_function = torch.nn.MSELoss(reduction='mean') # begin training ########################################################################################################################### io.cprint("\nBeginning Training..\n") for epoch in range(last_epoch + 1, last_epoch + epochs + 1): io.cprint( "Epoch: %d ------------------------------------------------------------------------------------------" % (epoch)) io.cprint("Current LR: %.10f" % (optimizer.param_groups[0]['lr'])) model.train() optimizer.zero_grad() checkpoint['train_batch_loss'].append([]) checkpoint['train_batch_N'].append([]) checkpoint['train_batch_lr_adjust'].append([]) checkpoint['train_batch_loss_reduction'].append([]) checkpoint['lr'].append(optimizer.param_groups[0]['lr']) # draw random batches from random bins binbatches = utils.drawBinBatches([bin.size(0) for bin in train_bins], batchsize=batch_size) checkpoint['train_batch_N'][-1] = [ train_bins[bin_id][batch_ids].size(1) for (bin_id, batch_ids) in binbatches ] failed_loss_optims = 0 cum_lr_adjust_fac = 0 cum_loss_reduction = 0 # pre-compute random rotations if needed batch_rotations = [None] * len(binbatches) if rotations: start_rotations = time.time() batch_rotations = torch.zeros( (len(binbatches), batch_size, test_samples[0].size(1), test_samples[0].size(1)), device=device) for i in range(len(binbatches)): for j in range(batch_size): batch_rotations[i, j] = utils.getRandomRotation( test_samples[0].size(1), device=device) print("created batch rotations (%ds)" % (time.time() - start_rotations)) b = 0 # batch counter train_start = time.time() for (bin_id, batch_ids) in binbatches: b += 1 # print ("handling batch %d" % (b)) # prediction & loss ---------------------------------------- batch_sample = train_bins[bin_id][batch_ids].to( model.base.device) # size: (B x N x d x 2) batch_loss = getBatchLoss(model, batch_sample, loss_function, edge_loss=edge_loss, rotations=batch_rotations[b - 1]) batch_loss.backward() checkpoint['train_batch_loss'][-1].append(batch_loss.item()) new_loss = 0.0 lr_adjust = 1.0 loss_reduction = 0.0 # if safe descent is enabled, try to optimize the descent step so that a reduction in loss is guaranteed if safe_descent: # create backups to restore states before the optimizer step model_state_backup = copy.deepcopy(model.state_dict()) opt_state_backup = copy.deepcopy(optimizer.state_dict()) # make an optimizer step optimizer.step() # in each itearation, check if the optimzer gave an improvement # if not, restore the original states, reduce the learning rate and try again # no gradient needed for the plain loss calculation with torch.no_grad(): for i in range(10): new_loss = getBatchLoss( model, batch_sample, loss_function, edge_loss=edge_loss, rotations=batch_rotations[b - 1]).item() # if the model performs better now we continue, if not we try a smaller learning step if (new_loss < batch_loss.item()): # print("lucky! (%f -> %f) reduction: %.4f%%" % (batch_loss.item(), new_loss, 100 * (batch_loss.item()-new_loss) / batch_loss.item())) break else: # print("try again.. (%f -> %f)" % (batch_loss.item(), new_loss)) model.load_state_dict(model_state_backup) optimizer.load_state_dict(opt_state_backup) lr_adjust *= 0.7 optimizer.step(lr_adjust=lr_adjust) loss_reduction = 100 * (batch_loss.item() - new_loss) / batch_loss.item() if new_loss >= batch_loss.item(): failed_loss_optims += 1 else: cum_lr_adjust_fac += lr_adjust cum_loss_reduction += loss_reduction else: cum_lr_adjust_fac += lr_adjust optimizer.step() checkpoint['train_batch_lr_adjust'][-1].append(lr_adjust) checkpoint['train_batch_loss_reduction'][-1].append(loss_reduction) # reset gradients optimizer.zero_grad() # statistic caluclation and output ------------------------- if b % report_interval == 0: last_100_loss = sum(checkpoint['train_batch_loss'][-1] [b - report_interval:b]) / report_interval improvement_indicator = '+' if epoch > 1 and last_100_loss < checkpoint[ 'train_loss'][-1] else '' io.cprint( ' Batch %4d to %4d | loss: %.10f%1s | av. dist. per neighbor: %.10f | E%3d | T:%5ds | Failed Optims: %3d (%05.2f%%) | Av. Adjust LR: %.6f | Av. Loss Reduction: %07.4f%%' % (b - (report_interval - 1), b, last_100_loss, improvement_indicator, np.sqrt(last_100_loss), epoch, time.time() - train_start, failed_loss_optims, 100 * (failed_loss_optims / report_interval), (cum_lr_adjust_fac / (report_interval - failed_loss_optims) if failed_loss_optims < report_interval else -1), (cum_loss_reduction / (report_interval - failed_loss_optims) if failed_loss_optims < report_interval else -1))) failed_loss_optims = 0 cum_lr_adjust_fac = 0 cum_loss_reduction = 0 checkpoint['train_loss'].append( sum(checkpoint['train_batch_loss'][-1]) / b) checkpoint['train_time'].append(time.time() - train_start) io.cprint( '----\n TRN | time: %5ds | loss: %.10f| av. dist. per neighbor: %.10f' % (checkpoint['train_time'][-1], checkpoint['train_loss'][-1], np.sqrt(checkpoint['train_loss'][-1]))) torch.cuda.empty_cache() #################### # Test & Validation #################### with torch.no_grad(): if use_batch_norm: model.eval_bn() eval_bn_start = time.time() # run through all train samples again to accumulate layer-wise input distribution statistics (mean and variance) with fixed weights # these statistics are later used for the BatchNorm layers during inference for (bin_id, batch_ids) in binbatches: input = train_bins[bin_id][batch_ids][:, :, :, 0].squeeze( -1) # size: (B x N x d) model(input.transpose(1, 2).to(model.base.device)).transpose( 1, 2) # size: (B x N x d) io.cprint('Accumulated BN Layer statistics (%ds)' % (time.time() - eval_bn_start)) model.eval() test_start = time.time() test_loss = getTestLoss(model, test_samples, loss_function, edge_loss=edge_loss) checkpoint['test_loss'].append(test_loss) checkpoint['test_time'].append(time.time() - test_start) io.cprint( ' TST | time: %5ds | loss: %.10f| av. dist. per neighbor: %.10f' % (checkpoint['test_time'][-1], checkpoint['test_loss'][-1], np.sqrt(checkpoint['test_loss'][-1]))) val_start = time.time() val_loss = getTestLoss(model, val_samples, loss_function, edge_loss=edge_loss) checkpoint['val_loss'].append(val_loss) checkpoint['val_time'].append(time.time() - val_start) io.cprint( ' VAL | time: %5ds | loss: %.10f| av. dist. per neighbor: %.10f' % (checkpoint['val_time'][-1], checkpoint['val_loss'][-1], np.sqrt(checkpoint['val_loss'][-1]))) #################### # Scheduler Step #################### if not no_lr_schedule: scheduler.step() if epoch > 1 and dynamic_lr and sum( checkpoint['train_batch_lr_adjust'][-1]) > 0: io.cprint("----\n dynamic lr adjust: %.10f" % (0.5 * (1 + sum(checkpoint['train_batch_lr_adjust'][-1]) / len(checkpoint['train_batch_lr_adjust'][-1])))) for param_group in optimizer.param_groups: param_group['lr'] *= 0.5 * ( 1 + sum(checkpoint['train_batch_lr_adjust'][-1]) / len(checkpoint['train_batch_lr_adjust'][-1])) # Save model and optimizer state .. checkpoint['model_state_dict'].append(copy.deepcopy( model.state_dict())) checkpoint['optimizer_state_dict'].append( copy.deepcopy(optimizer.state_dict())) torch.save(checkpoint, exp_dir + '/corrector_checkpoints.t7') io.cprint("\n-------------------------------------------------------" + ("\ntotal_time: %.2fh" % ((time.time() - start_time) / 3600)) + ("\ntrain_time: %.2fh" % (sum(checkpoint['train_time']) / 3600)) + ("\ntest_time: %.2fh" % (sum(checkpoint['test_time']) / 3600)) + ("\nval_time: %.2fh" % (sum(checkpoint['val_time']) / 3600)) + "\n-------------------------------------------------------" + "\nend_time: " + datetime.now().strftime("%Y-%m-%d_%H%M%S") + "\n-------------------------------------------------------")
class CNNModel(): def __init__(self, args={}): self.args = args self.parse_args(args) self.classifier = ConvNet() self.optimizer = optim.Adam(self.classifier.parameters(), lr=self.lr, betas=(0.9, 0.98), eps=1e-9) self.loss_function = nn.CrossEntropyLoss() lmbda = lambda epoch: self.lr_factor self.lr_scheduler = MultiplicativeLR(self.optimizer, lr_lambda=lmbda) def parse_args(self, args): self.lr = args['learning_rate'] if 'learning_rate' in args else 0.001 self.max_epoch = args['max_epoch'] if 'max_epoch' in args else 100 self.early_stop = args['early_stop'] if 'early_stop' in args else False self.batch_size = args['batch_size'] if 'batch_size' in args else 64 self.shuffle = args['shuffle'] if 'shuffle' in args else False self.adjust_lr = args[ 'adaptive_learning_rate'] if 'adaptive_learning_rate' in args else False self.early_stop_idx_limit = 10 self.lr_factor = 0.95 self.min_lr = 5e-6 def adjust_learning_rate(optimizer, factor=.5, min_lr=0.00001): for i, param_group in enumerate(optimizer.param_groups): old_lr = float(param_group['lr']) new_lr = max(old_lr * factor, min_lr) param_group['lr'] = new_lr logger.info('adjusting learning rate from %.6f to %.6f' % (old_lr, new_lr)) def train_model(self, train_X, train_Y): if self.early_stop: best_acc = 0 best_model = None early_stop_idx = 0 train_X, dev_X = np.split(train_X, [int(len(train_X) * .8)]) train_Y, dev_Y = np.split(train_Y, [int(len(train_Y) * .8)]) tensor_dev_X = torch.Tensor(dev_X) tensor_dev_Y = torch.Tensor(dev_Y).type(torch.LongTensor) dev = TensorDataset(tensor_dev_X, tensor_dev_Y) dev_loader = DataLoader(dev, batch_size=self.batch_size, shuffle=False) tensor_train_X = torch.Tensor(train_X) tensor_train_Y = torch.Tensor(train_Y).type(torch.LongTensor) train = TensorDataset(tensor_train_X, tensor_train_Y) train_loader = DataLoader(train, batch_size=self.batch_size, shuffle=self.shuffle) prev_loss = np.inf for epoch in range(self.max_epoch): running_loss = 0.0 for i, data in enumerate(train_loader): features, labels = data self.optimizer.zero_grad() outputs = self.classifier( features.view(features.size(0), 1, 28, 28)) loss = self.loss_function(outputs, labels) loss.backward() self.optimizer.step() running_loss += loss.item() print("epoch: ", epoch, "training loss: ", running_loss) if self.adjust_lr and running_loss > prev_loss: old_lr = self.optimizer.param_groups[0]['lr'] self.lr_scheduler.step() new_lr = self.optimizer.param_groups[0]['lr'] print("Adjusting learning rate from %.5f to %.5f" % (old_lr, new_lr)) prev_loss = running_loss if self.early_stop: with torch.no_grad(): dev_correct = 0. dev_total = 0. dev_loss = 0. for data in dev_loader: features, labels = data outputs = self.classifier( features.view(features.size(0), 1, 28, 28)) loss = self.loss_function(outputs, labels) _, predicted = torch.max(outputs.data, 1) dev_total += labels.size(0) dev_correct += (predicted == labels).sum().item() dev_loss += loss.item() current_acc = dev_correct / dev_total if current_acc > best_acc: print("Best dev accuracy obtained: %.3f" % current_acc) best_model = copy.deepcopy(self.classifier) best_acc = current_acc early_stop_idx = 0 else: early_stop_idx += 1 if early_stop_idx >= self.early_stop_idx_limit: print("early stop triggered") self.classifier = best_model break return self def score(self, test_X, test_Y): tensor_test_X = torch.Tensor(test_X) tensor_test_Y = torch.Tensor(test_Y).type(torch.LongTensor) test = TensorDataset(tensor_test_X, tensor_test_Y) test_loader = DataLoader(test, batch_size=self.batch_size, shuffle=False) correct = 0.0 total = 0.0 with torch.no_grad(): for data in test_loader: features, labels = data outputs = self.classifier( features.view(features.size(0), 1, 28, 28)) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() return correct / total @staticmethod def Name(): return "CNN"
def train(train_data, exp_dir=datetime.now().strftime("detector_model/%Y-%m-%d_%H%M"), learning_rate=0.00005, rsize=10, epochs=1, checkpoint_path='', seed=6548, batch_size=4, model_type='cnet', model_cap='normal', optimizer='radam', safe_descent=True, activation_type='mish', activation_args={}, io=None, dynamic_lr=True, dropout=0, rotations=False, use_batch_norm=True, batch_norm_momentum=None, batch_norm_affine=True, use_gc=True, no_lr_schedule=False, diff_features_only=False, scale_min=1, scale_max=1, noise=0): start_time = time.time() scale_min = scale_min if scale_min < 1 else 1 scale_max = scale_max if scale_max > 1 else 1 io.cprint("-------------------------------------------------------" + "\nexport dir = " + '/checkpoints/' + exp_dir + "\nbase_learning_rate = " + str(learning_rate) + "\nuse_batch_norm = " + str(use_batch_norm) + "\nbatch_norm_momentum = " + str(batch_norm_momentum) + "\nbatch_norm_affine = " + str(batch_norm_affine) + "\nno_lr_schedule = " + str(no_lr_schedule) + "\nuse_gc = " + str(use_gc) + "\nrsize = " + str(rsize) + "\npython_version: " + sys.version + "\ntorch_version: " + torch.__version__ + "\nnumpy_version: " + np.version.version + "\nmodel_type: " + model_type + "\nmodel_cap: " + model_cap + "\noptimizer: " + optimizer + "\nactivation_type: " + activation_type + "\nsafe_descent: " + str(safe_descent) + "\ndynamic_lr: " + str(dynamic_lr) + "\nrotations: " + str(rotations) + "\nscaling: " + str(scale_min) + " to " + str(scale_max) + "\nnoise: " + str(noise) + "\nepochs = " + str(epochs) + (("\ncheckpoint = " + checkpoint_path) if checkpoint_path != '' else '') + "\nseed = " + str(seed) + "\nbatch_size = " + str(batch_size) + "\n#train_data = " + str(sum([bin.size(0) for bin in train_data["train_bins"]])) + "\n#test_data = " + str(len(train_data["test_samples"])) + "\n#validation_data = " + str(len(train_data["val_samples"])) + "\n-------------------------------------------------------" + "\nstart_time: " + datetime.now().strftime("%Y-%m-%d_%H%M%S") + "\n-------------------------------------------------------") # initialize torch & cuda --------------------------------------------------------------------- torch.manual_seed(seed) np.random.seed(seed) device = utils.getDevice(io) # extract train- & test data (and move to device) -------------------------------------------- pts = train_data["pts"].to(device) val_pts = train_data["val_pts"].to(device) train_bins = train_data["train_bins"] test_samples = train_data["test_samples"] val_samples = train_data["val_samples"] # the maximum noise offset for each point is equal to the distance to its nearest neighbor max_noise = torch.square(pts[train_data["knn"][:, 0]] - pts).sum(dim=1).sqrt() # Initialize Model ------------------------------------------------------------------------------ model_args = { 'model_type': model_type, 'model_cap': model_cap, 'input_channels': pts.size(1), 'output_channels': 2, 'rsize': rsize, 'emb_dims': 1024, 'activation_type': activation_type, 'activation_args': activation_args, 'dropout': dropout, 'batch_norm': use_batch_norm, 'batch_norm_affine': batch_norm_affine, 'batch_norm_momentum': batch_norm_momentum, 'diff_features_only': diff_features_only } model = getModel(model_args).to(device) # init optimizer & scheduler ------------------------------------------------------------------- lookahead_sync_period = 6 opt = None if optimizer == 'radam': opt = RAdam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), eps=1e-8, use_gc=use_gc) elif optimizer == 'lookahead': opt = Ranger(model.parameters(), lr=learning_rate, alpha=0.9, k=lookahead_sync_period) # make sure that either a LR schedule is given or dynamic LR is enabled assert dynamic_lr or not no_lr_schedule scheduler = None if no_lr_schedule else MultiplicativeLR( opt, lr_lambda=MultiplicativeAnnealing(epochs)) # set train settings & load previous model state ------------------------------------------------------------ checkpoint = getEmptyCheckpoint() last_epoch = 0 if (checkpoint_path != ''): checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint['model_state_dict'][-1]) optimizer.load_state_dict(checkpoint['optimizer_state_dict'][-1]) last_epoch = len(checkpoint['model_state_dict']) print('> loaded checkpoint! (%d epochs)' % (last_epoch)) checkpoint['train_settings'].append({ 'learning_rate': learning_rate, 'scheduler': scheduler, 'epochs': epochs, 'seed': seed, 'batch_size': batch_size, 'optimizer': optimizer, 'safe_descent:': str(safe_descent), 'dynamic_lr': str(dynamic_lr), 'rotations': str(rotations), 'scale_min': scale_min, 'scale_max': scale_max, 'noise': noise, 'train_data_count': sum([bin.size(0) for bin in train_data["train_bins"]]), 'test_data_count': len(train_data["test_samples"]), 'validation_data_count': len(train_data["val_samples"]), 'model_args': model_args }) # calculate class weights --------------------------------------------------------------------- av_c1_freq = sum([ torch.sum(bin[:, :, 1]).item() for bin in train_data["train_bins"] ]) / sum([bin[:, :, 1].numel() for bin in train_data["train_bins"]]) class_weights = torch.tensor([av_c1_freq, 1 - av_c1_freq]).float().to(device) io.cprint("\nC0 Weight: %.4f" % (class_weights[0].item())) io.cprint("C1 Weight: %.4f" % (class_weights[1].item())) # Adjust Weights in favor of C1 (edge:true class) # class_weights[0] = class_weights[0] / 2 # class_weights[1] = 1 - class_weights[0] # io.cprint("\nAdjusted C0 Weight: %.4f" % (class_weights[0].item())) # io.cprint("Adjusted C1 Weight: %.4f" % (class_weights[1].item())) # set up report interval (for logging) and batch size ------------------------------------------------------------------- report_interval = 100 # begin training ########################################################################################################################### io.cprint("\nBeginning Training..\n") for epoch in range(last_epoch + 1, last_epoch + epochs + 1): io.cprint( "Epoch: %d ------------------------------------------------------------------------------------------" % (epoch)) io.cprint("Current LR: %.10f" % (opt.param_groups[0]['lr'])) model.train() opt.zero_grad() checkpoint['train_batch_loss'].append([]) checkpoint['train_batch_N'].append([]) checkpoint['train_batch_acc'].append([]) checkpoint['train_batch_C0_acc'].append([]) checkpoint['train_batch_C1_acc'].append([]) checkpoint['train_batch_lr_adjust'].append([]) checkpoint['train_batch_loss_reduction'].append([]) checkpoint['lr'].append(opt.param_groups[0]['lr']) # draw random batches from random bins binbatches = utils.drawBinBatches([bin.size(0) for bin in train_bins], batchsize=batch_size) checkpoint['train_batch_N'][-1] = [ train_bins[bin_id][batch_ids].size(1) for (bin_id, batch_ids) in binbatches ] failed_loss_optims = 0 cum_lr_adjust_fac = 0 cum_loss_reduction = 0 # pre-compute random rotations if needed batch_rotations = [None] * len(binbatches) if rotations: start_rotations = time.time() batch_rotations = torch.zeros( (len(binbatches), batch_size, pts.size(1), pts.size(1)), device=device) for i in range(len(binbatches)): for j in range(batch_size): batch_rotations[i, j] = utils.getRandomRotation(pts.size(1), device=device) print("created batch rotations (%ds)" % (time.time() - start_rotations)) b = 0 # batch counter train_start = time.time() for (bin_id, batch_ids) in binbatches: b += 1 batch_pts_ids = train_bins[bin_id][batch_ids][:, :, 0] # size: (B x N) batch_input = pts[batch_pts_ids] # size: (B x N x d) batch_target = train_bins[bin_id][batch_ids][:, :, 1].to( device) # size: (B x N) if batch_rotations[b - 1] != None: batch_input = batch_input.matmul(batch_rotations[b - 1]) if noise > 0: noise_v = torch.randn( batch_input.size(), device=batch_input.device) # size: (B x N x d) noise_v.div_( torch.square(noise_v).sum( dim=2).sqrt()[:, :, None]) # norm to unit vectors batch_input.addcmul(noise_v, max_noise[batch_pts_ids][:, :, None], value=noise) if scale_min < 1 or scale_max > 1: # batch_scales = scale_min + torch.rand(batch_input.size(0), device=batch_input.device) * (scale_max - scale_min) batch_scales = torch.rand(batch_input.size(0), device=batch_input.device) batch_scales.mul_(scale_max - scale_min) batch_scales.add_(scale_min) batch_input.mul(batch_scales[:, None, None]) batch_input = batch_input.transpose(1, 2) # size: (B x d x N) # prediction & loss ---------------------------------------- batch_prediction = model(batch_input).transpose( 1, 2) # size: (B x N x 2) batch_loss = cross_entropy(batch_prediction.reshape(-1, 2), batch_target.view(-1), class_weights, reduction='mean') batch_loss.backward() checkpoint['train_batch_loss'][-1].append(batch_loss.item()) new_loss = 0.0 lr_adjust = 1.0 loss_reduction = 0.0 # if safe descent is enabled, try to optimize the descent step so that a reduction in loss is guaranteed if safe_descent: # create backups to restore states before the optimizer step model_state_backup = copy.deepcopy(model.state_dict()) opt_state_backup = copy.deepcopy(opt.state_dict()) # make an optimizer step opt.step() # in each itearation, check if the optimzer gave an improvement # if not, restore the original states, reduce the learning rate and try again # no gradient needed for the plain loss calculation with torch.no_grad(): for i in range(10): # new_batch_prediction = model(batch_input).transpose(1,2).contiguous() new_batch_prediction = model(batch_input).transpose( 1, 2) new_loss = cross_entropy(new_batch_prediction.reshape( -1, 2), batch_target.view(-1), class_weights, reduction='mean').item() # if the model performs better now we continue, if not we try a smaller learning step if (new_loss < batch_loss.item()): # print("lucky! (%f -> %f) reduction: %.4f%%" % (batch_loss.item(), new_loss, 100 * (batch_loss.item()-new_loss) / batch_loss.item())) break else: # print("try again.. (%f -> %f)" % (batch_loss.item(), new_loss)) model.load_state_dict(model_state_backup) opt.load_state_dict(opt_state_backup) lr_adjust *= 0.7 opt.step(lr_adjust=lr_adjust) loss_reduction = 100 * (batch_loss.item() - new_loss) / batch_loss.item() if new_loss >= batch_loss.item(): failed_loss_optims += 1 else: cum_lr_adjust_fac += lr_adjust cum_loss_reduction += loss_reduction else: cum_lr_adjust_fac += lr_adjust opt.step() checkpoint['train_batch_lr_adjust'][-1].append(lr_adjust) checkpoint['train_batch_loss_reduction'][-1].append(loss_reduction) # reset gradients opt.zero_grad() # make class prediction and save stats ----------------------- success_vector = torch.argmax(batch_prediction, dim=2) == batch_target c0_idx = batch_target == 0 c1_idx = batch_target == 1 checkpoint['train_batch_acc'][-1].append( torch.sum(success_vector).item() / success_vector.numel()) checkpoint['train_batch_C0_acc'][-1].append( torch.sum(success_vector[c0_idx]).item() / torch.sum(c0_idx).item()) # TODO handle divsion by zero checkpoint['train_batch_C1_acc'][-1].append( torch.sum(success_vector[c1_idx]).item() / torch.sum(c1_idx).item()) # TODO # statistic caluclation and output ------------------------- if b % report_interval == 0: last_100_loss = sum(checkpoint['train_batch_loss'][-1] [b - report_interval:b]) / report_interval last_100_acc = sum(checkpoint['train_batch_acc'][-1] [b - report_interval:b]) / report_interval last_100_acc_c0 = sum( checkpoint['train_batch_C0_acc'][-1] [b - report_interval:b]) / report_interval last_100_acc_c1 = sum( checkpoint['train_batch_C1_acc'][-1] [b - report_interval:b]) / report_interval io.cprint( ' Batch %4d to %4d | loss: %.5f%1s| acc: %.4f%1s| C0 acc: %.4f%1s| C1 acc: %.4f%1s| E%3d | T:%5ds | Failed Optims: %3d (%05.2f%%) | Av. Adjust LR: %.6f | Av. Loss Reduction: %07.4f%%' % (b - (report_interval - 1), b, last_100_loss, '+' if epoch > 1 and last_100_loss < checkpoint['train_loss'][-1] else '', last_100_acc, '+' if epoch > 1 and last_100_acc > checkpoint['train_acc'][-1] else '', last_100_acc_c0, '+' if epoch > 1 and last_100_acc_c0 > checkpoint['train_C0_acc'][-1] else '', last_100_acc_c1, '+' if epoch > 1 and last_100_acc_c1 > checkpoint['train_C1_acc'][-1] else '', epoch, time.time() - train_start, failed_loss_optims, 100 * (failed_loss_optims / report_interval), (cum_lr_adjust_fac / (report_interval - failed_loss_optims) if failed_loss_optims < report_interval else -1), (cum_loss_reduction / (report_interval - failed_loss_optims) if failed_loss_optims < report_interval else -1))) failed_loss_optims = 0 cum_lr_adjust_fac = 0 cum_loss_reduction = 0 checkpoint['train_loss'].append( sum(checkpoint['train_batch_loss'][-1]) / b) checkpoint['train_acc'].append( sum(checkpoint['train_batch_acc'][-1]) / b) checkpoint['train_C0_acc'].append( sum(checkpoint['train_batch_C0_acc'][-1]) / b) checkpoint['train_C1_acc'].append( sum(checkpoint['train_batch_C1_acc'][-1]) / b) checkpoint['train_time'].append(time.time() - train_start) io.cprint( '----\n TRN | time: %5ds | loss: %.10f | acc: %.4f | C0 acc: %.4f | C1 acc: %.4f' % (checkpoint['train_time'][-1], checkpoint['train_loss'][-1], checkpoint['train_acc'][-1], checkpoint['train_C0_acc'][-1], checkpoint['train_C1_acc'][-1])) torch.cuda.empty_cache() #################### # Test & Validation #################### with torch.no_grad(): if use_batch_norm: model.eval_bn() eval_bn_start = time.time() # run through all train samples again to accumulate layer-wise input distribution statistics (mean and variance) with fixed weights # these statistics are later used for the BatchNorm layers during inference for (bin_id, batch_ids) in binbatches: batch_pts_ids = train_bins[bin_id][ batch_ids][:, :, 0] # size: (B xN) batch_input = pts[batch_pts_ids] # size: (B x N x d) # batch_input = batch_input.transpose(1,2).contiguous() # size: (B x d x N) batch_input = batch_input.transpose(1, 2) # size: (B x d x N) model(batch_input) io.cprint('Accumulated BN Layer statistics (%ds)' % (time.time() - eval_bn_start)) model.eval() if len(test_samples) > 0: test_start = time.time() test_loss, test_acc, test_acc_c0, test_acc_c1 = getTestLoss( pts, test_samples, model, class_weights) checkpoint['test_loss'].append(test_loss) checkpoint['test_acc'].append(test_acc) checkpoint['test_C0_acc'].append(test_acc_c0) checkpoint['test_C1_acc'].append(test_acc_c1) checkpoint['test_time'].append(time.time() - test_start) io.cprint( ' TST | time: %5ds | loss: %.10f | acc: %.4f | C0 acc: %.4f | C1 acc: %.4f' % (checkpoint['test_time'][-1], checkpoint['test_loss'][-1], checkpoint['test_acc'][-1], checkpoint['test_C0_acc'][-1], checkpoint['test_C1_acc'][-1])) else: io.cprint(' TST | n/a (no samples)') if len(val_samples) > 0: val_start = time.time() val_loss, val_acc, val_acc_c0, val_acc_c1 = getTestLoss( val_pts, val_samples, model, class_weights) checkpoint['val_loss'].append(val_loss) checkpoint['val_acc'].append(val_acc) checkpoint['val_C0_acc'].append(val_acc_c0) checkpoint['val_C1_acc'].append(val_acc_c1) checkpoint['val_time'].append(time.time() - val_start) io.cprint( ' VAL | time: %5ds | loss: %.10f | acc: %.4f | C0 acc: %.4f | C1 acc: %.4f' % (checkpoint['val_time'][-1], checkpoint['val_loss'][-1], checkpoint['val_acc'][-1], checkpoint['val_C0_acc'][-1], checkpoint['val_C1_acc'][-1])) else: io.cprint(' VAL | n/a (no samples)') #################### # Scheduler Step #################### if not no_lr_schedule: scheduler.step() if epoch > 1 and dynamic_lr and sum( checkpoint['train_batch_lr_adjust'][-1]) > 0: io.cprint("----\n dynamic lr adjust: %.10f" % (0.5 * (1 + sum(checkpoint['train_batch_lr_adjust'][-1]) / len(checkpoint['train_batch_lr_adjust'][-1])))) for param_group in opt.param_groups: param_group['lr'] *= 0.5 * ( 1 + sum(checkpoint['train_batch_lr_adjust'][-1]) / len(checkpoint['train_batch_lr_adjust'][-1])) # Save model and optimizer state .. checkpoint['model_state_dict'].append(copy.deepcopy( model.state_dict())) checkpoint['optimizer_state_dict'].append( copy.deepcopy(opt.state_dict())) torch.save(checkpoint, exp_dir + '/detector_checkpoints.t7') io.cprint("\n-------------------------------------------------------" + ("\ntotal_time: %.2fh" % ((time.time() - start_time) / 3600)) + ("\ntrain_time: %.2fh" % (sum(checkpoint['train_time']) / 3600)) + ("\ntest_time: %.2fh" % (sum(checkpoint['test_time']) / 3600)) + ("\nval_time: %.2fh" % (sum(checkpoint['val_time']) / 3600)) + "\n-------------------------------------------------------" + "\nend_time: " + datetime.now().strftime("%Y-%m-%d_%H%M%S") + "\n-------------------------------------------------------")
def record_lr( model: torch.nn.Module, train_loader: DataLoader, batch_transforms, optimizer, start_lr: float = 1e-7, end_lr: float = 1, num_it: int = 100, amp: bool = False, ): """Gridsearch the optimal learning rate for the training. Adapted from https://github.com/frgfm/Holocron/blob/master/holocron/trainer/core.py """ if num_it > len(train_loader): raise ValueError( "the value of `num_it` needs to be lower than the number of available batches" ) model = model.train() # Update param groups & LR optimizer.defaults["lr"] = start_lr for pgroup in optimizer.param_groups: pgroup["lr"] = start_lr gamma = (end_lr / start_lr)**(1 / (num_it - 1)) scheduler = MultiplicativeLR(optimizer, lambda step: gamma) lr_recorder = [start_lr * gamma**idx for idx in range(num_it)] loss_recorder = [] if amp: scaler = torch.cuda.amp.GradScaler() for batch_idx, (images, targets) in enumerate(train_loader): if torch.cuda.is_available(): images = images.cuda() images = batch_transforms(images) # Forward, Backward & update optimizer.zero_grad() if amp: with torch.cuda.amp.autocast(): train_loss = model(images, targets)["loss"] scaler.scale(train_loss).backward() # Gradient clipping scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), 5) # Update the params scaler.step(optimizer) scaler.update() else: train_loss = model(images, targets)["loss"] train_loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() # Update LR scheduler.step() # Record if not torch.isfinite(train_loss): if batch_idx == 0: raise ValueError("loss value is NaN or inf.") else: break loss_recorder.append(train_loss.item()) # Stop after the number of iterations if batch_idx + 1 == num_it: break return lr_recorder[:len(loss_recorder)], loss_recorder
class DeepSeqNet(Module): def __init__(self): super(DeepSeqNet, self).__init__() def _compile(self, optimizer, learning_rate): self._set_optim(optimizer, learning_rate) self._set_scheduler() self._set_criterion() def _set_optim(self, optimizer, learning_rate): optimizer = optimizer.lower() if optimizer == "adam": self.optimizer = optim.Adam(self.parameters(), lr=learning_rate) elif optimizer == "rmsprop": self.optimizer = optim.RMSprop(self.parameters(), lr=learning_rate) else: self.optimizer = optim.SGD(self.parameters(), lr=learning_rate) def _set_scheduler(self): self.scheduler = MultiplicativeLR(self.optimizer, lr_lambda=(lambda x: 0.95)) def _set_criterion(self): self.criterion = nn.CrossEntropyLoss() def forward(self, x_txt, x_num): txt_features = self.txt_net_forward(x_txt) num_features = self.num_net_forward(x_num) features = torch.cat((txt_features, num_features), 1) out_features = self.dropout(features) logits = self.fc(out_features) return logits def txt_net_forward(self, x_txt): raise NotImplementedError() def num_net_forward(self, x_num): for linear in self.linear_layers: x_num = self.activation_layer(linear(x_num)) return x_num def fit(self, x_txt, x_num, y): self.train() self.optimizer.zero_grad() y_ = self.forward(x_txt, x_num) loss = self.criterion(y_, y) loss.backward() self.optimizer.step() return loss def evaluate(self, data_iterator): self.eval() labels, preds = [], [] for _, (x_txt, x_num, y) in enumerate(data_iterator): x_txt, x_num = x_txt.t(), x_num.t() if torch.cuda.is_available(): x_txt, x_num = x_txt.cuda(), x_num.cuda() y_ = self.forward(x_txt, x_num) pred = torch.argmax(y_, 1) preds.extend(pred.cpu().numpy()) labels.extend(y.numpy()) score = accuracy_score(labels, np.array(preds).flatten()) return score def run_epoch(self, train_iterator, val_iterator): train_losses = [] val_accuracies = [] losses = [] for i, (x_txt, x_num, y) in enumerate(train_iterator): x_txt, x_num = x_txt.t(), x_num.t() if torch.cuda.is_available(): x_txt, x_num = x_txt.cuda(), x_num.cuda() y = y.cuda() loss = self.fit(x_txt, x_num, y) losses.append(loss.item()) if i % 100 == 0 and i != 0: avg_train_loss = float(np.mean(losses)) train_losses.append(avg_train_loss) losses = [] val_accuracy = self.evaluate(val_iterator) print("Iteration: %4d | train loss: %3.2f | val acc.: %.2f" % ((i + 1), avg_train_loss * 100, val_accuracy * 100)) # Run the scheduler to reduce the learning rate self.scheduler.step(epoch=None) return train_losses, val_accuracies
def configure_optimizers(self): optimizer = torch.optim.Adam(self.parameters(), lr=self.lr) scheduler = MultiplicativeLR(optimizer, lr_lambda=lambda epoch: self.decay) return [optimizer], [scheduler]
def _set_scheduler(self): self.scheduler = MultiplicativeLR(self.optimizer, lr_lambda=(lambda x: 0.95))