def configure_optimizers(self): if self.params["generator"]["optimizer"] == "adam": generator_optimizer = torch.optim.Adam( self.generator_model.parameters(), lr=self.params['generator']['learning_rate'], weight_decay=self.params['generator']['weight_decay'], betas=(self.params["generator"]["beta1"], self.params["generator"]["beta2"])) elif self.params["generator"]["optimizer"] == "qhadam": generator_optimizer = QHAdam(self.generator_model.parameters(), nus=(0.7, 1.0), betas=(0.95, 0.998)) else: raise NameError("Unknown optimizer name") if self.params["critic"]["optimizer"] == "adam": critic_optimizer = torch.optim.Adam( self.critic_model.parameters(), lr=self.params['critic']['learning_rate'], weight_decay=self.params['critic']['weight_decay'], betas=(self.params["critic"]["beta1"], self.params["critic"]["beta2"])) elif self.params["critic"]["optimizer"] == "qhadam": critic_optimizer = QHAdam(self.critic_model.parameters(), nus=(0.7, 1.0), betas=(0.95, 0.998)) else: raise NameError("Unknown optimizer name") return generator_optimizer, critic_optimizer
def init_trainer_network(self): nus = (0.7, 1.0) self.gen_paramas = list(self.generator.parameters()) self.dsc_params = list(self.discriminators.parameters()) if self.cfg.proj_lang: self.language_params = list(self.language_model.proj.parameters()) self.optim_language = QHAdam(self.language_params, lr=self.cfg.lang_lr, betas=self.cfg.lang_betas, nus=(0.7, 0.8)) self.style_params = list(self.style_model.parameters()) self.content_params = list(self.content_model.parameters()) self.optim_generator = QHAdam(self.gen_paramas, lr=self.cfg.gen_lr, betas=self.cfg.gen_betas, nus=(0.7, 0.8)) self.optim_discriminator = QHAdam(self.dsc_params, lr=self.cfg.dsc_lr, betas=self.cfg.dsc_betas, nus=(0.7, 0.8)) self.optim_style = QHAdam(self.style_params, lr=self.cfg.lmf_lr, betas=self.cfg.lmf_betas, nus=(0.7, 0.8)) self.optim_content = QHAdam(self.content_params, lr=self.cfg.lmf_lr, betas=self.cfg.lmf_betas, nus=(0.7, 0.8))
def define_optimizer(model, args): if args.optimizer.startswith('adam'): optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.startswith('rmsprop'): optimizer = torch.optim.RMSprop(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.startswith('sgd'): optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.weight_decay) elif args.optimizer.startswith('qhadam'): optimizer = QHAdam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, nus=[0.7, 1.0], betas=[0.995, 0.999], weight_decay=args.weight_decay) else: raise ValueError('Optimizer not supported') print('Optimizer: ', optimizer) return optimizer
def qhadam_ctor(params): return QHAdam(params, lr=lr, betas=betas, weight_decay=l2 * 2.0, nus=(1.0, 1.0), eps=eps)
def __init__(self, data_processor, bottleneck_dim=128, num_codebooks=16, hidden_dim=512, decoder_layers=2, encoder_layers=2, **kwargs): super().__init__() self.data_processor = data_processor self.encoder1 = nn.Sequential( Feedforward(self.data_processor.input_dim, hidden_dim, num_layers=encoder_layers, **kwargs), nn.Linear(hidden_dim, bottleneck_dim)) self.quntizer = Model(input_dim=bottleneck_dim, hidden_dim=1024, bottleneck_dim=256, encoder_layers=2, decoder_layers=2, Activation=nn.ReLU, num_codebooks=8, codebook_size=256, initial_entropy=3.0, share_codewords=True).cuda() self.distance = DISTANCES['euclidian_squared'] self.triplet_delta = 5 all_parameters = list(self.encoder1.parameters()) + list( self.quntizer.parameters()) self.optimizer = OneCycleSchedule(QHAdam(all_parameters, nus=(0.8, 0.7), betas=(0.95, 0.998)), learning_rate_base=1e-3, warmup_steps=10000, decay_rate=0.2) self.experiment_path = 'logs' self.writer = SummaryWriter(self.experiment_path, comment='Cora')
trainloader = DataLoader(TorchDataset(data.X_train, data.y_train), batch_size=BATCH_SIZE, num_workers=16, shuffle=True) valloader = DataLoader(TorchDataset(data.X_valid, data.y_valid), batch_size=BATCH_SIZE*2, num_workers=16, shuffle=False) testloader = DataLoader(TorchDataset(data.X_test, data.y_test), batch_size=BATCH_SIZE*2, num_workers=16, shuffle=False) test_losses, train_time, test_time = [], [], [] for SEED in [1225, 1337, 2020, 6021991]: save_dir = Path("./results/tabular/") / DATA_NAME / "depth={}/reg={}/mlp-layers={}/dropout={}/seed={}".format(TREE_DEPTH, REG, MLP_LAYERS, DROPOUT, SEED) save_dir.mkdir(parents=True, exist_ok=True) deterministic(SEED) model = LTRegressor(TREE_DEPTH, in_features, out_features, reg=REG, linear=LINEAR, layers=MLP_LAYERS, dropout=DROPOUT) # init optimizer optimizer = QHAdam(model.parameters(), lr=LR, nus=(0.7, 1.0), betas=(0.995, 0.998)) # init loss loss = MSELoss(reduction="sum") criterion = lambda x, y: loss(x.float(), y.float()) # init learning rate scheduler lr_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=2) # init train-eval monitoring monitor = MonitorTree(pruning, save_dir) state = { 'batch-size': BATCH_SIZE, 'loss-function': 'MSE', 'learning-rate': LR,
from common import * from model import vocab option = dict(edim=256, epochs=1.5, maxgrad=1., learningrate=1e-3, sdt_decay_step=1, batchsize=8, vocabsize=vocab, fp16=2, saveInterval=10, logInterval=.4) option['loss'] = lambda opt, model, y, out, *_, rewards=[]: F.cross_entropy( out.transpose(-1, -2), y, reduction='none') option['criterion'] = lambda y, out, mask, *_: (out[:, :, 1:vocab].max(-1)[ 1] + 1).ne(y).float() * mask.float() option['startEnv'] = lambda x, y, l, *args: (x, y, l, *args) option['stepEnv'] = lambda i, pred, l, *args: ( False, 1., None, None ) # done episode, fake reward, Null next input, Null length, Null args option['cumOut'] = False # True to keep trajectory option['devices'] = [0] if torch.cuda.is_available() else [] # list of GPUs option[ 'init_method'] = 'file:///tmp/sharedfile' # initial configuration for multiple-GPU training try: from qhoptim.pyt import QHAdam option['newOptimizer'] = lambda opt, params, _: QHAdam( params, lr=opt.learningrate, nus=(.7, .8), betas=(0.995, 0.999)) except ImportError: pass
def fit_model(model, X_train, y_train, batch_size=None, validation_data=tuple(), epochs=50, optimizer='Adam', learning_rate=1e-2, device=None, verbose=1): #For early stopping patience = int(epochs * 0.2) min_val_loss = float('inf') stop_count = 0 #For saving tentative best model best_model_param = None #Use GPU if it is available when "device" is not specified if device == None: device = 'cuda' if torch.cuda.is_available() else 'cpu' #Set up parallel processing when multiple GPUs are available if torch.cuda.device_count() > 1: model = nn.DataParallel(model) opt = None if optimizer == 'SGD': opt = torch.optim.SGD(model.parameters(), lr=learning_rate) if optimizer == 'RMSprop': opt = torch.optim.RMSprop(model.parameters(), lr=learning_rate) if optimizer == 'Adadelta': opt = torch.optim.Adadelta(model.parameters(), lr=learning_rate) if optimizer == 'Adam': opt = torch.optim.Adam(model.parameters(), lr=learning_rate) if optimizer == 'QHAdam': opt = QHAdam(model.parameters(), lr=learning_rate, nus=(0.7, 1.0), betas=(0.95, 0.998)) num_unique = np.unique(np.argmax(y_train[:, -1, :], axis=1), return_counts=True) num_sample = num_unique[1] weight_list = 1 / (num_sample / num_sample.max()) class_weight = torch.as_tensor(weight_list, device=device) #sample_weight = np.array([weight_list[list(num_unique[0]).index(label)] for label in np.argmax(y_train[:,-1,:], axis=1)]) critertion = nn.CrossEntropyLoss(weight=class_weight, reduction='none') if class_weight.shape[0]>2\ else nn.BCELoss(weight=class_weight, reduction='none') path = r'.\pytorch_check_point' os.makedirs(path, exist_ok=True) for i in range(epochs): #train model.train() shuffled_idx = random.sample(list(range(len(y_train))), len(y_train)) train_loss = 0 for st in list( range(0, len(y_train), (batch_size if batch_size != None else len(y_train)))): en = st + (batch_size if batch_size != None else len(y_train)) if en > len(y_train): en = len(y_train) target_idx = shuffled_idx[st:en] inputs = torch.as_tensor(X_train[target_idx], device=device) inputs.requires_grad = True target = torch.as_tensor(np.argmax(y_train[target_idx], axis=2), device=device) #Predict the train data out = model(inputs) #Calculate average loss of all timesteps per sample loss = 0 for j in range(out.shape[0]): loss += critertion(torch.log(out[j]), target[j]).mean( ) #Softmax is part of the model architecture train_loss += loss.item() #Get the avrage loss of one batch loss = loss.mean() #Initialize gradient opt.zero_grad() #Calculate gradient loss.backward() #Update the parameter opt.step() del loss, inputs, target, out #Get the average train loss of all samples train_loss /= len(y_train) #train_loss += model.train_on_batch(X_train[target_idx], y_train[target_idx], # sample_weight=sample_weight[target_idx]) #test if len(validation_data) > 0: X_test = validation_data[0] y_true = validation_data[1] val_loss = test_model(model, X_test, y_true, batch_size=batch_size, critertion=critertion, device=device) #val_loss += model.evaluate(X_test[target_idx], y_true[target_idx], verbose=0) if verbose == 1: print('epoch{0}:\t train_loss = {1}\t val_loss = {2}'.format( i + 1, train_loss, val_loss)) #early stop if min_val_loss > val_loss: min_val_loss = val_loss best_model_param = model.state_dict( ) #Save tentative best model stop_count = 0 else: stop_count += 1 if stop_count > patience: break else: if verbose == 1: print('epoch{0}:\t train_loss = {1}'.format(i + 1, train_loss)) del class_weight if best_model_param != None: model.load_state_dict(best_model_param) min_val_loss = val_loss return model
def objective(trial): TREE_DEPTH = trial.suggest_int('TREE_DEPTH', 2, 6) REG = trial.suggest_loguniform('REG', 1e-3, 1e3) print(f'TREE_DEPTH={TREE_DEPTH}, REG={REG}') if not LINEAR: MLP_LAYERS = trial.suggest_int('MLP_LAYERS', 2, 7) DROPOUT = trial.suggest_uniform('DROPOUT', 0.0, 0.5) print(f'MLP_LAYERS={MLP_LAYERS}, DROPOUT={DROPOUT}') pruning = REG > 0 if LINEAR: save_dir = root_dir / "depth={}/reg={}/seed={}".format( TREE_DEPTH, REG, SEED) model = LTBinaryClassifier(TREE_DEPTH, data.X_train.shape[1], reg=REG, linear=LINEAR) else: save_dir = root_dir / "depth={}/reg={}/mlp-layers={}/dropout={}/seed={}".format( TREE_DEPTH, REG, MLP_LAYERS, DROPOUT, SEED) model = LTBinaryClassifier(TREE_DEPTH, data.X_train.shape[1], reg=REG, linear=LINEAR, layers=MLP_LAYERS, dropout=DROPOUT) print(model.count_parameters(), "model's parameters") save_dir.mkdir(parents=True, exist_ok=True) # init optimizer optimizer = QHAdam(model.parameters(), lr=LR, nus=(0.7, 1.0), betas=(0.995, 0.998)) # init learning rate scheduler lr_scheduler = ReduceLROnPlateau(optimizer, 'min', factor=0.1, patience=2) # init loss criterion = BCELoss(reduction="sum") # evaluation criterion => error rate eval_criterion = lambda x, y: (x.long() != y.long()).sum() # init train-eval monitoring monitor = MonitorTree(pruning, save_dir) state = { 'batch-size': BATCH_SIZE, 'loss-function': 'BCE', 'learning-rate': LR, 'seed': SEED, 'dataset': DATA_NAME, } best_val_loss = float("inf") best_e = -1 no_improv = 0 for e in range(EPOCHS): train_stochastic(trainloader, model, optimizer, criterion, epoch=e, monitor=monitor) val_loss = evaluate(valloader, model, {'ER': eval_criterion}, epoch=e, monitor=monitor) no_improv += 1 if val_loss['ER'] < best_val_loss: best_val_loss = val_loss['ER'] best_e = e no_improv = 0 # save_model(model, optimizer, state, save_dir) # reduce learning rate if needed lr_scheduler.step(val_loss['ER']) monitor.write(model, e, train={"lr": optimizer.param_groups[0]['lr']}) trial.report(val_loss['ER'], e) # Handle pruning based on the intermediate value. if trial.should_prune() or np.isnan(val_loss['ER']): monitor.close() raise optuna.TrialPruned() if no_improv == 10: break print("Best validation ER:", best_val_loss) monitor.close() return best_val_loss
def trainingNetwork(images_folder_train, labels_folder_train, images_folder_val, labels_folder_val, dictionary, target_classes, output_classes, save_network_as, classifier_name, epochs, batch_sz, batch_mult, learning_rate, L2_penalty, validation_frequency, loss_to_use, epochs_switch, epochs_transition, tversky_alpha, tversky_gamma, optimiz, flag_shuffle, flag_training_accuracy, experiment_name): ##### DATA ##### # setup the training dataset datasetTrain = CoralsDataset(images_folder_train, labels_folder_train, dictionary, target_classes) print("Dataset setup..", end='') datasetTrain.computeAverage() datasetTrain.computeWeights() print(datasetTrain.dict_target) print(datasetTrain.weights) freq = 1.0 / datasetTrain.weights print(freq) print("done.") save_classifier_as = save_network_as.replace(".net", ".json") writeClassifierInfo(save_classifier_as, classifier_name, datasetTrain, output_classes) datasetTrain.enableAugumentation() datasetVal = CoralsDataset(images_folder_val, labels_folder_val, dictionary, target_classes) datasetVal.dataset_average = datasetTrain.dataset_average datasetVal.weights = datasetTrain.weights #AUGUMENTATION IS NOT APPLIED ON THE VALIDATION SET datasetVal.disableAugumentation() # setup the data loader dataloaderTrain = DataLoader(datasetTrain, batch_size=batch_sz, shuffle=flag_shuffle, num_workers=0, drop_last=True, pin_memory=True) validation_batch_size = 4 dataloaderVal = DataLoader(datasetVal, batch_size=validation_batch_size, shuffle=False, num_workers=0, drop_last=True, pin_memory=True) training_images_number = len(datasetTrain.images_names) validation_images_number = len(datasetVal.images_names) print("NETWORK USED: DEEPLAB V3+") if os.path.exists(save_network_as): net = DeepLab(backbone='resnet', output_stride=16, num_classes=output_classes) net.load_state_dict(torch.load(save_network_as)) print("Checkpoint loaded.") else: ###### SETUP THE NETWORK ##### net = DeepLab(backbone='resnet', output_stride=16, num_classes=output_classes) state = torch.load("deeplab-resnet.pth.tar") # RE-INIZIALIZE THE CLASSIFICATION LAYER WITH THE RIGHT NUMBER OF CLASSES, DON'T LOAD WEIGHTS OF THE CLASSIFICATION LAYER new_dictionary = state['state_dict'] del new_dictionary['decoder.last_conv.8.weight'] del new_dictionary['decoder.last_conv.8.bias'] net.load_state_dict(state['state_dict'], strict=False) # OPTIMIZER if optimiz == "SGD": optimizer = optim.SGD(net.parameters(), lr=learning_rate, weight_decay=L2_penalty, momentum=0.9) elif optimiz == "ADAM": optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay=L2_penalty) elif optimiz == "QHADAM": optimizer = QHAdam(net.parameters(), lr=learning_rate, weight_decay=L2_penalty, nus = (0.7, 1.0), betas = (0.99, 0.999)) USE_CUDA = torch.cuda.is_available() if USE_CUDA: device = torch.device("cuda") net.to(device) ##### TRAINING LOOP ##### # Writer will output to ./runs/ directory by default writer = SummaryWriter(comment=experiment_name) reduce_lr_patience = 2 if loss_to_use == "DICE+BOUNDARY": reduce_lr_patience = 200 print("patience increased !") scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=reduce_lr_patience, verbose=True) best_accuracy = 0.0 best_jaccard_score = 0.0 # Crossentropy loss weights = datasetTrain.weights class_weights = torch.FloatTensor(weights).cuda() CEloss = nn.CrossEntropyLoss(weight=class_weights, ignore_index=-1) # weights for GENERALIZED DICE LOSS (GDL) freq = 1.0 / datasetTrain.weights[1:] w = 1.0 / (freq * freq) w = w / w.sum() + 0.00001 w_for_GDL = torch.from_numpy(w) w_for_GDL = w_for_GDL.to(device) # Focal Tversky loss focal_tversky_gamma = torch.tensor(tversky_gamma) focal_tversky_gamma = focal_tversky_gamma.to(device) tversky_loss_alpha = torch.tensor(tversky_alpha) tversky_loss_beta = torch.tensor(1.0 - tversky_alpha) tversky_loss_alpha = tversky_loss_alpha.to(device) tversky_loss_beta = tversky_loss_beta.to(device) print("Training Network") for epoch in range(epochs): # loop over the dataset multiple times net.train() optimizer.zero_grad() writer.add_scalar('LR/train', optimizer.param_groups[0]['lr'], epoch) loss_values = [] for i, minibatch in enumerate(dataloaderTrain): # get the inputs images_batch = minibatch['image'] labels_batch = minibatch['labels'] if USE_CUDA: images_batch = images_batch.to(device) labels_batch = labels_batch.to(device) # forward+loss+backward outputs = net(images_batch) loss = computeLoss(loss_to_use, CEloss, w_for_GDL, tversky_loss_alpha, tversky_loss_beta, focal_tversky_gamma, epoch, epochs_switch, epochs_transition, labels_batch, outputs) loss.backward() # TO AVOID MEMORY TROUBLE UPDATE WEIGHTS EVERY BATCH SIZE X BATCH MULT if (i+1)% batch_mult == 0: optimizer.step() optimizer.zero_grad() print(epoch, i, loss.item()) loss_values.append(loss.item()) mean_loss_train = sum(loss_values) / len(loss_values) print("Epoch: %d , Mean loss = %f" % (epoch, mean_loss_train)) writer.add_scalar('Loss/train', mean_loss_train, epoch) ### VALIDATION ### if epoch > 0 and (epoch+1) % validation_frequency == 0: print("RUNNING VALIDATION.. ", end='') metrics_val, mean_loss_val = evaluateNetwork(datasetVal, dataloaderVal, loss_to_use, CEloss, w_for_GDL, tversky_loss_alpha, tversky_loss_beta, focal_tversky_gamma, epoch, epochs_switch, epochs_transition, output_classes, net, flag_compute_mIoU=False) accuracy = metrics_val['Accuracy'] jaccard_score = metrics_val['JaccardScore'] scheduler.step(mean_loss_val) accuracy_training = 0.0 jaccard_training = 0.0 if flag_training_accuracy is True: metrics_train, mean_loss_train = evaluateNetwork(datasetTrain, dataloaderTrain, loss_to_use, CEloss, w_for_GDL, tversky_loss_alpha, tversky_loss_beta, focal_tversky_gamma, epoch, epochs_switch, epochs_transition, output_classes, net, flag_compute_mIoU=False) accuracy_training = metrics_train['Accuracy'] jaccard_training = metrics_train['JaccardScore'] #writer.add_scalar('Loss/train', mean_loss_train, epoch) writer.add_scalar('Loss/validation', mean_loss_val, epoch) writer.add_scalar('Accuracy/train', accuracy_training, epoch) writer.add_scalar('Accuracy/validation', accuracy, epoch) #if jaccard_score > best_jaccard_score: if accuracy > best_accuracy: best_accuracy = accuracy best_jaccard_score = jaccard_score torch.save(net.state_dict(), save_network_as) # performance of the best accuracy network on the validation dataset metrics_filename = save_network_as[:len(save_network_as) - 4] + "-val-metrics.txt" saveMetrics(metrics_val, metrics_filename) if flag_training_accuracy is True: metrics_filename = save_network_as[:len(save_network_as) - 4] + "-train-metrics.txt" saveMetrics(metrics_train, metrics_filename) print("-> CURRENT BEST ACCURACY ", best_accuracy) # main loop ended - reload it and evaluate mIoU torch.cuda.empty_cache() del net net = None print("Final evaluation..") net = DeepLab(backbone='resnet', output_stride=16, num_classes=datasetTrain.num_classes) net.load_state_dict(torch.load(save_network_as)) metrics_val, mean_loss_val = evaluateNetwork(datasetVal, dataloaderVal, loss_to_use, CEloss, w_for_GDL, tversky_loss_alpha, tversky_loss_beta, focal_tversky_gamma, epoch, epochs_switch, epochs_transition, datasetVal.num_classes, net, flag_compute_mIoU=True) writer.add_hparams({'LR': learning_rate, 'Decay': L2_penalty, 'Loss': loss_to_use, 'Transition': epochs_transition, 'Gamma': tversky_gamma, 'Alpha': tversky_alpha }, {'hparam/Accuracy': best_accuracy, 'hparam/mIoU': best_jaccard_score}) writer.close() print("***** TRAINING FINISHED *****") print("BEST ACCURACY REACHED ON THE VALIDATION SET: %.3f " % best_accuracy)