def train_model_opt(parameters): print("Training model with hyper-parameters: {}\n\n\n".format(parameters)) args = parse_args() # Load the config file cfg = load_config(args) # Make the results reproducible fix_seed(cfg.SEED) cfg.TRAIN.LR = parameters[0] cfg.TRAIN.BATCH_SIZE = int(parameters[1]) # Preparing data (train_loader, valid_loader) = prepare_dataloaders(cfg) # Define model architecture vgg19 = VGG("VGG19", num_classes_length=7, num_classes_digits=10) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Device used: ", device) # We return negative accuracy since we have minimization function (gp_minimize) return -train_model(vgg19, cfg=cfg, train_loader=train_loader, valid_loader=valid_loader, device=device)
def main(): numClasses = 4 imgSize = (480, 480) origSize = (720, 1160) batchSize = args['batchsize'] epochs = args['epochs'] lr = 0.001 momentum = 0.9 if USE_WANDB: config = wandb.config config.imgSize = imgSize config.batchSize = batchSize config.epochs = epochs config.lr = lr config.momentum = momentum wandb.save('./*.py') model = wR2(numClasses) #model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) # This piece of shit hangs #the node pretty badly, to the point that the script process is unkillable and have to restart #the node to restore operation, which results in a stopped docker container removing all its contents. #(https://github.com/pytorch/pytorch/issues/24081#issuecomment-557074611). Cant disable IOMMU in BIOS #since working on a remote node. Got no choice but to work with a single GPU. #In summary, as Linus Torvalds would say: F**K YOU NVIDIA model = model.cuda() criterion = nn.MSELoss().cuda() optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum) lrScheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) dset_conf = parse_dset_config(args['dsetconf']) #Loading Train Split trainloc = dset_conf['train'] dst = ChaLocDataLoader(trainloc, imgSize) trainloader = DataLoader(dst, batch_size=batchSize, shuffle=True, num_workers=4) #Loading Validation Split valloc = dset_conf['val'] valdst = ChaLocDataLoader(valloc, imgSize) evalloader = DataLoader(valdst, batch_size=batchSize, shuffle=False, num_workers=4) print('Starting Training...') model_conv = train_model(model, criterion, optimizer, lrScheduler, trainloader, evalloader, batchSize, num_epochs=epochs, USE_WANDB=USE_WANDB)
lr=lr, weight_decay=l2, momentum=momentum) scheduler = ReduceLROnPlateau( optimizer, patience=cfg.TRAIN.SCHEDULER_PATIENCE) results = train_model( mdl, optimizer, scheduler, hp_opt, train_loader=train_loader, valid_loader=valid_loader, device=device, output_dir=cfg.OUTPUT_DIR, iteration=iteration, resume=resume, best_final_acc=best_final_acc, num_epochs=cfg.TRAIN.NUM_EPOCHS, lr=lr, l2=l2, momentum=momentum, track_misclassified=cfg.TRAIN.TRACK_MISCLASSIFIED) if resume: resume = False # Update optimizer with best accuracy obtained. hp_opt.tell([lr, l2, momentum], results['best_acc'])
nlayers = 1 bidirectional = False latent_dim = 16 fc_units = 10 ### Step 1: train the predictive model, here a cVAE print('STEP 1') model_dilate = cVAE(input_size, rnn_units, nlayers, bidirectional, latent_dim, fc_units, N_output, device).to(device) train_model(model_dilate, trainloader, testloader, loss_type='dilate', nsamples=10, learning_rate=0.001, device=device, epochs=1, gamma=gamma, alpha=0.5, print_every=50, eval_every=100, verbose=1) #torch.save(model_dilate.state_dict(),'save/model_dilate.pth') ### Step 2: train STRIPE-shape print('STEP 2') nshapes = 10 stripe_shape = STRIPE('shape', nshapes, latent_dim, N_output, rnn_units).to(device) train_STRIPE(cvae=model_dilate,
def fitness(learning_rate, num_dense_layers, dropout, Weigth_Decay): ''' Create and run model with a specified hyperparameter setting. Used for the hyperparameter optimization Parameters ---------- learning_rate: float The learning rate num_dense_layers: int Number of fully connected layer dropout: float Amount of Dropout weigth_decay: float Amount of weight decay ''' # Print the hyper-parameters. print("............................") print('learning rate: {0:.1e}'.format(learning_rate)) print('num_dense_layers:', num_dense_layers) print('Dropout:', dropout) print('Weight Decay:', Weigth_Decay) print() # Create the neural network with these hyper-parameters. model = ConvModel(num_dense_layers=num_dense_layers, dropout=dropout) # Dir-name for the TensorBoard log-files. log_dir = log_dir_name(learning_rate, num_dense_layers, dropout, Weigth_Decay) output_dir = cfg.OUTPUT_DIR + "/" + log_dir # Create the directory mkdir_p(output_dir) #Create the summaryWriter for Tensorboard writer = SummaryWriter(output_dir.replace("checkpoint", "logs")) # Train the model. best_model, accuracy = train_model(model, train_loader=train_loader, valid_loader=valid_loader, device=device, writer=writer, num_epochs=cfg.TRAIN.NUM_EPOCHS, lr=learning_rate, weight_decay=Weigth_Decay, output_dir=output_dir) # Save the model if it improves on the best-found performance. # We use the global keyword so we update the variable outside # of this function. global best_accuracy # If the classification accuracy of the saved model is improved ... if accuracy > best_accuracy: print("Updating best Model") # Save the new model to harddisk. torch.save(best_model, path_best_model) # Update the best classification accuracy. best_accuracy = accuracy # Delete the model with these hyper-parameters from memory. del model # NOTE: Scikit-optimize does minimization so it tries to # find a set of hyper-parameters with the LOWEST fitness-value. # Because we are interested in the HIGHEST classification # accuracy, we need to negate this number so it can be minimized. return -accuracy
x0=default_parameters) #Print the result of the hyperparameter search print("Best Accuracy:") print(-search_result.fun) print("Best Parameters:") dim_names = [ 'learning_rate', 'num_dense_layers', 'dropout', 'Weigth_Decay' ] print({ paramname: best_param for paramname, best_param in zip(dim_names, search_result.x) }) else: # Define model architecture model = initialize_model(cfg.CONFIG_NAME) #Create the summaryWriter for Tensorboard writer = SummaryWriter(cfg.OUTPUT_DIR.replace("checkpoint", "logs")) #Train the model train_model(model, train_loader=train_loader, valid_loader=valid_loader, device=device, writer=writer, num_epochs=cfg.TRAIN.NUM_EPOCHS, lr=cfg.TRAIN.NUM_EPOCHS, output_dir=cfg.OUTPUT_DIR)
print(-search_result.fun) print("Best Parameters:") dim_names = [ 'learning_rate', 'num_dense_layers', 'dropout', 'weight_decay' ] print({ paramname: best_param for paramname, best_param in zip(dim_names, search_result.x) }) else: # Define model architecture model = initialize_model(cfg.CONFIG_NAME) # Create the summaryWriter for Tensorboard writer = SummaryWriter(cfg.OUTPUT_DIR) # Train the model train_model( model, train_loader=train_loader, valid_loader=valid_loader, device=device, writer=writer, num_epochs=cfg.TRAIN.NUM_EPOCHS, lr=cfg.TRAIN.LR, output_dir='results/ResNet50_2019_03_13_22_13_10', checkpoint_every=10, load_model_path= 'results/ResNet50_2019_03_13_22_13_10/epoch40_checkpoint.pth')
checkpoint = CheckpointSaver(args.checkpoint_dir) # Load model from checkpoint model, cfg = checkpoint.load(args.checkpoint_name) # Make results reproducible fix_seed(cfg.SEED) else: # Load the config file cfg = load_config(args) # Make results reproducible fix_seed(cfg.SEED) # Define model architecture model = VGG('VGG19', num_classes_length=7, num_classes_digits=10) # Prepare data (train_loader, valid_loader) = prepare_dataloaders(cfg) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("Device used: ", device) # Start model training train_model(model, cfg=cfg, train_loader=train_loader, valid_loader=valid_loader, device=device)