Exemple #1
0
def train_model_opt(parameters):

    print("Training model with hyper-parameters: {}\n\n\n".format(parameters))

    args = parse_args()

    # Load the config file
    cfg = load_config(args)

    # Make the results reproducible
    fix_seed(cfg.SEED)

    cfg.TRAIN.LR = parameters[0]
    cfg.TRAIN.BATCH_SIZE = int(parameters[1])

    # Preparing data
    (train_loader,
     valid_loader) = prepare_dataloaders(cfg)

    # Define model architecture
    vgg19 = VGG("VGG19", num_classes_length=7, num_classes_digits=10)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Device used: ", device)

    # We return negative accuracy since we have minimization function (gp_minimize)
    return -train_model(vgg19, cfg=cfg, train_loader=train_loader, valid_loader=valid_loader, device=device)
Exemple #2
0
def main():
    numClasses = 4
    imgSize = (480, 480)
    origSize = (720, 1160)
    batchSize = args['batchsize']
    epochs = args['epochs']
    lr = 0.001
    momentum = 0.9

    if USE_WANDB:
        config = wandb.config
        config.imgSize = imgSize
        config.batchSize = batchSize
        config.epochs = epochs
        config.lr = lr
        config.momentum = momentum
        wandb.save('./*.py')

    model = wR2(numClasses)
    #model = torch.nn.DataParallel(model, device_ids=range(torch.cuda.device_count())) # This piece of shit hangs
    #the node pretty badly, to the point that the script process is unkillable and have to restart
    #the node to restore operation, which results in a stopped docker container removing all its contents.
    #(https://github.com/pytorch/pytorch/issues/24081#issuecomment-557074611). Cant disable IOMMU in BIOS
    #since working on a remote node. Got no choice but to work with a single GPU.
    #In summary, as Linus Torvalds would say: F**K YOU NVIDIA
    model = model.cuda()
    criterion = nn.MSELoss().cuda()
    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)
    lrScheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
    dset_conf = parse_dset_config(args['dsetconf'])
    #Loading Train Split
    trainloc = dset_conf['train']
    dst = ChaLocDataLoader(trainloc, imgSize)
    trainloader = DataLoader(dst,
                             batch_size=batchSize,
                             shuffle=True,
                             num_workers=4)
    #Loading Validation Split
    valloc = dset_conf['val']
    valdst = ChaLocDataLoader(valloc, imgSize)
    evalloader = DataLoader(valdst,
                            batch_size=batchSize,
                            shuffle=False,
                            num_workers=4)
    print('Starting Training...')
    model_conv = train_model(model,
                             criterion,
                             optimizer,
                             lrScheduler,
                             trainloader,
                             evalloader,
                             batchSize,
                             num_epochs=epochs,
                             USE_WANDB=USE_WANDB)
Exemple #3
0
                                        lr=lr,
                                        weight_decay=l2,
                                        momentum=momentum)

            scheduler = ReduceLROnPlateau(
                optimizer, patience=cfg.TRAIN.SCHEDULER_PATIENCE)

        results = train_model(
            mdl,
            optimizer,
            scheduler,
            hp_opt,
            train_loader=train_loader,
            valid_loader=valid_loader,
            device=device,
            output_dir=cfg.OUTPUT_DIR,
            iteration=iteration,
            resume=resume,
            best_final_acc=best_final_acc,
            num_epochs=cfg.TRAIN.NUM_EPOCHS,
            lr=lr,
            l2=l2,
            momentum=momentum,
            track_misclassified=cfg.TRAIN.TRACK_MISCLASSIFIED)

        if resume:
            resume = False

        # Update optimizer with best accuracy obtained.
        hp_opt.tell([lr, l2, momentum], results['best_acc'])
Exemple #4
0
nlayers = 1
bidirectional = False
latent_dim = 16
fc_units = 10

### Step 1: train the predictive model, here a cVAE
print('STEP 1')
model_dilate = cVAE(input_size, rnn_units, nlayers, bidirectional, latent_dim,
                    fc_units, N_output, device).to(device)
train_model(model_dilate,
            trainloader,
            testloader,
            loss_type='dilate',
            nsamples=10,
            learning_rate=0.001,
            device=device,
            epochs=1,
            gamma=gamma,
            alpha=0.5,
            print_every=50,
            eval_every=100,
            verbose=1)

#torch.save(model_dilate.state_dict(),'save/model_dilate.pth')

### Step 2: train STRIPE-shape
print('STEP 2')
nshapes = 10
stripe_shape = STRIPE('shape', nshapes, latent_dim, N_output,
                      rnn_units).to(device)
train_STRIPE(cvae=model_dilate,
Exemple #5
0
        def fitness(learning_rate, num_dense_layers, dropout, Weigth_Decay):
            '''
            Create and run model with a specified hyperparameter setting.
            Used for the hyperparameter optimization

            Parameters
            ----------
            learning_rate: float
                The learning rate
            num_dense_layers: int
                Number of fully connected layer
            dropout: float
                Amount of Dropout
            weigth_decay: float
                Amount of weight decay
            '''

            # Print the hyper-parameters.
            print("............................")
            print('learning rate: {0:.1e}'.format(learning_rate))
            print('num_dense_layers:', num_dense_layers)
            print('Dropout:', dropout)
            print('Weight Decay:', Weigth_Decay)
            print()

            # Create the neural network with these hyper-parameters.
            model = ConvModel(num_dense_layers=num_dense_layers,
                              dropout=dropout)

            # Dir-name for the TensorBoard log-files.
            log_dir = log_dir_name(learning_rate, num_dense_layers, dropout,
                                   Weigth_Decay)
            output_dir = cfg.OUTPUT_DIR + "/" + log_dir

            # Create the directory
            mkdir_p(output_dir)

            #Create the summaryWriter for Tensorboard
            writer = SummaryWriter(output_dir.replace("checkpoint", "logs"))

            # Train the model.
            best_model, accuracy = train_model(model,
                                               train_loader=train_loader,
                                               valid_loader=valid_loader,
                                               device=device,
                                               writer=writer,
                                               num_epochs=cfg.TRAIN.NUM_EPOCHS,
                                               lr=learning_rate,
                                               weight_decay=Weigth_Decay,
                                               output_dir=output_dir)

            # Save the model if it improves on the best-found performance.
            # We use the global keyword so we update the variable outside
            # of this function.

            global best_accuracy

            # If the classification accuracy of the saved model is improved ...

            if accuracy > best_accuracy:

                print("Updating best Model")
                # Save the new model to harddisk.
                torch.save(best_model, path_best_model)

                # Update the best classification accuracy.
                best_accuracy = accuracy

            # Delete the model with these hyper-parameters from memory.
            del model

            # NOTE: Scikit-optimize does minimization so it tries to
            # find a set of hyper-parameters with the LOWEST fitness-value.
            # Because we are interested in the HIGHEST classification
            # accuracy, we need to negate this number so it can be minimized.
            return -accuracy
Exemple #6
0
            x0=default_parameters)

        #Print the result of the hyperparameter search
        print("Best Accuracy:")
        print(-search_result.fun)
        print("Best Parameters:")
        dim_names = [
            'learning_rate', 'num_dense_layers', 'dropout', 'Weigth_Decay'
        ]
        print({
            paramname: best_param
            for paramname, best_param in zip(dim_names, search_result.x)
        })

    else:
        # Define model architecture
        model = initialize_model(cfg.CONFIG_NAME)

        #Create the summaryWriter for Tensorboard
        writer = SummaryWriter(cfg.OUTPUT_DIR.replace("checkpoint", "logs"))

        #Train the model
        train_model(model,
                    train_loader=train_loader,
                    valid_loader=valid_loader,
                    device=device,
                    writer=writer,
                    num_epochs=cfg.TRAIN.NUM_EPOCHS,
                    lr=cfg.TRAIN.NUM_EPOCHS,
                    output_dir=cfg.OUTPUT_DIR)
Exemple #7
0
        print(-search_result.fun)
        print("Best Parameters:")
        dim_names = [
            'learning_rate', 'num_dense_layers', 'dropout', 'weight_decay'
        ]
        print({
            paramname: best_param
            for paramname, best_param in zip(dim_names, search_result.x)
        })

    else:
        # Define model architecture
        model = initialize_model(cfg.CONFIG_NAME)

        # Create the summaryWriter for Tensorboard
        writer = SummaryWriter(cfg.OUTPUT_DIR)

        # Train the model
        train_model(
            model,
            train_loader=train_loader,
            valid_loader=valid_loader,
            device=device,
            writer=writer,
            num_epochs=cfg.TRAIN.NUM_EPOCHS,
            lr=cfg.TRAIN.LR,
            output_dir='results/ResNet50_2019_03_13_22_13_10',
            checkpoint_every=10,
            load_model_path=
            'results/ResNet50_2019_03_13_22_13_10/epoch40_checkpoint.pth')
Exemple #8
0
        checkpoint = CheckpointSaver(args.checkpoint_dir)

        # Load model from checkpoint
        model, cfg = checkpoint.load(args.checkpoint_name)

        # Make results reproducible
        fix_seed(cfg.SEED)
    else:

        # Load the config file
        cfg = load_config(args)

        # Make results reproducible
        fix_seed(cfg.SEED)

        # Define model architecture
        model = VGG('VGG19', num_classes_length=7, num_classes_digits=10)

    # Prepare data
    (train_loader, valid_loader) = prepare_dataloaders(cfg)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("Device used: ", device)

    # Start model training
    train_model(model,
                cfg=cfg,
                train_loader=train_loader,
                valid_loader=valid_loader,
                device=device)