Esempio n. 1
0
 def from_name(cls, model_name, override_params=None):
     cls._check_model_name_is_valid(model_name)
     blocks_args, global_params = get_model_params(model_name,
                                                   override_params)
     return cls(blocks_args, global_params)
Esempio n. 2
0
def train(args, checkpoint, mid_checkpoint_location, final_checkpoint_location, best_checkpoint_location,
          actfun, curr_seed, outfile_path, filename, fieldnames, curr_sample_size, device, num_params,
          curr_k=2, curr_p=1, curr_g=1, perm_method='shuffle'):
    """
    Runs training session for a given randomized model
    :param args: arguments for this job
    :param checkpoint: current checkpoint
    :param checkpoint_location: output directory for checkpoints
    :param actfun: activation function currently being used
    :param curr_seed: seed being used by current job
    :param outfile_path: path to save outputs from training session
    :param fieldnames: column names for output file
    :param device: reference to CUDA device for GPU support
    :param num_params: number of parameters in the network
    :param curr_k: k value for this iteration
    :param curr_p: p value for this iteration
    :param curr_g: g value for this iteration
    :param perm_method: permutation strategy for our network
    :return:
    """

    resnet_ver = args.resnet_ver
    resnet_width = args.resnet_width
    num_epochs = args.num_epochs

    actfuns_1d = ['relu', 'abs', 'swish', 'leaky_relu', 'tanh']
    if actfun in actfuns_1d:
        curr_k = 1
    kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}

    if args.one_shot:
        util.seed_all(curr_seed)
        model_temp, _ = load_model(args.model, args.dataset, actfun, curr_k, curr_p, curr_g, num_params=num_params,
                                   perm_method=perm_method, device=device, resnet_ver=resnet_ver,
                                   resnet_width=resnet_width, verbose=args.verbose)

        util.seed_all(curr_seed)
        dataset_temp = util.load_dataset(
            args,
            args.model,
            args.dataset,
            seed=curr_seed,
            validation=True,
            batch_size=args.batch_size,
            train_sample_size=curr_sample_size,
            kwargs=kwargs)

        curr_hparams = hparams.get_hparams(args.model, args.dataset, actfun, curr_seed,
                                           num_epochs, args.search, args.hp_idx, args.one_shot)
        optimizer = optim.Adam(model_temp.parameters(),
                               betas=(curr_hparams['beta1'], curr_hparams['beta2']),
                               eps=curr_hparams['eps'],
                               weight_decay=curr_hparams['wd']
                               )

        start_time = time.time()
        oneshot_fieldnames = fieldnames if args.search else None
        oneshot_outfile_path = outfile_path if args.search else None
        lr = util.run_lr_finder(
            args,
            model_temp,
            dataset_temp[0],
            optimizer,
            nn.CrossEntropyLoss(),
            val_loader=dataset_temp[3],
            show=False,
            device=device,
            fieldnames=oneshot_fieldnames,
            outfile_path=oneshot_outfile_path,
            hparams=curr_hparams
        )
        curr_hparams = {}
        print("Time to find LR: {}\n LR found: {:3e}".format(time.time() - start_time, lr))

    else:
        curr_hparams = hparams.get_hparams(args.model, args.dataset, actfun, curr_seed,
                                           num_epochs, args.search, args.hp_idx)
        lr = curr_hparams['max_lr']

        criterion = nn.CrossEntropyLoss()
        model, model_params = load_model(args.model, args.dataset, actfun, curr_k, curr_p, curr_g, num_params=num_params,
                                   perm_method=perm_method, device=device, resnet_ver=resnet_ver,
                                   resnet_width=resnet_width, verbose=args.verbose)

        util.seed_all(curr_seed)
        model.apply(util.weights_init)

        util.seed_all(curr_seed)
        dataset = util.load_dataset(
            args,
            args.model,
            args.dataset,
            seed=curr_seed,
            validation=args.validation,
            batch_size=args.batch_size,
            train_sample_size=curr_sample_size,
            kwargs=kwargs)
        loaders = {
            'aug_train': dataset[0],
            'train': dataset[1],
            'aug_eval': dataset[2],
            'eval': dataset[3],
        }
        sample_size = dataset[4]
        batch_size = dataset[5]

        if args.one_shot:
            optimizer = optim.Adam(model_params)
            scheduler = OneCycleLR(optimizer,
                                   max_lr=lr,
                                   epochs=num_epochs,
                                   steps_per_epoch=int(math.floor(sample_size / batch_size)),
                                   cycle_momentum=False
                                   )
        else:
            optimizer = optim.Adam(model_params,
                                   betas=(curr_hparams['beta1'], curr_hparams['beta2']),
                                   eps=curr_hparams['eps'],
                                   weight_decay=curr_hparams['wd']
                                   )
            scheduler = OneCycleLR(optimizer,
                                   max_lr=curr_hparams['max_lr'],
                                   epochs=num_epochs,
                                   steps_per_epoch=int(math.floor(sample_size / batch_size)),
                                   pct_start=curr_hparams['cycle_peak'],
                                   cycle_momentum=False
                                   )

        epoch = 1
        if checkpoint is not None:
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            epoch = checkpoint['epoch']
            model.to(device)
            print("*** LOADED CHECKPOINT ***"
                  "\n{}"
                  "\nSeed: {}"
                  "\nEpoch: {}"
                  "\nActfun: {}"
                  "\nNum Params: {}"
                  "\nSample Size: {}"
                  "\np: {}"
                  "\nk: {}"
                  "\ng: {}"
                  "\nperm_method: {}".format(mid_checkpoint_location, checkpoint['curr_seed'],
                                             checkpoint['epoch'], checkpoint['actfun'],
                                             checkpoint['num_params'], checkpoint['sample_size'],
                                             checkpoint['p'], checkpoint['k'], checkpoint['g'],
                                             checkpoint['perm_method']))

        util.print_exp_settings(curr_seed, args.dataset, outfile_path, args.model, actfun,
                                util.get_model_params(model), sample_size, batch_size, model.k, model.p, model.g,
                                perm_method, resnet_ver, resnet_width, args.optim, args.validation, curr_hparams)

        best_val_acc = 0

        if args.mix_pre_apex:
            model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

        # ---- Start Training
        while epoch <= num_epochs:

            if args.check_path != '':
                torch.save({'state_dict': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'scheduler': scheduler.state_dict(),
                            'curr_seed': curr_seed,
                            'epoch': epoch,
                            'actfun': actfun,
                            'num_params': num_params,
                            'sample_size': sample_size,
                            'p': curr_p, 'k': curr_k, 'g': curr_g,
                            'perm_method': perm_method
                            }, mid_checkpoint_location)

            util.seed_all((curr_seed * args.num_epochs) + epoch)
            start_time = time.time()
            if args.mix_pre:
                scaler = torch.cuda.amp.GradScaler()

            # ---- Training
            model.train()
            total_train_loss, n, num_correct, num_total = 0, 0, 0, 0
            for batch_idx, (x, targetx) in enumerate(loaders['aug_train']):
                # print(batch_idx)
                x, targetx = x.to(device), targetx.to(device)
                optimizer.zero_grad()
                if args.mix_pre:
                    with torch.cuda.amp.autocast():
                        output = model(x)
                        train_loss = criterion(output, targetx)
                    total_train_loss += train_loss
                    n += 1
                    scaler.scale(train_loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                elif args.mix_pre_apex:
                    output = model(x)
                    train_loss = criterion(output, targetx)
                    total_train_loss += train_loss
                    n += 1
                    with amp.scale_loss(train_loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    optimizer.step()
                else:
                    output = model(x)
                    train_loss = criterion(output, targetx)
                    total_train_loss += train_loss
                    n += 1
                    train_loss.backward()
                    optimizer.step()
                if args.optim == 'onecycle' or args.optim == 'onecycle_sgd':
                    scheduler.step()
                _, prediction = torch.max(output.data, 1)
                num_correct += torch.sum(prediction == targetx.data)
                num_total += len(prediction)
            epoch_aug_train_loss = total_train_loss / n
            epoch_aug_train_acc = num_correct * 1.0 / num_total

            alpha_primes = []
            alphas = []
            if model.actfun == 'combinact':
                for i, layer_alpha_primes in enumerate(model.all_alpha_primes):
                    curr_alpha_primes = torch.mean(layer_alpha_primes, dim=0)
                    curr_alphas = F.softmax(curr_alpha_primes, dim=0).data.tolist()
                    curr_alpha_primes = curr_alpha_primes.tolist()
                    alpha_primes.append(curr_alpha_primes)
                    alphas.append(curr_alphas)

            model.eval()
            with torch.no_grad():
                total_val_loss, n, num_correct, num_total = 0, 0, 0, 0
                for batch_idx, (y, targety) in enumerate(loaders['aug_eval']):
                    y, targety = y.to(device), targety.to(device)
                    output = model(y)
                    val_loss = criterion(output, targety)
                    total_val_loss += val_loss
                    n += 1
                    _, prediction = torch.max(output.data, 1)
                    num_correct += torch.sum(prediction == targety.data)
                    num_total += len(prediction)
                epoch_aug_val_loss = total_val_loss / n
                epoch_aug_val_acc = num_correct * 1.0 / num_total

                total_val_loss, n, num_correct, num_total = 0, 0, 0, 0
                for batch_idx, (y, targety) in enumerate(loaders['eval']):
                    y, targety = y.to(device), targety.to(device)
                    output = model(y)
                    val_loss = criterion(output, targety)
                    total_val_loss += val_loss
                    n += 1
                    _, prediction = torch.max(output.data, 1)
                    num_correct += torch.sum(prediction == targety.data)
                    num_total += len(prediction)
                epoch_val_loss = total_val_loss / n
                epoch_val_acc = num_correct * 1.0 / num_total
            lr_curr = 0
            for param_group in optimizer.param_groups:
                lr_curr = param_group['lr']
            print(
                "    Epoch {}: LR {:1.5f} ||| aug_train_acc {:1.4f} | val_acc {:1.4f}, aug {:1.4f} ||| "
                "aug_train_loss {:1.4f} | val_loss {:1.4f}, aug {:1.4f} ||| time = {:1.4f}"
                    .format(epoch, lr_curr, epoch_aug_train_acc, epoch_val_acc, epoch_aug_val_acc,
                            epoch_aug_train_loss, epoch_val_loss, epoch_aug_val_loss, (time.time() - start_time)), flush=True
            )

            if args.hp_idx is None:
                hp_idx = -1
            else:
                hp_idx = args.hp_idx

            epoch_train_loss = 0
            epoch_train_acc = 0
            if epoch == num_epochs:
                with torch.no_grad():
                    total_train_loss, n, num_correct, num_total = 0, 0, 0, 0
                    for batch_idx, (x, targetx) in enumerate(loaders['aug_train']):
                        x, targetx = x.to(device), targetx.to(device)
                        output = model(x)
                        train_loss = criterion(output, targetx)
                        total_train_loss += train_loss
                        n += 1
                        _, prediction = torch.max(output.data, 1)
                        num_correct += torch.sum(prediction == targetx.data)
                        num_total += len(prediction)
                    epoch_aug_train_loss = total_train_loss / n
                    epoch_aug_train_acc = num_correct * 1.0 / num_total

                    total_train_loss, n, num_correct, num_total = 0, 0, 0, 0
                    for batch_idx, (x, targetx) in enumerate(loaders['train']):
                        x, targetx = x.to(device), targetx.to(device)
                        output = model(x)
                        train_loss = criterion(output, targetx)
                        total_train_loss += train_loss
                        n += 1
                        _, prediction = torch.max(output.data, 1)
                        num_correct += torch.sum(prediction == targetx.data)
                        num_total += len(prediction)
                    epoch_train_loss = total_val_loss / n
                    epoch_train_acc = num_correct * 1.0 / num_total

            # Outputting data to CSV at end of epoch
            with open(outfile_path, mode='a') as out_file:
                writer = csv.DictWriter(out_file, fieldnames=fieldnames, lineterminator='\n')
                writer.writerow({'dataset': args.dataset,
                                 'seed': curr_seed,
                                 'epoch': epoch,
                                 'time': (time.time() - start_time),
                                 'actfun': model.actfun,
                                 'sample_size': sample_size,
                                 'model': args.model,
                                 'batch_size': batch_size,
                                 'alpha_primes': alpha_primes,
                                 'alphas': alphas,
                                 'num_params': util.get_model_params(model),
                                 'var_nparams': args.var_n_params,
                                 'var_nsamples': args.var_n_samples,
                                 'k': curr_k,
                                 'p': curr_p,
                                 'g': curr_g,
                                 'perm_method': perm_method,
                                 'gen_gap': float(epoch_val_loss - epoch_train_loss),
                                 'aug_gen_gap': float(epoch_aug_val_loss - epoch_aug_train_loss),
                                 'resnet_ver': resnet_ver,
                                 'resnet_width': resnet_width,
                                 'epoch_train_loss': float(epoch_train_loss),
                                 'epoch_train_acc': float(epoch_train_acc),
                                 'epoch_aug_train_loss': float(epoch_aug_train_loss),
                                 'epoch_aug_train_acc': float(epoch_aug_train_acc),
                                 'epoch_val_loss': float(epoch_val_loss),
                                 'epoch_val_acc': float(epoch_val_acc),
                                 'epoch_aug_val_loss': float(epoch_aug_val_loss),
                                 'epoch_aug_val_acc': float(epoch_aug_val_acc),
                                 'hp_idx': hp_idx,
                                 'curr_lr': lr_curr,
                                 'found_lr': lr,
                                 'hparams': curr_hparams,
                                 'epochs': num_epochs
                                 })

            epoch += 1

            if args.optim == 'rmsprop':
                scheduler.step()

            if args.checkpoints:
                if epoch_val_acc > best_val_acc:
                    best_val_acc = epoch_val_acc
                    torch.save({'state_dict': model.state_dict(),
                                'optimizer': optimizer.state_dict(),
                                'scheduler': scheduler.state_dict(),
                                'curr_seed': curr_seed,
                                'epoch': epoch,
                                'actfun': actfun,
                                'num_params': num_params,
                                'sample_size': sample_size,
                                'p': curr_p, 'k': curr_k, 'g': curr_g,
                                'perm_method': perm_method
                                }, best_checkpoint_location)

                torch.save({'state_dict': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                            'scheduler': scheduler.state_dict(),
                            'curr_seed': curr_seed,
                            'epoch': epoch,
                            'actfun': actfun,
                            'num_params': num_params,
                            'sample_size': sample_size,
                            'p': curr_p, 'k': curr_k, 'g': curr_g,
                            'perm_method': perm_method
                            }, final_checkpoint_location)
Esempio n. 3
0
def train_network(model):
    n_channels, depth, z_dim, n_hid_first, lam, L = get_model_params(model)
    batch_size, num_epochs, learning_rate = get_training_params(model)

    data = TrainData(batch_size)
    input_var = T.matrix('inputs')

    # Create VAE model
    l_z_mean, l_z_logsigma, l_x_mean_list, l_x_logsigma_list, l_x_list, l_x = \
        build_vae(input_var, n_channels=n_channels, depth=depth, z_dim=z_dim,
                                              n_hid_first=n_hid_first, L=L)

    def build_loss(deterministic):
        layer_outputs = nn.layers.get_output([l_z_mean, l_z_logsigma] + l_x_mean_list
                                             + l_x_logsigma_list,
                                             deterministic=deterministic)
        z_mean = layer_outputs[0]
        z_ls = layer_outputs[1]
        x_mean = layer_outputs[2: 2 + L]
        x_logsigma = layer_outputs[2 + L : 2 + 2 * L]

        # Loss function:  - log p(x|z) + KL_div
        kl_div = lam * 0.5 * T.sum(T.exp(2 * z_ls) + T.sqr(z_mean) - 1 - 2 * z_ls)

        logpxz = sum(log_likelihood(input_var.flatten(2), mu, ls)
                     for mu, ls in zip(x_mean, x_logsigma)) / L
        prediction = x_mean[0] if deterministic else T.sum(x_mean, axis=0) / L
        loss = -logpxz + kl_div

        return loss, prediction

    loss, _ = build_loss(deterministic=False)
    test_loss, test_prediction = build_loss(deterministic=True)

    # ADAM updates
    params = nn.layers.get_all_params(l_x, trainable=True)
    updates = nn.updates.adam(loss, params, learning_rate=learning_rate)
    train_fn = theano.function([input_var], loss, updates=updates)
    val_fn = theano.function([input_var], test_loss)

    previous_val_err_1 = float('inf')
    previous_val_err_2 = float('inf')
    for epoch in range(num_epochs):
        train_err = 0.0
        epoch_size = 0
        start_time = time.time()
        for i in range(data.train_size):
            batch = data.next_batch()
            this_err = train_fn(batch)
            train_err += this_err
            epoch_size += batch.shape[0]

        print("Epoch {} of {} took {:.3f}s".format(
            epoch + 1, num_epochs, time.time() - start_time))
        print("training loss: {:.6f}".format(train_err / epoch_size))
        val_err = 0.0
        val_size = 0
        test_data = data.validation_data()
        for i in range(data.validation_size):
            err = val_fn(test_data[i])
            val_err += err
            val_size += test_data[i].shape[0]

        print("validation loss: {:.6f}".format(val_err / val_size))

        # early stopping
        if val_err > previous_val_err_1 and val_err > previous_val_err_2:
            break
        else:
            previous_val_err_2 = previous_val_err_1
            previous_val_err_1 = val_err

        # save the parameters so they can be loaded for next time
        np.savez(model_path(model) + str(epoch), *nn.layers.get_all_param_values(l_x))

        # output samples
        samples = data.validation_samples()
        pred_fn = theano.function([input_var], test_prediction)
        X_pred = pred_fn(samples)
        for i in range(len(samples)):
            print(samples[i] - X_pred[i])
Esempio n. 4
0
def main():
    setup_default_logging()
    args, args_text = _parse_args()

    args.prefetcher = not args.no_prefetcher
    args.distributed = False
    args.device = 'cuda:0'
    args.world_size = 1
    args.rank = 0  # global rank

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    _logger.info('====================\n\n'
                 'Actfun: {}\n'
                 'LR: {}\n'
                 'Epochs: {}\n'
                 'p: {}\n'
                 'k: {}\n'
                 'g: {}\n'
                 'Extra channel multiplier: {}\n'
                 'Weight Init: {}\n'
                 '\n===================='.format(args.actfun, args.lr,
                                                 args.epochs, args.p, args.k,
                                                 args.g,
                                                 args.extra_channel_mult,
                                                 args.weight_init))

    # ================================================================================= Loading models
    pre_model = create_model(
        args.model,
        pretrained=True,
        actfun='swish',
        num_classes=args.num_classes,
        drop_rate=args.drop,
        drop_connect_rate=args.drop_connect,  # DEPRECATED, use drop_path
        drop_path_rate=args.drop_path,
        drop_block_rate=args.drop_block,
        global_pool=args.gp,
        bn_tf=args.bn_tf,
        bn_momentum=args.bn_momentum,
        bn_eps=args.bn_eps,
        scriptable=args.torchscript,
        checkpoint_path=args.initial_checkpoint,
        p=args.p,
        k=args.k,
        g=args.g,
        extra_channel_mult=args.extra_channel_mult,
        weight_init_name=args.weight_init,
        partial_ho_actfun=args.partial_ho_actfun)
    pre_model_layers = list(pre_model.children())
    pre_model = torch.nn.Sequential(*pre_model_layers[:-1])
    pre_model.to(device)

    model = MLP.MLP(actfun=args.actfun,
                    input_dim=1280,
                    output_dim=args.num_classes,
                    k=args.k,
                    p=args.p,
                    g=args.g,
                    num_params=1_000_000,
                    permute_type='shuffle')
    model.to(device)

    # ================================================================================= Loading dataset
    util.seed_all(args.seed)
    if args.data == 'caltech101' and not os.path.exists('caltech101'):
        dir_root = r'101_ObjectCategories'
        dir_new = r'caltech101'
        dir_new_train = os.path.join(dir_new, 'train')
        dir_new_val = os.path.join(dir_new, 'val')
        dir_new_test = os.path.join(dir_new, 'test')
        if not os.path.exists(dir_new):
            os.mkdir(dir_new)
            os.mkdir(dir_new_train)
            os.mkdir(dir_new_val)
            os.mkdir(dir_new_test)

        for dir2 in os.listdir(dir_root):
            if dir2 != 'BACKGROUND_Google':
                curr_path = os.path.join(dir_root, dir2)
                new_path_train = os.path.join(dir_new_train, dir2)
                new_path_val = os.path.join(dir_new_val, dir2)
                new_path_test = os.path.join(dir_new_test, dir2)
                if not os.path.exists(new_path_train):
                    os.mkdir(new_path_train)
                if not os.path.exists(new_path_val):
                    os.mkdir(new_path_val)
                if not os.path.exists(new_path_test):
                    os.mkdir(new_path_test)

                train_upper = int(0.8 * len(os.listdir(curr_path)))
                val_upper = int(0.9 * len(os.listdir(curr_path)))
                curr_files_all = os.listdir(curr_path)
                curr_files_train = curr_files_all[:train_upper]
                curr_files_val = curr_files_all[train_upper:val_upper]
                curr_files_test = curr_files_all[val_upper:]

                for file in curr_files_train:
                    copyfile(os.path.join(curr_path, file),
                             os.path.join(new_path_train, file))
                for file in curr_files_val:
                    copyfile(os.path.join(curr_path, file),
                             os.path.join(new_path_val, file))
                for file in curr_files_test:
                    copyfile(os.path.join(curr_path, file),
                             os.path.join(new_path_test, file))
    time.sleep(5)

    # create the train and eval datasets
    train_dir = os.path.join(args.data, 'train')
    if not os.path.exists(train_dir):
        _logger.error(
            'Training folder does not exist at: {}'.format(train_dir))
        exit(1)
    dataset_train = Dataset(train_dir)

    eval_dir = os.path.join(args.data, 'val')
    if not os.path.isdir(eval_dir):
        eval_dir = os.path.join(args.data, 'validation')
        if not os.path.isdir(eval_dir):
            _logger.error(
                'Validation folder does not exist at: {}'.format(eval_dir))
            exit(1)
    dataset_eval = Dataset(eval_dir)

    # setup augmentation batch splits for contrastive loss or split bn
    num_aug_splits = 0
    if args.aug_splits > 0:
        assert args.aug_splits > 1, 'A split of 1 makes no sense'
        num_aug_splits = args.aug_splits

    # enable split bn (separate bn stats per batch-portion)
    if args.split_bn:
        assert num_aug_splits > 1 or args.resplit
        model = convert_splitbn_model(model, max(num_aug_splits, 2))

    # setup mixup / cutmix
    collate_fn = None
    mixup_fn = None
    mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None
    if mixup_active:
        mixup_args = dict(mixup_alpha=args.mixup,
                          cutmix_alpha=args.cutmix,
                          cutmix_minmax=args.cutmix_minmax,
                          prob=args.mixup_prob,
                          switch_prob=args.mixup_switch_prob,
                          mode=args.mixup_mode,
                          label_smoothing=args.smoothing,
                          num_classes=args.num_classes)
        if args.prefetcher:
            assert not num_aug_splits  # collate conflict (need to support deinterleaving in collate mixup)
            collate_fn = FastCollateMixup(**mixup_args)
        else:
            mixup_fn = Mixup(**mixup_args)

    # create data loaders w/ augmentation pipeline
    train_interpolation = args.train_interpolation
    data_config = resolve_data_config(vars(args),
                                      model=model,
                                      verbose=args.local_rank == 0)
    if args.no_aug or not train_interpolation:
        train_interpolation = data_config['interpolation']
    loader_train = create_loader(
        dataset_train,
        input_size=data_config['input_size'],
        batch_size=args.batch_size,
        is_training=True,
        use_prefetcher=args.prefetcher,
        no_aug=args.no_aug,
        re_prob=args.reprob,
        re_mode=args.remode,
        re_count=args.recount,
        re_split=args.resplit,
        scale=args.scale,
        ratio=args.ratio,
        hflip=args.hflip,
        vflip=args.vflip,
        color_jitter=args.color_jitter,
        auto_augment=args.aa,
        num_aug_splits=num_aug_splits,
        interpolation=train_interpolation,
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
        collate_fn=collate_fn,
        pin_memory=args.pin_mem,
        use_multi_epochs_loader=args.use_multi_epochs_loader)

    loader_eval = create_loader(
        dataset_eval,
        input_size=data_config['input_size'],
        batch_size=args.validation_batch_size_multiplier * args.batch_size,
        is_training=False,
        use_prefetcher=args.prefetcher,
        interpolation=data_config['interpolation'],
        mean=data_config['mean'],
        std=data_config['std'],
        num_workers=args.workers,
        distributed=args.distributed,
        crop_pct=data_config['crop_pct'],
        pin_memory=args.pin_mem,
    )

    # ================================================================================= Optimizer / scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5)
    scheduler = OneCycleLR(
        optimizer,
        max_lr=args.lr,
        epochs=args.epochs,
        steps_per_epoch=int(math.floor(len(dataset_train) / args.batch_size)),
        cycle_momentum=False)

    # ================================================================================= Save file / checkpoints
    fieldnames = [
        'dataset', 'seed', 'epoch', 'time', 'actfun', 'model', 'batch_size',
        'alpha_primes', 'alphas', 'num_params', 'k', 'p', 'g', 'perm_method',
        'gen_gap', 'epoch_train_loss', 'epoch_train_acc',
        'epoch_aug_train_loss', 'epoch_aug_train_acc', 'epoch_val_loss',
        'epoch_val_acc', 'curr_lr', 'found_lr', 'epochs'
    ]
    filename = 'out_{}_{}_{}_{}'.format(datetime.date.today(), args.actfun,
                                        args.data, args.seed)
    outfile_path = os.path.join(args.output, filename) + '.csv'
    checkpoint_path = os.path.join(args.check_path, filename) + '.pth'
    if not os.path.exists(outfile_path):
        with open(outfile_path, mode='w') as out_file:
            writer = csv.DictWriter(out_file,
                                    fieldnames=fieldnames,
                                    lineterminator='\n')
            writer.writeheader()

    epoch = 1
    checkpoint = torch.load(checkpoint_path) if os.path.exists(
        checkpoint_path) else None
    if checkpoint is not None:
        pre_model.load_state_dict(checkpoint['pre_model_state_dict'])
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        epoch = checkpoint['epoch']
        pre_model.to(device)
        model.to(device)
        print("*** LOADED CHECKPOINT ***"
              "\n{}"
              "\nSeed: {}"
              "\nEpoch: {}"
              "\nActfun: {}"
              "\np: {}"
              "\nk: {}"
              "\ng: {}"
              "\nperm_method: {}".format(checkpoint_path,
                                         checkpoint['curr_seed'],
                                         checkpoint['epoch'],
                                         checkpoint['actfun'], checkpoint['p'],
                                         checkpoint['k'], checkpoint['g'],
                                         checkpoint['perm_method']))

    args.mix_pre_apex = False
    if args.control_amp == 'apex':
        args.mix_pre_apex = True
        model, optimizer = amp.initialize(model, optimizer, opt_level="O2")

    # ================================================================================= Training
    while epoch <= args.epochs:

        if args.check_path != '':
            torch.save(
                {
                    'pre_model_state_dict': pre_model.state_dict(),
                    'model_state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict(),
                    'curr_seed': args.seed,
                    'epoch': epoch,
                    'actfun': args.actfun,
                    'p': args.p,
                    'k': args.k,
                    'g': args.g,
                    'perm_method': 'shuffle'
                }, checkpoint_path)

        util.seed_all((args.seed * args.epochs) + epoch)
        start_time = time.time()
        args.mix_pre = False
        if args.control_amp == 'native':
            args.mix_pre = True
            scaler = torch.cuda.amp.GradScaler()

        # ---- Training
        model.train()
        total_train_loss, n, num_correct, num_total = 0, 0, 0, 0
        for batch_idx, (x, targetx) in enumerate(loader_train):
            x, targetx = x.to(device), targetx.to(device)
            optimizer.zero_grad()
            if args.mix_pre:
                with torch.cuda.amp.autocast():
                    with torch.no_grad():
                        x = pre_model(x)
                    output = model(x)
                    train_loss = criterion(output, targetx)
                total_train_loss += train_loss
                n += 1
                scaler.scale(train_loss).backward()
                scaler.step(optimizer)
                scaler.update()
            elif args.mix_pre_apex:
                with torch.no_grad():
                    x = pre_model(x)
                output = model(x)
                train_loss = criterion(output, targetx)
                total_train_loss += train_loss
                n += 1
                with amp.scale_loss(train_loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                optimizer.step()
            else:
                with torch.no_grad():
                    x = pre_model(x)
                output = model(x)
                train_loss = criterion(output, targetx)
                total_train_loss += train_loss
                n += 1
                train_loss.backward()
                optimizer.step()
            scheduler.step()
            _, prediction = torch.max(output.data, 1)
            num_correct += torch.sum(prediction == targetx.data)
            num_total += len(prediction)
        epoch_aug_train_loss = total_train_loss / n
        epoch_aug_train_acc = num_correct * 1.0 / num_total

        alpha_primes = []
        alphas = []
        if model.actfun == 'combinact':
            for i, layer_alpha_primes in enumerate(model.all_alpha_primes):
                curr_alpha_primes = torch.mean(layer_alpha_primes, dim=0)
                curr_alphas = F.softmax(curr_alpha_primes, dim=0).data.tolist()
                curr_alpha_primes = curr_alpha_primes.tolist()
                alpha_primes.append(curr_alpha_primes)
                alphas.append(curr_alphas)

        model.eval()
        with torch.no_grad():
            total_val_loss, n, num_correct, num_total = 0, 0, 0, 0
            for batch_idx, (y, targety) in enumerate(loader_eval):
                y, targety = y.to(device), targety.to(device)
                with torch.no_grad():
                    y = pre_model(y)
                output = model(y)
                val_loss = criterion(output, targety)
                total_val_loss += val_loss
                n += 1
                _, prediction = torch.max(output.data, 1)
                num_correct += torch.sum(prediction == targety.data)
                num_total += len(prediction)
            epoch_val_loss = total_val_loss / n
            epoch_val_acc = num_correct * 1.0 / num_total
        lr_curr = 0
        for param_group in optimizer.param_groups:
            lr_curr = param_group['lr']
        print(
            "    Epoch {}: LR {:1.5f} ||| aug_train_acc {:1.4f} | val_acc {:1.4f} ||| "
            "aug_train_loss {:1.4f} | val_loss {:1.4f} ||| time = {:1.4f}".
            format(epoch, lr_curr, epoch_aug_train_acc, epoch_val_acc,
                   epoch_aug_train_loss, epoch_val_loss,
                   (time.time() - start_time)),
            flush=True)

        epoch_train_loss = 0
        epoch_train_acc = 0
        if epoch == args.epochs:
            with torch.no_grad():
                total_train_loss, n, num_correct, num_total = 0, 0, 0, 0
                for batch_idx, (x, targetx) in enumerate(loader_train):
                    x, targetx = x.to(device), targetx.to(device)
                    with torch.no_grad():
                        x = pre_model(x)
                    output = model(x)
                    train_loss = criterion(output, targetx)
                    total_train_loss += train_loss
                    n += 1
                    _, prediction = torch.max(output.data, 1)
                    num_correct += torch.sum(prediction == targetx.data)
                    num_total += len(prediction)
                epoch_aug_train_loss = total_train_loss / n
                epoch_aug_train_acc = num_correct * 1.0 / num_total

                total_train_loss, n, num_correct, num_total = 0, 0, 0, 0
                for batch_idx, (x, targetx) in enumerate(loader_eval):
                    x, targetx = x.to(device), targetx.to(device)
                    with torch.no_grad():
                        x = pre_model(x)
                    output = model(x)
                    train_loss = criterion(output, targetx)
                    total_train_loss += train_loss
                    n += 1
                    _, prediction = torch.max(output.data, 1)
                    num_correct += torch.sum(prediction == targetx.data)
                    num_total += len(prediction)
                epoch_train_loss = total_val_loss / n
                epoch_train_acc = num_correct * 1.0 / num_total

        # Outputting data to CSV at end of epoch
        with open(outfile_path, mode='a') as out_file:
            writer = csv.DictWriter(out_file,
                                    fieldnames=fieldnames,
                                    lineterminator='\n')
            writer.writerow({
                'dataset': args.data,
                'seed': args.seed,
                'epoch': epoch,
                'time': (time.time() - start_time),
                'actfun': model.actfun,
                'model': args.model,
                'batch_size': args.batch_size,
                'alpha_primes': alpha_primes,
                'alphas': alphas,
                'num_params': util.get_model_params(model),
                'k': args.k,
                'p': args.p,
                'g': args.g,
                'perm_method': 'shuffle',
                'gen_gap': float(epoch_val_loss - epoch_train_loss),
                'epoch_train_loss': float(epoch_train_loss),
                'epoch_train_acc': float(epoch_train_acc),
                'epoch_aug_train_loss': float(epoch_aug_train_loss),
                'epoch_aug_train_acc': float(epoch_aug_train_acc),
                'epoch_val_loss': float(epoch_val_loss),
                'epoch_val_acc': float(epoch_val_acc),
                'curr_lr': lr_curr,
                'found_lr': args.lr,
                'epochs': args.epochs
            })

        epoch += 1
Esempio n. 5
0
 def _load_model(self, path):
     self._model = keras.models.load_model(path, compile=False, custom_objects={'Interp': Interp, 'relu6': relu6})
     (self.height, self.width, self.channels), _, self.sine_steering = get_model_params(self._model)
     self._model.summary()
     x = self._model.layers[-7].get_output_at(0)
     self.cut_model = Model(inputs=self._model.layers[-7].get_input_at(0), outputs=x)
Esempio n. 6
0
 def _load_model(self, path: str):
     # Uncomment for legacy models
     # model: tensorflow.python.keras.Model = get_hegemax_model(1, True)
     # model.load_weights(path)
     # self._model = model
     self._model = keras.models.load_model(path, compile=False, custom_objects={'Interp': Interp, 'relu6': relu6})
     (self.height, self.width, self.channels), self.sequence_length, self.sine_steering = get_model_params(
         self._model)
     print("Image shape: " + str((self.height, self.width, self.channels)) + ", sequence length: " + str(
         self.sequence_length) + ", sine steering? " + str(self.sine_steering))
Esempio n. 7
0
    with open(folder_name + '/params.json', 'w') as outfile:
        json.dump(test_params, outfile)

    save_predictions(folder_name, x, gt, probs, processor)


def save_predictions(path, x, gt, probs, processor):
    np.save(path + '/x', x)
    np.save(path + '/gt', gt)
    np.save(path + '/probs', probs)
    pickle.dump(processor, open(path + '/processor', 'w+'))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("test_config_file", help="path to config file")
    parser.add_argument(
        'model_paths',
        nargs='+',
        help="path to models")
    parser.add_argument("--split", help="train/val", choices=['train', 'test'],
                        default='test')
    args = parser.parse_args()
    train_params = util.get_model_params(args.model_paths[0])  # FIXME: bug
    test_params = train_params.copy()
    test_new_params = json.load(open(args.test_config_file, 'r'))
    test_params.update(test_new_params)
    if "label_review" in test_new_params["data_path"]:
        assert(args.split == 'test')
    predict(args, train_params, test_params)