Example #1
0
    def __init__(self,
                 p_type,
                 algorithms=None,
                 hyperparameters=None,
                 verbose=False,
                 n_cores=mp.cpu_count(),
                 runtime_limit=512,
                 selection_method='min_variance',
                 scalarization='D',
                 error_matrix=None,
                 runtime_matrix=None,
                 stacking_alg='greedy',
                 **stacking_hyperparams):

        # TODO: check if arguments to constructor are valid; set to defaults if not specified
        assert selection_method in {'qr', 'min_variance'}, "The method to select entries to sample must be " \
            "either qr (QR decomposition) or min_variance (minimize variance with time constraints)."

        with open(os.path.join(DEFAULTS, p_type + '.json')) as file:
            defaults = json.load(file)

        # attributes of ML problem
        self.p_type = p_type.lower()
        self.algorithms = algorithms or defaults['algorithms']
        self.hyperparameters = hyperparameters or defaults['hyperparameters']
        self.verbose = verbose

        # computational considerations
        self.n_cores = n_cores
        self.runtime_limit = runtime_limit

        # sample column selection
        self.selection_method = selection_method
        self.scalarization = scalarization

        # error matrix attributes
        # TODO: determine whether to generate new error matrix or use default/subset of default
        self.error_matrix = ERROR_MATRIX if error_matrix is None else error_matrix
        self.runtime_matrix = RUNTIME_MATRIX if runtime_matrix is None else runtime_matrix
        assert util.check_dataframes(self.error_matrix, self.runtime_matrix)
        self.column_headings = np.array(
            [eval(heading) for heading in list(self.error_matrix)])
        self.X, self.Y, _ = linalg.pca(self.error_matrix.values,
                                       rank=min(self.error_matrix.shape) - 1)

        # sampled & fitted models
        self.new_row = np.zeros((1, self.error_matrix.shape[1]))
        self.sampled_indices = set()
        self.sampled_models = [None] * self.error_matrix.shape[1]
        self.fitted_indices = set()
        self.fitted_models = [None] * self.error_matrix.shape[1]

        # ensemble attributes
        self.stacking_alg = stacking_alg
        self.stacking_hyperparams = stacking_hyperparams
        self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                 self.stacking_hyperparams)
Example #2
0
        def doubling():
            k, t = ranks[0], times[0]
            counter, self.best = 0, 0
            while time.time() - start < self.runtime_limit - t:
                if verbose:
                    print('Fitting with k={}, t={}'.format(k, t))
                t0 = time.time()
                if self.build_ensemble:
                    self.ensemble = Ensemble(self.p_type, self.ensemble_method,
                                             self.stacking_hyperparams)
                else:
                    self.ensemble = Model_collection(self.p_type)
                self._fit(x_tr, y_tr, rank=k, runtime_limit=t)
                if self.build_ensemble:
                    loss = util.error(y_va, self.ensemble.predict(x_va),
                                      self.p_type)

                    # TEMPORARY: Record intermediate results

                    e_hat.append(np.copy(self.new_row))
                    actual_times.append(time.time() - start)
                    sampled.append(self.sampled_indices)
                    ensembles.append(self.ensemble)
                    losses.append(loss)

                    if loss == min(losses):
                        ranks.append(k + 1)
                        self.best = counter
                    else:
                        ranks.append(k)

                    times.append(2 * t)
                    k = ranks[-1]
                    t = times[-1]
                    counter += 1
Example #3
0
    def __init__(self,
                 p_type,
                 algorithms=None,
                 hyperparameters=None,
                 n_cores=None,
                 verbose=False,
                 stacking_alg='Logit',
                 **stacking_hyperparams):

        # check if arguments to constructor are valid; set to defaults if not specified
        default, new = util.check_arguments(p_type, algorithms,
                                            hyperparameters)
        self.p_type = p_type.lower()
        self.algorithms = algorithms
        self.hyperparameters = hyperparameters
        self.n_cores = n_cores
        self.verbose = verbose

        if len(new) > 0:
            # if selected hyperparameters contain model configurations not included in default
            proceed = input(
                "Your selected hyperparameters contain some not included in the default error matrix. \n"
                "Do you want to generate your own error matrix? [yes/no]")
            if proceed == 'yes':
                subprocess.call(['./generate_matrix.sh'])
                # TODO: load newly generated error matrix file
            else:
                return
        else:
            # use default error matrix (or subset of)
            path = pkg_resources.resource_filename(
                __name__, 'defaults/error_matrix.csv')
            default_error_matrix = pd.read_csv(path, index_col=0)
            column_headings = np.array(
                [eval(heading) for heading in list(default_error_matrix)])
            selected_indices = np.array(
                [heading in column_headings for heading in default])
            self.error_matrix = default_error_matrix.values[:,
                                                            selected_indices]
            self.column_headings = sorted(default,
                                          key=lambda d: d['algorithm'])

        self.ensemble = Ensemble(self.p_type, stacking_alg,
                                 **stacking_hyperparams)
        self.optimized_settings = []
        self.new_row = None
Example #4
0
def build_ensemble(model_dims, paths=["models/model-20.pkl", "models_2009/model-20.pkl", "models_2010/model-20.pkl", "models_2011/model-20.pkl"]):
	# build the model ensembles given the model architecture and trained models 
	models = []
	for path in paths:
		net = RetailModel(model_dims)
		net.load_state_dict(torch.load(path))
		models.append(net)
	ensemble = Ensemble(models)
	return ensemble
Example #5
0
def train(CONFIG):

    #creat result folder and save config as txt file
    t = time.strftime('%Y_%m_%d_%H_%M_%S')
    results_dir = os.path.join(CONFIG['SAVE_PATH'], t)
    if not os.path.isdir(results_dir):
        os.makedirs(results_dir)
    with open(os.path.join(results_dir, 'Settings.txt'), 'w') as file:
        file.write(json.dumps(CONFIG))

    # creat train dataset
    train_dataset = generate_stem_dataset(CONFIG['DATA_PATH'],
                                          CONFIG['INPUT_SIZE'],
                                          CONFIG['DATA_AUGMENTATION'])

    # split train dataset for 5-fold stratified cross validation
    kf = model_selection.KFold(n_splits=5, shuffle=True)

    for fold_num, (train_index,
                   test_index) in enumerate(kf.split(train_dataset)):

        #creat sub_train&sub_test dataset
        train_subset = torch.utils.data.Subset(train_dataset, train_index)
        test_subset = torch.utils.data.Subset(train_dataset, test_index)

        #define dynamic weighted resampler
        train_targets = [item[1] for item in train_subset]
        weighted_sampler = ScheduledWeightedSampler(len(train_subset),
                                                    train_targets, True)

        #creat dataloader
        train_loader = DataLoader(train_subset,
                                  batch_size=CONFIG['BATCH_SIZE'],
                                  sampler=weighted_sampler,
                                  num_workers=CONFIG['NUM_WORKERS'],
                                  drop_last=False)
        test_loader = DataLoader(test_subset,
                                 batch_size=CONFIG['BATCH_SIZE'],
                                 num_workers=CONFIG['NUM_WORKERS'],
                                 shuffle=False)

        # define model
        m1 = AlexNetDR()
        m2 = GoogleNetDR()

        m1 = load_pretrain_param_alexnet(m1)
        m2 = load_pretrain_param_googlenet(m2)

        model = Ensemble(m1, m2)
        model = model.cuda(CONFIG['NUM_GPU'])

        # load pretrained weights
        if CONFIG['PRETRAINED_PATH']:
            checkpoint = torch.load(CONFIG['PRETRAINED_PATH'])
            model.load_state_dict(checkpoint)

        # define loss and optimizer
        if CONFIG['LOSS_FUNC'] == 'CrossEntropyLoss':
            criterion = nn.CrossEntropyLoss()
        elif CONFIG['LOSS_FUNC'] == 'MSELoss':
            criterion = nn.MSELoss()
        else:
            raise NotImplementedError

        if CONFIG['OPTIMIZER'] == 'SGD':
            optimizer = torch.optim.SGD(model.parameters(),
                                        lr=CONFIG['LEARNING_RATE'],
                                        momentum=CONFIG['MOMENTUM'],
                                        nesterov=True,
                                        weight_decay=CONFIG['WEIGHT_DECAY'])
        elif CONFIG['OPTIMIZER'] == 'ADAM':
            optimizer = torch.optim.Adam(model.parameters(),
                                         lr=CONFIG['LEARNING_RATE'],
                                         betas=CONFIG['BETAS'],
                                         eps=CONFIG['EPS'],
                                         weight_decay=CONFIG['WEIGHT_DECAY'])
        else:
            raise NotImplementedError

        # learning rate decay
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer, milestones=CONFIG['MILESTONES'], gamma=CONFIG['GAMMA'])
        # train
        max_kappa = 0
        record_epochs, accs, losses, kappa_per_fold = [], [], [], []
        for epoch in range(1, CONFIG['EPOCHS'] + 1):

            # resampling weight update
            if weighted_sampler:
                weighted_sampler.step()

            # learning rate update
            if lr_scheduler:
                lr_scheduler.step()
                if epoch in lr_scheduler.milestones:
                    print_msg('Learning rate decayed to {}'.format(
                        lr_scheduler.get_lr()[0]))

            epoch_loss = 0
            correct = 0
            total = 0
            progress = tqdm(enumerate(train_loader))
            for step, train_data in progress:
                X, y = train_data  # X.dtype is torch.float32, y.dtype is torch.int64
                X, y = X.cuda(CONFIG['NUM_GPU']), y.float().cuda(
                    CONFIG['NUM_GPU'])

                # forward
                y_pred = model(X)

                y_one_hot = torch.zeros(y.shape[0], 5).cuda(CONFIG['NUM_GPU'])
                y_one_hot[range(y_one_hot.shape[0]),
                          y.to(dtype=torch.int64)] = 1

                loss = criterion(y_pred, y_one_hot)

                # backward
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                # metrics
                epoch_loss += loss.item()
                total += y.size(0)
                correct += accuracy(torch.argmax(y_pred, dim=1),
                                    y,
                                    method='5_class_vec_output') * y.size(0)
                avg_loss = epoch_loss / (step + 1)
                avg_acc = correct / total

                progress.set_description(
                    'Fold {} Epoch: {}/{}, loss: {:.6f}, acc: {:.4f}'.format(
                        fold_num + 1, epoch, CONFIG['EPOCHS'], avg_loss,
                        avg_acc))

            # save model and kappa score&confusion matrix
            acc, c_matrix, kappa, all_pred = _eval(model, test_loader, CONFIG)
            print('validation accuracy: {}, kappa: {}'.format(acc, kappa))
            if kappa > max_kappa:
                torch.save(
                    model.state_dict(), results_dir + '/fold' +
                    str(fold_num + 1) + '_best_kappa.pth')
                max_kappa = kappa
                print_msg('Fold {} of 5. Best kappa model save at {}'.format(
                    fold_num + 1, results_dir))
                print_msg(
                    'Fold ' + str(fold_num + 1) +
                    ' of 5. Confusion matrix with best kappa is:\n', c_matrix)
                #            ks_dataframe = pd.DataFrame({'file_name':[sampler[0] for sampler in test_dataset.samples],
                #                                     'truth':[sampler[1] for sampler in test_dataset.samples],
                #                                     'prediction':list(all_pred),
                #                                     'kappa_score':''})
                #            ks_dataframe.at[0,'kappa_score'] = kappa
                #            ks_dataframe.to_csv(os.path.join(results_dir,'test_kappa_score.csv'),index=False,sep=',')
                np.savetxt(os.path.join(
                    results_dir,
                    'ford' + str(fold_num + 1) + '_confusion_matrix.csv'),
                           np.array(c_matrix),
                           delimiter=',')
                with open(
                        os.path.join(
                            results_dir,
                            'ford' + str(fold_num + 1) + '_kappa_score.txt'),
                        'w') as f:
                    f.write('Best kappa: {}'.format(kappa))

            # record
            record_epochs.append(epoch)
            accs.append(acc)
            losses.append(avg_loss)
        kappa_per_fold.append(max_kappa)
    print('\nBest validation kappa score for fold 1 to 5:\n {}'.format(
        kappa_per_fold))
    return record_epochs, accs, losses
Example #6
0
    def __init__(self,
                 p_type='classification',
                 algorithms=None,
                 hyperparameters=None,
                 verbose=False,
                 n_cores=mp.cpu_count(),
                 runtime_limit=512,
                 dataset_ratio_threshold=100,
                 selection_method='min_variance',
                 scalarization='D',
                 error_matrix=None,
                 runtime_matrix=None,
                 new_row=None,
                 build_ensemble=True,
                 ensemble_method='greedy',
                 runtime_predictor='KNeighborsRegressor',
                 solver='scipy',
                 **stacking_hyperparams):

        # TODO: check if arguments to constructor are valid; set to defaults if not specified
        assert selection_method in {'qr', 'min_variance', 'random'}, "The method to select entries to sample must be " \
            "either qr (QR decomposition), min_variance (minimize variance with time constraints), or random (time-constrained random selection, for testing purpose)."

        with open(os.path.join(DEFAULTS, p_type + '.json')) as file:
            defaults = json.load(file)

        # attributes of ML problem
        self.p_type = p_type.lower()
        self.algorithms = algorithms or defaults['algorithms']
        self.hyperparameters = hyperparameters or defaults['hyperparameters']
        self.verbose = verbose

        # computational considerations
        self.n_cores = n_cores
        self.runtime_limit = runtime_limit

        # sample column selection
        self.selection_method = selection_method
        self.scalarization = scalarization

        # error matrix attributes
        # TODO: determine whether to generate new error matrix or use default/subset of default
        self.error_matrix = util.extract_columns(
            ERROR_MATRIX, self.algorithms,
            self.hyperparameters) if error_matrix is None else error_matrix
        self.runtime_matrix = util.extract_columns(
            RUNTIME_MATRIX, self.algorithms,
            self.hyperparameters) if runtime_matrix is None else runtime_matrix
        assert util.check_dataframes(self.error_matrix, self.runtime_matrix)
        self.column_headings = np.array(
            [eval(heading) for heading in list(self.error_matrix)])
        self.X, self.Y, _ = linalg.pca(self.error_matrix.values,
                                       rank=min(self.error_matrix.shape) - 1)

        # sampled & fitted models
        self.new_row = new_row or np.zeros((1, self.error_matrix.shape[1]))
        self.sampled_indices = set()
        self.sampled_models = [None] * self.error_matrix.shape[1]
        self.fitted_indices = set()
        self.fitted_models = [None] * self.error_matrix.shape[1]

        # ensemble attributes
        self.build_ensemble = build_ensemble
        self.ensemble_method = ensemble_method
        self.stacking_hyperparams = stacking_hyperparams
        if self.build_ensemble:
            self.ensemble = Ensemble(self.p_type, self.ensemble_method,
                                     self.stacking_hyperparams)
        else:
            self.ensemble = Model_collection(self.p_type)

        # runtime predictor
        self.runtime_predictor = runtime_predictor

        self.dataset_ratio_threshold = dataset_ratio_threshold
Example #7
0
class AutoLearner:
    """An object representing an automatically tuned machine learning model.

    Attributes:
        p_type (str): Problem type. One of {'classification', 'regression'}.
        algorithms (list): A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM', 'kSVM']).
        hyperparameters (dict): A nested dict of hyperparameters to be considered; see above for example.
        n_cores (int): Maximum number of cores over which to parallelize (None means no limit).
        verbose (bool): Whether or not to generate print statements when a model finishes fitting.
        stacking_alg (str): Algorithm type to use for stacked learner.
        **stacking_hyperparams (dict): Hyperparameter settings of stacked learner.
    """
    def __init__(self,
                 p_type,
                 algorithms=None,
                 hyperparameters=None,
                 n_cores=None,
                 verbose=False,
                 stacking_alg='Logit',
                 **stacking_hyperparams):

        # check if arguments to constructor are valid; set to defaults if not specified
        default, new = util.check_arguments(p_type, algorithms,
                                            hyperparameters)
        self.p_type = p_type.lower()
        self.algorithms = algorithms
        self.hyperparameters = hyperparameters
        self.n_cores = n_cores
        self.verbose = verbose

        if len(new) > 0:
            # if selected hyperparameters contain model configurations not included in default
            proceed = input(
                "Your selected hyperparameters contain some not included in the default error matrix. \n"
                "Do you want to generate your own error matrix? [yes/no]")
            if proceed == 'yes':
                subprocess.call(['./generate_matrix.sh'])
                # TODO: load newly generated error matrix file
            else:
                return
        else:
            # use default error matrix (or subset of)
            path = pkg_resources.resource_filename(
                __name__, 'defaults/error_matrix.csv')
            default_error_matrix = pd.read_csv(path, index_col=0)
            column_headings = np.array(
                [eval(heading) for heading in list(default_error_matrix)])
            selected_indices = np.array(
                [heading in column_headings for heading in default])
            self.error_matrix = default_error_matrix.values[:,
                                                            selected_indices]
            self.column_headings = sorted(default,
                                          key=lambda d: d['algorithm'])

        self.ensemble = Ensemble(self.p_type, stacking_alg,
                                 **stacking_hyperparams)
        self.optimized_settings = []
        self.new_row = None

    def fit(self, x_train, y_train):
        """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the
        new dataset, predict performance on the rest, then perform Bayesian optimization and construct an optimal
        ensemble model.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        print('Data={}'.format(x_train.shape))
        self.new_row = np.zeros((1, self.error_matrix.shape[1]))
        known_indices = linalg.pivot_columns(self.error_matrix)

        print('Sampling {} entries of new row...'.format(len(known_indices)))
        pool1 = mp.Pool(self.n_cores)
        sample_models = [
            Model(self.p_type,
                  self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'],
                  verbose=self.verbose) for i in known_indices
        ]
        sample_model_errors = [
            pool1.apply_async(Model.kfold_fit_validate,
                              args=[m, x_train, y_train, 5])
            for m in sample_models
        ]
        pool1.close()
        pool1.join()
        for i, error in enumerate(sample_model_errors):
            self.new_row[:, known_indices[i]] = error.get()[0].mean()
            # TODO: add predictions to second layer matrix?
        self.new_row = linalg.impute(self.error_matrix, self.new_row,
                                     known_indices)

        # Add new row to error matrix at the end (might be incorrect?)
        # self.error_matrix = np.vstack((self.error_matrix, self.new_row))

        # TODO: Fit ensemble candidates (?)

        if self.verbose:
            print('\nConducting Bayesian optimization...')
        n_models = 3
        pool2 = Pool(self.n_cores)
        bayesian_opt_models = [
            Model(self.p_type,
                  self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'],
                  verbose=self.verbose)
            for i in np.argsort(self.new_row.flatten())[:n_models]
        ]
        optimized_hyperparams = pool2.map(Model.bayesian_optimize,
                                          bayesian_opt_models)
        pool2.close()
        pool2.join()
        for i, params in enumerate(optimized_hyperparams):
            bayesian_opt_models[i].hyperparameters = params
            self.ensemble.add_base_learner(bayesian_opt_models[i])
            self.optimized_settings.append({
                'algorithm':
                bayesian_opt_models[i].algorithm,
                'hyperparameters':
                bayesian_opt_models[i].hyperparameters
            })

        if self.verbose:
            print('\nFitting optimized ensemble...')
        self.ensemble.fit(x_train, y_train)
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')

    def refit(self, x_train, y_train):
        """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and
        stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        assert self.ensemble.fitted, "Cannot refit unless model has been fit."
        self.ensemble.fit(x_train, y_train)

    def predict(self, x_test):
        """Generate predictions on test data.

        Args:
            x_test (np.ndarray): Features of the test dataset.
        """
        return self.ensemble.predict(x_test)
Example #8
0
def predict(args):
    import data_loader
    from model import DenseNet103, Ensemble
    import time

    os.makedirs(args.output_path, exist_ok=True)

    # Get image names
    img_files = sorted(
        glob.glob(os.path.join(args.input_image_dir, args.image_glob)))
    # basenames w/o extensions
    img_names = [osp.splitext(osp.basename(x))[0] for x in img_files]

    # Load models and build ensemble
    models = []
    for m in [args.model_weights]:
        mdl = DenseNet103(n_classes=args.n_classes)
        mdl.load_state_dict(torch.load(m, map_location='cpu'))
        if torch.cuda.is_available():
            mdl = mdl.cuda()
            print('Moved model to CUDA compute device.')
        else:
            print('No CUDA device available. Using CPU.')
        mdl.train(False)
        models.append(mdl)

    ens = Ensemble(models)
    print('Models loaded.')

    if args.transform.lower() == 'crop512raw':
        transform_train = data_loader.crop512raw
        transform_test_pre = None
        transform_test_post = data_loader.predcrop512
    elif args.transform.lower() == 'crop512':
        transform_train = data_loader.crop512
        transform_test_pre = data_loader.predcrop512_pre
        transform_test_post = data_loader.predcrop512
    elif args.transform.lower() == 'custom':
        transform_train = build_transform(resize=args.transform_resize,
                                          crop_sz=args.transform_crop_sz,
                                          crop_type='random',
                                          flips=True,
                                          method='scale_center',
                                          to_tensor=True)
        transform_test_pre = build_transform(
            resize=args.transform_resize,
            crop_sz=None,
            crop_type=None,
            flips=False,
            method='scale_center',
            to_tensor=False,
        )
        transform_test_post = build_transform(resize=None,
                                              crop_sz=None,
                                              crop_type=None,
                                              method='scale_center',
                                              flips=False,
                                              to_tensor=True)
    else:
        raise ValueError('`transform` argument `%s` in invalid.' %
                         args.transform)

    pl = data_loader.PredCropLoader(args.input_image_dir,
                                    transform_pre=transform_test_pre,
                                    transform_post=transform_test_post,
                                    dtype='uint16',
                                    img_regex=args.image_glob,
                                    n_windows=4)

    sm = torch.nn.Softmax2d()

    # Segment images
    print('Segmenting...')
    times = []
    for i in range(len(pl)):
        start = time.time()

        samples = pl[i]
        print('Number of samples %d' % len(samples))

        outs = []  # np.ndarrays of softmax probs
        with torch.no_grad():
            for d in samples:
                input_panel = d['image'].unsqueeze(0)  # add empty batch
                print('Input panel size ', input_panel.size())
                if torch.cuda.is_available():
                    input_panel = input_panel.cuda()
                input_panel.requires_grad = False
                output = ens(input_panel)
                probs = sm(output)
                probs = probs.cpu().data.numpy()  # unpack to numpy array
                outs.append(probs)

        # call a mask for each set of probs
        mask_panels = []
        for sm_panel in outs:
            mask = post_process(sm_panel,
                                min_sm_prob=1. / args.n_classes,
                                sz_min=200,
                                erod=None,
                                dil=None)
            mask_panels.append(mask)

        # reconstruct total mask from masks
        mask_sz = mask_panels[0].shape
        n_per_side = int(np.sqrt(len(mask_panels)))
        total_mask = np.zeros(
            (mask_sz[0] * n_per_side, mask_sz[1] * n_per_side))
        for j in range(len(mask_panels)):
            total_mask[(j // n_per_side) * mask_sz[0]:((j // n_per_side) + 1) *
                       mask_sz[0],
                       (j % n_per_side) * mask_sz[1]:((j % n_per_side) + 1) *
                       mask_sz[1]] = mask_panels[j]

        print('Upsampling size: ', tuple(args.upsamp_sz))
        maskR = imresize(total_mask.astype('uint8'),
                         tuple(args.upsamp_sz),
                         interp='nearest')  # upsample
        maskR = maskR.astype('bool').astype('uint8')
        # save upsamples mask
        imsave(
            os.path.join(args.output_path,
                         img_names[i] + args.mask_suffix + '.png'),
            maskR * 255)
        print('Processed ', img_names[i])
        end = time.time()
        times.append(end - start)

    print('Average image processing time : ', np.mean(times))

    return
Example #9
0
class AutoLearner:
    """An object representing an automatically tuned machine learning model.

    Attributes:
        p_type (str):                  Problem type. One of {'classification', 'regression'}.
        algorithms (list):             A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM']).
        hyperparameters (dict):        A nested dict of hyperparameters to be considered; see above for example.
        verbose (bool):                Whether or not to generate print statements when a model finishes fitting.

        n_cores (int):                 Maximum number of cores over which to parallelize (None means no limit).
        runtime_limit(int):            Maximum training time for AutoLearner (powers of 2 preferred).

        selection_method (str):        Method of selecting entries of new row to sample.
        scalarization (str):           Scalarization of objective for min-variance selection. Either 'D' or 'A'.

        error_matrix (DataFrame):      Error matrix to use for imputation; includes index and headers.
        runtime_matrix (DataFrame):    Runtime matrix to use for runtime prediction; includes index and headers.
        column_headings (list):        Column headings of error/runtime matrices; list of dicts.
        X, Y (np.ndarray):             PCA decomposition of error matrix.

        new_row (np.ndarray):          Predicted row of error matrix.
        sampled_indices (set):         Indices of new row that have been sampled.
        sampled_models (list):         List of models that have been sampled (i.e. k-fold fitted).
        fitted_indices (set):          Indices of new row that have been fitted (i.e. included in ensemble)
        fitted_models (list):          List of models that have been fitted.

        stacking_alg (str):            Algorithm type to use for stacked learner.
    """
    def __init__(self,
                 p_type,
                 algorithms=None,
                 hyperparameters=None,
                 verbose=False,
                 n_cores=mp.cpu_count(),
                 runtime_limit=512,
                 selection_method='min_variance',
                 scalarization='D',
                 error_matrix=None,
                 runtime_matrix=None,
                 stacking_alg='greedy',
                 **stacking_hyperparams):

        # TODO: check if arguments to constructor are valid; set to defaults if not specified
        assert selection_method in {'qr', 'min_variance'}, "The method to select entries to sample must be " \
            "either qr (QR decomposition) or min_variance (minimize variance with time constraints)."

        with open(os.path.join(DEFAULTS, p_type + '.json')) as file:
            defaults = json.load(file)

        # attributes of ML problem
        self.p_type = p_type.lower()
        self.algorithms = algorithms or defaults['algorithms']
        self.hyperparameters = hyperparameters or defaults['hyperparameters']
        self.verbose = verbose

        # computational considerations
        self.n_cores = n_cores
        self.runtime_limit = runtime_limit

        # sample column selection
        self.selection_method = selection_method
        self.scalarization = scalarization

        # error matrix attributes
        # TODO: determine whether to generate new error matrix or use default/subset of default
        self.error_matrix = ERROR_MATRIX if error_matrix is None else error_matrix
        self.runtime_matrix = RUNTIME_MATRIX if runtime_matrix is None else runtime_matrix
        assert util.check_dataframes(self.error_matrix, self.runtime_matrix)
        self.column_headings = np.array(
            [eval(heading) for heading in list(self.error_matrix)])
        self.X, self.Y, _ = linalg.pca(self.error_matrix.values,
                                       rank=min(self.error_matrix.shape) - 1)

        # sampled & fitted models
        self.new_row = np.zeros((1, self.error_matrix.shape[1]))
        self.sampled_indices = set()
        self.sampled_models = [None] * self.error_matrix.shape[1]
        self.fitted_indices = set()
        self.fitted_models = [None] * self.error_matrix.shape[1]

        # ensemble attributes
        self.stacking_alg = stacking_alg
        self.stacking_hyperparams = stacking_hyperparams
        self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                 self.stacking_hyperparams)

    def fit(self, x_train, y_train, rank=None, runtime_limit=None):
        """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the
        new dataset, predict performance on the rest, then construct an optimal ensemble model.

        Args:
            x_train (np.ndarray):  Features of the training dataset.
            y_train (np.ndarray):  Labels of the training dataset.
            rank (int):            Rank of error matrix factorization.
            runtime_limit (float): Maximum time to allocate to AutoLearner fitting.
        """
        # set to defaults if not provided
        rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01)
        runtime_limit = runtime_limit or self.runtime_limit

        if self.verbose:
            print('Fitting AutoLearner with max. runtime {}s'.format(
                runtime_limit))
        t_predicted = convex_opt.predict_runtime(
            x_train.shape, runtime_matrix=self.runtime_matrix)

        t0 = time.time()
        while time.time() - t0 < runtime_limit / 2:
            # set of algorithms that are predicted to run in given budget
            options = np.where(t_predicted <= runtime_limit / 2 -
                               (time.time() - t0))[0]
            # remove algorithms that have been sampled already
            options = list(set(options) - self.sampled_indices)
            if len(options) == 0:
                if len(self.ensemble.candidate_learners) == 0:
                    to_sample = np.argmin(t_predicted)
                else:
                    break
            else:
                to_sample = np.random.choice(options)
            self.sampled_indices.add(to_sample)
            self.sampled_models[to_sample] = Model(
                self.p_type, self.column_headings[to_sample]['algorithm'],
                self.column_headings[to_sample]['hyperparameters'],
                self.verbose, to_sample)
            self.sampled_models[to_sample].kfold_fit_validate(
                x_train, y_train, 5)
            self.ensemble.candidate_learners.append(
                self.sampled_models[to_sample])

        if self.verbose:
            print('\nFitting ensemble of max. size {}...'.format(
                len(self.ensemble.candidate_learners)))
        remaining = runtime_limit - (time.time() - t0)
        self.ensemble.fit(x_train, y_train, remaining, self.fitted_models)
        for model in self.ensemble.base_learners:
            assert model.index is not None
            self.fitted_indices.add(model.index)
            self.fitted_models[model.index] = model
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')

    def fit_doubling(self, x_train, y_train, verbose=False):
        """Fit an AutoLearner object, iteratively doubling allowed runtime."""
        t_predicted = convex_opt.predict_runtime(x_train.shape)

        # split data into training and validation sets
        try:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      stratify=y_train,
                                                      random_state=0)
        except ValueError:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=0)

        ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)]
        t_init = 2**np.floor(
            np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum()))
        t_init = max(1, t_init)
        times = [t_init]
        losses = [1.0]

        e_hat, actual_times, sampled, ensembles = [], [], [], []
        k, t = ranks[0], times[0]

        start = time.time()
        counter, best = 0, 0
        while time.time() - start < self.runtime_limit - t:
            if verbose:
                print('Fitting with k={}, t={}'.format(k, t))
            t0 = time.time()
            self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                     self.stacking_hyperparams)
            self.fit(x_tr, y_tr, rank=k, runtime_limit=t)
            loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type)

            # TEMPORARY: Record intermediate results
            e_hat.append(np.copy(self.new_row))
            actual_times.append(time.time() - start)
            sampled.append(self.sampled_indices)
            ensembles.append(self.ensemble)
            losses.append(loss)

            if loss == min(losses):
                ranks.append(k + 1)
                best = counter
            else:
                ranks.append(k)

            times.append(2 * t)
            k = ranks[-1]
            t = times[-1]
            counter += 1

        # after all iterations, restore best model
        self.new_row = e_hat[best]
        self.ensemble = ensembles[best]
        return {
            'ranks': ranks[:-1],
            'runtime_limits': times[:-1],
            'validation_loss': losses,
            'predicted_new_row': e_hat,
            'actual_runtimes': actual_times,
            'sampled_indices': sampled,
            'models': ensembles
        }

    def refit(self, x_train, y_train):
        """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and
        stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        assert self.ensemble.fitted, "Cannot refit unless model has been fit."
        self.ensemble.fit(x_train, y_train)

    def predict(self, x_test):
        """Generate predictions on test data.

        Args:
            x_test (np.ndarray): Features of the test dataset.
        Returns:
            np.ndarray: Predicted labels.
        """
        return self.ensemble.predict(x_test)
Example #10
0
    def fit_doubling(self, x_train, y_train, verbose=False):
        """Fit an AutoLearner object, iteratively doubling allowed runtime."""
        t_predicted = convex_opt.predict_runtime(x_train.shape)

        # split data into training and validation sets
        try:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      stratify=y_train,
                                                      random_state=0)
        except ValueError:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=0)

        ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)]
        t_init = 2**np.floor(
            np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum()))
        t_init = max(1, t_init)
        times = [t_init]
        losses = [1.0]

        e_hat, actual_times, sampled, ensembles = [], [], [], []
        k, t = ranks[0], times[0]

        start = time.time()
        counter, best = 0, 0
        while time.time() - start < self.runtime_limit - t:
            if verbose:
                print('Fitting with k={}, t={}'.format(k, t))
            t0 = time.time()
            self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                     self.stacking_hyperparams)
            self.fit(x_tr, y_tr, rank=k, runtime_limit=t)
            loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type)

            # TEMPORARY: Record intermediate results
            e_hat.append(np.copy(self.new_row))
            actual_times.append(time.time() - start)
            sampled.append(self.sampled_indices)
            ensembles.append(self.ensemble)
            losses.append(loss)

            if loss == min(losses):
                ranks.append(k + 1)
                best = counter
            else:
                ranks.append(k)

            times.append(2 * t)
            k = ranks[-1]
            t = times[-1]
            counter += 1

        # after all iterations, restore best model
        self.new_row = e_hat[best]
        self.ensemble = ensembles[best]
        return {
            'ranks': ranks[:-1],
            'runtime_limits': times[:-1],
            'validation_loss': losses,
            'predicted_new_row': e_hat,
            'actual_runtimes': actual_times,
            'sampled_indices': sampled,
            'models': ensembles
        }
Example #11
0
def multiple_models(data_hhb, data_hbo2, data_na, data_ca, Ytrain, Xtest):
    print('Fit the models for ensemable')
    model1 = Ensemble()
    model2 = Ensemble()
    model3 = Ensemble()
    model4 = Ensemble()
    model1.fit(data_hhb, Ytrain.hhb)
    model2.fit(data_hbo2, Ytrain.hbo2)
    model3.fit(data_na, Ytrain.na)
    model4.fit(data_ca, Ytrain.ca)

    print('Predict the target values')
    tdata = DataSet(Xtest)
    tdata_hhb, tdata_hbo2, tdata_na, tdata_ca, _ = tdata.data_proccessing(
        False)
    hhb = model1.predict(tdata_hhb)
    hbo2 = model2.predict(tdata_hbo2)
    na = model3.predict(tdata_na)
    ca = model4.predict(tdata_ca)
    na = np.exp(na - 1.5) - 2

    return np.concatenate([
        hhb.reshape((-1, 1)),
        hbo2.reshape((-1, 1)),
        ca.reshape((-1, 1)),
        na.reshape((-1, 1))
    ],
                          axis=1)
    Classifier(processors2[0], head2),
    Classifier(processors2[1], frozenhead2),
    Classifier(processors2[2], frozenhead2),
    Classifier(processors2[3], frozenhead2)
]

# testing sets
loader = torch.utils.data.DataLoader(test_set, batch_size=2**31, shuffle=False)
for test_images1, test_labels1, _ in load(loader, digits1):
    pass
loader = torch.utils.data.DataLoader(test_set, batch_size=2**31, shuffle=False)
for test_images2, test_labels2, _ in load(loader, digits2):
    pass

# those two ensembles are utilized for predictions, not training
ensemble1 = Ensemble(processors1, frozenhead1)
ensemble2 = Ensemble(processors2, frozenhead2)

# training
loss_func = nn.BCEWithLogitsLoss()
rounds = []
for digits, classifiers in [(digits1, classifiers1), (digits2, classifiers2)]:
    optimizers = [
        torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)
        for model in classifiers
    ]
    for epoch in tqdm(range(3), desc="digits %s" % digits, total=3):
        loader = torch.utils.data.DataLoader(train_set, batch_size=32, shuffle=True)
        for images, _, y_true in load(loader, digits):
            for optimizer, model in zip(optimizers, classifiers):
                y_pred = model(images)
def main(args: argparse.Namespace):
    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True

    cudnn.benchmark = True

    # Data loading code
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
    train_transform = transforms.Compose([
        ResizeImage(256),
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), normalize
    ])
    val_tranform = transforms.Compose([
        ResizeImage(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(), normalize
    ])

    a, b, c = args.n_share, args.n_source_private, args.n_total
    common_classes = [i for i in range(a)]
    source_private_classes = [i + a for i in range(b)]
    target_private_classes = [i + a + b for i in range(c - a - b)]
    source_classes = common_classes + source_private_classes
    target_classes = common_classes + target_private_classes

    dataset = datasets.Office31
    train_source_dataset = dataset(root=args.root,
                                   data_list_file=args.source,
                                   filter_class=source_classes,
                                   transform=train_transform)
    train_source_loader = DataLoader(train_source_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.workers,
                                     drop_last=True)
    train_target_dataset = dataset(root=args.root,
                                   data_list_file=args.target,
                                   filter_class=target_classes,
                                   transform=train_transform)
    train_target_loader = DataLoader(train_target_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=True,
                                     num_workers=args.workers,
                                     drop_last=True)
    val_dataset = dataset(root=args.root,
                          data_list_file=args.target,
                          filter_class=target_classes,
                          transform=val_tranform)
    val_loader = DataLoader(val_dataset,
                            batch_size=args.batch_size,
                            shuffle=False,
                            num_workers=args.workers)

    test_loader = val_loader

    train_source_iter = ForeverDataIterator(train_source_loader)
    train_target_iter = ForeverDataIterator(train_target_loader)
    esem_iter1, esem_iter2, esem_iter3, esem_iter4, esem_iter5 = esem_dataloader(
        args, source_classes)

    # create model
    backbone = resnet50(pretrained=True)
    classifier = ImageClassifier(backbone,
                                 train_source_dataset.num_classes).to(device)
    domain_discri = DomainDiscriminator(in_feature=classifier.features_dim,
                                        hidden_size=1024).to(device)
    esem = Ensemble(classifier.features_dim,
                    train_source_dataset.num_classes).to(device)

    # define optimizer and lr scheduler
    optimizer = SGD(classifier.get_parameters() +
                    domain_discri.get_parameters(),
                    args.lr,
                    momentum=args.momentum,
                    weight_decay=args.weight_decay,
                    nesterov=True)
    lr_scheduler = StepwiseLR(optimizer,
                              init_lr=args.lr,
                              gamma=0.001,
                              decay_rate=0.75)

    optimizer_esem = SGD(esem.parameters(),
                         args.lr,
                         momentum=args.momentum,
                         weight_decay=args.weight_decay,
                         nesterov=True)
    lr_scheduler1 = StepwiseLR(optimizer_esem,
                               init_lr=args.lr,
                               gamma=0.001,
                               decay_rate=0.75)
    lr_scheduler2 = StepwiseLR(optimizer_esem,
                               init_lr=args.lr,
                               gamma=0.001,
                               decay_rate=0.75)
    lr_scheduler3 = StepwiseLR(optimizer_esem,
                               init_lr=args.lr,
                               gamma=0.001,
                               decay_rate=0.75)
    lr_scheduler4 = StepwiseLR(optimizer_esem,
                               init_lr=args.lr,
                               gamma=0.001,
                               decay_rate=0.75)
    lr_scheduler5 = StepwiseLR(optimizer_esem,
                               init_lr=args.lr,
                               gamma=0.001,
                               decay_rate=0.75)

    optimizer_pre = SGD(esem.get_parameters() + classifier.get_parameters(),
                        args.lr,
                        momentum=args.momentum,
                        weight_decay=args.weight_decay,
                        nesterov=True)

    # define loss function
    domain_adv = DomainAdversarialLoss(domain_discri,
                                       reduction='none').to(device)

    pretrain(esem_iter1, esem_iter2, esem_iter3, esem_iter4, esem_iter5,
             classifier, esem, optimizer_pre, args)

    # start training
    best_acc1 = 0.
    for epoch in range(args.epochs):
        # train for one epoch

        train_esem(esem_iter1,
                   classifier,
                   esem,
                   optimizer_esem,
                   lr_scheduler1,
                   epoch,
                   args,
                   index=1)
        train_esem(esem_iter2,
                   classifier,
                   esem,
                   optimizer_esem,
                   lr_scheduler2,
                   epoch,
                   args,
                   index=2)
        train_esem(esem_iter3,
                   classifier,
                   esem,
                   optimizer_esem,
                   lr_scheduler3,
                   epoch,
                   args,
                   index=3)
        train_esem(esem_iter4,
                   classifier,
                   esem,
                   optimizer_esem,
                   lr_scheduler4,
                   epoch,
                   args,
                   index=4)
        train_esem(esem_iter5,
                   classifier,
                   esem,
                   optimizer_esem,
                   lr_scheduler5,
                   epoch,
                   args,
                   index=5)

        source_class_weight = evaluate_source_common(val_loader, classifier,
                                                     esem, source_classes,
                                                     args)

        train(train_source_iter, train_target_iter, classifier, domain_adv,
              esem, optimizer, lr_scheduler, epoch, source_class_weight, args)

        # evaluate on validation set
        acc1 = validate(val_loader, classifier, esem, source_classes, args)

        # remember best acc@1 and save checkpoint
        if acc1 > best_acc1:
            best_model = copy.deepcopy(classifier.state_dict())
        best_acc1 = max(acc1, best_acc1)

    print("best_acc1 = {:3.3f}".format(best_acc1))

    # evaluate on test set
    classifier.load_state_dict(best_model)
    acc1 = validate(test_loader, classifier, esem, source_classes, args)
    print("test_acc1 = {:3.3f}".format(acc1))
Example #14
0
class AutoLearner:
    """An object representing an automatically tuned machine learning model.

    Attributes:
        p_type (str):                  Problem type. One of {'classification', 'regression'}.
        algorithms (list):             A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM']).
        hyperparameters (dict):        A nested dict of hyperparameters to be considered; see above for example.
        verbose (bool):                Whether or not to generate print statements when a model finishes fitting.

        n_cores (int):                 Maximum number of cores over which to parallelize (None means no limit).
        runtime_limit(int):            Maximum training time for AutoLearner (powers of 2 preferred).

        selection_method (str):        Method of selecting entries of new row to sample.
        scalarization (str):           Scalarization of objective for min-variance selection. Either 'D' or 'A'.

        error_matrix (DataFrame):      Error matrix to use for imputation; includes index and headers.
        runtime_matrix (DataFrame):    Runtime matrix to use for runtime prediction; includes index and headers.
        column_headings (list):        Column headings of error/runtime matrices; list of dicts.
        X, Y (np.ndarray):             PCA decomposition of error matrix.

        new_row (np.ndarray):          Predicted row of error matrix.
        sampled_indices (set):         Indices of new row that have been sampled.
        sampled_models (list):         List of models that have been sampled (i.e. k-fold fitted).
        fitted_indices (set):          Indices of new row that have been fitted (i.e. included in ensemble)
        fitted_models (list):          List of models that have been fitted.

        stacking_alg (str):            Algorithm type to use for stacked learner.
    """
    def __init__(self,
                 p_type,
                 algorithms=None,
                 hyperparameters=None,
                 verbose=False,
                 n_cores=mp.cpu_count(),
                 runtime_limit=512,
                 selection_method='min_variance',
                 scalarization='D',
                 error_matrix=None,
                 runtime_matrix=None,
                 stacking_alg='greedy',
                 **stacking_hyperparams):

        # TODO: check if arguments to constructor are valid; set to defaults if not specified
        assert selection_method in {'qr', 'min_variance'}, "The method to select entries to sample must be " \
            "either qr (QR decomposition) or min_variance (minimize variance with time constraints)."

        with open(os.path.join(DEFAULTS, p_type + '.json')) as file:
            defaults = json.load(file)

        # attributes of ML problem
        self.p_type = p_type.lower()
        self.algorithms = algorithms or defaults['algorithms']
        self.hyperparameters = hyperparameters or defaults['hyperparameters']
        self.verbose = verbose

        # computational considerations
        self.n_cores = n_cores
        self.runtime_limit = runtime_limit

        # sample column selection
        self.selection_method = selection_method
        self.scalarization = scalarization

        # error matrix attributes
        # TODO: determine whether to generate new error matrix or use default/subset of default
        self.error_matrix = ERROR_MATRIX if error_matrix is None else error_matrix
        self.runtime_matrix = RUNTIME_MATRIX if runtime_matrix is None else runtime_matrix
        assert util.check_dataframes(self.error_matrix, self.runtime_matrix)
        self.column_headings = np.array(
            [eval(heading) for heading in list(self.error_matrix)])
        self.X, self.Y, _ = linalg.pca(self.error_matrix.values,
                                       rank=min(self.error_matrix.shape) - 1)

        # sampled & fitted models
        self.new_row = np.zeros((1, self.error_matrix.shape[1]))
        self.sampled_indices = set()
        self.sampled_models = [None] * self.error_matrix.shape[1]
        self.fitted_indices = set()
        self.fitted_models = [None] * self.error_matrix.shape[1]

        # ensemble attributes
        self.stacking_alg = stacking_alg
        self.stacking_hyperparams = stacking_hyperparams
        self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                 self.stacking_hyperparams)

    def fit(self, x_train, y_train, rank=None, runtime_limit=None):
        """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the
        new dataset, predict performance on the rest, then construct an optimal ensemble model.

        Args:
            x_train (np.ndarray):  Features of the training dataset.
            y_train (np.ndarray):  Labels of the training dataset.
            rank (int):            Rank of error matrix factorization.
            runtime_limit (float): Maximum time to allocate to AutoLearner fitting.
        """
        # set to defaults if not provided
        rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01)
        runtime_limit = runtime_limit or self.runtime_limit

        if self.verbose:
            print('Fitting AutoLearner with max. runtime {}s'.format(
                runtime_limit))
        t_predicted = convex_opt.predict_runtime(
            x_train.shape, runtime_matrix=self.runtime_matrix)

        if self.selection_method == 'qr':
            to_sample = linalg.pivot_columns(self.error_matrix)
        elif self.selection_method == 'min_variance':
            # select algorithms to sample only from subset of algorithms that will run in allocated time
            valid = np.where(
                t_predicted <= self.n_cores * runtime_limit / 2)[0]
            Y = self.Y[:rank, valid]
            # TODO: check if Y is rank-deficient, i.e. will ED problem fail?
            v_opt = convex_opt.solve(t_predicted[valid], runtime_limit / 4,
                                     self.n_cores, Y, self.scalarization)
            to_sample = valid[np.where(v_opt > 0.9)[0]]
            if np.isnan(to_sample).any():
                to_sample = np.argsort(t_predicted)[:rank]
        else:
            to_sample = np.arange(0, self.new_row.shape[1])

        if len(to_sample) == 0 and len(self.sampled_indices) == 0:
            # if no columns are selected in first iteration (log det instability), sample n fastest columns
            n = len(
                np.where(np.cumsum(np.sort(t_predicted)) <= runtime_limit)[0])
            to_sample = np.argsort(t_predicted)[:n]

        # only need to compute column entry if it has not been computed already
        to_sample = list(set(to_sample) - self.sampled_indices)
        if self.verbose:
            print('Sampling {} entries of new row...'.format(len(to_sample)))
        start = time.time()
        p1 = mp.Pool(self.n_cores)
        sample_models = [
            Model(self.p_type, self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'], self.verbose, i)
            for i in to_sample
        ]
        sample_model_errors = [
            p1.apply_async(Model.kfold_fit_validate,
                           args=[m, x_train, y_train, 5])
            for m in sample_models
        ]
        p1.close()
        p1.join()

        # update sampled indices
        self.sampled_indices = self.sampled_indices.union(set(to_sample))
        for i, error in enumerate(sample_model_errors):
            cv_error, cv_predictions = error.get()
            sample_models[i].cv_error, sample_models[
                i].cv_predictions = cv_error.mean(), cv_predictions
            sample_models[i].sampled = True
            self.new_row[:, to_sample[i]] = cv_error.mean()
            self.sampled_models[to_sample[i]] = sample_models[i]
        imputed = linalg.impute(self.error_matrix,
                                self.new_row,
                                list(self.sampled_indices),
                                rank=rank)

        # impute ALL entries
        # unknown = sorted(list(set(range(self.new_row.shape[1])) - self.sampled_indices))
        # self.new_row[:, unknown] = imputed[:, unknown]
        self.new_row = imputed

        # k-fold fit candidate learners of ensemble
        remaining = (runtime_limit - (time.time() - start)) * self.n_cores
        # add best sampled model to list of candidate learners to avoid empty lists
        best_sampled_idx = list(self.sampled_indices)[int(
            np.argmin(self.new_row[:, list(self.sampled_indices)]))]
        assert self.sampled_models[best_sampled_idx] is not None
        candidate_indices = [best_sampled_idx]
        self.ensemble.candidate_learners.append(
            self.sampled_models[best_sampled_idx])
        for i in np.argsort(self.new_row[0]):
            if t_predicted[i] + t_predicted[candidate_indices].sum(
            ) <= remaining:
                last = candidate_indices.pop()
                assert last == best_sampled_idx
                candidate_indices.append(i)
                candidate_indices.append(last)
                # if model has already been k-fold fitted, immediately add to candidate learners
                if i in self.sampled_indices:
                    assert self.sampled_models[i] is not None
                    self.ensemble.candidate_learners.append(
                        self.sampled_models[i])
        # candidate learners that need to be k-fold fitted
        to_fit = list(set(candidate_indices) - self.sampled_indices)
        p2 = mp.Pool(self.n_cores)
        candidate_models = [
            Model(self.p_type, self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'], self.verbose, i)
            for i in to_fit
        ]
        candidate_model_errors = [
            p2.apply_async(Model.kfold_fit_validate,
                           args=[m, x_train, y_train, 5])
            for m in candidate_models
        ]
        p2.close()
        p2.join()

        # update sampled indices
        self.sampled_indices = self.sampled_indices.union(set(to_fit))
        for i, error in enumerate(candidate_model_errors):
            cv_error, cv_predictions = error.get()
            candidate_models[i].cv_error, candidate_models[
                i].cv_predictions = cv_error.mean(), cv_predictions
            candidate_models[i].sampled = True
            self.new_row[:, to_fit[i]] = cv_error.mean()
            self.sampled_models[to_fit[i]] = candidate_models[i]
            self.ensemble.candidate_learners.append(candidate_models[i])
        # self.new_row = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank)

        if self.verbose:
            print('\nFitting ensemble of max. size {}...'.format(
                len(self.ensemble.candidate_learners)))
        self.ensemble.fit(x_train, y_train, remaining, self.fitted_models)
        for model in self.ensemble.base_learners:
            assert model.index is not None
            self.fitted_indices.add(model.index)
            self.fitted_models[model.index] = model
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')

    def fit_doubling(self, x_train, y_train, verbose=False):
        """Fit an AutoLearner object, iteratively doubling allowed runtime."""
        t_predicted = convex_opt.predict_runtime(x_train.shape)

        # split data into training and validation sets
        try:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      stratify=y_train,
                                                      random_state=0)
        except ValueError:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=0)

        ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)]
        t_init = 2**np.floor(
            np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum()))
        t_init = max(1, t_init)
        times = [t_init]
        losses = [1.0]

        e_hat, actual_times, sampled, ensembles = [], [], [], []
        k, t = ranks[0], times[0]

        start = time.time()
        counter, best = 0, 0
        while time.time() - start < self.runtime_limit - t:
            if verbose:
                print('Fitting with k={}, t={}'.format(k, t))
            t0 = time.time()
            self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                     self.stacking_hyperparams)
            self.fit(x_tr, y_tr, rank=k, runtime_limit=t)
            loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type)

            # TEMPORARY: Record intermediate results
            e_hat.append(np.copy(self.new_row))
            actual_times.append(time.time() - start)
            sampled.append(self.sampled_indices)
            ensembles.append(self.ensemble)
            losses.append(loss)

            if loss == min(losses):
                ranks.append(k + 1)
                best = counter
            else:
                ranks.append(k)

            times.append(2 * t)
            k = ranks[-1]
            t = times[-1]
            counter += 1

        # after all iterations, restore best model
        self.new_row = e_hat[best]
        self.ensemble = ensembles[best]
        return {
            'ranks': ranks[:-1],
            'runtime_limits': times[:-1],
            'validation_loss': losses,
            'predicted_new_row': e_hat,
            'actual_runtimes': actual_times,
            'sampled_indices': sampled,
            'models': ensembles
        }

    def refit(self, x_train, y_train):
        """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and
        stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        assert self.ensemble.fitted, "Cannot refit unless model has been fit."
        self.ensemble.fit(x_train, y_train)

    def predict(self, x_test):
        """Generate predictions on test data.

        Args:
            x_test (np.ndarray): Features of the test dataset.
        Returns:
            np.ndarray: Predicted labels.
        """
        return self.ensemble.predict(x_test)