Exemple #1
0
def multiple_models(data_hhb, data_hbo2, data_na, data_ca, Ytrain, Xtest):
    print('Fit the models for ensemable')
    model1 = Ensemble()
    model2 = Ensemble()
    model3 = Ensemble()
    model4 = Ensemble()
    model1.fit(data_hhb, Ytrain.hhb)
    model2.fit(data_hbo2, Ytrain.hbo2)
    model3.fit(data_na, Ytrain.na)
    model4.fit(data_ca, Ytrain.ca)

    print('Predict the target values')
    tdata = DataSet(Xtest)
    tdata_hhb, tdata_hbo2, tdata_na, tdata_ca, _ = tdata.data_proccessing(
        False)
    hhb = model1.predict(tdata_hhb)
    hbo2 = model2.predict(tdata_hbo2)
    na = model3.predict(tdata_na)
    ca = model4.predict(tdata_ca)
    na = np.exp(na - 1.5) - 2

    return np.concatenate([
        hhb.reshape((-1, 1)),
        hbo2.reshape((-1, 1)),
        ca.reshape((-1, 1)),
        na.reshape((-1, 1))
    ],
                          axis=1)
Exemple #2
0
class AutoLearner:
    """An object representing an automatically tuned machine learning model.

    Attributes:
        p_type (str): Problem type. One of {'classification', 'regression'}.
        algorithms (list): A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM', 'kSVM']).
        hyperparameters (dict): A nested dict of hyperparameters to be considered; see above for example.
        n_cores (int): Maximum number of cores over which to parallelize (None means no limit).
        verbose (bool): Whether or not to generate print statements when a model finishes fitting.
        stacking_alg (str): Algorithm type to use for stacked learner.
        **stacking_hyperparams (dict): Hyperparameter settings of stacked learner.
    """
    def __init__(self,
                 p_type,
                 algorithms=None,
                 hyperparameters=None,
                 n_cores=None,
                 verbose=False,
                 stacking_alg='Logit',
                 **stacking_hyperparams):

        # check if arguments to constructor are valid; set to defaults if not specified
        default, new = util.check_arguments(p_type, algorithms,
                                            hyperparameters)
        self.p_type = p_type.lower()
        self.algorithms = algorithms
        self.hyperparameters = hyperparameters
        self.n_cores = n_cores
        self.verbose = verbose

        if len(new) > 0:
            # if selected hyperparameters contain model configurations not included in default
            proceed = input(
                "Your selected hyperparameters contain some not included in the default error matrix. \n"
                "Do you want to generate your own error matrix? [yes/no]")
            if proceed == 'yes':
                subprocess.call(['./generate_matrix.sh'])
                # TODO: load newly generated error matrix file
            else:
                return
        else:
            # use default error matrix (or subset of)
            path = pkg_resources.resource_filename(
                __name__, 'defaults/error_matrix.csv')
            default_error_matrix = pd.read_csv(path, index_col=0)
            column_headings = np.array(
                [eval(heading) for heading in list(default_error_matrix)])
            selected_indices = np.array(
                [heading in column_headings for heading in default])
            self.error_matrix = default_error_matrix.values[:,
                                                            selected_indices]
            self.column_headings = sorted(default,
                                          key=lambda d: d['algorithm'])

        self.ensemble = Ensemble(self.p_type, stacking_alg,
                                 **stacking_hyperparams)
        self.optimized_settings = []
        self.new_row = None

    def fit(self, x_train, y_train):
        """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the
        new dataset, predict performance on the rest, then perform Bayesian optimization and construct an optimal
        ensemble model.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        print('Data={}'.format(x_train.shape))
        self.new_row = np.zeros((1, self.error_matrix.shape[1]))
        known_indices = linalg.pivot_columns(self.error_matrix)

        print('Sampling {} entries of new row...'.format(len(known_indices)))
        pool1 = mp.Pool(self.n_cores)
        sample_models = [
            Model(self.p_type,
                  self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'],
                  verbose=self.verbose) for i in known_indices
        ]
        sample_model_errors = [
            pool1.apply_async(Model.kfold_fit_validate,
                              args=[m, x_train, y_train, 5])
            for m in sample_models
        ]
        pool1.close()
        pool1.join()
        for i, error in enumerate(sample_model_errors):
            self.new_row[:, known_indices[i]] = error.get()[0].mean()
            # TODO: add predictions to second layer matrix?
        self.new_row = linalg.impute(self.error_matrix, self.new_row,
                                     known_indices)

        # Add new row to error matrix at the end (might be incorrect?)
        # self.error_matrix = np.vstack((self.error_matrix, self.new_row))

        # TODO: Fit ensemble candidates (?)

        if self.verbose:
            print('\nConducting Bayesian optimization...')
        n_models = 3
        pool2 = Pool(self.n_cores)
        bayesian_opt_models = [
            Model(self.p_type,
                  self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'],
                  verbose=self.verbose)
            for i in np.argsort(self.new_row.flatten())[:n_models]
        ]
        optimized_hyperparams = pool2.map(Model.bayesian_optimize,
                                          bayesian_opt_models)
        pool2.close()
        pool2.join()
        for i, params in enumerate(optimized_hyperparams):
            bayesian_opt_models[i].hyperparameters = params
            self.ensemble.add_base_learner(bayesian_opt_models[i])
            self.optimized_settings.append({
                'algorithm':
                bayesian_opt_models[i].algorithm,
                'hyperparameters':
                bayesian_opt_models[i].hyperparameters
            })

        if self.verbose:
            print('\nFitting optimized ensemble...')
        self.ensemble.fit(x_train, y_train)
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')

    def refit(self, x_train, y_train):
        """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and
        stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        assert self.ensemble.fitted, "Cannot refit unless model has been fit."
        self.ensemble.fit(x_train, y_train)

    def predict(self, x_test):
        """Generate predictions on test data.

        Args:
            x_test (np.ndarray): Features of the test dataset.
        """
        return self.ensemble.predict(x_test)
Exemple #3
0
class AutoLearner:
    """An object representing an automatically tuned machine learning model.

    Attributes:
        p_type (str):                  Problem type. One of {'classification', 'regression'}.
        algorithms (list):             A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM']).
        hyperparameters (dict):        A nested dict of hyperparameters to be considered; see above for example.
        verbose (bool):                Whether or not to generate print statements when a model finishes fitting.

        n_cores (int):                 Maximum number of cores over which to parallelize (None means no limit).
        runtime_limit(int):            Maximum training time for AutoLearner (powers of 2 preferred).

        selection_method (str):        Method of selecting entries of new row to sample.
        scalarization (str):           Scalarization of objective for min-variance selection. Either 'D' or 'A'.

        error_matrix (DataFrame):      Error matrix to use for imputation; includes index and headers.
        runtime_matrix (DataFrame):    Runtime matrix to use for runtime prediction; includes index and headers.
        column_headings (list):        Column headings of error/runtime matrices; list of dicts.
        X, Y (np.ndarray):             PCA decomposition of error matrix.

        new_row (np.ndarray):          Predicted row of error matrix.
        sampled_indices (set):         Indices of new row that have been sampled.
        sampled_models (list):         List of models that have been sampled (i.e. k-fold fitted).
        fitted_indices (set):          Indices of new row that have been fitted (i.e. included in ensemble)
        fitted_models (list):          List of models that have been fitted.

        stacking_alg (str):            Algorithm type to use for stacked learner.
    """
    def __init__(self,
                 p_type,
                 algorithms=None,
                 hyperparameters=None,
                 verbose=False,
                 n_cores=mp.cpu_count(),
                 runtime_limit=512,
                 selection_method='min_variance',
                 scalarization='D',
                 error_matrix=None,
                 runtime_matrix=None,
                 stacking_alg='greedy',
                 **stacking_hyperparams):

        # TODO: check if arguments to constructor are valid; set to defaults if not specified
        assert selection_method in {'qr', 'min_variance'}, "The method to select entries to sample must be " \
            "either qr (QR decomposition) or min_variance (minimize variance with time constraints)."

        with open(os.path.join(DEFAULTS, p_type + '.json')) as file:
            defaults = json.load(file)

        # attributes of ML problem
        self.p_type = p_type.lower()
        self.algorithms = algorithms or defaults['algorithms']
        self.hyperparameters = hyperparameters or defaults['hyperparameters']
        self.verbose = verbose

        # computational considerations
        self.n_cores = n_cores
        self.runtime_limit = runtime_limit

        # sample column selection
        self.selection_method = selection_method
        self.scalarization = scalarization

        # error matrix attributes
        # TODO: determine whether to generate new error matrix or use default/subset of default
        self.error_matrix = ERROR_MATRIX if error_matrix is None else error_matrix
        self.runtime_matrix = RUNTIME_MATRIX if runtime_matrix is None else runtime_matrix
        assert util.check_dataframes(self.error_matrix, self.runtime_matrix)
        self.column_headings = np.array(
            [eval(heading) for heading in list(self.error_matrix)])
        self.X, self.Y, _ = linalg.pca(self.error_matrix.values,
                                       rank=min(self.error_matrix.shape) - 1)

        # sampled & fitted models
        self.new_row = np.zeros((1, self.error_matrix.shape[1]))
        self.sampled_indices = set()
        self.sampled_models = [None] * self.error_matrix.shape[1]
        self.fitted_indices = set()
        self.fitted_models = [None] * self.error_matrix.shape[1]

        # ensemble attributes
        self.stacking_alg = stacking_alg
        self.stacking_hyperparams = stacking_hyperparams
        self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                 self.stacking_hyperparams)

    def fit(self, x_train, y_train, rank=None, runtime_limit=None):
        """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the
        new dataset, predict performance on the rest, then construct an optimal ensemble model.

        Args:
            x_train (np.ndarray):  Features of the training dataset.
            y_train (np.ndarray):  Labels of the training dataset.
            rank (int):            Rank of error matrix factorization.
            runtime_limit (float): Maximum time to allocate to AutoLearner fitting.
        """
        # set to defaults if not provided
        rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01)
        runtime_limit = runtime_limit or self.runtime_limit

        if self.verbose:
            print('Fitting AutoLearner with max. runtime {}s'.format(
                runtime_limit))
        t_predicted = convex_opt.predict_runtime(
            x_train.shape, runtime_matrix=self.runtime_matrix)

        t0 = time.time()
        while time.time() - t0 < runtime_limit / 2:
            # set of algorithms that are predicted to run in given budget
            options = np.where(t_predicted <= runtime_limit / 2 -
                               (time.time() - t0))[0]
            # remove algorithms that have been sampled already
            options = list(set(options) - self.sampled_indices)
            if len(options) == 0:
                if len(self.ensemble.candidate_learners) == 0:
                    to_sample = np.argmin(t_predicted)
                else:
                    break
            else:
                to_sample = np.random.choice(options)
            self.sampled_indices.add(to_sample)
            self.sampled_models[to_sample] = Model(
                self.p_type, self.column_headings[to_sample]['algorithm'],
                self.column_headings[to_sample]['hyperparameters'],
                self.verbose, to_sample)
            self.sampled_models[to_sample].kfold_fit_validate(
                x_train, y_train, 5)
            self.ensemble.candidate_learners.append(
                self.sampled_models[to_sample])

        if self.verbose:
            print('\nFitting ensemble of max. size {}...'.format(
                len(self.ensemble.candidate_learners)))
        remaining = runtime_limit - (time.time() - t0)
        self.ensemble.fit(x_train, y_train, remaining, self.fitted_models)
        for model in self.ensemble.base_learners:
            assert model.index is not None
            self.fitted_indices.add(model.index)
            self.fitted_models[model.index] = model
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')

    def fit_doubling(self, x_train, y_train, verbose=False):
        """Fit an AutoLearner object, iteratively doubling allowed runtime."""
        t_predicted = convex_opt.predict_runtime(x_train.shape)

        # split data into training and validation sets
        try:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      stratify=y_train,
                                                      random_state=0)
        except ValueError:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=0)

        ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)]
        t_init = 2**np.floor(
            np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum()))
        t_init = max(1, t_init)
        times = [t_init]
        losses = [1.0]

        e_hat, actual_times, sampled, ensembles = [], [], [], []
        k, t = ranks[0], times[0]

        start = time.time()
        counter, best = 0, 0
        while time.time() - start < self.runtime_limit - t:
            if verbose:
                print('Fitting with k={}, t={}'.format(k, t))
            t0 = time.time()
            self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                     self.stacking_hyperparams)
            self.fit(x_tr, y_tr, rank=k, runtime_limit=t)
            loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type)

            # TEMPORARY: Record intermediate results
            e_hat.append(np.copy(self.new_row))
            actual_times.append(time.time() - start)
            sampled.append(self.sampled_indices)
            ensembles.append(self.ensemble)
            losses.append(loss)

            if loss == min(losses):
                ranks.append(k + 1)
                best = counter
            else:
                ranks.append(k)

            times.append(2 * t)
            k = ranks[-1]
            t = times[-1]
            counter += 1

        # after all iterations, restore best model
        self.new_row = e_hat[best]
        self.ensemble = ensembles[best]
        return {
            'ranks': ranks[:-1],
            'runtime_limits': times[:-1],
            'validation_loss': losses,
            'predicted_new_row': e_hat,
            'actual_runtimes': actual_times,
            'sampled_indices': sampled,
            'models': ensembles
        }

    def refit(self, x_train, y_train):
        """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and
        stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        assert self.ensemble.fitted, "Cannot refit unless model has been fit."
        self.ensemble.fit(x_train, y_train)

    def predict(self, x_test):
        """Generate predictions on test data.

        Args:
            x_test (np.ndarray): Features of the test dataset.
        Returns:
            np.ndarray: Predicted labels.
        """
        return self.ensemble.predict(x_test)
class AutoLearner:
    """An object representing an automatically tuned machine learning model.

    Attributes:
        p_type (str):                  Problem type. One of {'classification', 'regression'}.
        algorithms (list):             A list of algorithm types to be considered, in strings. (e.g. ['KNN', 'lSVM']).
        hyperparameters (dict):        A nested dict of hyperparameters to be considered; see above for example.
        verbose (bool):                Whether or not to generate print statements when a model finishes fitting.

        n_cores (int):                 Maximum number of cores over which to parallelize (None means no limit).
        runtime_limit(int):            Maximum training time for AutoLearner (powers of 2 preferred).

        selection_method (str):        Method of selecting entries of new row to sample.
        scalarization (str):           Scalarization of objective for min-variance selection. Either 'D' or 'A'.

        error_matrix (DataFrame):      Error matrix to use for imputation; includes index and headers.
        runtime_matrix (DataFrame):    Runtime matrix to use for runtime prediction; includes index and headers.
        column_headings (list):        Column headings of error/runtime matrices; list of dicts.
        X, Y (np.ndarray):             PCA decomposition of error matrix.

        new_row (np.ndarray):          Predicted row of error matrix.
        sampled_indices (set):         Indices of new row that have been sampled.
        sampled_models (list):         List of models that have been sampled (i.e. k-fold fitted).
        fitted_indices (set):          Indices of new row that have been fitted (i.e. included in ensemble)
        fitted_models (list):          List of models that have been fitted.

        stacking_alg (str):            Algorithm type to use for stacked learner.
    """
    def __init__(self,
                 p_type,
                 algorithms=None,
                 hyperparameters=None,
                 verbose=False,
                 n_cores=mp.cpu_count(),
                 runtime_limit=512,
                 selection_method='min_variance',
                 scalarization='D',
                 error_matrix=None,
                 runtime_matrix=None,
                 stacking_alg='greedy',
                 **stacking_hyperparams):

        # TODO: check if arguments to constructor are valid; set to defaults if not specified
        assert selection_method in {'qr', 'min_variance'}, "The method to select entries to sample must be " \
            "either qr (QR decomposition) or min_variance (minimize variance with time constraints)."

        with open(os.path.join(DEFAULTS, p_type + '.json')) as file:
            defaults = json.load(file)

        # attributes of ML problem
        self.p_type = p_type.lower()
        self.algorithms = algorithms or defaults['algorithms']
        self.hyperparameters = hyperparameters or defaults['hyperparameters']
        self.verbose = verbose

        # computational considerations
        self.n_cores = n_cores
        self.runtime_limit = runtime_limit

        # sample column selection
        self.selection_method = selection_method
        self.scalarization = scalarization

        # error matrix attributes
        # TODO: determine whether to generate new error matrix or use default/subset of default
        self.error_matrix = ERROR_MATRIX if error_matrix is None else error_matrix
        self.runtime_matrix = RUNTIME_MATRIX if runtime_matrix is None else runtime_matrix
        assert util.check_dataframes(self.error_matrix, self.runtime_matrix)
        self.column_headings = np.array(
            [eval(heading) for heading in list(self.error_matrix)])
        self.X, self.Y, _ = linalg.pca(self.error_matrix.values,
                                       rank=min(self.error_matrix.shape) - 1)

        # sampled & fitted models
        self.new_row = np.zeros((1, self.error_matrix.shape[1]))
        self.sampled_indices = set()
        self.sampled_models = [None] * self.error_matrix.shape[1]
        self.fitted_indices = set()
        self.fitted_models = [None] * self.error_matrix.shape[1]

        # ensemble attributes
        self.stacking_alg = stacking_alg
        self.stacking_hyperparams = stacking_hyperparams
        self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                 self.stacking_hyperparams)

    def fit(self, x_train, y_train, rank=None, runtime_limit=None):
        """Fit an AutoLearner object on a new dataset. This will sample the performance of several algorithms on the
        new dataset, predict performance on the rest, then construct an optimal ensemble model.

        Args:
            x_train (np.ndarray):  Features of the training dataset.
            y_train (np.ndarray):  Labels of the training dataset.
            rank (int):            Rank of error matrix factorization.
            runtime_limit (float): Maximum time to allocate to AutoLearner fitting.
        """
        # set to defaults if not provided
        rank = rank or linalg.approx_rank(self.error_matrix, threshold=0.01)
        runtime_limit = runtime_limit or self.runtime_limit

        if self.verbose:
            print('Fitting AutoLearner with max. runtime {}s'.format(
                runtime_limit))
        t_predicted = convex_opt.predict_runtime(
            x_train.shape, runtime_matrix=self.runtime_matrix)

        if self.selection_method == 'qr':
            to_sample = linalg.pivot_columns(self.error_matrix)
        elif self.selection_method == 'min_variance':
            # select algorithms to sample only from subset of algorithms that will run in allocated time
            valid = np.where(
                t_predicted <= self.n_cores * runtime_limit / 2)[0]
            Y = self.Y[:rank, valid]
            # TODO: check if Y is rank-deficient, i.e. will ED problem fail?
            v_opt = convex_opt.solve(t_predicted[valid], runtime_limit / 4,
                                     self.n_cores, Y, self.scalarization)
            to_sample = valid[np.where(v_opt > 0.9)[0]]
            if np.isnan(to_sample).any():
                to_sample = np.argsort(t_predicted)[:rank]
        else:
            to_sample = np.arange(0, self.new_row.shape[1])

        if len(to_sample) == 0 and len(self.sampled_indices) == 0:
            # if no columns are selected in first iteration (log det instability), sample n fastest columns
            n = len(
                np.where(np.cumsum(np.sort(t_predicted)) <= runtime_limit)[0])
            to_sample = np.argsort(t_predicted)[:n]

        # only need to compute column entry if it has not been computed already
        to_sample = list(set(to_sample) - self.sampled_indices)
        if self.verbose:
            print('Sampling {} entries of new row...'.format(len(to_sample)))
        start = time.time()
        p1 = mp.Pool(self.n_cores)
        sample_models = [
            Model(self.p_type, self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'], self.verbose, i)
            for i in to_sample
        ]
        sample_model_errors = [
            p1.apply_async(Model.kfold_fit_validate,
                           args=[m, x_train, y_train, 5])
            for m in sample_models
        ]
        p1.close()
        p1.join()

        # update sampled indices
        self.sampled_indices = self.sampled_indices.union(set(to_sample))
        for i, error in enumerate(sample_model_errors):
            cv_error, cv_predictions = error.get()
            sample_models[i].cv_error, sample_models[
                i].cv_predictions = cv_error.mean(), cv_predictions
            sample_models[i].sampled = True
            self.new_row[:, to_sample[i]] = cv_error.mean()
            self.sampled_models[to_sample[i]] = sample_models[i]
        imputed = linalg.impute(self.error_matrix,
                                self.new_row,
                                list(self.sampled_indices),
                                rank=rank)

        # impute ALL entries
        # unknown = sorted(list(set(range(self.new_row.shape[1])) - self.sampled_indices))
        # self.new_row[:, unknown] = imputed[:, unknown]
        self.new_row = imputed

        # k-fold fit candidate learners of ensemble
        remaining = (runtime_limit - (time.time() - start)) * self.n_cores
        # add best sampled model to list of candidate learners to avoid empty lists
        best_sampled_idx = list(self.sampled_indices)[int(
            np.argmin(self.new_row[:, list(self.sampled_indices)]))]
        assert self.sampled_models[best_sampled_idx] is not None
        candidate_indices = [best_sampled_idx]
        self.ensemble.candidate_learners.append(
            self.sampled_models[best_sampled_idx])
        for i in np.argsort(self.new_row[0]):
            if t_predicted[i] + t_predicted[candidate_indices].sum(
            ) <= remaining:
                last = candidate_indices.pop()
                assert last == best_sampled_idx
                candidate_indices.append(i)
                candidate_indices.append(last)
                # if model has already been k-fold fitted, immediately add to candidate learners
                if i in self.sampled_indices:
                    assert self.sampled_models[i] is not None
                    self.ensemble.candidate_learners.append(
                        self.sampled_models[i])
        # candidate learners that need to be k-fold fitted
        to_fit = list(set(candidate_indices) - self.sampled_indices)
        p2 = mp.Pool(self.n_cores)
        candidate_models = [
            Model(self.p_type, self.column_headings[i]['algorithm'],
                  self.column_headings[i]['hyperparameters'], self.verbose, i)
            for i in to_fit
        ]
        candidate_model_errors = [
            p2.apply_async(Model.kfold_fit_validate,
                           args=[m, x_train, y_train, 5])
            for m in candidate_models
        ]
        p2.close()
        p2.join()

        # update sampled indices
        self.sampled_indices = self.sampled_indices.union(set(to_fit))
        for i, error in enumerate(candidate_model_errors):
            cv_error, cv_predictions = error.get()
            candidate_models[i].cv_error, candidate_models[
                i].cv_predictions = cv_error.mean(), cv_predictions
            candidate_models[i].sampled = True
            self.new_row[:, to_fit[i]] = cv_error.mean()
            self.sampled_models[to_fit[i]] = candidate_models[i]
            self.ensemble.candidate_learners.append(candidate_models[i])
        # self.new_row = linalg.impute(self.error_matrix, self.new_row, list(self.sampled_indices), rank=rank)

        if self.verbose:
            print('\nFitting ensemble of max. size {}...'.format(
                len(self.ensemble.candidate_learners)))
        self.ensemble.fit(x_train, y_train, remaining, self.fitted_models)
        for model in self.ensemble.base_learners:
            assert model.index is not None
            self.fitted_indices.add(model.index)
            self.fitted_models[model.index] = model
        self.ensemble.fitted = True

        if self.verbose:
            print('\nAutoLearner fitting complete.')

    def fit_doubling(self, x_train, y_train, verbose=False):
        """Fit an AutoLearner object, iteratively doubling allowed runtime."""
        t_predicted = convex_opt.predict_runtime(x_train.shape)

        # split data into training and validation sets
        try:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      stratify=y_train,
                                                      random_state=0)
        except ValueError:
            x_tr, x_va, y_tr, y_va = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.15,
                                                      random_state=0)

        ranks = [linalg.approx_rank(self.error_matrix, threshold=0.05)]
        t_init = 2**np.floor(
            np.log2(np.sort(t_predicted)[:int(1.1 * ranks[0])].sum()))
        t_init = max(1, t_init)
        times = [t_init]
        losses = [1.0]

        e_hat, actual_times, sampled, ensembles = [], [], [], []
        k, t = ranks[0], times[0]

        start = time.time()
        counter, best = 0, 0
        while time.time() - start < self.runtime_limit - t:
            if verbose:
                print('Fitting with k={}, t={}'.format(k, t))
            t0 = time.time()
            self.ensemble = Ensemble(self.p_type, self.stacking_alg,
                                     self.stacking_hyperparams)
            self.fit(x_tr, y_tr, rank=k, runtime_limit=t)
            loss = util.error(y_va, self.ensemble.predict(x_va), self.p_type)

            # TEMPORARY: Record intermediate results
            e_hat.append(np.copy(self.new_row))
            actual_times.append(time.time() - start)
            sampled.append(self.sampled_indices)
            ensembles.append(self.ensemble)
            losses.append(loss)

            if loss == min(losses):
                ranks.append(k + 1)
                best = counter
            else:
                ranks.append(k)

            times.append(2 * t)
            k = ranks[-1]
            t = times[-1]
            counter += 1

        # after all iterations, restore best model
        self.new_row = e_hat[best]
        self.ensemble = ensembles[best]
        return {
            'ranks': ranks[:-1],
            'runtime_limits': times[:-1],
            'validation_loss': losses,
            'predicted_new_row': e_hat,
            'actual_runtimes': actual_times,
            'sampled_indices': sampled,
            'models': ensembles
        }

    def refit(self, x_train, y_train):
        """Refit an existing AutoLearner object on a new dataset. This will simply retrain the base-learners and
        stacked learner of an existing model, and so algorithm and hyperparameter selection may not be optimal.

        Args:
            x_train (np.ndarray): Features of the training dataset.
            y_train (np.ndarray): Labels of the training dataset.
        """
        assert self.ensemble.fitted, "Cannot refit unless model has been fit."
        self.ensemble.fit(x_train, y_train)

    def predict(self, x_test):
        """Generate predictions on test data.

        Args:
            x_test (np.ndarray): Features of the test dataset.
        Returns:
            np.ndarray: Predicted labels.
        """
        return self.ensemble.predict(x_test)