def ensemble_selection(predictions,
                       labels,
                       ensemble_size,
                       task_type,
                       metric,
                       do_pruning=False):
    '''
        Fast version of Rich Caruana's ensemble selection method
    '''

    ensemble = []
    trajectory = []
    order = []

    if do_pruning:
        n_best = 20
        indices = pruning(predictions, labels, n_best, task_type, metric)
        for idx in indices:
            ensemble.append(predictions[idx])
            order.append(idx)
            ensemble_ = np.array(ensemble).mean(axis=0)
            ensemble_performance = evaluator.calculate_score(
                labels, ensemble_, task_type, metric, ensemble_.shape[1])
            trajectory.append(ensemble_performance)
        ensemble_size = ensemble_size - n_best

    for i in range(ensemble_size):
        scores = np.zeros([predictions.shape[0]])
        s = len(ensemble)
        if s == 0:
            weighted_ensemble_prediction = np.zeros(predictions[0].shape)
        else:
            ensemble_prediction = np.mean(np.array(ensemble), axis=0)
            weighted_ensemble_prediction = (s /
                                            float(s + 1)) * ensemble_prediction
        for j, pred in enumerate(predictions):
            #ensemble.append(pred)
            #ensemble_prediction = np.mean(np.array(ensemble), axis=0)
            fant_ensemble_prediction = weighted_ensemble_prediction + (
                1. / float(s + 1)) * pred

            scores[j] = evaluator.calculate_score(
                labels, fant_ensemble_prediction, task_type, metric,
                fant_ensemble_prediction.shape[1])
            # ensemble.pop()
        best = np.nanargmax(scores)
        ensemble.append(predictions[best])
        trajectory.append(scores[best])
        order.append(best)

    return np.array(order), np.array(trajectory)
    def predict(self):
        Y_optimization_pred = self.predict_function(self.X_optimization,
                                                    self.model, self.task_type)
        if self.X_valid is not None:
            Y_valid_pred = self.predict_function(self.X_valid, self.model,
                                                 self.task_type)
        else:
            Y_valid_pred = None
        if self.X_test is not None:
            Y_test_pred = self.predict_function(self.X_test, self.model,
                                                self.task_type)
        else:
            Y_test_pred = None

        score = calculate_score(self.Y_optimization, Y_optimization_pred,
                                self.task_type, self.metric,
                                self.D.info['target_num'],
                                all_scoring_functions=self.all_scoring_functions)

        if hasattr(score, "__len__"):
            err = {key: 1 - score[key] for key in score}
        else:
            err = 1 - score

        if self.with_predictions:
            return err, Y_optimization_pred, Y_valid_pred, Y_test_pred
        return err
def pruning(predictions, labels, n_best, task_type, metric):
    perf = np.zeros([predictions.shape[0]])
    for i, p in enumerate(predictions):
        perf[i] = evaluator.calculate_score(labels, predictions, task_type,
                                            metric, predictions.shape[1])

    indcies = np.argsort(perf)[perf.shape[0] - n_best:]
    return indcies
def ensemble_selection(predictions, labels, ensemble_size, task_type, metric, do_pruning=False):
    '''
        Fast version of Rich Caruana's ensemble selection method
    '''

    ensemble = []
    trajectory = []
    order = []

    if do_pruning:
        n_best = 20
        indices = pruning(predictions, labels, n_best, task_type, metric)
        for idx in indices:
            ensemble.append(predictions[idx])
            order.append(idx)
            ensemble_ = np.array(ensemble).mean(axis=0)
            ensemble_performance = evaluator.calculate_score(labels, ensemble_,
                                                             task_type, metric,
                                                             ensemble_.shape[1])
            trajectory.append(ensemble_performance)
        ensemble_size = ensemble_size - n_best

    for i in range(ensemble_size):
        scores = np.zeros([predictions.shape[0]])
        s = len(ensemble)
        if s == 0:
            weighted_ensemble_prediction = np.zeros(predictions[0].shape)
        else:
            ensemble_prediction = np.mean(np.array(ensemble), axis=0)
            weighted_ensemble_prediction = (s / float(s + 1)) * ensemble_prediction
        for j, pred in enumerate(predictions):
            #ensemble.append(pred)
            #ensemble_prediction = np.mean(np.array(ensemble), axis=0)
            fant_ensemble_prediction = weighted_ensemble_prediction + (1. / float(s + 1)) * pred

            scores[j] = evaluator.calculate_score(labels,
                                                  fant_ensemble_prediction,
                                                  task_type, metric,
                                                  fant_ensemble_prediction.shape[1])
            # ensemble.pop()
        best = np.nanargmax(scores)
        ensemble.append(predictions[best])
        trajectory.append(scores[best])
        order.append(best)

    return np.array(order), np.array(trajectory)
def pruning(predictions, labels, n_best, task_type, metric):
    perf = np.zeros([predictions.shape[0]])
    for i, p in enumerate(predictions):
        perf[i] = evaluator.calculate_score(labels, predictions, task_type,
                                            metric, predictions.shape[1])

    indcies = np.argsort(perf)[perf.shape[0] - n_best:]
    return indcies
def original_ensemble_selection(predictions,
                                labels,
                                ensemble_size,
                                task_type,
                                metric,
                                do_pruning=False):
    '''
        Rich Caruana's ensemble selection method
    '''

    ensemble = []
    trajectory = []
    order = []

    if do_pruning:
        n_best = 20
        indices = pruning(predictions, labels, n_best, task_type, metric)
        for idx in indices:
            ensemble.append(predictions[idx])
            order.append(idx)
            ensemble_ = np.array(ensemble).mean(axis=0)
            ensemble_performance = evaluator.calculate_score(
                labels, ensemble_, task_type, metric, ensemble_.shape[1])
            trajectory.append(ensemble_performance)
        ensemble_size = ensemble_size - n_best

    for i in range(ensemble_size):
        scores = np.zeros([predictions.shape[0]])
        for j, pred in enumerate(predictions):
            ensemble.append(pred)
            ensemble_prediction = np.mean(np.array(ensemble), axis=0)
            scores[j] = evaluator.calculate_score(labels, ensemble_prediction,
                                                  task_type, metric,
                                                  ensemble_prediction.shape[1])
            ensemble.pop()
        best = np.nanargmax(scores)
        ensemble.append(predictions[best])
        trajectory.append(scores[best])
        order.append(best)

    return np.array(order), np.array(trajectory)
def weighted_ensemble_error(weights, *args):
    predictions = args[0]
    true_labels = args[1]
    metric = args[2]
    task_type = args[3]

    weight_prime = weights / weights.sum()
    weighted_predictions = ensemble_prediction(predictions, weight_prime)

    score = evaluator.calculate_score(true_labels, weighted_predictions,
                                      task_type, metric,
                                      weighted_predictions.shape[1])
    return 1 - score
def weighted_ensemble_error(weights, *args):
    predictions = args[0]
    true_labels = args[1]
    metric = args[2]
    task_type = args[3]

    weight_prime = weights / weights.sum()
    weighted_predictions = ensemble_prediction(predictions, weight_prime)

    score = evaluator.calculate_score(true_labels, weighted_predictions,
                                     task_type, metric,
                                     weighted_predictions.shape[1])
    return 1 - score
def original_ensemble_selection(predictions, labels, ensemble_size, task_type, metric, do_pruning=False):
    '''
        Rich Caruana's ensemble selection method
    '''

    ensemble = []
    trajectory = []
    order = []

    if do_pruning:
        n_best = 20
        indices = pruning(predictions, labels, n_best, task_type, metric)
        for idx in indices:
            ensemble.append(predictions[idx])
            order.append(idx)
            ensemble_ = np.array(ensemble).mean(axis=0)
            ensemble_performance = evaluator.calculate_score(labels, ensemble_,
                                                             task_type, metric,
                                                             ensemble_.shape[1])
            trajectory.append(ensemble_performance)
        ensemble_size = ensemble_size - n_best

    for i in range(ensemble_size):
        scores = np.zeros([predictions.shape[0]])
        for j, pred in enumerate(predictions):
            ensemble.append(pred)
            ensemble_prediction = np.mean(np.array(ensemble), axis=0)
            scores[j] = evaluator.calculate_score(labels,
                                                  ensemble_prediction,
                                                  task_type, metric,
                                                  ensemble_prediction.shape[1])
            ensemble.pop()
        best = np.nanargmax(scores)
        ensemble.append(predictions[best])
        trajectory.append(scores[best])
        order.append(best)

    return np.array(order), np.array(trajectory)
def main(predictions_dir,
         basename,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None):
    watch = autosklearn.util.stopwatch.StopWatch()
    watch.start_task("ensemble_builder")

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0
    logging.basicConfig(filename=os.path.join(predictions_dir, "ensemble.log"),
                        level=logging.DEBUG)

    while used_time < limit:
        logging.debug("Time left: %f" % (limit - used_time))
        logging.debug("Time last iteration: %f" % time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(
            os.path.join(predictions_dir, "true_labels_ensemble.npy"))

        # Load the predictions from the models
        all_predictions_train = []
        dir_ensemble = os.path.join(predictions_dir, "predictions_ensemble/")
        dir_valid = os.path.join(predictions_dir, "predictions_valid/")
        dir_test = os.path.join(predictions_dir, "predictions_test/")

        if not os.path.isdir(dir_ensemble) or not os.path.isdir(dir_valid) or \
                not os.path.isdir(dir_test):
            logging.debug("Prediction directory does not exist")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid))
        dir_test_list = sorted(os.listdir(dir_test))

        if len(dir_ensemble_list) == 0:
            logging.debug("Directories are empty")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) != len(dir_valid_list):
            logging.debug("Directories are inconsistent")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) != len(dir_test_list):
            logging.debug("Directories are inconsistent")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) <= current_num_models:
            logging.debug("Nothing has changed since the last time")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        watch.start_task("ensemble_iter_" + str(index_run))

        # Binary mask where True indicates that the corresponding will be excluded from the ensemble
        exclude_mask = []
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []

        model_idx = 0
        for f in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, f))
            score = evaluator.calculate_score(true_labels, predictions,
                                              task_type, metric,
                                              predictions.shape[1])

            if ensemble_size is not None:
                if score <= 0.001:
                    exclude_mask.append(True)
                    logging.error("Model only predicts at random: " + f +
                                  " has score: " + str(score))
                # If we have less model in our ensemble than ensemble_size add the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    exclude_mask.append(False)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in our ensemble replace it by the current model
                    if (scores_nbest[idx] < score):
                        logging.debug(
                            "Worst model in our ensemble: %d with score %f will be replaced by model %d with score %f"
                            % (idx, scores_nbest[idx], model_idx, score))
                        scores_nbest[idx] = score
                        # Exclude the old model
                        exclude_mask[int(indices_nbest[idx])] = True
                        indices_nbest[idx] = model_idx
                        exclude_mask.append(False)
                    # Otherwise exclude the current model from the ensemble
                    else:
                        exclude_mask.append(True)

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    exclude_mask.append(True)
                    logging.error("Model only predicts at random: " + f +
                                  " has score: " + str(score))
                else:
                    exclude_mask.append(False)
                    all_predictions_train.append(predictions)

            model_idx += 1
            print exclude_mask

        all_predictions_valid = []
        for i, f in enumerate(dir_valid_list):
            predictions = np.load(os.path.join(dir_valid, f))
            if not exclude_mask[i]:
                all_predictions_valid.append(predictions)

        all_predictions_test = []
        for i, f in enumerate(dir_test_list):
            predictions = np.load(os.path.join(dir_test, f))
            if not exclude_mask[i]:
                all_predictions_test.append(predictions)

        if len(all_predictions_train) == len(all_predictions_test) == len(
                all_predictions_valid) == 0:
            logging.error("All models do just random guessing")
            time.sleep(2)
            continue

        if len(all_predictions_train) == 1:
            logging.debug("Only one model so far we just copy its predictions")
            Y_valid = all_predictions_valid[0]
            Y_test = all_predictions_test[0]
        else:
            try:
                # Compute the weights for the ensemble
                # Use equally initialized weights
                n_models = len(all_predictions_train)
                init_weights = np.ones([n_models]) / n_models

                weights = weighted_ensemble(np.array(all_predictions_train),
                                            true_labels, task_type, metric,
                                            init_weights)
            except (ValueError):
                logging.error("Caught ValueError!")
                used_time = watch.wall_elapsed("ensemble_builder")
                continue
            except:
                logging.error("Caught error!")
                used_time = watch.wall_elapsed("ensemble_builder")
                continue

            # Compute the ensemble predictions for the valid data
            Y_valid = ensemble_prediction(np.array(all_predictions_valid),
                                          weights)

            # Compute the ensemble predictions for the test data
            Y_test = ensemble_prediction(np.array(all_predictions_test),
                                         weights)

        # Save predictions for valid and test data set
        filename_test = os.path.join(
            output_dir,
            basename + '_valid_' + str(index_run).zfill(3) + '.predict')
        data_util.save_predictions(
            os.path.join(predictions_dir, filename_test), Y_valid)

        filename_test = os.path.join(
            output_dir,
            basename + '_test_' + str(index_run).zfill(3) + '.predict')
        data_util.save_predictions(
            os.path.join(predictions_dir, filename_test), Y_test)

        current_num_models = len(dir_ensemble_list)
        watch.stop_task("ensemble_iter_" + str(index_run))
        time_iter = watch.get_wall_dur("ensemble_iter_" + str(index_run))
        used_time = watch.wall_elapsed("ensemble_builder")
        index_run += 1
    return
def main(predictions_dir,
         basename,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None,
         seed=1,
         indices_output_dir="."):
    watch = StopWatch()
    watch.start_task("ensemble_builder")

    task_type = STRING_TO_TASK_TYPES[task_type]

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0
    logging.basicConfig(filename=os.path.join(predictions_dir,
                                              "ensemble_%d.log" % seed),
                        level=logging.DEBUG)

    while used_time < limit:
        logging.debug("Time left: %f", limit - used_time)
        logging.debug("Time last iteration: %f", time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(
            os.path.join(predictions_dir, "true_labels_ensemble.npy"))

        # Load the predictions from the models
        dir_ensemble = os.path.join(predictions_dir,
                                    "predictions_ensemble_%s/" % seed)
        dir_valid = os.path.join(predictions_dir,
                                 "predictions_valid_%s/" % seed)
        dir_test = os.path.join(predictions_dir, "predictions_test_%s/" % seed)

        paths_ = [dir_ensemble, dir_valid, dir_test]
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]:  #all(exists):
            logging.debug("Prediction directory %s does not exist!" %
                          dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
        dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        if len(dir_ensemble_list) == 0:
            logging.debug("Directories are empty")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) <= current_num_models:
            logging.debug("Nothing has changed since the last time")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        watch.start_task("ensemble_iter_" + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        re_num_run = re.compile(r'_([0-9]*)\.npy$')
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []
            # The num run of the models
            num_runs = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, model_name))
            score = evaluator.calculate_score(true_labels, predictions,
                                              task_type, metric,
                                              predictions.shape[1])
            model_names_to_scores[model_name] = score
            num_run = int(re_num_run.search(model_name).group(1))

            if ensemble_size is not None:
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logging.error("Model only predicts at random: " +
                                  model_name + " has score: " + str(score))
                # If we have less models in our ensemble than ensemble_size add the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append(num_run)
                    model_names.append(model_name)
                    num_runs.append(num_run)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in our ensemble replace it by the current model
                    if (scores_nbest[idx] < score):
                        logging.debug(
                            "Worst model in our ensemble: %s with "
                            "score %f will be replaced by model %s "
                            "with score %f", model_names[idx],
                            scores_nbest[idx], model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append(num_run)
                        del model_names[idx]
                        model_names.append(model_name)
                        del num_runs[idx]
                        num_runs.append(num_run)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        #include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    #include_num_runs.append(True)
                    logging.error("Model only predicts at random: " +
                                  model_name + " has score: " + str(score))
                else:
                    include_num_runs.append(num_run)

            model_idx += 1

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = num_run

        #logging.info("Indices to model names:")
        #logging.info(indices_to_model_names)

        #for i, item in enumerate(sorted(model_names_to_scores.items(),
        #                                key=lambda t: t[1])):
        #    logging.info("%d: %s", i, item)

        include_num_runs = set(include_num_runs)

        all_predictions_train = []
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_ensemble, model_name))
                all_predictions_train.append(predictions)

        all_predictions_valid = []
        for i, model_name in enumerate(dir_valid_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_valid, model_name))
                all_predictions_valid.append(predictions)

        all_predictions_test = []
        for i, model_name in enumerate(dir_test_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_test, model_name))
                all_predictions_test.append(predictions)

        if len(all_predictions_train) == len(all_predictions_test) == len(
                all_predictions_valid) == 0:
            logging.error("All models do just random guessing")
            time.sleep(2)
            continue

        elif len(all_predictions_train) == 1:
            logging.debug("Only one model so far we just copy its predictions")
            ensemble_members_run_numbers = {0: 1.0}

            # Output the score
            logging.info("Training performance: %f" %
                         np.max(model_names_to_scores.values()))
        else:
            try:
                indices, trajectory = ensemble_selection(
                    np.array(all_predictions_train), true_labels,
                    ensemble_size, task_type, metric)

                logging.info("Trajectory and indices!")
                logging.info(trajectory)
                logging.info(indices)

            except ValueError as e:
                logging.error("Caught ValueError: " + str(e))
                used_time = watch.wall_elapsed("ensemble_builder")
                continue
            except Exception as e:
                logging.error("Caught error! %s", e.message)
                used_time = watch.wall_elapsed("ensemble_builder")
                continue

            # Output the score
            logging.info("Training performance: %f" % trajectory[-1])

            # Print the ensemble members:
            ensemble_members_run_numbers = dict()
            ensemble_members = Counter(indices).most_common()
            ensemble_members_string = "Ensemble members:\n"
            logging.info(ensemble_members)
            for ensemble_member in ensemble_members:
                weight = float(ensemble_member[1]) / len(indices)
                ensemble_members_string += \
                    ("    %s; weight: %10f; performance: %10f\n" %
                     (indices_to_model_names[ensemble_member[0]],
                      weight,
                    model_names_to_scores[indices_to_model_names[ensemble_member[0]]]))

                ensemble_members_run_numbers[indices_to_run_num[
                    ensemble_member[0]]] = weight
            logging.info(ensemble_members_string)

        # Save the ensemble indices for later use!
        filename_indices = os.path.join(indices_output_dir,
                                        str(index_run).zfill(5) + ".indices")

        logging.info(ensemble_members_run_numbers)
        with open(filename_indices, "w") as fh:
            pickle.dump(ensemble_members_run_numbers, fh)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            ensemble_predictions_valid = np.mean(
                all_predictions_valid[indices.astype(int)], axis=0)
            filename_test = os.path.join(
                output_dir,
                basename + '_valid_' + str(index_run).zfill(3) + '.predict')
            data_util.save_predictions(
                os.path.join(predictions_dir, filename_test),
                ensemble_predictions_valid)
        else:
            logging.info("Could not find as many validation set predictions "
                         "as ensemble predictions!.")

        if len(dir_test_list) == len(dir_ensemble_list):
            ensemble_predictions_test = np.mean(
                all_predictions_test[indices.astype(int)], axis=0)
            filename_test = os.path.join(
                output_dir,
                basename + '_test_' + str(index_run).zfill(3) + '.predict')
            data_util.save_predictions(
                os.path.join(predictions_dir, filename_test),
                ensemble_predictions_test)
        else:
            logging.info("Could not find as many test set predictions as "
                         "ensemble predictions!")

        current_num_models = len(dir_ensemble_list)
        watch.stop_task("ensemble_iter_" + str(index_run))
        time_iter = watch.get_wall_dur("ensemble_iter_" + str(index_run))
        used_time = watch.wall_elapsed("ensemble_builder")
        index_run += 1
    return
def main(predictions_dir, basename, task_type, metric, limit, output_dir, ensemble_size=None):
    watch = autosklearn.util.stopwatch.StopWatch()
    watch.start_task("ensemble_builder")

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0
    logging.basicConfig(filename=os.path.join(predictions_dir, "ensemble.log"), level=logging.DEBUG)

    while used_time < limit:
        logging.debug("Time left: %f" % (limit - used_time))
        logging.debug("Time last iteration: %f" % time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(os.path.join(predictions_dir, "true_labels_ensemble.npy"))

        # Load the predictions from the models
        all_predictions_train = []
        dir_ensemble = os.path.join(predictions_dir, "predictions_ensemble/")
        dir_valid = os.path.join(predictions_dir, "predictions_valid/")
        dir_test = os.path.join(predictions_dir, "predictions_test/")

        if not os.path.isdir(dir_ensemble) or not os.path.isdir(dir_valid) or \
                not os.path.isdir(dir_test):
            logging.debug("Prediction directory does not exist")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid))
        dir_test_list = sorted(os.listdir(dir_test))

        if len(dir_ensemble_list) == 0:
            logging.debug("Directories are empty")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) != len(dir_valid_list):
            logging.debug("Directories are inconsistent")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) != len(dir_test_list):
            logging.debug("Directories are inconsistent")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) <= current_num_models:
            logging.debug("Nothing has changed since the last time")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        watch.start_task("ensemble_iter_" + str(index_run))

        # Binary mask where True indicates that the corresponding will be excluded from the ensemble
        exclude_mask = []
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []

        model_idx = 0
        for f in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, f))
            score = evaluator.calculate_score(true_labels, predictions,
                task_type, metric, predictions.shape[1])

            if ensemble_size is not None:
                if score <= 0.001:
                    exclude_mask.append(True)
                    logging.error("Model only predicts at random: " + f + " has score: " + str(score))
                # If we have less model in our ensemble than ensemble_size add the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    exclude_mask.append(False)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in our ensemble replace it by the current model
                    if(scores_nbest[idx] < score):
                        logging.debug("Worst model in our ensemble: %d with score %f will be replaced by model %d with score %f" % (idx, scores_nbest[idx], model_idx, score))
                        scores_nbest[idx] = score
                        # Exclude the old model
                        exclude_mask[int(indices_nbest[idx])] = True
                        indices_nbest[idx] = model_idx
                        exclude_mask.append(False)
                    # Otherwise exclude the current model from the ensemble
                    else:
                        exclude_mask.append(True)

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    exclude_mask.append(True)
                    logging.error("Model only predicts at random: " + f + " has score: " + str(score))
                else:
                    exclude_mask.append(False)
                    all_predictions_train.append(predictions)

            model_idx += 1
            print exclude_mask

        all_predictions_valid = []
        for i, f in enumerate(dir_valid_list):
            predictions = np.load(os.path.join(dir_valid, f))
            if not exclude_mask[i]:
                all_predictions_valid.append(predictions)

        all_predictions_test = []
        for i, f in enumerate(dir_test_list):
            predictions = np.load(os.path.join(dir_test, f))
            if not exclude_mask[i]:
                all_predictions_test.append(predictions)

        if len(all_predictions_train) == len(all_predictions_test) == len(all_predictions_valid) == 0:
            logging.error("All models do just random guessing")
            time.sleep(2)
            continue

        if len(all_predictions_train) == 1:
            logging.debug("Only one model so far we just copy its predictions")
            Y_valid = all_predictions_valid[0]
            Y_test = all_predictions_test[0]
        else:
            try:
                # Compute the weights for the ensemble
                # Use equally initialized weights
                n_models = len(all_predictions_train)
                init_weights = np.ones([n_models]) / n_models

                weights = weighted_ensemble(np.array(all_predictions_train),
                                    true_labels, task_type, metric, init_weights)
            except (ValueError):
                logging.error("Caught ValueError!")
                used_time = watch.wall_elapsed("ensemble_builder")
                continue
            except:
                logging.error("Caught error!")
                used_time = watch.wall_elapsed("ensemble_builder")
                continue

            # Compute the ensemble predictions for the valid data
            Y_valid = ensemble_prediction(np.array(all_predictions_valid), weights)

            # Compute the ensemble predictions for the test data
            Y_test = ensemble_prediction(np.array(all_predictions_test), weights)

        # Save predictions for valid and test data set
        filename_test = os.path.join(output_dir, basename + '_valid_' + str(index_run).zfill(3) + '.predict')
        data_util.save_predictions(os.path.join(predictions_dir, filename_test), Y_valid)

        filename_test = os.path.join(output_dir, basename + '_test_' + str(index_run).zfill(3) + '.predict')
        data_util.save_predictions(os.path.join(predictions_dir, filename_test), Y_test)

        current_num_models = len(dir_ensemble_list)
        watch.stop_task("ensemble_iter_" + str(index_run))
        time_iter = watch.get_wall_dur("ensemble_iter_" + str(index_run))
        used_time = watch.wall_elapsed("ensemble_builder")
        index_run += 1
    return
def main(predictions_dir, basename, task_type, metric, limit, output_dir,
         ensemble_size=None, seed=1, indices_output_dir="."):
    watch = StopWatch()
    watch.start_task("ensemble_builder")

    task_type = STRING_TO_TASK_TYPES[task_type]

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0
    logging.basicConfig(filename=os.path.join(predictions_dir, "ensemble_%d.log" % seed), level=logging.DEBUG)

    while used_time < limit:
        logging.debug("Time left: %f", limit - used_time)
        logging.debug("Time last iteration: %f", time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(os.path.join(predictions_dir, "true_labels_ensemble.npy"))

        # Load the predictions from the models
        dir_ensemble = os.path.join(predictions_dir,
                                    "predictions_ensemble_%s/" % seed)
        dir_valid = os.path.join(predictions_dir,
                                 "predictions_valid_%s/" % seed)
        dir_test = os.path.join(predictions_dir,
                                "predictions_test_%s/" % seed)

        paths_ = [dir_ensemble, dir_valid, dir_test]
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]: #all(exists):
            logging.debug("Prediction directory %s does not exist!" %
                           dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
        dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        if len(dir_ensemble_list) == 0:
            logging.debug("Directories are empty")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) <= current_num_models:
            logging.debug("Nothing has changed since the last time")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        watch.start_task("ensemble_iter_" + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        re_num_run = re.compile(r'_([0-9]*)\.npy$')
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []
            # The num run of the models
            num_runs = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, model_name))
            score = evaluator.calculate_score(true_labels, predictions,
                                              task_type, metric,
                                              predictions.shape[1])
            model_names_to_scores[model_name] = score
            num_run = int(re_num_run.search(model_name).group(1))

            if ensemble_size is not None:
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logging.error("Model only predicts at random: " + model_name + " has score: " + str(score))
                # If we have less models in our ensemble than ensemble_size add the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append(num_run)
                    model_names.append(model_name)
                    num_runs.append(num_run)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in our ensemble replace it by the current model
                    if(scores_nbest[idx] < score):
                        logging.debug("Worst model in our ensemble: %s with "
                                      "score %f will be replaced by model %s "
                                      "with score %f",
                                      model_names[idx], scores_nbest[idx],
                                      model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append(num_run)
                        del model_names[idx]
                        model_names.append(model_name)
                        del num_runs[idx]
                        num_runs.append(num_run)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        #include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    #include_num_runs.append(True)
                    logging.error("Model only predicts at random: " + model_name + " has score: " + str(score))
                else:
                    include_num_runs.append(num_run)

            model_idx += 1

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = num_run

        #logging.info("Indices to model names:")
        #logging.info(indices_to_model_names)

        #for i, item in enumerate(sorted(model_names_to_scores.items(),
        #                                key=lambda t: t[1])):
        #    logging.info("%d: %s", i, item)

        include_num_runs = set(include_num_runs)

        all_predictions_train = []
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_ensemble, model_name))
                all_predictions_train.append(predictions)

        all_predictions_valid = []
        for i, model_name in enumerate(dir_valid_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_valid, model_name))
                all_predictions_valid.append(predictions)

        all_predictions_test = []
        for i, model_name in enumerate(dir_test_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_test, model_name))
                all_predictions_test.append(predictions)

        if len(all_predictions_train) == len(all_predictions_test) == len(all_predictions_valid) == 0:
            logging.error("All models do just random guessing")
            time.sleep(2)
            continue

        elif len(all_predictions_train) == 1:
            logging.debug("Only one model so far we just copy its predictions")
            ensemble_members_run_numbers = {0: 1.0}

            # Output the score
            logging.info("Training performance: %f" % np.max(
                model_names_to_scores.values()))
        else:
            try:
                indices, trajectory = ensemble_selection(
                    np.array(all_predictions_train), true_labels,
                    ensemble_size, task_type, metric)

                logging.info("Trajectory and indices!")
                logging.info(trajectory)
                logging.info(indices)

            except ValueError as e:
                logging.error("Caught ValueError: " + str(e))
                used_time = watch.wall_elapsed("ensemble_builder")
                continue
            except Exception as e:
                logging.error("Caught error! %s", e.message)
                used_time = watch.wall_elapsed("ensemble_builder")
                continue

            # Output the score
            logging.info("Training performance: %f" % trajectory[-1])

            # Print the ensemble members:
            ensemble_members_run_numbers = dict()
            ensemble_members = Counter(indices).most_common()
            ensemble_members_string = "Ensemble members:\n"
            logging.info(ensemble_members)
            for ensemble_member in ensemble_members:
                weight = float(ensemble_member[1]) / len(indices)
                ensemble_members_string += \
                    ("    %s; weight: %10f; performance: %10f\n" %
                     (indices_to_model_names[ensemble_member[0]],
                      weight,
                    model_names_to_scores[indices_to_model_names[ensemble_member[0]]]))

                ensemble_members_run_numbers[indices_to_run_num[
                    ensemble_member[0]]] = weight
            logging.info(ensemble_members_string)

        # Save the ensemble indices for later use!
        filename_indices = os.path.join(indices_output_dir,
                                        str(index_run).zfill(5) + ".indices")

        logging.info(ensemble_members_run_numbers)
        with open(filename_indices, "w") as fh:
            pickle.dump(ensemble_members_run_numbers, fh)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            ensemble_predictions_valid = np.mean(
                all_predictions_valid[indices.astype(int)], axis=0)
            filename_test = os.path.join(output_dir, basename + '_valid_' + str(index_run).zfill(3) + '.predict')
            data_util.save_predictions(os.path.join(predictions_dir, filename_test),
                                       ensemble_predictions_valid)
        else:
            logging.info("Could not find as many validation set predictions "
                         "as ensemble predictions!.")

        if len(dir_test_list) == len(dir_ensemble_list):
            ensemble_predictions_test = np.mean(
                all_predictions_test[indices.astype(int)], axis=0)
            filename_test = os.path.join(output_dir, basename + '_test_' + str(index_run).zfill(3) + '.predict')
            data_util.save_predictions(os.path.join(predictions_dir, filename_test),
                                       ensemble_predictions_test)
        else:
            logging.info("Could not find as many test set predictions as "
                         "ensemble predictions!")

        current_num_models = len(dir_ensemble_list)
        watch.stop_task("ensemble_iter_" + str(index_run))
        time_iter = watch.get_wall_dur("ensemble_iter_" + str(index_run))
        used_time = watch.wall_elapsed("ensemble_builder")
        index_run += 1
    return
Example #14
0
    def predict(self):
        Y_optimization_pred = [None] * self.cv_folds
        Y_targets = [None] * self.cv_folds
        Y_valid_pred = [None] * self.cv_folds
        Y_test_pred = [None] * self.cv_folds

        for i in range(self.cv_folds):
            # To support prediction when only partial_fit was called
            if self.models[i] is None:
                continue

            train_indices, test_indices = self.indices[i]
            opt_pred = self.predict_function(self.X_train[test_indices],
                                             self.models[i], self.task_type,
                                             self.Y_train[train_indices])

            Y_optimization_pred[i] = opt_pred
            Y_targets[i] = self.Y_train[test_indices]

            if self.X_valid is not None:
                X_valid = self.X_valid.copy()
                valid_pred = self.predict_function(X_valid, self.models[i],
                                                   self.task_type,
                                                   self.Y_train[train_indices])
                Y_valid_pred[i] = valid_pred

            if self.X_test is not None:
                X_test = self.X_test.copy()
                test_pred = self.predict_function(X_test, self.models[i],
                                                  self.task_type,
                                                  self.Y_train[train_indices])
                Y_test_pred[i] = test_pred

        Y_optimization_pred = np.concatenate([
            Y_optimization_pred[i] for i in range(self.cv_folds)
            if Y_optimization_pred[i] is not None
        ])
        Y_targets = np.concatenate([
            Y_targets[i] for i in range(self.cv_folds)
            if Y_targets[i] is not None
        ])

        if self.X_valid is not None:
            Y_valid_pred = np.array([
                Y_valid_pred[i] for i in range(self.cv_folds)
                if Y_valid_pred[i] is not None
            ])
            # Average the predictions of several models
            if len(Y_valid_pred.shape) == 3:
                Y_valid_pred = np.nanmean(Y_valid_pred, axis=0)

        if self.X_test is not None:
            Y_test_pred = np.array([
                Y_test_pred[i] for i in range(self.cv_folds)
                if Y_test_pred[i] is not None
            ])
            # Average the predictions of several models
            if len(Y_test_pred.shape) == 3:
                Y_test_pred = np.nanmean(Y_test_pred, axis=0)

        self.Y_optimization = Y_targets
        score = calculate_score(
            Y_targets,
            Y_optimization_pred,
            self.task_type,
            self.metric,
            self.D.info['target_num'],
            all_scoring_functions=self.all_scoring_functions)

        if hasattr(score, "__len__"):
            err = {key: 1 - score[key] for key in score}
        else:
            err = 1 - score

        if self.with_predictions:
            return err, Y_optimization_pred, Y_valid_pred, Y_test_pred
        return err
    def predict(self):
        # First, obtain the predictions for the ensembles, the validation and
        #  the test set!
        outer_scores = defaultdict(list)
        inner_scores = defaultdict(list)
        Y_optimization_pred = [None] * self.outer_cv_folds
        Y_targets = [None] * self.outer_cv_folds
        Y_valid_pred = [None] * self.outer_cv_folds
        Y_test_pred = [None] * self.outer_cv_folds

        for i in range(self.outer_cv_folds):
            train_indices, test_indices = self.outer_indices[i]
            opt_pred = self.predict_function(
                self.X_train[test_indices],
                self.outer_models[i],
                self.task_type,
                Y_train=self.Y_train[train_indices])

            Y_optimization_pred[i] = opt_pred
            Y_targets[i] = self.Y_train[test_indices]

            if self.X_valid is not None:
                X_valid = self.X_valid.copy()
                valid_pred = self.predict_function(
                    X_valid,
                    self.outer_models[i],
                    self.task_type,
                    Y_train=self.Y_train[train_indices])
                Y_valid_pred[i] = valid_pred

            if self.X_test is not None:
                X_test = self.X_test.copy()
                test_pred = self.predict_function(
                    X_test,
                    self.outer_models[i],
                    self.task_type,
                    Y_train=self.Y_train[train_indices])
                Y_test_pred[i] = test_pred

        # Calculate the outer scores
        for i in range(self.outer_cv_folds):
            scores = calculate_score(
                Y_targets[i],
                Y_optimization_pred[i],
                self.task_type,
                self.metric,
                self.D.info['target_num'],
                all_scoring_functions=self.all_scoring_functions)
            if self.all_scoring_functions:
                for score_name in scores:
                    outer_scores[score_name].append(scores[score_name])
            else:
                outer_scores[self.metric].append(scores)

        Y_optimization_pred = np.concatenate([
            Y_optimization_pred[i] for i in range(self.outer_cv_folds)
            if Y_optimization_pred[i] is not None
        ])
        Y_targets = np.concatenate([
            Y_targets[i] for i in range(self.outer_cv_folds)
            if Y_targets[i] is not None
        ])

        if self.X_valid is not None:
            Y_valid_pred = np.array([
                Y_valid_pred[i] for i in range(self.outer_cv_folds)
                if Y_valid_pred[i] is not None
            ])
            # Average the predictions of several models
            if len(Y_valid_pred.shape) == 3:
                Y_valid_pred = np.nanmean(Y_valid_pred, axis=0)

        if self.X_test is not None:
            Y_test_pred = np.array([
                Y_test_pred[i] for i in range(self.outer_cv_folds)
                if Y_test_pred[i] is not None
            ])
            # Average the predictions of several models
            if len(Y_test_pred.shape) == 3:
                Y_test_pred = np.nanmean(Y_test_pred, axis=0)

        self.Y_optimization = Y_targets

        # Second, calculate the inner score
        for outer_fold in range(self.outer_cv_folds):
            for inner_fold in range(self.inner_cv_folds):
                inner_train_indices, inner_test_indices = self.inner_indices[
                    outer_fold][inner_fold]
                Y_test = self.Y_train[inner_test_indices]
                X_test = self.X_train[inner_test_indices]
                model = self.inner_models[outer_fold][inner_fold]
                Y_hat = self.predict_function(
                    X_test,
                    model,
                    self.task_type,
                    Y_train=self.Y_train[inner_train_indices])
                scores = calculate_score(
                    Y_test,
                    Y_hat,
                    self.task_type,
                    self.metric,
                    self.D.info['target_num'],
                    all_scoring_functions=self.all_scoring_functions)
                if self.all_scoring_functions:
                    for score_name in scores:
                        inner_scores[score_name].append(scores[score_name])
                else:
                    inner_scores[self.metric].append(scores)

        # Average the scores!
        if self.all_scoring_functions:
            inner_err = {
                key: 1 - np.mean(inner_scores[key])
                for key in inner_scores
            }
            outer_err = {
                "outer:%s" % key: 1 - np.mean(outer_scores[key])
                for key in outer_scores
            }
            inner_err.update(outer_err)
        else:
            inner_err = 1 - np.mean(inner_scores[self.metric])

        if self.with_predictions:
            return inner_err, Y_optimization_pred, Y_valid_pred, Y_test_pred
        return inner_err
Example #16
0
 def score(self, X, y):
     prediction = self.predict(X)
     return evaluator.calculate_score(y, prediction, self.task_,
                                      self.metric_, self.target_num_)
    def predict(self):
        # First, obtain the predictions for the ensembles, the validation and
        #  the test set!
        outer_scores = defaultdict(list)
        inner_scores = defaultdict(list)
        Y_optimization_pred = [None] * self.outer_cv_folds
        Y_targets = [None] * self.outer_cv_folds
        Y_valid_pred = [None] * self.outer_cv_folds
        Y_test_pred = [None] * self.outer_cv_folds

        for i in range(self.outer_cv_folds):
            train_indices, test_indices = self.outer_indices[i]
            opt_pred = self.predict_function(self.X_train[test_indices],
                                             self.outer_models[i],
                                             self.task_type,
                                             Y_train=self.Y_train[train_indices])

            Y_optimization_pred[i] = opt_pred
            Y_targets[i] = self.Y_train[test_indices]

            if self.X_valid is not None:
                X_valid = self.X_valid.copy()
                valid_pred = self.predict_function(X_valid,
                                                   self.outer_models[i],
                                                   self.task_type,
                                                   Y_train=self.Y_train[train_indices])
                Y_valid_pred[i] = valid_pred

            if self.X_test is not None:
                X_test = self.X_test.copy()
                test_pred = self.predict_function(X_test, self.outer_models[i],
                                                  self.task_type,
                                                  Y_train=self.Y_train[train_indices])
                Y_test_pred[i] = test_pred

        # Calculate the outer scores
        for i in range(self.outer_cv_folds):
            scores = calculate_score(Y_targets[i], Y_optimization_pred[i],
                                     self.task_type, self.metric,
                                     self.D.info['target_num'],
                                     all_scoring_functions=self.all_scoring_functions)
            if self.all_scoring_functions:
                for score_name in scores:
                    outer_scores[score_name].append(scores[score_name])
            else:
                outer_scores[self.metric].append(scores)


        Y_optimization_pred = np.concatenate([Y_optimization_pred[i] for i in
                                              range(self.outer_cv_folds) if
                                              Y_optimization_pred[
                                                  i] is not None])
        Y_targets = np.concatenate([Y_targets[i] for i in range(self.outer_cv_folds)
                                    if Y_targets[i] is not None])

        if self.X_valid is not None:
            Y_valid_pred = np.array([Y_valid_pred[i] for i in range(
                self.outer_cv_folds) if Y_valid_pred[i] is not None])
            # Average the predictions of several models
            if len(Y_valid_pred.shape) == 3:
                Y_valid_pred = np.nanmean(Y_valid_pred, axis=0)

        if self.X_test is not None:
            Y_test_pred = np.array([Y_test_pred[i] for i in range(
                self.outer_cv_folds) if Y_test_pred[i] is not None])
            # Average the predictions of several models
            if len(Y_test_pred.shape) == 3:
                Y_test_pred = np.nanmean(Y_test_pred, axis=0)

        self.Y_optimization = Y_targets

        # Second, calculate the inner score
        for outer_fold in range(self.outer_cv_folds):
            for inner_fold in range(self.inner_cv_folds):
                inner_train_indices, inner_test_indices = self.inner_indices[
                    outer_fold][inner_fold]
                Y_test = self.Y_train[inner_test_indices]
                X_test = self.X_train[inner_test_indices]
                model = self.inner_models[outer_fold][inner_fold]
                Y_hat = self.predict_function(X_test, model, self.task_type,
                                              Y_train=self.Y_train[inner_train_indices])
                scores = calculate_score(Y_test, Y_hat, self.task_type, self.metric,
                                         self.D.info['target_num'],
                                         all_scoring_functions=self.all_scoring_functions)
                if self.all_scoring_functions:
                    for score_name in scores:
                        inner_scores[score_name].append(scores[score_name])
                else:
                    inner_scores[self.metric].append(scores)

        # Average the scores!
        if self.all_scoring_functions:
            inner_err = {key: 1 - np.mean(inner_scores[key])
                         for key in inner_scores}
            outer_err = {"outer:%s" % key: 1 - np.mean(outer_scores[key])
                         for key in outer_scores}
            inner_err.update(outer_err)
        else:
            inner_err = 1 - np.mean(inner_scores[self.metric])

        if self.with_predictions:
            return inner_err, Y_optimization_pred, Y_valid_pred, Y_test_pred
        return inner_err