def main(predictions_dir,
         basename,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None):
    watch = autosklearn.util.stopwatch.StopWatch()
    watch.start_task("ensemble_builder")

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0
    logging.basicConfig(filename=os.path.join(predictions_dir, "ensemble.log"),
                        level=logging.DEBUG)

    while used_time < limit:
        logging.debug("Time left: %f" % (limit - used_time))
        logging.debug("Time last iteration: %f" % time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(
            os.path.join(predictions_dir, "true_labels_ensemble.npy"))

        # Load the predictions from the models
        all_predictions_train = []
        dir_ensemble = os.path.join(predictions_dir, "predictions_ensemble/")
        dir_valid = os.path.join(predictions_dir, "predictions_valid/")
        dir_test = os.path.join(predictions_dir, "predictions_test/")

        if not os.path.isdir(dir_ensemble) or not os.path.isdir(dir_valid) or \
                not os.path.isdir(dir_test):
            logging.debug("Prediction directory does not exist")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid))
        dir_test_list = sorted(os.listdir(dir_test))

        if len(dir_ensemble_list) == 0:
            logging.debug("Directories are empty")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) != len(dir_valid_list):
            logging.debug("Directories are inconsistent")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) != len(dir_test_list):
            logging.debug("Directories are inconsistent")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) <= current_num_models:
            logging.debug("Nothing has changed since the last time")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        watch.start_task("ensemble_iter_" + str(index_run))

        # Binary mask where True indicates that the corresponding will be excluded from the ensemble
        exclude_mask = []
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []

        model_idx = 0
        for f in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, f))
            score = evaluator.calculate_score(true_labels, predictions,
                                              task_type, metric,
                                              predictions.shape[1])

            if ensemble_size is not None:
                if score <= 0.001:
                    exclude_mask.append(True)
                    logging.error("Model only predicts at random: " + f +
                                  " has score: " + str(score))
                # If we have less model in our ensemble than ensemble_size add the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    exclude_mask.append(False)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in our ensemble replace it by the current model
                    if (scores_nbest[idx] < score):
                        logging.debug(
                            "Worst model in our ensemble: %d with score %f will be replaced by model %d with score %f"
                            % (idx, scores_nbest[idx], model_idx, score))
                        scores_nbest[idx] = score
                        # Exclude the old model
                        exclude_mask[int(indices_nbest[idx])] = True
                        indices_nbest[idx] = model_idx
                        exclude_mask.append(False)
                    # Otherwise exclude the current model from the ensemble
                    else:
                        exclude_mask.append(True)

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    exclude_mask.append(True)
                    logging.error("Model only predicts at random: " + f +
                                  " has score: " + str(score))
                else:
                    exclude_mask.append(False)
                    all_predictions_train.append(predictions)

            model_idx += 1
            print exclude_mask

        all_predictions_valid = []
        for i, f in enumerate(dir_valid_list):
            predictions = np.load(os.path.join(dir_valid, f))
            if not exclude_mask[i]:
                all_predictions_valid.append(predictions)

        all_predictions_test = []
        for i, f in enumerate(dir_test_list):
            predictions = np.load(os.path.join(dir_test, f))
            if not exclude_mask[i]:
                all_predictions_test.append(predictions)

        if len(all_predictions_train) == len(all_predictions_test) == len(
                all_predictions_valid) == 0:
            logging.error("All models do just random guessing")
            time.sleep(2)
            continue

        if len(all_predictions_train) == 1:
            logging.debug("Only one model so far we just copy its predictions")
            Y_valid = all_predictions_valid[0]
            Y_test = all_predictions_test[0]
        else:
            try:
                # Compute the weights for the ensemble
                # Use equally initialized weights
                n_models = len(all_predictions_train)
                init_weights = np.ones([n_models]) / n_models

                weights = weighted_ensemble(np.array(all_predictions_train),
                                            true_labels, task_type, metric,
                                            init_weights)
            except (ValueError):
                logging.error("Caught ValueError!")
                used_time = watch.wall_elapsed("ensemble_builder")
                continue
            except:
                logging.error("Caught error!")
                used_time = watch.wall_elapsed("ensemble_builder")
                continue

            # Compute the ensemble predictions for the valid data
            Y_valid = ensemble_prediction(np.array(all_predictions_valid),
                                          weights)

            # Compute the ensemble predictions for the test data
            Y_test = ensemble_prediction(np.array(all_predictions_test),
                                         weights)

        # Save predictions for valid and test data set
        filename_test = os.path.join(
            output_dir,
            basename + '_valid_' + str(index_run).zfill(3) + '.predict')
        data_util.save_predictions(
            os.path.join(predictions_dir, filename_test), Y_valid)

        filename_test = os.path.join(
            output_dir,
            basename + '_test_' + str(index_run).zfill(3) + '.predict')
        data_util.save_predictions(
            os.path.join(predictions_dir, filename_test), Y_test)

        current_num_models = len(dir_ensemble_list)
        watch.stop_task("ensemble_iter_" + str(index_run))
        time_iter = watch.get_wall_dur("ensemble_iter_" + str(index_run))
        used_time = watch.wall_elapsed("ensemble_builder")
        index_run += 1
    return
def main(predictions_dir, basename, task_type, metric, limit, output_dir,
         ensemble_size=None, seed=1, indices_output_dir="."):
    watch = StopWatch()
    watch.start_task("ensemble_builder")

    task_type = STRING_TO_TASK_TYPES[task_type]

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0
    logging.basicConfig(filename=os.path.join(predictions_dir, "ensemble_%d.log" % seed), level=logging.DEBUG)

    while used_time < limit:
        logging.debug("Time left: %f", limit - used_time)
        logging.debug("Time last iteration: %f", time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(os.path.join(predictions_dir, "true_labels_ensemble.npy"))

        # Load the predictions from the models
        dir_ensemble = os.path.join(predictions_dir,
                                    "predictions_ensemble_%s/" % seed)
        dir_valid = os.path.join(predictions_dir,
                                 "predictions_valid_%s/" % seed)
        dir_test = os.path.join(predictions_dir,
                                "predictions_test_%s/" % seed)

        paths_ = [dir_ensemble, dir_valid, dir_test]
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]: #all(exists):
            logging.debug("Prediction directory %s does not exist!" %
                           dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
        dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        if len(dir_ensemble_list) == 0:
            logging.debug("Directories are empty")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) <= current_num_models:
            logging.debug("Nothing has changed since the last time")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        watch.start_task("ensemble_iter_" + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        re_num_run = re.compile(r'_([0-9]*)\.npy$')
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []
            # The num run of the models
            num_runs = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, model_name))
            score = evaluator.calculate_score(true_labels, predictions,
                                              task_type, metric,
                                              predictions.shape[1])
            model_names_to_scores[model_name] = score
            num_run = int(re_num_run.search(model_name).group(1))

            if ensemble_size is not None:
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logging.error("Model only predicts at random: " + model_name + " has score: " + str(score))
                # If we have less models in our ensemble than ensemble_size add the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append(num_run)
                    model_names.append(model_name)
                    num_runs.append(num_run)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in our ensemble replace it by the current model
                    if(scores_nbest[idx] < score):
                        logging.debug("Worst model in our ensemble: %s with "
                                      "score %f will be replaced by model %s "
                                      "with score %f",
                                      model_names[idx], scores_nbest[idx],
                                      model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append(num_run)
                        del model_names[idx]
                        model_names.append(model_name)
                        del num_runs[idx]
                        num_runs.append(num_run)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        #include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    #include_num_runs.append(True)
                    logging.error("Model only predicts at random: " + model_name + " has score: " + str(score))
                else:
                    include_num_runs.append(num_run)

            model_idx += 1

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = num_run

        #logging.info("Indices to model names:")
        #logging.info(indices_to_model_names)

        #for i, item in enumerate(sorted(model_names_to_scores.items(),
        #                                key=lambda t: t[1])):
        #    logging.info("%d: %s", i, item)

        include_num_runs = set(include_num_runs)

        all_predictions_train = []
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_ensemble, model_name))
                all_predictions_train.append(predictions)

        all_predictions_valid = []
        for i, model_name in enumerate(dir_valid_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_valid, model_name))
                all_predictions_valid.append(predictions)

        all_predictions_test = []
        for i, model_name in enumerate(dir_test_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_test, model_name))
                all_predictions_test.append(predictions)

        if len(all_predictions_train) == len(all_predictions_test) == len(all_predictions_valid) == 0:
            logging.error("All models do just random guessing")
            time.sleep(2)
            continue

        elif len(all_predictions_train) == 1:
            logging.debug("Only one model so far we just copy its predictions")
            ensemble_members_run_numbers = {0: 1.0}

            # Output the score
            logging.info("Training performance: %f" % np.max(
                model_names_to_scores.values()))
        else:
            try:
                indices, trajectory = ensemble_selection(
                    np.array(all_predictions_train), true_labels,
                    ensemble_size, task_type, metric)

                logging.info("Trajectory and indices!")
                logging.info(trajectory)
                logging.info(indices)

            except ValueError as e:
                logging.error("Caught ValueError: " + str(e))
                used_time = watch.wall_elapsed("ensemble_builder")
                continue
            except Exception as e:
                logging.error("Caught error! %s", e.message)
                used_time = watch.wall_elapsed("ensemble_builder")
                continue

            # Output the score
            logging.info("Training performance: %f" % trajectory[-1])

            # Print the ensemble members:
            ensemble_members_run_numbers = dict()
            ensemble_members = Counter(indices).most_common()
            ensemble_members_string = "Ensemble members:\n"
            logging.info(ensemble_members)
            for ensemble_member in ensemble_members:
                weight = float(ensemble_member[1]) / len(indices)
                ensemble_members_string += \
                    ("    %s; weight: %10f; performance: %10f\n" %
                     (indices_to_model_names[ensemble_member[0]],
                      weight,
                    model_names_to_scores[indices_to_model_names[ensemble_member[0]]]))

                ensemble_members_run_numbers[indices_to_run_num[
                    ensemble_member[0]]] = weight
            logging.info(ensemble_members_string)

        # Save the ensemble indices for later use!
        filename_indices = os.path.join(indices_output_dir,
                                        str(index_run).zfill(5) + ".indices")

        logging.info(ensemble_members_run_numbers)
        with open(filename_indices, "w") as fh:
            pickle.dump(ensemble_members_run_numbers, fh)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            ensemble_predictions_valid = np.mean(
                all_predictions_valid[indices.astype(int)], axis=0)
            filename_test = os.path.join(output_dir, basename + '_valid_' + str(index_run).zfill(3) + '.predict')
            data_util.save_predictions(os.path.join(predictions_dir, filename_test),
                                       ensemble_predictions_valid)
        else:
            logging.info("Could not find as many validation set predictions "
                         "as ensemble predictions!.")

        if len(dir_test_list) == len(dir_ensemble_list):
            ensemble_predictions_test = np.mean(
                all_predictions_test[indices.astype(int)], axis=0)
            filename_test = os.path.join(output_dir, basename + '_test_' + str(index_run).zfill(3) + '.predict')
            data_util.save_predictions(os.path.join(predictions_dir, filename_test),
                                       ensemble_predictions_test)
        else:
            logging.info("Could not find as many test set predictions as "
                         "ensemble predictions!")

        current_num_models = len(dir_ensemble_list)
        watch.stop_task("ensemble_iter_" + str(index_run))
        time_iter = watch.get_wall_dur("ensemble_iter_" + str(index_run))
        used_time = watch.wall_elapsed("ensemble_builder")
        index_run += 1
    return
def main(predictions_dir,
         basename,
         task_type,
         metric,
         limit,
         output_dir,
         ensemble_size=None,
         seed=1,
         indices_output_dir="."):
    watch = StopWatch()
    watch.start_task("ensemble_builder")

    task_type = STRING_TO_TASK_TYPES[task_type]

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0
    logging.basicConfig(filename=os.path.join(predictions_dir,
                                              "ensemble_%d.log" % seed),
                        level=logging.DEBUG)

    while used_time < limit:
        logging.debug("Time left: %f", limit - used_time)
        logging.debug("Time last iteration: %f", time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(
            os.path.join(predictions_dir, "true_labels_ensemble.npy"))

        # Load the predictions from the models
        dir_ensemble = os.path.join(predictions_dir,
                                    "predictions_ensemble_%s/" % seed)
        dir_valid = os.path.join(predictions_dir,
                                 "predictions_valid_%s/" % seed)
        dir_test = os.path.join(predictions_dir, "predictions_test_%s/" % seed)

        paths_ = [dir_ensemble, dir_valid, dir_test]
        exists = [os.path.isdir(dir_) for dir_ in paths_]
        if not exists[0]:  #all(exists):
            logging.debug("Prediction directory %s does not exist!" %
                          dir_ensemble)
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid)) if exists[1] else []
        dir_test_list = sorted(os.listdir(dir_test)) if exists[2] else []

        if len(dir_ensemble_list) == 0:
            logging.debug("Directories are empty")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) <= current_num_models:
            logging.debug("Nothing has changed since the last time")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        watch.start_task("ensemble_iter_" + str(index_run))

        # List of num_runs (which are in the filename) which will be included
        #  later
        include_num_runs = []
        re_num_run = re.compile(r'_([0-9]*)\.npy$')
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []
            # The names of the models
            model_names = []
            # The num run of the models
            num_runs = []

        model_names_to_scores = dict()

        model_idx = 0
        for model_name in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, model_name))
            score = evaluator.calculate_score(true_labels, predictions,
                                              task_type, metric,
                                              predictions.shape[1])
            model_names_to_scores[model_name] = score
            num_run = int(re_num_run.search(model_name).group(1))

            if ensemble_size is not None:
                if score <= 0.001:
                    # include_num_runs.append(True)
                    logging.error("Model only predicts at random: " +
                                  model_name + " has score: " + str(score))
                # If we have less models in our ensemble than ensemble_size add the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    include_num_runs.append(num_run)
                    model_names.append(model_name)
                    num_runs.append(num_run)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in our ensemble replace it by the current model
                    if (scores_nbest[idx] < score):
                        logging.debug(
                            "Worst model in our ensemble: %s with "
                            "score %f will be replaced by model %s "
                            "with score %f", model_names[idx],
                            scores_nbest[idx], model_name, score)
                        # Exclude the old model
                        del scores_nbest[idx]
                        scores_nbest.append(score)
                        del include_num_runs[idx]
                        del indices_nbest[idx]
                        indices_nbest.append(model_idx)
                        include_num_runs.append(num_run)
                        del model_names[idx]
                        model_names.append(model_name)
                        del num_runs[idx]
                        num_runs.append(num_run)

                    # Otherwise exclude the current model from the ensemble
                    else:
                        #include_num_runs.append(True)
                        pass

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    #include_num_runs.append(True)
                    logging.error("Model only predicts at random: " +
                                  model_name + " has score: " + str(score))
                else:
                    include_num_runs.append(num_run)

            model_idx += 1

        indices_to_model_names = dict()
        indices_to_run_num = dict()
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                num_indices = len(indices_to_model_names)
                indices_to_model_names[num_indices] = model_name
                indices_to_run_num[num_indices] = num_run

        #logging.info("Indices to model names:")
        #logging.info(indices_to_model_names)

        #for i, item in enumerate(sorted(model_names_to_scores.items(),
        #                                key=lambda t: t[1])):
        #    logging.info("%d: %s", i, item)

        include_num_runs = set(include_num_runs)

        all_predictions_train = []
        for i, model_name in enumerate(dir_ensemble_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_ensemble, model_name))
                all_predictions_train.append(predictions)

        all_predictions_valid = []
        for i, model_name in enumerate(dir_valid_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_valid, model_name))
                all_predictions_valid.append(predictions)

        all_predictions_test = []
        for i, model_name in enumerate(dir_test_list):
            num_run = int(re_num_run.search(model_name).group(1))
            if num_run in include_num_runs:
                predictions = np.load(os.path.join(dir_test, model_name))
                all_predictions_test.append(predictions)

        if len(all_predictions_train) == len(all_predictions_test) == len(
                all_predictions_valid) == 0:
            logging.error("All models do just random guessing")
            time.sleep(2)
            continue

        elif len(all_predictions_train) == 1:
            logging.debug("Only one model so far we just copy its predictions")
            ensemble_members_run_numbers = {0: 1.0}

            # Output the score
            logging.info("Training performance: %f" %
                         np.max(model_names_to_scores.values()))
        else:
            try:
                indices, trajectory = ensemble_selection(
                    np.array(all_predictions_train), true_labels,
                    ensemble_size, task_type, metric)

                logging.info("Trajectory and indices!")
                logging.info(trajectory)
                logging.info(indices)

            except ValueError as e:
                logging.error("Caught ValueError: " + str(e))
                used_time = watch.wall_elapsed("ensemble_builder")
                continue
            except Exception as e:
                logging.error("Caught error! %s", e.message)
                used_time = watch.wall_elapsed("ensemble_builder")
                continue

            # Output the score
            logging.info("Training performance: %f" % trajectory[-1])

            # Print the ensemble members:
            ensemble_members_run_numbers = dict()
            ensemble_members = Counter(indices).most_common()
            ensemble_members_string = "Ensemble members:\n"
            logging.info(ensemble_members)
            for ensemble_member in ensemble_members:
                weight = float(ensemble_member[1]) / len(indices)
                ensemble_members_string += \
                    ("    %s; weight: %10f; performance: %10f\n" %
                     (indices_to_model_names[ensemble_member[0]],
                      weight,
                    model_names_to_scores[indices_to_model_names[ensemble_member[0]]]))

                ensemble_members_run_numbers[indices_to_run_num[
                    ensemble_member[0]]] = weight
            logging.info(ensemble_members_string)

        # Save the ensemble indices for later use!
        filename_indices = os.path.join(indices_output_dir,
                                        str(index_run).zfill(5) + ".indices")

        logging.info(ensemble_members_run_numbers)
        with open(filename_indices, "w") as fh:
            pickle.dump(ensemble_members_run_numbers, fh)

        # Save predictions for valid and test data set
        if len(dir_valid_list) == len(dir_ensemble_list):
            ensemble_predictions_valid = np.mean(
                all_predictions_valid[indices.astype(int)], axis=0)
            filename_test = os.path.join(
                output_dir,
                basename + '_valid_' + str(index_run).zfill(3) + '.predict')
            data_util.save_predictions(
                os.path.join(predictions_dir, filename_test),
                ensemble_predictions_valid)
        else:
            logging.info("Could not find as many validation set predictions "
                         "as ensemble predictions!.")

        if len(dir_test_list) == len(dir_ensemble_list):
            ensemble_predictions_test = np.mean(
                all_predictions_test[indices.astype(int)], axis=0)
            filename_test = os.path.join(
                output_dir,
                basename + '_test_' + str(index_run).zfill(3) + '.predict')
            data_util.save_predictions(
                os.path.join(predictions_dir, filename_test),
                ensemble_predictions_test)
        else:
            logging.info("Could not find as many test set predictions as "
                         "ensemble predictions!")

        current_num_models = len(dir_ensemble_list)
        watch.stop_task("ensemble_iter_" + str(index_run))
        time_iter = watch.get_wall_dur("ensemble_iter_" + str(index_run))
        used_time = watch.wall_elapsed("ensemble_builder")
        index_run += 1
    return
def main(predictions_dir, basename, task_type, metric, limit, output_dir, ensemble_size=None):
    watch = autosklearn.util.stopwatch.StopWatch()
    watch.start_task("ensemble_builder")

    used_time = 0
    time_iter = 0
    index_run = 0
    current_num_models = 0
    logging.basicConfig(filename=os.path.join(predictions_dir, "ensemble.log"), level=logging.DEBUG)

    while used_time < limit:
        logging.debug("Time left: %f" % (limit - used_time))
        logging.debug("Time last iteration: %f" % time_iter)
        # Load the true labels of the validation data
        true_labels = np.load(os.path.join(predictions_dir, "true_labels_ensemble.npy"))

        # Load the predictions from the models
        all_predictions_train = []
        dir_ensemble = os.path.join(predictions_dir, "predictions_ensemble/")
        dir_valid = os.path.join(predictions_dir, "predictions_valid/")
        dir_test = os.path.join(predictions_dir, "predictions_test/")

        if not os.path.isdir(dir_ensemble) or not os.path.isdir(dir_valid) or \
                not os.path.isdir(dir_test):
            logging.debug("Prediction directory does not exist")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        dir_ensemble_list = sorted(os.listdir(dir_ensemble))
        dir_valid_list = sorted(os.listdir(dir_valid))
        dir_test_list = sorted(os.listdir(dir_test))

        if len(dir_ensemble_list) == 0:
            logging.debug("Directories are empty")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) != len(dir_valid_list):
            logging.debug("Directories are inconsistent")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) != len(dir_test_list):
            logging.debug("Directories are inconsistent")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        if len(dir_ensemble_list) <= current_num_models:
            logging.debug("Nothing has changed since the last time")
            time.sleep(2)
            used_time = watch.wall_elapsed("ensemble_builder")
            continue

        watch.start_task("ensemble_iter_" + str(index_run))

        # Binary mask where True indicates that the corresponding will be excluded from the ensemble
        exclude_mask = []
        if ensemble_size is not None:
            # Keeps track of the single scores of each model in our ensemble
            scores_nbest = []
            # The indices of the model that are currently in our ensemble
            indices_nbest = []

        model_idx = 0
        for f in dir_ensemble_list:
            predictions = np.load(os.path.join(dir_ensemble, f))
            score = evaluator.calculate_score(true_labels, predictions,
                task_type, metric, predictions.shape[1])

            if ensemble_size is not None:
                if score <= 0.001:
                    exclude_mask.append(True)
                    logging.error("Model only predicts at random: " + f + " has score: " + str(score))
                # If we have less model in our ensemble than ensemble_size add the current model if it is better than random
                elif len(scores_nbest) < ensemble_size:
                    scores_nbest.append(score)
                    indices_nbest.append(model_idx)
                    exclude_mask.append(False)
                else:
                    # Take the worst performing model in our ensemble so far
                    idx = np.argmin(np.array([scores_nbest]))

                    # If the current model is better than the worst model in our ensemble replace it by the current model
                    if(scores_nbest[idx] < score):
                        logging.debug("Worst model in our ensemble: %d with score %f will be replaced by model %d with score %f" % (idx, scores_nbest[idx], model_idx, score))
                        scores_nbest[idx] = score
                        # Exclude the old model
                        exclude_mask[int(indices_nbest[idx])] = True
                        indices_nbest[idx] = model_idx
                        exclude_mask.append(False)
                    # Otherwise exclude the current model from the ensemble
                    else:
                        exclude_mask.append(True)

            else:
                # Load all predictions that are better than random
                if score <= 0.001:
                    exclude_mask.append(True)
                    logging.error("Model only predicts at random: " + f + " has score: " + str(score))
                else:
                    exclude_mask.append(False)
                    all_predictions_train.append(predictions)

            model_idx += 1
            print exclude_mask

        all_predictions_valid = []
        for i, f in enumerate(dir_valid_list):
            predictions = np.load(os.path.join(dir_valid, f))
            if not exclude_mask[i]:
                all_predictions_valid.append(predictions)

        all_predictions_test = []
        for i, f in enumerate(dir_test_list):
            predictions = np.load(os.path.join(dir_test, f))
            if not exclude_mask[i]:
                all_predictions_test.append(predictions)

        if len(all_predictions_train) == len(all_predictions_test) == len(all_predictions_valid) == 0:
            logging.error("All models do just random guessing")
            time.sleep(2)
            continue

        if len(all_predictions_train) == 1:
            logging.debug("Only one model so far we just copy its predictions")
            Y_valid = all_predictions_valid[0]
            Y_test = all_predictions_test[0]
        else:
            try:
                # Compute the weights for the ensemble
                # Use equally initialized weights
                n_models = len(all_predictions_train)
                init_weights = np.ones([n_models]) / n_models

                weights = weighted_ensemble(np.array(all_predictions_train),
                                    true_labels, task_type, metric, init_weights)
            except (ValueError):
                logging.error("Caught ValueError!")
                used_time = watch.wall_elapsed("ensemble_builder")
                continue
            except:
                logging.error("Caught error!")
                used_time = watch.wall_elapsed("ensemble_builder")
                continue

            # Compute the ensemble predictions for the valid data
            Y_valid = ensemble_prediction(np.array(all_predictions_valid), weights)

            # Compute the ensemble predictions for the test data
            Y_test = ensemble_prediction(np.array(all_predictions_test), weights)

        # Save predictions for valid and test data set
        filename_test = os.path.join(output_dir, basename + '_valid_' + str(index_run).zfill(3) + '.predict')
        data_util.save_predictions(os.path.join(predictions_dir, filename_test), Y_valid)

        filename_test = os.path.join(output_dir, basename + '_test_' + str(index_run).zfill(3) + '.predict')
        data_util.save_predictions(os.path.join(predictions_dir, filename_test), Y_test)

        current_num_models = len(dir_ensemble_list)
        watch.stop_task("ensemble_iter_" + str(index_run))
        time_iter = watch.get_wall_dur("ensemble_iter_" + str(index_run))
        used_time = watch.wall_elapsed("ensemble_builder")
        index_run += 1
    return