Beispiel #1
0
def mean_aggregation(fold):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    ids         = test_df.index.get_level_values('id')
    labels      = test_df.index.get_level_values('label')
    predictions = test_df.mean(axis = 1)
    diversity   = common.diversity_score(test_df.values)
    return DataFrame({'id': ids, 'label': labels, 'fold': fold, 'prediction': predictions, 'diversity': diversity})
Beispiel #2
0
def select_candidate_sdi(train_df, train_labels, best_classifiers, ensemble, i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False)
        candidate_diversity_scores = [1 - abs(common.diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates] # 1 - kappa so larger = more diverse
        candidate_scores = [accuracy_weight * best_classifiers.ix[candidate] + (1 - accuracy_weight) * candidate_diversity_scores[candidate_i] for candidate_i, candidate in enumerate(candidates)]
        best_candidate = candidates[common.argbest(candidate_scores)]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Beispiel #3
0
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False)
        candidate_diversity_scores = [abs(common.diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates]
        candidate_diversity_ranks = array(candidate_diversity_scores).argsort()
        diversity_candidates = candidates[candidate_diversity_ranks[:max_diversity_candidates]]
        candidate_accuracy_scores = [common.score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in diversity_candidates]
        best_candidate = candidates[common.argbest(candidate_accuracy_scores)]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Beispiel #4
0
def mean_aggregation(fold):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    ids = test_df.index.get_level_values('id')
    labels = test_df.index.get_level_values('label')
    predictions = test_df.mean(axis=1)
    diversity = common.diversity_score(test_df.values)
    return DataFrame({
        'id': ids,
        'label': labels,
        'fold': fold,
        'prediction': predictions,
        'diversity': diversity
    })
Beispiel #5
0
def stacked_generalization(stacker, fold):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_df = common.unbag(train_df, 10)
    test_df = common.unbag(test_df, 10)
    test_predictions = stacker.fit(train_df,
                                   train_labels).predict_proba(test_df)[:, 1]
    return DataFrame({
        'fold': fold,
        'id': test_df.index.get_level_values('id'),
        'label': test_labels,
        'prediction': test_predictions,
        'diversity': common.diversity_score(test_df.values)
    })
Beispiel #6
0
def get_predictions(df, ensemble, fold, seedval):
    ids = df.index.get_level_values('id')
    labels = df.index.get_level_values('label')
    predictions = df[ensemble].mean(axis=1)
    diversity = common.diversity_score(df[ensemble].values)
    return DataFrame({
        'fold': fold,
        'seed': seedval,
        'id': ids,
        'label': labels,
        'prediction': predictions,
        'diversity': diversity,
        'ensemble_size': len(ensemble)
    })
Beispiel #7
0
def select_candidate_sdi(train_df, train_labels, best_classifiers, ensemble,
                         i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values,
                            min(max_candidates, len(best_classifiers)),
                            replace=False)
        candidate_diversity_scores = [
            1 -
            abs(common.diversity_score(
                train_df[ensemble + [candidate]].values))
            for candidate in candidates
        ]  # 1 - kappa so larger = more diverse
        candidate_scores = [
            accuracy_weight * best_classifiers.ix[candidate] +
            (1 - accuracy_weight) * candidate_diversity_scores[candidate_i]
            for candidate_i, candidate in enumerate(candidates)
        ]
        best_candidate = candidates[common.argbest(candidate_scores)]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Beispiel #8
0
def stacked_selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_performance = []
    test_performance = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(n_clusters,
                                                         train_distances,
                                                         train_df,
                                                         train_labels,
                                                         predict_df=train_df)
        test_values, test_predictions = stack_function(n_clusters,
                                                       train_distances,
                                                       train_df,
                                                       train_labels,
                                                       predict_df=test_df)
        train_performance.append(
            get_cluster_performance(train_labels, train_predictions,
                                    n_clusters, fold, seedval))
        test_performance.append(
            get_cluster_performance(test_labels, test_predictions, n_clusters,
                                    fold, seedval))
    best_cluster_size = common.get_best_performer(
        DataFrame.from_records(train_performance)).n_clusters.values
    test_values, test_predictions = stack_function(best_cluster_size,
                                                   train_distances,
                                                   train_df,
                                                   train_labels,
                                                   predict_df=test_df)
    return DataFrame({
        'fold': fold,
        'seed': seedval,
        'id': test_df.index.get_level_values('id'),
        'label': test_labels,
        'prediction': test_predictions,
        'diversity': common.diversity_score(test_values),
        'metric': common.score.__name__
    }), DataFrame.from_records(test_performance)
Beispiel #9
0
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble,
                          i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values,
                            min(max_candidates, len(best_classifiers)),
                            replace=False)
        candidate_diversity_scores = [
            abs(common.diversity_score(train_df[ensemble +
                                                [candidate]].values))
            for candidate in candidates
        ]
        candidate_diversity_ranks = array(candidate_diversity_scores).argsort()
        diversity_candidates = candidates[
            candidate_diversity_ranks[:max_diversity_candidates]]
        candidate_accuracy_scores = [
            common.score(train_labels,
                         train_df[ensemble + [candidate]].mean(axis=1))
            for candidate in diversity_candidates
        ]
        best_candidate = candidates[common.argbest(candidate_accuracy_scores)]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Beispiel #10
0
def stacked_generalization(fold):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    if method == 'aggregate':
        train_df = common.unbag(train_df, bag_count)
        test_df = common.unbag(test_df, bag_count)
    test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1]
    return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})
Beispiel #11
0
def get_predictions(df, ensemble, fold, seedval):
    ids             = df.index.get_level_values('id')
    labels          = df.index.get_level_values('label')
    predictions     = df[ensemble].mean(axis = 1)
    diversity       = common.diversity_score(df[ensemble].values)
    return DataFrame({'fold': fold, 'seed': seedval, 'id': ids, 'label': labels, 'prediction': predictions, 'diversity': diversity, 'ensemble_size': len(ensemble)})
Beispiel #12
0
def stacked_selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_performance = []
    test_performance = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = train_df)
        test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = test_df)
        train_performance.append(get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval))
        test_performance.append(get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval))
    best_cluster_size = common.get_best_performer(DataFrame.from_records(train_performance)).n_clusters.values
    test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df = test_df)
    return DataFrame({'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__}), DataFrame.from_records(test_performance)