コード例 #1
0
def selection(fold, seedval, path, agg):
    seed(seedval)
    initial_ensemble_size = 2
    max_ensemble_size = 50
    max_candidates = 50
    max_diversity_candidates = 5
    accuracy_weight = 0.5
    max_clusters = 20
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_df = common.unbag(train_df, agg)
    test_df = common.unbag(test_df, agg)
    best_classifiers = train_df.apply(lambda x: common.fmax_score(
        train_labels, x)).sort_values(ascending=not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate_enhanced(train_df, train_labels,
                                                   best_classifiers, ensemble,
                                                   i)
        ensemble.append(best_candidate)
        train_performance.append(
            get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(
            get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = pd.DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(
        train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size.item(0) +
                                                  1]
    return get_predictions(
        test_df, best_ensemble, fold,
        seedval), pd.DataFrame.from_records(test_performance)
コード例 #2
0
def selection(fold):
    seed(seedval)
    indices = [[fold], [seedval]]
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    best_classifiers = train_df.apply(
        lambda x: roc_auc_score(train_labels, x)).order(ascending=False)
    train_metrics = []
    test_metrics = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels,
                                          best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_metrics.append(
            eval_metrics(train_df, ensemble, train_labels, indices))
        test_metrics.append(
            eval_metrics(test_df, ensemble, test_labels, indices))
    train_metrics_df = concat(train_metrics)
    best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size
    best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1]
    return eval_metrics(test_df,
                        best_ensemble,
                        test_labels,
                        indices,
                        final=True), concat(test_metrics)
コード例 #3
0
def stacked_selection(fold):
    seed(seedval)
    indices = [[fold], [seedval]]
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_metrics = []
    test_metrics = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(
            n_clusters, train_distances, train_df, train_labels, train_df)
        test_values, test_predictions = stack_function(n_clusters,
                                                       train_distances,
                                                       train_df, train_labels,
                                                       test_df)
        train_metrics.append(
            eval_cluster_metrics(train_values, train_labels, train_predictions,
                                 n_clusters, indices))
        test_metrics.append(
            eval_cluster_metrics(test_values, test_labels, test_predictions,
                                 n_clusters, indices))
    best_cluster_size = get_best_performer(concat(train_metrics)).n_clusters
    test_values, test_predictions = stack_function(best_cluster_size,
                                                   train_distances, train_df,
                                                   train_labels, test_df)
    return eval_cluster_metrics(test_values, test_labels, test_predictions,
                                best_cluster_size,
                                indices), concat(test_metrics)
コード例 #4
0
ファイル: selection.py プロジェクト: GauravPandeyLab/datasink
def stacked_selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_performance = []
    test_performance = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = train_df)
        test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = test_df)
        train_performance.append(get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval))
        test_performance.append(get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval))
    best_cluster_size = common.get_best_performer(DataFrame.from_records(train_performance)).n_clusters.values
    test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df = test_df)
    return DataFrame({'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__}), DataFrame.from_records(test_performance)
コード例 #5
0
ファイル: stacking.py プロジェクト: Web5design/datasink
def stacked_selection(fold):
    seed(seedval)
    indices = [[fold], [seedval]]
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_metrics = []
    test_metrics = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, train_df)
        test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, test_df)
        train_metrics.append(eval_cluster_metrics(train_values, train_labels, train_predictions, n_clusters, indices))
        test_metrics.append(eval_cluster_metrics(test_values, test_labels, test_predictions, n_clusters, indices))
    best_cluster_size = get_best_performer(concat(train_metrics)).n_clusters
    test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, test_df)
    return eval_cluster_metrics(test_values, test_labels, test_predictions, best_cluster_size, indices), concat(test_metrics)
コード例 #6
0
ファイル: selection.py プロジェクト: GauravPandeyLab/datasink
def selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    best_classifiers = train_df.apply(lambda x: common.score(train_labels, x)).order(ascending = not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_performance.append(get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1]
    return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
コード例 #7
0
ファイル: selection.py プロジェクト: Web5design/datasink
def selection(fold):
    seed(seedval)
    indices = [[fold], [seedval]]
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    best_classifiers = train_df.apply(lambda x: roc_auc_score(train_labels, x)).order(ascending = False)
    train_metrics = []
    test_metrics = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_metrics.append(eval_metrics(train_df, ensemble, train_labels, indices))
        test_metrics.append(eval_metrics(test_df, ensemble, test_labels, indices))
    train_metrics_df = concat(train_metrics)
    best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size
    best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1]
    return eval_metrics(test_df, best_ensemble, test_labels, indices, final = True), concat(test_metrics)
コード例 #8
0
ファイル: selection.py プロジェクト: shwhalen/datasink
def selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    best_classifiers = train_df.apply(lambda x: common.score(
        train_labels, x)).order(ascending=not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels,
                                          best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_performance.append(
            get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(
            get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(
        train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1]
    return get_predictions(test_df, best_ensemble, fold,
                           seedval), DataFrame.from_records(test_performance)
コード例 #9
0
ファイル: selection.py プロジェクト: shwhalen/datasink
def stacked_selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_performance = []
    test_performance = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(n_clusters,
                                                         train_distances,
                                                         train_df,
                                                         train_labels,
                                                         predict_df=train_df)
        test_values, test_predictions = stack_function(n_clusters,
                                                       train_distances,
                                                       train_df,
                                                       train_labels,
                                                       predict_df=test_df)
        train_performance.append(
            get_cluster_performance(train_labels, train_predictions,
                                    n_clusters, fold, seedval))
        test_performance.append(
            get_cluster_performance(test_labels, test_predictions, n_clusters,
                                    fold, seedval))
    best_cluster_size = common.get_best_performer(
        DataFrame.from_records(train_performance)).n_clusters.values
    test_values, test_predictions = stack_function(best_cluster_size,
                                                   train_distances,
                                                   train_df,
                                                   train_labels,
                                                   predict_df=test_df)
    return DataFrame({
        'fold': fold,
        'seed': seedval,
        'id': test_df.index.get_level_values('id'),
        'label': test_labels,
        'prediction': test_predictions,
        'diversity': common.diversity_score(test_values),
        'metric': common.score.__name__
    }), DataFrame.from_records(test_performance)