Exemple #1
0
def mean_aggregation(fold):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    ids         = test_df.index.get_level_values('id')
    labels      = test_df.index.get_level_values('label')
    predictions = test_df.mean(axis = 1)
    diversity   = common.diversity_score(test_df.values)
    return DataFrame({'id': ids, 'label': labels, 'fold': fold, 'prediction': predictions, 'diversity': diversity})
Exemple #2
0
def stacked_generalization(fold):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    if method == 'aggregate':
        train_df = common.unbag(train_df, bag_count)
        test_df = common.unbag(test_df, bag_count)
    test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1]
    return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})
Exemple #3
0
def selection(fold):
    seed(seedval)
    indices = [[fold], [seedval]]
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    best_classifiers = train_df.apply(
        lambda x: roc_auc_score(train_labels, x)).order(ascending=False)
    train_metrics = []
    test_metrics = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels,
                                          best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_metrics.append(
            eval_metrics(train_df, ensemble, train_labels, indices))
        test_metrics.append(
            eval_metrics(test_df, ensemble, test_labels, indices))
    train_metrics_df = concat(train_metrics)
    best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size
    best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1]
    return eval_metrics(test_df,
                        best_ensemble,
                        test_labels,
                        indices,
                        final=True), concat(test_metrics)
def selection(fold, seedval, path, agg):
    seed(seedval)
    initial_ensemble_size = 2
    max_ensemble_size = 50
    max_candidates = 50
    max_diversity_candidates = 5
    accuracy_weight = 0.5
    max_clusters = 20
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_df = common.unbag(train_df, agg)
    test_df = common.unbag(test_df, agg)
    best_classifiers = train_df.apply(lambda x: common.fmax_score(
        train_labels, x)).sort_values(ascending=not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate_enhanced(train_df, train_labels,
                                                   best_classifiers, ensemble,
                                                   i)
        ensemble.append(best_candidate)
        train_performance.append(
            get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(
            get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = pd.DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(
        train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size.item(0) +
                                                  1]
    return get_predictions(
        test_df, best_ensemble, fold,
        seedval), pd.DataFrame.from_records(test_performance)
Exemple #5
0
def stacked_selection(fold):
    seed(seedval)
    indices = [[fold], [seedval]]
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_metrics = []
    test_metrics = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(
            n_clusters, train_distances, train_df, train_labels, train_df)
        test_values, test_predictions = stack_function(n_clusters,
                                                       train_distances,
                                                       train_df, train_labels,
                                                       test_df)
        train_metrics.append(
            eval_cluster_metrics(train_values, train_labels, train_predictions,
                                 n_clusters, indices))
        test_metrics.append(
            eval_cluster_metrics(test_values, test_labels, test_predictions,
                                 n_clusters, indices))
    best_cluster_size = get_best_performer(concat(train_metrics)).n_clusters
    test_values, test_predictions = stack_function(best_cluster_size,
                                                   train_distances, train_df,
                                                   train_labels, test_df)
    return eval_cluster_metrics(test_values, test_labels, test_predictions,
                                best_cluster_size,
                                indices), concat(test_metrics)
Exemple #6
0
def stacked_generalization(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    if method == 'aggregate':
        train_df = unbag(train_df, bag_count)
        test_df = unbag(test_df, bag_count)
    predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1]
    return eval_metrics(test_df, test_labels, predictions, [[fold], [seedval]])
Exemple #7
0
def stacked_generalization(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    if method == 'aggregate':
        train_df = unbag(train_df, bag_count)
        test_df = unbag(test_df, bag_count)
    predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:,
                                                                             1]
    return eval_metrics(test_df, test_labels, predictions, [[fold], [seedval]])
Exemple #8
0
def stacked_generalization(path,stacker_name,stacker,fold,agg):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_df = common.unbag(train_df,agg)
    test_df = common.unbag(test_df,agg)
    try:
        test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1]
    except:
        test_predictions = stacker.fit(train_df,train_labels).predict(test_df)[:,1]
    df = pd.DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})
    return df
Exemple #9
0
def mean_aggregation(fold):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    ids = test_df.index.get_level_values('id')
    labels = test_df.index.get_level_values('label')
    predictions = test_df.mean(axis=1)
    diversity = common.diversity_score(test_df.values)
    return DataFrame({
        'id': ids,
        'label': labels,
        'fold': fold,
        'prediction': predictions,
        'diversity': diversity
    })
Exemple #10
0
def bestbase_fmax(path,fold_count=5,agg=1):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    predictions = []
    labels = []
    for fold in range(fold_count):
        _,_,test_df,label = common.read_fold(path,fold)
        test_df = common.unbag(test_df, agg)
        predictions.append(test_df)
        labels = append(labels,label)
    predictions = pd.concat(predictions)
    fmax_list = [common.fmax_score(labels,predictions.iloc[:,i]) for i in range(len(predictions.columns))]
    return max(fmax_list)
Exemple #11
0
def stacked_selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_performance = []
    test_performance = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = train_df)
        test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = test_df)
        train_performance.append(get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval))
        test_performance.append(get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval))
    best_cluster_size = common.get_best_performer(DataFrame.from_records(train_performance)).n_clusters.values
    test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df = test_df)
    return DataFrame({'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__}), DataFrame.from_records(test_performance)
Exemple #12
0
def mean_fmax(path,fold_count=5,agg=1):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    predictions = []
    labels = []
    for fold in range(fold_count):
        _,_,test_df,label = common.read_fold(path,fold)
        test_df = common.unbag(test_df, agg)
        predict = test_df.mean(axis=1).values
        predictions = append(predictions,predict)
        labels = append(labels,label)
    fmax = '%.3f' %(common.fmax_score(labels,predictions))
    return float(fmax)
Exemple #13
0
def stacked_selection(fold):
    seed(seedval)
    indices = [[fold], [seedval]]
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_metrics = []
    test_metrics = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, train_df)
        test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, test_df)
        train_metrics.append(eval_cluster_metrics(train_values, train_labels, train_predictions, n_clusters, indices))
        test_metrics.append(eval_cluster_metrics(test_values, test_labels, test_predictions, n_clusters, indices))
    best_cluster_size = get_best_performer(concat(train_metrics)).n_clusters
    test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, test_df)
    return eval_cluster_metrics(test_values, test_labels, test_predictions, best_cluster_size, indices), concat(test_metrics)
def mean_fmax(path):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    p = common.load_properties(path)
    fold_count = int(p['foldCount'])
    predictions = []
    labels = []
    for fold in range(fold_count):
        _, _, test_df, label = common.read_fold(path, fold)
        test_df = common.unbag(test_df, 10)
        predict = test_df.mean(axis=1).values
        predictions += predict
        labels += label
    fmax = '%.3f' % (common.fmax_score(labels, predictions))
    return fmax
Exemple #15
0
def selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    best_classifiers = train_df.apply(lambda x: common.score(train_labels, x)).order(ascending = not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_performance.append(get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1]
    return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
Exemple #16
0
def selection(fold):
    seed(seedval)
    indices = [[fold], [seedval]]
    train_df, train_labels, test_df, test_labels = read_fold(path, fold)
    best_classifiers = train_df.apply(lambda x: roc_auc_score(train_labels, x)).order(ascending = False)
    train_metrics = []
    test_metrics = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_metrics.append(eval_metrics(train_df, ensemble, train_labels, indices))
        test_metrics.append(eval_metrics(test_df, ensemble, test_labels, indices))
    train_metrics_df = concat(train_metrics)
    best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size
    best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1]
    return eval_metrics(test_df, best_ensemble, test_labels, indices, final = True), concat(test_metrics)
def bestbase_fmax(path):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    p = common.load_properties(path)
    fold_count = int(p['foldCount'])
    predictions = []
    labels = []
    for fold in range(fold_count):
        _, _, test_df, label = common.read_fold(path, fold)
        test_df = common.unbag(test_df, 10)
        predictions.append(test_df)
        labels += label
    predictions = concat(predictions)
    fmax_list = [
        common.fmax_core(labels, predictions[col].tolist())
        for col in list(predictions)
    ]
    return max(fmax_list)
Exemple #18
0
def selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    best_classifiers = train_df.apply(lambda x: common.score(
        train_labels, x)).order(ascending=not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate(train_df, train_labels,
                                          best_classifiers, ensemble, i)
        ensemble.append(best_candidate)
        train_performance.append(
            get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(
            get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(
        train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1]
    return get_predictions(test_df, best_ensemble, fold,
                           seedval), DataFrame.from_records(test_performance)
Exemple #19
0
def stacked_selection(fold):
    seed(seedval)
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_distances = 1 - train_df.corr().abs()
    train_performance = []
    test_performance = []
    for n_clusters in range(1, max_clusters + 1):
        train_values, train_predictions = stack_function(n_clusters,
                                                         train_distances,
                                                         train_df,
                                                         train_labels,
                                                         predict_df=train_df)
        test_values, test_predictions = stack_function(n_clusters,
                                                       train_distances,
                                                       train_df,
                                                       train_labels,
                                                       predict_df=test_df)
        train_performance.append(
            get_cluster_performance(train_labels, train_predictions,
                                    n_clusters, fold, seedval))
        test_performance.append(
            get_cluster_performance(test_labels, test_predictions, n_clusters,
                                    fold, seedval))
    best_cluster_size = common.get_best_performer(
        DataFrame.from_records(train_performance)).n_clusters.values
    test_values, test_predictions = stack_function(best_cluster_size,
                                                   train_distances,
                                                   train_df,
                                                   train_labels,
                                                   predict_df=test_df)
    return DataFrame({
        'fold': fold,
        'seed': seedval,
        'id': test_df.index.get_level_values('id'),
        'label': test_labels,
        'prediction': test_predictions,
        'diversity': common.diversity_score(test_values),
        'metric': common.score.__name__
    }), DataFrame.from_records(test_performance)