Ejemplo n.º 1
0
def eval_metrics(df, ensemble, labels, indices, final = False):
    predictions     = df[ensemble].mean(axis = 1)
    auc             = roc_auc_score(labels, predictions)
    brier           = mean_squared_error(labels, predictions)
    diversity       = average_diversity_score(df[ensemble].values)
    ensemble_size   = len(ensemble)
    ensemble        = ' '.join(ensemble) if final else ensemble[-1]
    return DataFrame({'auc': auc, 'brier': brier, 'diversity': diversity, 'ensemble': ensemble, 'ensemble_size': ensemble_size}, index = indices)
Ejemplo n.º 2
0
def select_candidate_sdi(train_df, train_labels, best_classifiers, ensemble, i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False)
        candidate_diversity_scores = [1 - abs(average_diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates] # 1 - kappa so larger = more diverse
        candidate_scores = [accuracy_weight * best_classifiers.ix[candidate] + (1 - accuracy_weight) * candidate_diversity_scores[candidate_i] for candidate_i, candidate in enumerate(candidates)]
        best_candidate = candidates[array(candidate_scores).argmax()]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Ejemplo n.º 3
0
def eval_metrics(df, labels, predictions, indices):
    auc = roc_auc_score(labels, predictions)
    brier = mean_squared_error(labels, predictions)
    diversity = average_diversity_score(df.values)
    return DataFrame({
        'auc': auc,
        'brier': brier,
        'diversity': diversity
    },
                     index=indices)
Ejemplo n.º 4
0
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False)
        candidate_diversity_scores = [abs(average_diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates]
        candidate_diversity_ranks = array(candidate_diversity_scores).argsort()
        diversity_candidates = candidates[candidate_diversity_ranks[:max_diversity_candidates]]
        candidate_accuracy_scores = [roc_auc_score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in diversity_candidates]
        best_candidate = candidates[array(candidate_accuracy_scores).argmax()]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Ejemplo n.º 5
0
def eval_cluster_metrics(x, labels, predictions, n_clusters, indices):
    auc = roc_auc_score(labels, predictions)
    brier = mean_squared_error(labels, predictions)
    diversity = average_diversity_score(x)
    return DataFrame(
        {
            'auc': auc,
            'brier': brier,
            'diversity': diversity,
            'n_clusters': n_clusters
        },
        index=indices)
Ejemplo n.º 6
0
def eval_metrics(df, ensemble, labels, indices, final=False):
    predictions = df[ensemble].mean(axis=1)
    auc = roc_auc_score(labels, predictions)
    brier = mean_squared_error(labels, predictions)
    diversity = average_diversity_score(df[ensemble].values)
    ensemble_size = len(ensemble)
    ensemble = ' '.join(ensemble) if final else ensemble[-1]
    return DataFrame(
        {
            'auc': auc,
            'brier': brier,
            'diversity': diversity,
            'ensemble': ensemble,
            'ensemble_size': ensemble_size
        },
        index=indices)
Ejemplo n.º 7
0
def select_candidate_sdi(train_df, train_labels, best_classifiers, ensemble,
                         i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values,
                            min(max_candidates, len(best_classifiers)),
                            replace=False)
        candidate_diversity_scores = [
            1 - abs(
                average_diversity_score(
                    train_df[ensemble + [candidate]].values))
            for candidate in candidates
        ]  # 1 - kappa so larger = more diverse
        candidate_scores = [
            accuracy_weight * best_classifiers.ix[candidate] +
            (1 - accuracy_weight) * candidate_diversity_scores[candidate_i]
            for candidate_i, candidate in enumerate(candidates)
        ]
        best_candidate = candidates[array(candidate_scores).argmax()]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Ejemplo n.º 8
0
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble,
                          i):
    if len(ensemble) >= initial_ensemble_size:
        candidates = choice(best_classifiers.index.values,
                            min(max_candidates, len(best_classifiers)),
                            replace=False)
        candidate_diversity_scores = [
            abs(
                average_diversity_score(train_df[ensemble +
                                                 [candidate]].values))
            for candidate in candidates
        ]
        candidate_diversity_ranks = array(candidate_diversity_scores).argsort()
        diversity_candidates = candidates[
            candidate_diversity_ranks[:max_diversity_candidates]]
        candidate_accuracy_scores = [
            roc_auc_score(train_labels,
                          train_df[ensemble + [candidate]].mean(axis=1))
            for candidate in diversity_candidates
        ]
        best_candidate = candidates[array(candidate_accuracy_scores).argmax()]
    else:
        best_candidate = best_classifiers.index.values[i]
    return best_candidate
Ejemplo n.º 9
0
from pandas import DataFrame, concat, read_csv
from sklearn.metrics import mean_squared_error, roc_auc_score

path = abspath(argv[1])
assert exists(path)
if not exists('%s/analysis' % path):
    mkdir('%s/analysis' % path)
p = load_properties(path)
fold_count = int(p['foldCount'])

dfs = []
for fold in range(fold_count):
    df = read_csv('%s/validation-%s.csv.gz' % (path, fold),
                  index_col=[0, 1],
                  compression='gzip')
    labels = df.index.get_level_values(1).values
    predictions = df.mean(axis=1)
    auc = roc_auc_score(labels, predictions)
    brier = mean_squared_error(labels, predictions)
    diversity = average_diversity_score(df.values)
    dfs.append(
        DataFrame({
            'auc': auc,
            'brier': brier,
            'diversity': diversity
        },
                  index=[fold]))
perf_df = concat(dfs)
perf_df.to_csv('%s/analysis/mean.csv' % path, index_label='fold')
print '%.3f' % perf_df.auc.mean()
Ejemplo n.º 10
0
def eval_cluster_metrics(x, labels, predictions, n_clusters, indices):
    auc         = roc_auc_score(labels, predictions)
    brier       = mean_squared_error(labels, predictions)
    diversity   = average_diversity_score(x)
    return DataFrame({'auc': auc, 'brier': brier, 'diversity': diversity, 'n_clusters': n_clusters}, index = indices)
Ejemplo n.º 11
0
def eval_metrics(df, labels, predictions, indices):
    auc         = roc_auc_score(labels, predictions)
    brier       = mean_squared_error(labels, predictions)
    diversity   = average_diversity_score(df.values)
    return DataFrame({'auc': auc, 'brier': brier, 'diversity': diversity}, index = indices)
Ejemplo n.º 12
0
"""

from os import mkdir
from os.path import abspath, exists
from sys import argv

from common import load_properties
from diversity import average_diversity_score
from pandas import DataFrame, concat, read_csv
from sklearn.metrics import mean_squared_error, roc_auc_score

path = abspath(argv[1])
assert exists(path)
if not exists('%s/analysis' % path):
    mkdir('%s/analysis' % path)
p = load_properties(path)
fold_count = int(p['foldCount'])

dfs = []
for fold in range(fold_count):
    df          = read_csv('%s/validation-%s.csv.gz' % (path, fold), index_col = [0, 1], compression = 'gzip')
    labels      = df.index.get_level_values(1).values
    predictions = df.mean(axis = 1)
    auc         = roc_auc_score(labels, predictions)
    brier       = mean_squared_error(labels, predictions)
    diversity   = average_diversity_score(df.values)
    dfs.append(DataFrame({'auc': auc, 'brier': brier, 'diversity': diversity}, index = [fold]))
perf_df = concat(dfs)
perf_df.to_csv('%s/analysis/mean.csv' % path, index_label = 'fold')
print '%.3f' % perf_df.auc.mean()