def eval_metrics(df, ensemble, labels, indices, final = False): predictions = df[ensemble].mean(axis = 1) auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(df[ensemble].values) ensemble_size = len(ensemble) ensemble = ' '.join(ensemble) if final else ensemble[-1] return DataFrame({'auc': auc, 'brier': brier, 'diversity': diversity, 'ensemble': ensemble, 'ensemble_size': ensemble_size}, index = indices)
def select_candidate_sdi(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False) candidate_diversity_scores = [1 - abs(average_diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates] # 1 - kappa so larger = more diverse candidate_scores = [accuracy_weight * best_classifiers.ix[candidate] + (1 - accuracy_weight) * candidate_diversity_scores[candidate_i] for candidate_i, candidate in enumerate(candidates)] best_candidate = candidates[array(candidate_scores).argmax()] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def eval_metrics(df, labels, predictions, indices): auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(df.values) return DataFrame({ 'auc': auc, 'brier': brier, 'diversity': diversity }, index=indices)
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False) candidate_diversity_scores = [abs(average_diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates] candidate_diversity_ranks = array(candidate_diversity_scores).argsort() diversity_candidates = candidates[candidate_diversity_ranks[:max_diversity_candidates]] candidate_accuracy_scores = [roc_auc_score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in diversity_candidates] best_candidate = candidates[array(candidate_accuracy_scores).argmax()] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def eval_cluster_metrics(x, labels, predictions, n_clusters, indices): auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(x) return DataFrame( { 'auc': auc, 'brier': brier, 'diversity': diversity, 'n_clusters': n_clusters }, index=indices)
def eval_metrics(df, ensemble, labels, indices, final=False): predictions = df[ensemble].mean(axis=1) auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(df[ensemble].values) ensemble_size = len(ensemble) ensemble = ' '.join(ensemble) if final else ensemble[-1] return DataFrame( { 'auc': auc, 'brier': brier, 'diversity': diversity, 'ensemble': ensemble, 'ensemble_size': ensemble_size }, index=indices)
def select_candidate_sdi(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace=False) candidate_diversity_scores = [ 1 - abs( average_diversity_score( train_df[ensemble + [candidate]].values)) for candidate in candidates ] # 1 - kappa so larger = more diverse candidate_scores = [ accuracy_weight * best_classifiers.ix[candidate] + (1 - accuracy_weight) * candidate_diversity_scores[candidate_i] for candidate_i, candidate in enumerate(candidates) ] best_candidate = candidates[array(candidate_scores).argmax()] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace=False) candidate_diversity_scores = [ abs( average_diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates ] candidate_diversity_ranks = array(candidate_diversity_scores).argsort() diversity_candidates = candidates[ candidate_diversity_ranks[:max_diversity_candidates]] candidate_accuracy_scores = [ roc_auc_score(train_labels, train_df[ensemble + [candidate]].mean(axis=1)) for candidate in diversity_candidates ] best_candidate = candidates[array(candidate_accuracy_scores).argmax()] else: best_candidate = best_classifiers.index.values[i] return best_candidate
from pandas import DataFrame, concat, read_csv from sklearn.metrics import mean_squared_error, roc_auc_score path = abspath(argv[1]) assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = load_properties(path) fold_count = int(p['foldCount']) dfs = [] for fold in range(fold_count): df = read_csv('%s/validation-%s.csv.gz' % (path, fold), index_col=[0, 1], compression='gzip') labels = df.index.get_level_values(1).values predictions = df.mean(axis=1) auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(df.values) dfs.append( DataFrame({ 'auc': auc, 'brier': brier, 'diversity': diversity }, index=[fold])) perf_df = concat(dfs) perf_df.to_csv('%s/analysis/mean.csv' % path, index_label='fold') print '%.3f' % perf_df.auc.mean()
def eval_cluster_metrics(x, labels, predictions, n_clusters, indices): auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(x) return DataFrame({'auc': auc, 'brier': brier, 'diversity': diversity, 'n_clusters': n_clusters}, index = indices)
def eval_metrics(df, labels, predictions, indices): auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(df.values) return DataFrame({'auc': auc, 'brier': brier, 'diversity': diversity}, index = indices)
""" from os import mkdir from os.path import abspath, exists from sys import argv from common import load_properties from diversity import average_diversity_score from pandas import DataFrame, concat, read_csv from sklearn.metrics import mean_squared_error, roc_auc_score path = abspath(argv[1]) assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = load_properties(path) fold_count = int(p['foldCount']) dfs = [] for fold in range(fold_count): df = read_csv('%s/validation-%s.csv.gz' % (path, fold), index_col = [0, 1], compression = 'gzip') labels = df.index.get_level_values(1).values predictions = df.mean(axis = 1) auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(df.values) dfs.append(DataFrame({'auc': auc, 'brier': brier, 'diversity': diversity}, index = [fold])) perf_df = concat(dfs) perf_df.to_csv('%s/analysis/mean.csv' % path, index_label = 'fold') print '%.3f' % perf_df.auc.mean()