def mean_aggregation(fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) ids = test_df.index.get_level_values('id') labels = test_df.index.get_level_values('label') predictions = test_df.mean(axis = 1) diversity = common.diversity_score(test_df.values) return DataFrame({'id': ids, 'label': labels, 'fold': fold, 'prediction': predictions, 'diversity': diversity})
def stacked_generalization(fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) if method == 'aggregate': train_df = common.unbag(train_df, bag_count) test_df = common.unbag(test_df, bag_count) test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})
def selection(fold): seed(seedval) indices = [[fold], [seedval]] train_df, train_labels, test_df, test_labels = read_fold(path, fold) best_classifiers = train_df.apply( lambda x: roc_auc_score(train_labels, x)).order(ascending=False) train_metrics = [] test_metrics = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_metrics.append( eval_metrics(train_df, ensemble, train_labels, indices)) test_metrics.append( eval_metrics(test_df, ensemble, test_labels, indices)) train_metrics_df = concat(train_metrics) best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1] return eval_metrics(test_df, best_ensemble, test_labels, indices, final=True), concat(test_metrics)
def selection(fold, seedval, path, agg): seed(seedval) initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 max_clusters = 20 train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_df = common.unbag(train_df, agg) test_df = common.unbag(test_df, agg) best_classifiers = train_df.apply(lambda x: common.fmax_score( train_labels, x)).sort_values(ascending=not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate_enhanced(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append( get_performance(train_df, ensemble, fold, seedval)) test_performance.append( get_performance(test_df, ensemble, fold, seedval)) train_performance_df = pd.DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer( train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size.item(0) + 1] return get_predictions( test_df, best_ensemble, fold, seedval), pd.DataFrame.from_records(test_performance)
def stacked_selection(fold): seed(seedval) indices = [[fold], [seedval]] train_df, train_labels, test_df, test_labels = read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_metrics = [] test_metrics = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function( n_clusters, train_distances, train_df, train_labels, train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, test_df) train_metrics.append( eval_cluster_metrics(train_values, train_labels, train_predictions, n_clusters, indices)) test_metrics.append( eval_cluster_metrics(test_values, test_labels, test_predictions, n_clusters, indices)) best_cluster_size = get_best_performer(concat(train_metrics)).n_clusters test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, test_df) return eval_cluster_metrics(test_values, test_labels, test_predictions, best_cluster_size, indices), concat(test_metrics)
def stacked_generalization(fold): seed(seedval) train_df, train_labels, test_df, test_labels = read_fold(path, fold) if method == 'aggregate': train_df = unbag(train_df, bag_count) test_df = unbag(test_df, bag_count) predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] return eval_metrics(test_df, test_labels, predictions, [[fold], [seedval]])
def stacked_generalization(path,stacker_name,stacker,fold,agg): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_df = common.unbag(train_df,agg) test_df = common.unbag(test_df,agg) try: test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] except: test_predictions = stacker.fit(train_df,train_labels).predict(test_df)[:,1] df = pd.DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)}) return df
def mean_aggregation(fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) ids = test_df.index.get_level_values('id') labels = test_df.index.get_level_values('label') predictions = test_df.mean(axis=1) diversity = common.diversity_score(test_df.values) return DataFrame({ 'id': ids, 'label': labels, 'fold': fold, 'prediction': predictions, 'diversity': diversity })
def bestbase_fmax(path,fold_count=5,agg=1): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) predictions = [] labels = [] for fold in range(fold_count): _,_,test_df,label = common.read_fold(path,fold) test_df = common.unbag(test_df, agg) predictions.append(test_df) labels = append(labels,label) predictions = pd.concat(predictions) fmax_list = [common.fmax_score(labels,predictions.iloc[:,i]) for i in range(len(predictions.columns))] return max(fmax_list)
def stacked_selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_performance = [] test_performance = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = test_df) train_performance.append(get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval)) test_performance.append(get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval)) best_cluster_size = common.get_best_performer(DataFrame.from_records(train_performance)).n_clusters.values test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df = test_df) return DataFrame({'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__}), DataFrame.from_records(test_performance)
def mean_fmax(path,fold_count=5,agg=1): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) predictions = [] labels = [] for fold in range(fold_count): _,_,test_df,label = common.read_fold(path,fold) test_df = common.unbag(test_df, agg) predict = test_df.mean(axis=1).values predictions = append(predictions,predict) labels = append(labels,label) fmax = '%.3f' %(common.fmax_score(labels,predictions)) return float(fmax)
def stacked_selection(fold): seed(seedval) indices = [[fold], [seedval]] train_df, train_labels, test_df, test_labels = read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_metrics = [] test_metrics = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, test_df) train_metrics.append(eval_cluster_metrics(train_values, train_labels, train_predictions, n_clusters, indices)) test_metrics.append(eval_cluster_metrics(test_values, test_labels, test_predictions, n_clusters, indices)) best_cluster_size = get_best_performer(concat(train_metrics)).n_clusters test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, test_df) return eval_cluster_metrics(test_values, test_labels, test_predictions, best_cluster_size, indices), concat(test_metrics)
def mean_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) fold_count = int(p['foldCount']) predictions = [] labels = [] for fold in range(fold_count): _, _, test_df, label = common.read_fold(path, fold) test_df = common.unbag(test_df, 10) predict = test_df.mean(axis=1).values predictions += predict labels += label fmax = '%.3f' % (common.fmax_score(labels, predictions)) return fmax
def selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) best_classifiers = train_df.apply(lambda x: common.score(train_labels, x)).order(ascending = not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append(get_performance(train_df, ensemble, fold, seedval)) test_performance.append(get_performance(test_df, ensemble, fold, seedval)) train_performance_df = DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer(train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1] return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
def selection(fold): seed(seedval) indices = [[fold], [seedval]] train_df, train_labels, test_df, test_labels = read_fold(path, fold) best_classifiers = train_df.apply(lambda x: roc_auc_score(train_labels, x)).order(ascending = False) train_metrics = [] test_metrics = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_metrics.append(eval_metrics(train_df, ensemble, train_labels, indices)) test_metrics.append(eval_metrics(test_df, ensemble, test_labels, indices)) train_metrics_df = concat(train_metrics) best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1] return eval_metrics(test_df, best_ensemble, test_labels, indices, final = True), concat(test_metrics)
def bestbase_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) fold_count = int(p['foldCount']) predictions = [] labels = [] for fold in range(fold_count): _, _, test_df, label = common.read_fold(path, fold) test_df = common.unbag(test_df, 10) predictions.append(test_df) labels += label predictions = concat(predictions) fmax_list = [ common.fmax_core(labels, predictions[col].tolist()) for col in list(predictions) ] return max(fmax_list)
def selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) best_classifiers = train_df.apply(lambda x: common.score( train_labels, x)).order(ascending=not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append( get_performance(train_df, ensemble, fold, seedval)) test_performance.append( get_performance(test_df, ensemble, fold, seedval)) train_performance_df = DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer( train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1] return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
def stacked_selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_performance = [] test_performance = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df=train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df=test_df) train_performance.append( get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval)) test_performance.append( get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval)) best_cluster_size = common.get_best_performer( DataFrame.from_records(train_performance)).n_clusters.values test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df=test_df) return DataFrame({ 'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__ }), DataFrame.from_records(test_performance)