def selection(fold, seedval, path, agg): seed(seedval) initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 max_clusters = 20 train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_df = common.unbag(train_df, agg) test_df = common.unbag(test_df, agg) best_classifiers = train_df.apply(lambda x: common.fmax_score( train_labels, x)).sort_values(ascending=not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate_enhanced(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append( get_performance(train_df, ensemble, fold, seedval)) test_performance.append( get_performance(test_df, ensemble, fold, seedval)) train_performance_df = pd.DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer( train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size.item(0) + 1] return get_predictions( test_df, best_ensemble, fold, seedval), pd.DataFrame.from_records(test_performance)
def selection(fold): seed(seedval) indices = [[fold], [seedval]] train_df, train_labels, test_df, test_labels = read_fold(path, fold) best_classifiers = train_df.apply( lambda x: roc_auc_score(train_labels, x)).order(ascending=False) train_metrics = [] test_metrics = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_metrics.append( eval_metrics(train_df, ensemble, train_labels, indices)) test_metrics.append( eval_metrics(test_df, ensemble, test_labels, indices)) train_metrics_df = concat(train_metrics) best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1] return eval_metrics(test_df, best_ensemble, test_labels, indices, final=True), concat(test_metrics)
def stacked_selection(fold): seed(seedval) indices = [[fold], [seedval]] train_df, train_labels, test_df, test_labels = read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_metrics = [] test_metrics = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function( n_clusters, train_distances, train_df, train_labels, train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, test_df) train_metrics.append( eval_cluster_metrics(train_values, train_labels, train_predictions, n_clusters, indices)) test_metrics.append( eval_cluster_metrics(test_values, test_labels, test_predictions, n_clusters, indices)) best_cluster_size = get_best_performer(concat(train_metrics)).n_clusters test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, test_df) return eval_cluster_metrics(test_values, test_labels, test_predictions, best_cluster_size, indices), concat(test_metrics)
def stacked_selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_performance = [] test_performance = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = test_df) train_performance.append(get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval)) test_performance.append(get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval)) best_cluster_size = common.get_best_performer(DataFrame.from_records(train_performance)).n_clusters.values test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df = test_df) return DataFrame({'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__}), DataFrame.from_records(test_performance)
def stacked_selection(fold): seed(seedval) indices = [[fold], [seedval]] train_df, train_labels, test_df, test_labels = read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_metrics = [] test_metrics = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, test_df) train_metrics.append(eval_cluster_metrics(train_values, train_labels, train_predictions, n_clusters, indices)) test_metrics.append(eval_cluster_metrics(test_values, test_labels, test_predictions, n_clusters, indices)) best_cluster_size = get_best_performer(concat(train_metrics)).n_clusters test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, test_df) return eval_cluster_metrics(test_values, test_labels, test_predictions, best_cluster_size, indices), concat(test_metrics)
def selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) best_classifiers = train_df.apply(lambda x: common.score(train_labels, x)).order(ascending = not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append(get_performance(train_df, ensemble, fold, seedval)) test_performance.append(get_performance(test_df, ensemble, fold, seedval)) train_performance_df = DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer(train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1] return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
def selection(fold): seed(seedval) indices = [[fold], [seedval]] train_df, train_labels, test_df, test_labels = read_fold(path, fold) best_classifiers = train_df.apply(lambda x: roc_auc_score(train_labels, x)).order(ascending = False) train_metrics = [] test_metrics = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_metrics.append(eval_metrics(train_df, ensemble, train_labels, indices)) test_metrics.append(eval_metrics(test_df, ensemble, test_labels, indices)) train_metrics_df = concat(train_metrics) best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1] return eval_metrics(test_df, best_ensemble, test_labels, indices, final = True), concat(test_metrics)
def selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) best_classifiers = train_df.apply(lambda x: common.score( train_labels, x)).order(ascending=not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append( get_performance(train_df, ensemble, fold, seedval)) test_performance.append( get_performance(test_df, ensemble, fold, seedval)) train_performance_df = DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer( train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size + 1] return get_predictions(test_df, best_ensemble, fold, seedval), DataFrame.from_records(test_performance)
def stacked_selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_performance = [] test_performance = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df=train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df=test_df) train_performance.append( get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval)) test_performance.append( get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval)) best_cluster_size = common.get_best_performer( DataFrame.from_records(train_performance)).n_clusters.values test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df=test_df) return DataFrame({ 'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__ }), DataFrame.from_records(test_performance)