def mean_aggregation(fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) ids = test_df.index.get_level_values('id') labels = test_df.index.get_level_values('label') predictions = test_df.mean(axis = 1) diversity = common.diversity_score(test_df.values) return DataFrame({'id': ids, 'label': labels, 'fold': fold, 'prediction': predictions, 'diversity': diversity})
def select_candidate_sdi(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False) candidate_diversity_scores = [1 - abs(common.diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates] # 1 - kappa so larger = more diverse candidate_scores = [accuracy_weight * best_classifiers.ix[candidate] + (1 - accuracy_weight) * candidate_diversity_scores[candidate_i] for candidate_i, candidate in enumerate(candidates)] best_candidate = candidates[common.argbest(candidate_scores)] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace = False) candidate_diversity_scores = [abs(common.diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates] candidate_diversity_ranks = array(candidate_diversity_scores).argsort() diversity_candidates = candidates[candidate_diversity_ranks[:max_diversity_candidates]] candidate_accuracy_scores = [common.score(train_labels, train_df[ensemble + [candidate]].mean(axis = 1)) for candidate in diversity_candidates] best_candidate = candidates[common.argbest(candidate_accuracy_scores)] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def mean_aggregation(fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) ids = test_df.index.get_level_values('id') labels = test_df.index.get_level_values('label') predictions = test_df.mean(axis=1) diversity = common.diversity_score(test_df.values) return DataFrame({ 'id': ids, 'label': labels, 'fold': fold, 'prediction': predictions, 'diversity': diversity })
def stacked_generalization(stacker, fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_df = common.unbag(train_df, 10) test_df = common.unbag(test_df, 10) test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] return DataFrame({ 'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values) })
def get_predictions(df, ensemble, fold, seedval): ids = df.index.get_level_values('id') labels = df.index.get_level_values('label') predictions = df[ensemble].mean(axis=1) diversity = common.diversity_score(df[ensemble].values) return DataFrame({ 'fold': fold, 'seed': seedval, 'id': ids, 'label': labels, 'prediction': predictions, 'diversity': diversity, 'ensemble_size': len(ensemble) })
def select_candidate_sdi(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace=False) candidate_diversity_scores = [ 1 - abs(common.diversity_score( train_df[ensemble + [candidate]].values)) for candidate in candidates ] # 1 - kappa so larger = more diverse candidate_scores = [ accuracy_weight * best_classifiers.ix[candidate] + (1 - accuracy_weight) * candidate_diversity_scores[candidate_i] for candidate_i, candidate in enumerate(candidates) ] best_candidate = candidates[common.argbest(candidate_scores)] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def stacked_selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_performance = [] test_performance = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df=train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df=test_df) train_performance.append( get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval)) test_performance.append( get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval)) best_cluster_size = common.get_best_performer( DataFrame.from_records(train_performance)).n_clusters.values test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df=test_df) return DataFrame({ 'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__ }), DataFrame.from_records(test_performance)
def select_candidate_drep(train_df, train_labels, best_classifiers, ensemble, i): if len(ensemble) >= initial_ensemble_size: candidates = choice(best_classifiers.index.values, min(max_candidates, len(best_classifiers)), replace=False) candidate_diversity_scores = [ abs(common.diversity_score(train_df[ensemble + [candidate]].values)) for candidate in candidates ] candidate_diversity_ranks = array(candidate_diversity_scores).argsort() diversity_candidates = candidates[ candidate_diversity_ranks[:max_diversity_candidates]] candidate_accuracy_scores = [ common.score(train_labels, train_df[ensemble + [candidate]].mean(axis=1)) for candidate in diversity_candidates ] best_candidate = candidates[common.argbest(candidate_accuracy_scores)] else: best_candidate = best_classifiers.index.values[i] return best_candidate
def stacked_generalization(fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) if method == 'aggregate': train_df = common.unbag(train_df, bag_count) test_df = common.unbag(test_df, bag_count) test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})
def get_predictions(df, ensemble, fold, seedval): ids = df.index.get_level_values('id') labels = df.index.get_level_values('label') predictions = df[ensemble].mean(axis = 1) diversity = common.diversity_score(df[ensemble].values) return DataFrame({'fold': fold, 'seed': seedval, 'id': ids, 'label': labels, 'prediction': predictions, 'diversity': diversity, 'ensemble_size': len(ensemble)})
def stacked_selection(fold): seed(seedval) train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_distances = 1 - train_df.corr().abs() train_performance = [] test_performance = [] for n_clusters in range(1, max_clusters + 1): train_values, train_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = train_df) test_values, test_predictions = stack_function(n_clusters, train_distances, train_df, train_labels, predict_df = test_df) train_performance.append(get_cluster_performance(train_labels, train_predictions, n_clusters, fold, seedval)) test_performance.append(get_cluster_performance(test_labels, test_predictions, n_clusters, fold, seedval)) best_cluster_size = common.get_best_performer(DataFrame.from_records(train_performance)).n_clusters.values test_values, test_predictions = stack_function(best_cluster_size, train_distances, train_df, train_labels, predict_df = test_df) return DataFrame({'fold': fold, 'seed': seedval, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_values), 'metric': common.score.__name__}), DataFrame.from_records(test_performance)