def selection(fold, seedval, path, agg): seed(seedval) initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 max_clusters = 20 train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_df = common.unbag(train_df, agg) test_df = common.unbag(test_df, agg) best_classifiers = train_df.apply(lambda x: common.fmax_score( train_labels, x)).sort_values(ascending=not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate_enhanced(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append( get_performance(train_df, ensemble, fold, seedval)) test_performance.append( get_performance(test_df, ensemble, fold, seedval)) train_performance_df = pd.DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer( train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size.item(0) + 1] return get_predictions( test_df, best_ensemble, fold, seedval), pd.DataFrame.from_records(test_performance)
def stacked_generalization(fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) if method == 'aggregate': train_df = common.unbag(train_df, bag_count) test_df = common.unbag(test_df, bag_count) test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})
def stacked_generalization(fold): seed(seedval) train_df, train_labels, test_df, test_labels = read_fold(path, fold) if method == 'aggregate': train_df = unbag(train_df, bag_count) test_df = unbag(test_df, bag_count) predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] return eval_metrics(test_df, test_labels, predictions, [[fold], [seedval]])
def stacked_generalization(path,stacker_name,stacker,fold,agg): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_df = common.unbag(train_df,agg) test_df = common.unbag(test_df,agg) try: test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] except: test_predictions = stacker.fit(train_df,train_labels).predict(test_df)[:,1] df = pd.DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)}) return df
def bestbase_fmax(path,fold_count=5,agg=1): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) predictions = [] labels = [] for fold in range(fold_count): _,_,test_df,label = common.read_fold(path,fold) test_df = common.unbag(test_df, agg) predictions.append(test_df) labels = append(labels,label) predictions = pd.concat(predictions) fmax_list = [common.fmax_score(labels,predictions.iloc[:,i]) for i in range(len(predictions.columns))] return max(fmax_list)
def mean_fmax(path,fold_count=5,agg=1): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) predictions = [] labels = [] for fold in range(fold_count): _,_,test_df,label = common.read_fold(path,fold) test_df = common.unbag(test_df, agg) predict = test_df.mean(axis=1).values predictions = append(predictions,predict) labels = append(labels,label) fmax = '%.3f' %(common.fmax_score(labels,predictions)) return float(fmax)
def mean_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) fold_count = int(p['foldCount']) predictions = [] labels = [] for fold in range(fold_count): _, _, test_df, label = common.read_fold(path, fold) test_df = common.unbag(test_df, 10) predict = test_df.mean(axis=1).values predictions += predict labels += label fmax = '%.3f' % (common.fmax_score(labels, predictions)) return fmax
def bestbase_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) fold_count = int(p['foldCount']) predictions = [] labels = [] for fold in range(fold_count): _, _, test_df, label = common.read_fold(path, fold) test_df = common.unbag(test_df, 10) predictions.append(test_df) labels += label predictions = concat(predictions) fmax_list = [ common.fmax_core(labels, predictions[col].tolist()) for col in list(predictions) ] return max(fmax_list)