def STACK_run(): start_time = time.time() y_true = DataFrame(columns=['label']) y_score = DataFrame(columns=['prediction']) y_size = 0.0 string = "" for fold in range(fold_count): classifiers = full_ensemble(project_path, size, fold, seed, metric) names = [] for c in classifiers: names.append(c) inner_y_true_train, inner_train_dataset = create_dataset_from_predictions_for_stacker( classifiers, seed, fold, "valid") inner_y_true, inner_test_dataset = create_dataset_from_predictions_for_stacker( classifiers, seed, fold, "test") "L1Log" stacker = LogisticRegression(random_state=0, penalty='l1', solver='liblinear') stacker.fit(inner_train_dataset, inner_y_true_train.values.ravel()) #print("\tstacker.coef_\n\t", stacker.coef_) #print("Lasso model: ", pretty_print_linear(stacker.coef_[0], names, sort = False)) inner_y_size = count_nonzero(stacker.coef_) y_size += inner_y_size inner_y_score_ndarray = stacker.predict_proba(inner_test_dataset)[:, 1] inner_y_score = DataFrame(inner_y_score_ndarray) inner_y_score.rename(columns={0: 'prediction'}, inplace=True) y_true = concat((y_true, inner_y_true), axis=0) y_true['label'] = to_numeric(y_true['label']) y_score = concat([y_score, inner_y_score], axis=0) string += ( "fold_%i,%f::%i\n" % (fold, fmax_score(inner_y_true, inner_y_score), inner_y_size)) y_size /= fold_count string += ("final,%f::%f\n" % (fmax_score(y_true, y_score), y_size)) dst = "%s/STACK_RESULTS/ORDER%i/stack_bp%i_seed%i_%s.fmax" % ( project_path, seed, size, seed, "L1Log") with open(dst, 'w') as f: f.write(string) f.close() seconds = time.time() - start_time print("\t%s (%s)" % (dst, (time.strftime('%H:%M:%S', time.gmtime(seconds)))))
def selection(fold, seedval, path, agg): seed(seedval) initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 max_clusters = 20 train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) train_df = common.unbag(train_df, agg) test_df = common.unbag(test_df, agg) best_classifiers = train_df.apply(lambda x: common.fmax_score( train_labels, x)).sort_values(ascending=not common.greater_is_better) train_performance = [] test_performance = [] ensemble = [] for i in range(min(max_ensemble_size, len(best_classifiers))): best_candidate = select_candidate_enhanced(train_df, train_labels, best_classifiers, ensemble, i) ensemble.append(best_candidate) train_performance.append( get_performance(train_df, ensemble, fold, seedval)) test_performance.append( get_performance(test_df, ensemble, fold, seedval)) train_performance_df = pd.DataFrame.from_records(train_performance) best_ensemble_size = common.get_best_performer( train_performance_df).ensemble_size.values best_ensemble = train_performance_df.ensemble[:best_ensemble_size.item(0) + 1] return get_predictions( test_df, best_ensemble, fold, seedval), pd.DataFrame.from_records(test_performance)
def CES_fmax(path,fold_count=5,agg=1): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) method = 'enhanced' select_candidate = eval('select_candidate_' + method) method_function = selection initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 max_clusters = 20 predictions_dfs = [] performance_dfs = [] seeds = range(agg) for seedval in seeds: for fold in range(fold_count): pred_df, perf_df = method_function(fold,seedval,path,agg) predictions_dfs.append(pred_df) performance_dfs.append(perf_df) performance_df = pd.concat(performance_dfs) performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' % (path, method, 'fmax'), index = False) predictions_df = pd.concat(predictions_dfs) predictions_df['method'] = method predictions_df['metric'] = 'fmax' predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' % (path, method, 'fmax'), index = False) fmax = '%.3f' %(common.fmax_score(predictions_df.label,predictions_df.prediction)) return float(fmax)
def get_cluster_performance(labels, predictions, n_clusters, fold, seedval): return { 'fold': fold, 'seed': seedval, 'score': common.fmax_score(labels, predictions), 'n_clusters': n_clusters }
def get_performance(df, ensemble, fold, seedval): labels = df.index.get_level_values('label').values predictions = df[ensemble].mean(axis=1) return { 'fold': fold, 'seed': seedval, 'score': common.fmax_score(labels, predictions), 'ensemble': ensemble[-1], 'ensemble_size': len(ensemble) }
def RL_ens(): y_true = DataFrame(columns = ["label"]) y_score = DataFrame(columns = ["prediction"]) string = "" for fold in range(fold_count): filename_fold = '%s/RL_OUTPUT/ORDER%s/bp%s_fold%s_seed%s_epsilon%s_pre%s_conv%s_exit%s_%s_%s_%s_start-%s.fmax' % (project_path, seed, size, fold, seed, epsilon, age, conv, exit, strategy, RULE, algo, start) ensemble = read_ens(filename_fold) ensemble_bps = get_ens_bps(ensemble, filename_fold) inner_y_true, inner_y_score = aggregate_predictions(ensemble_bps, set, fold, seed, RULE) y_true = concat([y_true, inner_y_true], axis = 0) y_true['label'] = to_numeric(y_true['label']) y_score = concat([y_score, inner_y_score], axis = 0) string += ("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score))) string += ("final,%f\n" % fmax_score(y_true, y_score)) filename = '%s/RL_RESULTS/ORDER%i/RL_bp%i_seed%i_epsilon%s_pre%s_conv%s_exit%s_%s_%s_%s_start-%s.fmax' % (project_path, seed, size, seed, epsilon, age, conv, exit, strategy, RULE, algo, start) with open(filename, 'w+') as f: f.write(string) f.close() print(filename)
def bestbase_fmax(path,fold_count=5,agg=1): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) predictions = [] labels = [] for fold in range(fold_count): _,_,test_df,label = common.read_fold(path,fold) test_df = common.unbag(test_df, agg) predictions.append(test_df) labels = append(labels,label) predictions = pd.concat(predictions) fmax_list = [common.fmax_score(labels,predictions.iloc[:,i]) for i in range(len(predictions.columns))] return max(fmax_list)
def mean_fmax(path,fold_count=5,agg=1): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) predictions = [] labels = [] for fold in range(fold_count): _,_,test_df,label = common.read_fold(path,fold) test_df = common.unbag(test_df, agg) predict = test_df.mean(axis=1).values predictions = append(predictions,predict) labels = append(labels,label) fmax = '%.3f' %(common.fmax_score(labels,predictions)) return float(fmax)
def FULL_ens(): y_true = DataFrame(columns=["label"]) y_score = DataFrame(columns=["prediction"]) string = "" for fold in range(fold_count): ensemble_bps = full_ensemble(project_path, size, fold, seed, metric) inner_y_true, inner_y_score = aggregate_predictions( ensemble_bps, "test", fold, seed, RULE) y_true = concat([y_true, inner_y_true], axis=0) y_score = concat([y_score, inner_y_score], axis=0) y_true['label'] = to_numeric(y_true['label']) inner_y_score = DataFrame(inner_y_score) inner_y_score.rename(columns={0: 'prediction'}, inplace=True) string += ("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score))) string += ("final,%f\n" % fmax_score(y_true, y_score)) filename = '%s/BASELINE/ORDER%i/FE_bp%i_seed%i_%s_rnd.fmax' % ( project_path, seed, size, seed, RULE) with open(filename, 'w') as f: f.write(string) f.close() print(filename)
def BEST_bp(): y_true = DataFrame(columns=["label"]) y_score = DataFrame(columns=["prediction"]) string = "" for fold in range(fold_count): ensemble_bps = full_ensemble(project_path, size, fold, seed, metric) inner_y_true, inner_y_score = get_max_predictions( ensemble_bps, seed, fold, "test") y_true = concat([y_true, inner_y_true], axis=0) y_true['label'] = to_numeric(y_true['label']) y_score = concat([y_score, inner_y_score], axis=0) string += ("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score))) print("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score))) print(len(y_true)) string += ("final,%f\n" % fmax_score(y_true, y_score)) filename = '%s/BASELINE/ORDER%i/BP_bp%i_seed%i_rnd.%s' % ( project_path, seed, size, seed, metric) with open(filename, 'w') as f: f.write(string) f.close() print(filename)
def mean_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) fold_count = int(p['foldCount']) predictions = [] labels = [] for fold in range(fold_count): _, _, test_df, label = common.read_fold(path, fold) test_df = common.unbag(test_df, 10) predict = test_df.mean(axis=1).values predictions += predict labels += label fmax = '%.3f' % (common.fmax_score(labels, predictions)) return fmax
def get_max_predictions(predictors, seed, fold, set): max_p = '' max_w = 0 path, bag, weight = get_predictor_path_bag_weight(predictors[0]) if weight > max_w: max_w = weight max_p = path for bp in predictors[1:]: path, bag, weight = get_predictor_path_bag_weight(bp) if weight > max_w: max_w = weight max_p = path y_true, y_score = get_set_preds(max_p, set, bag, fold, seed) perf = fmax_score(y_true, y_score) return (y_true, y_score)
def main(path,fold_count=5,agg=1): dn = abspath(path).split('/')[-1] cols = ['data_name','fmax','method'] dfs = [] print '[CES] Start building model #################################' ces = CES_fmax(path,fold_count,agg) print '[CES] Finished evaluating model ############################' print '[CES] F-max score is %s.' %ces print '[Mean] Start building model ################################' mean = mean_fmax(path,fold_count,agg) print '[Mean] Finished evaluating model ###########################' print '[Mean] F-max score is %s.' %mean print '[Best Base] Start building model ###########################' bestbase = bestbase_fmax(path,fold_count,agg) print '[Best Base] Finished evaluating model ######################' print '[Best Base] F-max score is %s.' %bestbase dfs.append(pd.DataFrame(data = [[dn,ces,'CES']],columns=cols,index = [0])) dfs.append(pd.DataFrame(data = [[dn,mean,'Mean']],columns=cols,index = [0])) dfs.append(pd.DataFrame(data = [[dn,bestbase,'best base']],columns=cols,index = [0])) # Get Stacking Fmax scores stackers = [RandomForestClassifier(n_estimators = 200, max_depth = 2, bootstrap = False, random_state = 0), SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',probability=True, max_iter=-1, random_state=None, shrinking=True, tol=0.001, verbose=False),GaussianNB(),LogisticRegression(),AdaBoostClassifier(),DecisionTreeClassifier(),GradientBoostingClassifier(loss='deviance'),KNeighborsClassifier()] stacker_names = ["RF.S","SVM.S","NB.S","LR.S","AB.S","DT.S","LB.S","KNN.S"] for i,(stacker_name,stacker) in enumerate(zip(stacker_names,stackers)): print '[%s] Start building model ################################' %(stacker_name) predictions_dfs = [stacked_generalization(path,stacker_name,stacker,fold,agg) for fold in range(fold_count)] predictions_df = pd.concat(predictions_dfs) fmax = common.fmax_score(predictions_df.label, predictions_df.prediction) print '[%s] Finished evaluating model ###########################' %(stacker_name) print '[%s] F-max score is %s.' %(stacker_name,fmax) df = pd.DataFrame(data = [[dn,fmax,stacker_name]],columns=cols, index = [0]) dfs.append(df) dfs = pd.concat(dfs) # Save results print 'Saving results #############################################' if not exists('%s/analysis' %path): mkdir('%s/analysis' %path) dfs.to_csv("%s/analysis/performance.csv" %path, index = False)
def CES_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) method = 'enhanced' select_candidate = eval('select_candidate_' + method) method_function = selection p = common.load_properties(path) fold_count = int(p['foldCount']) initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 max_clusters = 20 predictions_dfs = [] performance_dfs = [] seeds = [0] if method == 'greedy' else range(10) for seedval in seeds: for fold in range(fold_count): pred_df, perf_df = method_function(fold, seedval, path) predictions_dfs.append(pred_df) performance_dfs.append(perf_df) performance_df = concat(performance_dfs) performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' % (path, method, 'fmax'), index=False) predictions_df = concat(predictions_dfs) predictions_df['method'] = method predictions_df['metric'] = 'fmax' predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' % (path, method, 'fmax'), index=False) fmax = '%.3f' % (common.fmax_score(predictions_df.label, predictions_df.prediction)) # fmax = '%.3f' %predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.fmax_score(x.label, x.prediction)).mean() # print predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.fmax_score(x.label, x.prediction)) return fmax, predictions_df.ensemble_size.mean()
best_base_fmax = besides_stack.bestbase_fmax(path) method = 'aggregate' # Set default method for baggings to be aggregate cols = [ 'GO', 'category', '#pos', '#neg', '#total', '#+/#-', 'fmax', 'metaClassifier', 'best_base_fmax', 'best_base_learner', 'CES_size' ] dfs = [] # Get Stacking Fmax scores for i, (stacker_name, stacker) in enumerate(zip(stacker_names, stackers)): predictions_dfs = [ stacked_generalization(stacker, fold) for fold in range(fold_count) ] predictions_df = concat(predictions_dfs) predictions_df['method'] = method fmax = common.fmax_score(predictions_df.label, predictions_df.prediction) df = DataFrame(data=[[ go_term, category, posNum, negNum, totalNum, pn_ratio, fmax, stacker_name, best_base_fmax, best_base_clsf, ces_size ]], columns=cols, index=[0]) dfs.append(df) dfs.append( DataFrame(data=[[ go_term, category, posNum, negNum, totalNum, pn_ratio, ces_fmax, 'CES', best_base_fmax, best_base_clsf, ces_size ]], columns=cols, index=[0]))
path = abspath(argv[1]) assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) input_fn = '%s/%s' % (path, p['inputFilename']) assert exists(input_fn) # generate cross validation values for leave-one-value-out or k-fold assert ('foldAttribute' in p) or ('foldCount' in p) if 'foldAttribute' in p: headers = common.load_arff_headers(input_fn) fold_values = headers[p['foldAttribute']] else: fold_values = range(int(p['foldCount'])) stacker = LogisticRegression() perf_df = [] for fold in fold_values: prediction_df = stacked_generalization(fold) if prediction_df is not None: prediction_df.to_csv('%s/analysis/%s-predictions.csv' %(path,fold)) fmax = common.fmax_score(prediction_df.label.tolist(),prediction_df.prediction.tolist()) perf_df.append(DataFrame(data = [[path.split('/')[-1],fold,fmax]], columns=['data','fold','fmax'],index=[0])) # Get Fmax value for each fold, and dump to local disk perf_df = concat(perf_df) perf_df.to_csv(path + '/analysis/%s_fmax.csv' %path.split('/')[-1])
for bag in bag_list: print("fold = %i seed = %s, bag = %s" % (fold, seed, bag)) x1 = DataFrame(columns=["label"]) x2 = DataFrame(columns=["prediction"]) #for fold in range(fold_count): filename = '%s/valid-b%i-f%s-s%i.csv.gz' % (dirname, bag, fold, seed) print(filename) df = read_csv(filename, skiprows=1, compression='gzip') y_true = df.iloc[:, 1:2] y_score = df.iloc[:, 2:3] x1 = df.iloc[:, 1:2] x2 = df.iloc[:, 2:3] #x1 = concat([x1, y_true], axis = 0) #x2 = concat([x2, y_score], axis = 0) f_max_score = fmax_score(y_true, y_score) #print f_max_score #f1_score = f_score(x1,x2) #print f1_score if metric == "fmax": dir_dict["%s_bag%i" % (dirname, bag)] = fmax_score( x1, x2) #if metric == "f1score": # dir_dict["%s_bag%i" % (dirname, bag)] = f_score(x1,x2) #if metric == "auROC": # dir_dict ["%s_bag%i" % (dirname, bag)] = roc_auc_score(x1,x2) d_sorted_by_value = OrderedDict( sorted(dir_dict.items(), key=lambda x: (-x[1], x[0]))) for key, v in d_sorted_by_value.items(): #for key, v in dir_dict.items():