def mean_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) fold_count = int(p['foldCount']) predictions = [] labels = [] for fold in range(fold_count): _, _, test_df, label = common.read_fold(path, fold) test_df = common.unbag(test_df, 10) predict = test_df.mean(axis=1).values predictions += predict labels += label fmax = '%.3f' % (common.fmax_score(labels, predictions)) return fmax
def bestbase_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = common.load_properties(path) fold_count = int(p['foldCount']) predictions = [] labels = [] for fold in range(fold_count): _, _, test_df, label = common.read_fold(path, fold) test_df = common.unbag(test_df, 10) predictions.append(test_df) labels += label predictions = concat(predictions) fmax_list = [ common.fmax_core(labels, predictions[col].tolist()) for col in list(predictions) ] return max(fmax_list)
def CES_fmax(path): assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) method = 'enhanced' select_candidate = eval('select_candidate_' + method) method_function = selection p = common.load_properties(path) fold_count = int(p['foldCount']) initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 max_clusters = 20 predictions_dfs = [] performance_dfs = [] seeds = [0] if method == 'greedy' else range(10) for seedval in seeds: for fold in range(fold_count): pred_df, perf_df = method_function(fold, seedval, path) predictions_dfs.append(pred_df) performance_dfs.append(perf_df) performance_df = concat(performance_dfs) performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' % (path, method, 'fmax'), index=False) predictions_df = concat(predictions_dfs) predictions_df['method'] = method predictions_df['metric'] = 'fmax' predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' % (path, method, 'fmax'), index=False) fmax = '%.3f' % (common.fmax_score(predictions_df.label, predictions_df.prediction)) # fmax = '%.3f' %predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.fmax_score(x.label, x.prediction)).mean() # print predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.fmax_score(x.label, x.prediction)) return fmax, predictions_df.ensemble_size.mean()
working_dir, project_path, classifier, fold, bag = parameters expected_filenames = ['%s/%s/predictions-%s-%02i.csv.gz' % (project_path, classifier.split()[0], fold, bag)] + ['%s/%s/validation-%s-%02i-%02i.csv.gz' % (project_path, classifier.split()[0], fold, nested_fold, bag) for nested_fold in nested_fold_values] if sum(map(exists, expected_filenames)) == len(expected_filenames): return cmd = 'groovy -cp %s %s/Pipeline.groovy %s %s %s %s' % (classpath, working_dir, project_path, fold, bag, classifier) if use_cluster: cmd = '%s \"%s\"' % (cluster_cmd, cmd) system(cmd) # ensure project directory exists project_path = abspath(argv[1]) assert exists(project_path) # load and parse project properties p = load_properties(project_path) classifiers_fn = '%s/%s' % (project_path, p['classifiersFilename']) input_fn = '%s/%s' % (project_path, p['inputFilename']) assert exists(input_fn) # generate cross validation values for leave-one-value-out or k-fold assert ('foldAttribute' in p) or ('foldCount' in p) if 'foldAttribute' in p: headers = load_arff_headers(input_fn) fold_values = headers[p['foldAttribute']] else: fold_values = range(int(p['foldCount'])) nested_fold_values = range(int(p['nestedFoldCount'])) bag_count = int(p['bagCount']) bag_values = range(bag_count) if bag_count > 1 else [0]
print(len(y_true)) string += ("final,%f\n" % fmax_score(y_true, y_score)) filename = '%s/BASELINE/ORDER%i/BP_bp%i_seed%i_rnd.%s' % ( project_path, seed, size, seed, metric) with open(filename, 'w') as f: f.write(string) f.close() print(filename) project_path = "path/RL/%s" % argv[1].replace("/", "") assert exists(project_path) p = load_properties(project_path) fold_count = int(p['foldCount']) seeds = int(p['seeds']) metric = p['metric'] size = int(argv[2]) seed = int(argv[3]) dirnames = sorted(filter(isdir, glob('%s/weka.classifiers.*' % project_path))) #print "Starting . . ." if not exists("%s/BASELINE/" % project_path): makedirs("%s/BASELINE/" % project_path) for o in range(seeds): if not exists("%s/BASELINE/ORDER%i" % (project_path, o)): makedirs("%s/BASELINE/ORDER%i" % (project_path, o))
""" from os import mkdir from os.path import abspath, exists from sys import argv from common import load_properties from diversity import average_diversity_score from pandas import DataFrame, concat, read_csv from sklearn.metrics import mean_squared_error, roc_auc_score path = abspath(argv[1]) assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) p = load_properties(path) fold_count = int(p['foldCount']) dfs = [] for fold in range(fold_count): df = read_csv('%s/validation-%s.csv.gz' % (path, fold), index_col=[0, 1], compression='gzip') labels = df.index.get_level_values(1).values predictions = df.mean(axis=1) auc = roc_auc_score(labels, predictions) brier = mean_squared_error(labels, predictions) diversity = average_diversity_score(df.values) dfs.append( DataFrame({ 'auc': auc,
def stacked_generalization(fold): train_df, train_labels, test_df, test_labels = common.read_fold(path, fold) if method == 'aggregate': train_df = common.unbag(train_df, bag_count) test_df = common.unbag(test_df, bag_count) test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1] return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)}) path = abspath(argv[1]) assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) method = argv[2] assert method in ['aggregate', 'standard'] p = common.load_properties(path) fold_count = int(p['foldCount']) bag_count = int(p['bagCount']) # use non-negative least squares for regression if 'predictClassValue' not in p: stacker = NNLS() else: # use linear stacker if requested, else use shallow non-linear stacker if len(argv) > 3 and argv[3] == 'linear': stacker = SGDClassifier(loss = 'log', n_iter = 50, random_state = 0) else: stacker = RandomForestClassifier(n_estimators = 200, max_depth = 2, bootstrap = False, random_state = 0) predictions_dfs = Parallel(n_jobs = -1, verbose = 1)(delayed(stacked_generalization)(fold) for fold in range(fold_count)) predictions_df = concat(predictions_dfs)
train_metrics.append(eval_metrics(train_df, ensemble, train_labels, indices)) test_metrics.append(eval_metrics(test_df, ensemble, test_labels, indices)) train_metrics_df = concat(train_metrics) best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1] return eval_metrics(test_df, best_ensemble, test_labels, indices, final = True), concat(test_metrics) path = abspath(argv[1]) assert exists(path) if not exists('%s/analysis' % path): mkdir('%s/analysis' % path) method = argv[2] assert method in set(['greedy', 'enhanced', 'drep', 'sdi']) select_candidate = eval('select_candidate_' + method) p = load_properties(path) fold_count = int(p['foldCount']) initial_ensemble_size = 2 max_ensemble_size = 50 max_candidates = 50 max_diversity_candidates = 5 accuracy_weight = 0.5 index_labels = ['fold', 'seed'] best_dfs = [] iteration_dfs = [] seeds = range(10) if method in set(['enhanced', 'drep', 'sdi']) else [0] for seedval in seeds: results = Parallel(n_jobs = -1, verbose = 0)(delayed(selection)(fold) for fold in range(fold_count)) for best_df, iteration_df in results: best_dfs.append(best_df)
df = read_csv(filename, skiprows=1, index_col=[0, 1], compression='gzip', engine='python') df = df[['prediction']] df.rename( columns={'prediction': '%s.%s' % (classifier, bag)}, inplace=True) bag_dfs.append(df) except: print 'file not existed or crashed %s' % filename dirname_dfs.append(concat(bag_dfs, axis=1)) concat(dirname_dfs, axis=1).sort_index().to_csv( '%s/predictions-%s.csv.gz' % (path, fold), compression='gzip') data_folder = abspath(argv[1]) data_name = data_folder.split('/')[-1] fns = listdir(data_folder) fns = [fn for fn in fns if fn != 'analysis'] fns = [data_folder + '/' + fn for fn in fns] feature_folders = [fn for fn in fns if isdir(fn)] p = load_properties(data_folder) fold_count = int(p['foldCount']) nested_fold_count = int(p['nestedFoldCount']) bag_count = max(1, int(p['bagCount'])) for path in feature_folders: combine_individual(path)
default='true', help='use HPC cluster or not') parser.add_argument('--fold', '-F', default='5', help='number of cross-validation fold') args = parser.parse_args() ### record starting time start = time() ### get the data path data_path = abspath(args.path) data_name = data_path.split('/')[-1] working_dir = dirname(abspath(argv[0])) ### get weka properties from weka.properties p = load_properties(data_path) fold_values = range(int(p['foldCount'])) bag_values = range(int(p['bagCount'])) ### get the list of base classifiers classifiers_fn = data_path + '/classifiers.txt' assert exists(classifiers_fn) classifiers = filter(lambda x: not x.startswith('#'), open(classifiers_fn).readlines()) classifiers = [_.strip() for _ in classifiers] ### get paths of the list of features fns = listdir(data_path) fns = [fn for fn in fns if fn != 'analysis'] fns = [data_path + '/' + fn for fn in fns] feature_folders = [fn for fn in fns if isdir(fn)]