def STACK_run():
    start_time = time.time()
    y_true = DataFrame(columns=['label'])
    y_score = DataFrame(columns=['prediction'])
    y_size = 0.0
    string = ""

    for fold in range(fold_count):
        classifiers = full_ensemble(project_path, size, fold, seed, metric)
        names = []
        for c in classifiers:
            names.append(c)

        inner_y_true_train, inner_train_dataset = create_dataset_from_predictions_for_stacker(
            classifiers, seed, fold, "valid")
        inner_y_true, inner_test_dataset = create_dataset_from_predictions_for_stacker(
            classifiers, seed, fold, "test")

        "L1Log"
        stacker = LogisticRegression(random_state=0,
                                     penalty='l1',
                                     solver='liblinear')
        stacker.fit(inner_train_dataset, inner_y_true_train.values.ravel())

        #print("\tstacker.coef_\n\t", stacker.coef_)
        #print("Lasso model: ", pretty_print_linear(stacker.coef_[0], names, sort = False))
        inner_y_size = count_nonzero(stacker.coef_)
        y_size += inner_y_size

        inner_y_score_ndarray = stacker.predict_proba(inner_test_dataset)[:, 1]
        inner_y_score = DataFrame(inner_y_score_ndarray)
        inner_y_score.rename(columns={0: 'prediction'}, inplace=True)

        y_true = concat((y_true, inner_y_true), axis=0)
        y_true['label'] = to_numeric(y_true['label'])
        y_score = concat([y_score, inner_y_score], axis=0)
        string += (
            "fold_%i,%f::%i\n" %
            (fold, fmax_score(inner_y_true, inner_y_score), inner_y_size))

    y_size /= fold_count
    string += ("final,%f::%f\n" % (fmax_score(y_true, y_score), y_size))

    dst = "%s/STACK_RESULTS/ORDER%i/stack_bp%i_seed%i_%s.fmax" % (
        project_path, seed, size, seed, "L1Log")
    with open(dst, 'w') as f:
        f.write(string)
    f.close()

    seconds = time.time() - start_time
    print("\t%s (%s)" % (dst,
                         (time.strftime('%H:%M:%S', time.gmtime(seconds)))))
Ejemplo n.º 2
0
def selection(fold, seedval, path, agg):
    seed(seedval)
    initial_ensemble_size = 2
    max_ensemble_size = 50
    max_candidates = 50
    max_diversity_candidates = 5
    accuracy_weight = 0.5
    max_clusters = 20
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    train_df = common.unbag(train_df, agg)
    test_df = common.unbag(test_df, agg)
    best_classifiers = train_df.apply(lambda x: common.fmax_score(
        train_labels, x)).sort_values(ascending=not common.greater_is_better)
    train_performance = []
    test_performance = []
    ensemble = []
    for i in range(min(max_ensemble_size, len(best_classifiers))):
        best_candidate = select_candidate_enhanced(train_df, train_labels,
                                                   best_classifiers, ensemble,
                                                   i)
        ensemble.append(best_candidate)
        train_performance.append(
            get_performance(train_df, ensemble, fold, seedval))
        test_performance.append(
            get_performance(test_df, ensemble, fold, seedval))
    train_performance_df = pd.DataFrame.from_records(train_performance)
    best_ensemble_size = common.get_best_performer(
        train_performance_df).ensemble_size.values
    best_ensemble = train_performance_df.ensemble[:best_ensemble_size.item(0) +
                                                  1]
    return get_predictions(
        test_df, best_ensemble, fold,
        seedval), pd.DataFrame.from_records(test_performance)
Ejemplo n.º 3
0
def CES_fmax(path,fold_count=5,agg=1):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    method = 'enhanced'
    select_candidate = eval('select_candidate_' + method)
    method_function = selection
    initial_ensemble_size = 2
    max_ensemble_size = 50
    max_candidates = 50
    max_diversity_candidates = 5
    accuracy_weight = 0.5
    max_clusters = 20
    predictions_dfs = []
    performance_dfs = []
    seeds = range(agg)

    for seedval in seeds:
        for fold in range(fold_count):
            pred_df, perf_df = method_function(fold,seedval,path,agg)
            predictions_dfs.append(pred_df)
            performance_dfs.append(perf_df)
    performance_df = pd.concat(performance_dfs)
    performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' % (path, method, 'fmax'), index = False)
    predictions_df = pd.concat(predictions_dfs)
    predictions_df['method'] = method
    predictions_df['metric'] = 'fmax'
    predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' % (path, method, 'fmax'), index = False)
    fmax = '%.3f' %(common.fmax_score(predictions_df.label,predictions_df.prediction))
    return float(fmax)
Ejemplo n.º 4
0
def get_cluster_performance(labels, predictions, n_clusters, fold, seedval):
    return {
        'fold': fold,
        'seed': seedval,
        'score': common.fmax_score(labels, predictions),
        'n_clusters': n_clusters
    }
Ejemplo n.º 5
0
def get_performance(df, ensemble, fold, seedval):
    labels = df.index.get_level_values('label').values
    predictions = df[ensemble].mean(axis=1)
    return {
        'fold': fold,
        'seed': seedval,
        'score': common.fmax_score(labels, predictions),
        'ensemble': ensemble[-1],
        'ensemble_size': len(ensemble)
    }
def RL_ens():
    y_true = DataFrame(columns = ["label"])
    y_score = DataFrame(columns = ["prediction"])
    string = ""
    for fold in range(fold_count):
        filename_fold = '%s/RL_OUTPUT/ORDER%s/bp%s_fold%s_seed%s_epsilon%s_pre%s_conv%s_exit%s_%s_%s_%s_start-%s.fmax' % (project_path, seed, size, fold, seed, epsilon, age, conv, exit, strategy, RULE, algo, start)
        ensemble = read_ens(filename_fold)
        ensemble_bps = get_ens_bps(ensemble, filename_fold)
        inner_y_true, inner_y_score = aggregate_predictions(ensemble_bps, set, fold, seed, RULE)
        y_true = concat([y_true, inner_y_true], axis = 0)
        y_true['label'] = to_numeric(y_true['label'])
        y_score = concat([y_score, inner_y_score], axis = 0)

        string += ("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score)))

    string += ("final,%f\n" % fmax_score(y_true, y_score))
    filename = '%s/RL_RESULTS/ORDER%i/RL_bp%i_seed%i_epsilon%s_pre%s_conv%s_exit%s_%s_%s_%s_start-%s.fmax' % (project_path, seed, size, seed, epsilon, age, conv, exit, strategy, RULE, algo, start)

    with open(filename, 'w+') as f:
    	f.write(string)
    f.close()
    print(filename)
Ejemplo n.º 7
0
def bestbase_fmax(path,fold_count=5,agg=1):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    predictions = []
    labels = []
    for fold in range(fold_count):
        _,_,test_df,label = common.read_fold(path,fold)
        test_df = common.unbag(test_df, agg)
        predictions.append(test_df)
        labels = append(labels,label)
    predictions = pd.concat(predictions)
    fmax_list = [common.fmax_score(labels,predictions.iloc[:,i]) for i in range(len(predictions.columns))]
    return max(fmax_list)
Ejemplo n.º 8
0
def mean_fmax(path,fold_count=5,agg=1):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    predictions = []
    labels = []
    for fold in range(fold_count):
        _,_,test_df,label = common.read_fold(path,fold)
        test_df = common.unbag(test_df, agg)
        predict = test_df.mean(axis=1).values
        predictions = append(predictions,predict)
        labels = append(labels,label)
    fmax = '%.3f' %(common.fmax_score(labels,predictions))
    return float(fmax)
Ejemplo n.º 9
0
def FULL_ens():
    y_true = DataFrame(columns=["label"])
    y_score = DataFrame(columns=["prediction"])
    string = ""
    for fold in range(fold_count):
        ensemble_bps = full_ensemble(project_path, size, fold, seed, metric)
        inner_y_true, inner_y_score = aggregate_predictions(
            ensemble_bps, "test", fold, seed, RULE)
        y_true = concat([y_true, inner_y_true], axis=0)
        y_score = concat([y_score, inner_y_score], axis=0)
        y_true['label'] = to_numeric(y_true['label'])
        inner_y_score = DataFrame(inner_y_score)
        inner_y_score.rename(columns={0: 'prediction'}, inplace=True)
        string += ("fold_%i,%f\n" %
                   (fold, fmax_score(inner_y_true, inner_y_score)))
    string += ("final,%f\n" % fmax_score(y_true, y_score))
    filename = '%s/BASELINE/ORDER%i/FE_bp%i_seed%i_%s_rnd.fmax' % (
        project_path, seed, size, seed, RULE)

    with open(filename, 'w') as f:
        f.write(string)
    f.close()
    print(filename)
Ejemplo n.º 10
0
def BEST_bp():
    y_true = DataFrame(columns=["label"])
    y_score = DataFrame(columns=["prediction"])
    string = ""
    for fold in range(fold_count):
        ensemble_bps = full_ensemble(project_path, size, fold, seed, metric)
        inner_y_true, inner_y_score = get_max_predictions(
            ensemble_bps, seed, fold, "test")
        y_true = concat([y_true, inner_y_true], axis=0)
        y_true['label'] = to_numeric(y_true['label'])
        y_score = concat([y_score, inner_y_score], axis=0)
        string += ("fold_%i,%f\n" %
                   (fold, fmax_score(inner_y_true, inner_y_score)))
        print("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score)))
        print(len(y_true))
    string += ("final,%f\n" % fmax_score(y_true, y_score))

    filename = '%s/BASELINE/ORDER%i/BP_bp%i_seed%i_rnd.%s' % (
        project_path, seed, size, seed, metric)

    with open(filename, 'w') as f:
        f.write(string)
    f.close()
    print(filename)
Ejemplo n.º 11
0
def mean_fmax(path):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    p = common.load_properties(path)
    fold_count = int(p['foldCount'])
    predictions = []
    labels = []
    for fold in range(fold_count):
        _, _, test_df, label = common.read_fold(path, fold)
        test_df = common.unbag(test_df, 10)
        predict = test_df.mean(axis=1).values
        predictions += predict
        labels += label
    fmax = '%.3f' % (common.fmax_score(labels, predictions))
    return fmax
Ejemplo n.º 12
0
def get_max_predictions(predictors, seed, fold, set):
    max_p = ''
    max_w = 0

    path, bag, weight = get_predictor_path_bag_weight(predictors[0])
    if weight > max_w:
        max_w = weight
        max_p = path

    for bp in predictors[1:]:
        path, bag, weight = get_predictor_path_bag_weight(bp)
        if weight > max_w:
            max_w = weight
            max_p = path

    y_true, y_score = get_set_preds(max_p, set, bag, fold, seed)
    perf = fmax_score(y_true, y_score)
    return (y_true, y_score)
Ejemplo n.º 13
0
def main(path,fold_count=5,agg=1):
    dn = abspath(path).split('/')[-1]
    cols = ['data_name','fmax','method']
    dfs = []
    print '[CES] Start building model #################################'
    ces = CES_fmax(path,fold_count,agg)
    print '[CES] Finished evaluating model ############################'
    print '[CES] F-max score is %s.' %ces
    print '[Mean] Start building model ################################'
    mean = mean_fmax(path,fold_count,agg)
    print '[Mean] Finished evaluating model ###########################'
    print '[Mean] F-max score is %s.' %mean
    print '[Best Base] Start building model ###########################'
    bestbase = bestbase_fmax(path,fold_count,agg)
    print '[Best Base] Finished evaluating model ######################'
    print '[Best Base] F-max score is %s.' %bestbase
    dfs.append(pd.DataFrame(data = [[dn,ces,'CES']],columns=cols,index = [0]))
    dfs.append(pd.DataFrame(data = [[dn,mean,'Mean']],columns=cols,index = [0]))
    dfs.append(pd.DataFrame(data = [[dn,bestbase,'best base']],columns=cols,index = [0]))
    # Get Stacking Fmax scores
    stackers = [RandomForestClassifier(n_estimators = 200, max_depth = 2, bootstrap = False, random_state = 0), SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',probability=True,
    max_iter=-1, random_state=None, shrinking=True,
    tol=0.001, verbose=False),GaussianNB(),LogisticRegression(),AdaBoostClassifier(),DecisionTreeClassifier(),GradientBoostingClassifier(loss='deviance'),KNeighborsClassifier()]
    stacker_names = ["RF.S","SVM.S","NB.S","LR.S","AB.S","DT.S","LB.S","KNN.S"]
    for i,(stacker_name,stacker) in enumerate(zip(stacker_names,stackers)):
        print '[%s] Start building model ################################' %(stacker_name)
        predictions_dfs = [stacked_generalization(path,stacker_name,stacker,fold,agg) for fold in range(fold_count)]
        predictions_df = pd.concat(predictions_dfs)
        fmax = common.fmax_score(predictions_df.label, predictions_df.prediction)
	print '[%s] Finished evaluating model ###########################' %(stacker_name)
        print '[%s] F-max score is %s.' %(stacker_name,fmax)
	df = pd.DataFrame(data = [[dn,fmax,stacker_name]],columns=cols, index = [0])
        dfs.append(df)
    dfs = pd.concat(dfs)
    # Save results
    print 'Saving results #############################################'
    if not exists('%s/analysis' %path):
        mkdir('%s/analysis' %path)
    dfs.to_csv("%s/analysis/performance.csv" %path, index = False)
Ejemplo n.º 14
0
def CES_fmax(path):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    method = 'enhanced'
    select_candidate = eval('select_candidate_' + method)
    method_function = selection
    p = common.load_properties(path)
    fold_count = int(p['foldCount'])
    initial_ensemble_size = 2
    max_ensemble_size = 50
    max_candidates = 50
    max_diversity_candidates = 5
    accuracy_weight = 0.5
    max_clusters = 20

    predictions_dfs = []
    performance_dfs = []
    seeds = [0] if method == 'greedy' else range(10)

    for seedval in seeds:
        for fold in range(fold_count):
            pred_df, perf_df = method_function(fold, seedval, path)
            predictions_dfs.append(pred_df)
            performance_dfs.append(perf_df)
    performance_df = concat(performance_dfs)
    performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' %
                          (path, method, 'fmax'),
                          index=False)
    predictions_df = concat(predictions_dfs)
    predictions_df['method'] = method
    predictions_df['metric'] = 'fmax'
    predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' %
                          (path, method, 'fmax'),
                          index=False)
    fmax = '%.3f' % (common.fmax_score(predictions_df.label,
                                       predictions_df.prediction))
    #    fmax =  '%.3f' %predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.fmax_score(x.label, x.prediction)).mean()
    #    print predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.fmax_score(x.label, x.prediction))
    return fmax, predictions_df.ensemble_size.mean()
Ejemplo n.º 15
0
best_base_fmax = besides_stack.bestbase_fmax(path)
method = 'aggregate'  # Set default method for baggings to be aggregate
cols = [
    'GO', 'category', '#pos', '#neg', '#total', '#+/#-', 'fmax',
    'metaClassifier', 'best_base_fmax', 'best_base_learner', 'CES_size'
]
dfs = []

# Get Stacking Fmax scores
for i, (stacker_name, stacker) in enumerate(zip(stacker_names, stackers)):
    predictions_dfs = [
        stacked_generalization(stacker, fold) for fold in range(fold_count)
    ]
    predictions_df = concat(predictions_dfs)
    predictions_df['method'] = method
    fmax = common.fmax_score(predictions_df.label, predictions_df.prediction)
    df = DataFrame(data=[[
        go_term, category, posNum, negNum, totalNum, pn_ratio, fmax,
        stacker_name, best_base_fmax, best_base_clsf, ces_size
    ]],
                   columns=cols,
                   index=[0])
    dfs.append(df)

dfs.append(
    DataFrame(data=[[
        go_term, category, posNum, negNum, totalNum, pn_ratio, ces_fmax, 'CES',
        best_base_fmax, best_base_clsf, ces_size
    ]],
              columns=cols,
              index=[0]))
Ejemplo n.º 16
0
path = abspath(argv[1])
assert exists(path)
if not exists('%s/analysis' % path):
    mkdir('%s/analysis' % path)
p = common.load_properties(path)
input_fn = '%s/%s' % (path, p['inputFilename'])
assert exists(input_fn)

# generate cross validation values for leave-one-value-out or k-fold
assert ('foldAttribute' in p) or ('foldCount' in p)
if 'foldAttribute' in p:
    headers = common.load_arff_headers(input_fn)
    fold_values = headers[p['foldAttribute']]
else:
    fold_values = range(int(p['foldCount']))

stacker = LogisticRegression()

perf_df = []
for fold in fold_values:
	prediction_df = stacked_generalization(fold)
	if prediction_df is not None:
		prediction_df.to_csv('%s/analysis/%s-predictions.csv' %(path,fold))
		fmax = common.fmax_score(prediction_df.label.tolist(),prediction_df.prediction.tolist())
		perf_df.append(DataFrame(data = [[path.split('/')[-1],fold,fmax]], columns=['data','fold','fmax'],index=[0]))

# Get Fmax value for each fold, and dump to local disk
perf_df = concat(perf_df)
perf_df.to_csv(path + '/analysis/%s_fmax.csv' %path.split('/')[-1])
Ejemplo n.º 17
0
                for bag in bag_list:
                    print("fold = %i seed = %s, bag = %s" % (fold, seed, bag))
                    x1 = DataFrame(columns=["label"])
                    x2 = DataFrame(columns=["prediction"])
                    #for fold in range(fold_count):
                    filename = '%s/valid-b%i-f%s-s%i.csv.gz' % (dirname, bag,
                                                                fold, seed)
                    print(filename)
                    df = read_csv(filename, skiprows=1, compression='gzip')
                    y_true = df.iloc[:, 1:2]
                    y_score = df.iloc[:, 2:3]
                    x1 = df.iloc[:, 1:2]
                    x2 = df.iloc[:, 2:3]
                    #x1 = concat([x1, y_true], axis = 0)
                    #x2 = concat([x2, y_score], axis = 0)
                    f_max_score = fmax_score(y_true, y_score)
                    #print f_max_score
                    #f1_score = f_score(x1,x2)
                    #print f1_score

                    if metric == "fmax":
                        dir_dict["%s_bag%i" % (dirname, bag)] = fmax_score(
                            x1, x2)
                    #if metric == "f1score":
                    #	dir_dict["%s_bag%i" % (dirname, bag)] = f_score(x1,x2)
                    #if metric == "auROC":
                    #	dir_dict ["%s_bag%i" % (dirname, bag)] = roc_auc_score(x1,x2)
                    d_sorted_by_value = OrderedDict(
                        sorted(dir_dict.items(), key=lambda x: (-x[1], x[0])))
            for key, v in d_sorted_by_value.items():
                #for key, v in dir_dict.items():