def mean_fmax(path):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    p = common.load_properties(path)
    fold_count = int(p['foldCount'])
    predictions = []
    labels = []
    for fold in range(fold_count):
        _, _, test_df, label = common.read_fold(path, fold)
        test_df = common.unbag(test_df, 10)
        predict = test_df.mean(axis=1).values
        predictions += predict
        labels += label
    fmax = '%.3f' % (common.fmax_score(labels, predictions))
    return fmax
def bestbase_fmax(path):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    p = common.load_properties(path)
    fold_count = int(p['foldCount'])
    predictions = []
    labels = []
    for fold in range(fold_count):
        _, _, test_df, label = common.read_fold(path, fold)
        test_df = common.unbag(test_df, 10)
        predictions.append(test_df)
        labels += label
    predictions = concat(predictions)
    fmax_list = [
        common.fmax_core(labels, predictions[col].tolist())
        for col in list(predictions)
    ]
    return max(fmax_list)
def CES_fmax(path):
    assert exists(path)
    if not exists('%s/analysis' % path):
        mkdir('%s/analysis' % path)
    method = 'enhanced'
    select_candidate = eval('select_candidate_' + method)
    method_function = selection
    p = common.load_properties(path)
    fold_count = int(p['foldCount'])
    initial_ensemble_size = 2
    max_ensemble_size = 50
    max_candidates = 50
    max_diversity_candidates = 5
    accuracy_weight = 0.5
    max_clusters = 20

    predictions_dfs = []
    performance_dfs = []
    seeds = [0] if method == 'greedy' else range(10)

    for seedval in seeds:
        for fold in range(fold_count):
            pred_df, perf_df = method_function(fold, seedval, path)
            predictions_dfs.append(pred_df)
            performance_dfs.append(perf_df)
    performance_df = concat(performance_dfs)
    performance_df.to_csv('%s/analysis/selection-%s-%s-iterations.csv' %
                          (path, method, 'fmax'),
                          index=False)
    predictions_df = concat(predictions_dfs)
    predictions_df['method'] = method
    predictions_df['metric'] = 'fmax'
    predictions_df.to_csv('%s/analysis/selection-%s-%s.csv' %
                          (path, method, 'fmax'),
                          index=False)
    fmax = '%.3f' % (common.fmax_score(predictions_df.label,
                                       predictions_df.prediction))
    #    fmax =  '%.3f' %predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.fmax_score(x.label, x.prediction)).mean()
    #    print predictions_df.groupby(['fold', 'seed']).apply(lambda x: common.fmax_score(x.label, x.prediction))
    return fmax, predictions_df.ensemble_size.mean()
Example #4
0
    working_dir, project_path, classifier, fold, bag = parameters
    expected_filenames = ['%s/%s/predictions-%s-%02i.csv.gz' % (project_path, classifier.split()[0], fold, bag)] + ['%s/%s/validation-%s-%02i-%02i.csv.gz' % (project_path, classifier.split()[0], fold, nested_fold, bag) for nested_fold in nested_fold_values]
    if sum(map(exists, expected_filenames)) == len(expected_filenames):
        return
    cmd = 'groovy -cp %s %s/Pipeline.groovy %s %s %s %s' % (classpath, working_dir, project_path, fold, bag, classifier)
    if use_cluster:
        cmd = '%s \"%s\"' % (cluster_cmd, cmd)
    system(cmd)


# ensure project directory exists
project_path = abspath(argv[1])
assert exists(project_path)

# load and parse project properties
p = load_properties(project_path)
classifiers_fn = '%s/%s' % (project_path, p['classifiersFilename'])
input_fn = '%s/%s' % (project_path, p['inputFilename'])
assert exists(input_fn)

# generate cross validation values for leave-one-value-out or k-fold
assert ('foldAttribute' in p) or ('foldCount' in p)
if 'foldAttribute' in p:
    headers = load_arff_headers(input_fn)
    fold_values = headers[p['foldAttribute']]
else:
    fold_values = range(int(p['foldCount']))
nested_fold_values = range(int(p['nestedFoldCount']))
bag_count = int(p['bagCount'])
bag_values = range(bag_count) if bag_count > 1 else [0]
Example #5
0
        print(len(y_true))
    string += ("final,%f\n" % fmax_score(y_true, y_score))

    filename = '%s/BASELINE/ORDER%i/BP_bp%i_seed%i_rnd.%s' % (
        project_path, seed, size, seed, metric)

    with open(filename, 'w') as f:
        f.write(string)
    f.close()
    print(filename)


project_path = "path/RL/%s" % argv[1].replace("/", "")
assert exists(project_path)

p = load_properties(project_path)
fold_count = int(p['foldCount'])
seeds = int(p['seeds'])
metric = p['metric']
size = int(argv[2])
seed = int(argv[3])
dirnames = sorted(filter(isdir, glob('%s/weka.classifiers.*' % project_path)))

#print "Starting . . ."
if not exists("%s/BASELINE/" % project_path):
    makedirs("%s/BASELINE/" % project_path)

for o in range(seeds):
    if not exists("%s/BASELINE/ORDER%i" % (project_path, o)):
        makedirs("%s/BASELINE/ORDER%i" % (project_path, o))
Example #6
0
"""

from os import mkdir
from os.path import abspath, exists
from sys import argv

from common import load_properties
from diversity import average_diversity_score
from pandas import DataFrame, concat, read_csv
from sklearn.metrics import mean_squared_error, roc_auc_score

path = abspath(argv[1])
assert exists(path)
if not exists('%s/analysis' % path):
    mkdir('%s/analysis' % path)
p = load_properties(path)
fold_count = int(p['foldCount'])

dfs = []
for fold in range(fold_count):
    df = read_csv('%s/validation-%s.csv.gz' % (path, fold),
                  index_col=[0, 1],
                  compression='gzip')
    labels = df.index.get_level_values(1).values
    predictions = df.mean(axis=1)
    auc = roc_auc_score(labels, predictions)
    brier = mean_squared_error(labels, predictions)
    diversity = average_diversity_score(df.values)
    dfs.append(
        DataFrame({
            'auc': auc,
Example #7
0
def stacked_generalization(fold):
    train_df, train_labels, test_df, test_labels = common.read_fold(path, fold)
    if method == 'aggregate':
        train_df = common.unbag(train_df, bag_count)
        test_df = common.unbag(test_df, bag_count)
    test_predictions = stacker.fit(train_df, train_labels).predict_proba(test_df)[:, 1]
    return DataFrame({'fold': fold, 'id': test_df.index.get_level_values('id'), 'label': test_labels, 'prediction': test_predictions, 'diversity': common.diversity_score(test_df.values)})


path = abspath(argv[1])
assert exists(path)
if not exists('%s/analysis' % path):
    mkdir('%s/analysis' % path)
method = argv[2]
assert method in ['aggregate', 'standard']
p = common.load_properties(path)
fold_count = int(p['foldCount'])
bag_count = int(p['bagCount'])

# use non-negative least squares for regression
if 'predictClassValue' not in p:
    stacker = NNLS()
else:
    # use linear stacker if requested, else use shallow non-linear stacker
    if len(argv) > 3 and argv[3] == 'linear':
        stacker = SGDClassifier(loss = 'log', n_iter = 50, random_state = 0)
    else:
        stacker = RandomForestClassifier(n_estimators = 200, max_depth = 2, bootstrap = False, random_state = 0)

predictions_dfs = Parallel(n_jobs = -1, verbose = 1)(delayed(stacked_generalization)(fold) for fold in range(fold_count))
predictions_df = concat(predictions_dfs)
Example #8
0
        train_metrics.append(eval_metrics(train_df, ensemble, train_labels, indices))
        test_metrics.append(eval_metrics(test_df, ensemble, test_labels, indices))
    train_metrics_df = concat(train_metrics)
    best_ensemble_size = get_best_performer(train_metrics_df).ensemble_size
    best_ensemble = train_metrics_df.ensemble[:best_ensemble_size + 1]
    return eval_metrics(test_df, best_ensemble, test_labels, indices, final = True), concat(test_metrics)


path = abspath(argv[1])
assert exists(path)
if not exists('%s/analysis' % path):
    mkdir('%s/analysis' % path)
method = argv[2]
assert method in set(['greedy', 'enhanced', 'drep', 'sdi'])
select_candidate = eval('select_candidate_' + method)
p = load_properties(path)
fold_count = int(p['foldCount'])
initial_ensemble_size = 2
max_ensemble_size = 50
max_candidates = 50
max_diversity_candidates = 5
accuracy_weight = 0.5

index_labels = ['fold', 'seed']
best_dfs = []
iteration_dfs = []
seeds = range(10) if method in set(['enhanced', 'drep', 'sdi']) else [0]
for seedval in seeds:
    results = Parallel(n_jobs = -1, verbose = 0)(delayed(selection)(fold) for fold in range(fold_count))
    for best_df, iteration_df in results:
        best_dfs.append(best_df)
Example #9
0
                    df = read_csv(filename,
                                  skiprows=1,
                                  index_col=[0, 1],
                                  compression='gzip',
                                  engine='python')
                    df = df[['prediction']]
                    df.rename(
                        columns={'prediction': '%s.%s' % (classifier, bag)},
                        inplace=True)
                    bag_dfs.append(df)
                except:
                    print 'file not existed or crashed %s' % filename
            dirname_dfs.append(concat(bag_dfs, axis=1))
        concat(dirname_dfs, axis=1).sort_index().to_csv(
            '%s/predictions-%s.csv.gz' % (path, fold), compression='gzip')


data_folder = abspath(argv[1])
data_name = data_folder.split('/')[-1]
fns = listdir(data_folder)
fns = [fn for fn in fns if fn != 'analysis']
fns = [data_folder + '/' + fn for fn in fns]
feature_folders = [fn for fn in fns if isdir(fn)]

p = load_properties(data_folder)
fold_count = int(p['foldCount'])
nested_fold_count = int(p['nestedFoldCount'])
bag_count = max(1, int(p['bagCount']))
for path in feature_folders:
    combine_individual(path)
Example #10
0
                    default='true',
                    help='use HPC cluster or not')
parser.add_argument('--fold',
                    '-F',
                    default='5',
                    help='number of cross-validation fold')
args = parser.parse_args()
### record starting time
start = time()
### get the data path
data_path = abspath(args.path)
data_name = data_path.split('/')[-1]
working_dir = dirname(abspath(argv[0]))

### get weka properties from weka.properties
p = load_properties(data_path)
fold_values = range(int(p['foldCount']))
bag_values = range(int(p['bagCount']))

### get the list of base classifiers
classifiers_fn = data_path + '/classifiers.txt'
assert exists(classifiers_fn)
classifiers = filter(lambda x: not x.startswith('#'),
                     open(classifiers_fn).readlines())
classifiers = [_.strip() for _ in classifiers]

### get paths of the list of features
fns = listdir(data_path)
fns = [fn for fn in fns if fn != 'analysis']
fns = [data_path + '/' + fn for fn in fns]
feature_folders = [fn for fn in fns if isdir(fn)]