Ejemplo n.º 1
0
def CES_run():
    start_time = time.time()
    classifiers, bps_weight = get_bps(project_path, seed, metric, size)
    original = deepcopy(classifiers)
    ensemble = []
    set_ensembles = {}
    for i in range(init_ens_size):
        ensemble.append(select_top_classifier(classifiers, seed, fold, RULE))
    find_ensemble(ensemble, classifiers, seed, fold, RULE, set_ensembles)

    if len(set_ensembles) > 0:
        sel_ens = max(set_ensembles, key=set_ensembles.get)
    else:
        sel_ens = ensemble
    actual = []
    for o in range(len(original)):
        if original[o] in sel_ens:
            actual.append(o + 1)

    val_score = fmax_score(
        *aggregate_predictions(sel_ens, seed, fold, "valid", RULE))
    test_score = fmax_score(
        *aggregate_predictions(sel_ens, seed, fold, "test", RULE))
    seconds = time.time() - start_time
    string = "Fold_%i (val = %f) (test = %f) :: (%s) [%s]\n%s" % (
        fold, val_score, test_score, ", ".join(str(a) for a in actual),
        time.strftime('%H:%M:%S', time.gmtime(seconds)), bps2string(original))
    dst = '%s/CES_OUTPUT/ORDER%i/bp%i_fold%i_seed%i_%s_start-%s.fmax' % (
        project_path, seed, size, fold, seed, RULE, init_ens_size)
    with open(dst, 'wb') as f:
        f.write('%s' % string)
    f.close()
    print "\t%s (%s)" % (dst,
                         (time.strftime('%H:%M:%S', time.gmtime(seconds))))
Ejemplo n.º 2
0
def CES_ens():
    start_time = time.time()

    y_true = DataFrame(columns=["label"])
    y_score = DataFrame(columns=["prediction"])
    string = ""
    for fold in range(fold_count):
        filename_fold = '%s/CES_OUTPUT/ORDER%s/bp%s_fold%s_seed%s_%s_start-%s.%s' % (
            project_path, seed, size, fold, seed, RULE, start, metric)
        ensemble = get_fold_ens(filename_fold)
        ensemble_bps = get_ens_bps(ensemble, filename_fold)
        inner_y_true, inner_y_score = aggregate_predictions(
            ensemble_bps, seed, fold, "test", RULE)
        y_true = concat([y_true, inner_y_true], axis=0)
        y_score = concat([y_score, inner_y_score], axis=0)
        string += ("fold_%i,%f\n" %
                   (fold, fmax_score(inner_y_true, inner_y_score)))
    string += ("final,%f\n" % fmax_score(y_true, y_score))

    dst = '%s/CES_RESULTS/ORDER%i/CES_bp%i_seed%i_%s_start-%s.%s' % (
        project_path, seed, size, seed, RULE, start, metric)
    with open(dst, 'wb') as f:
        f.write(string)
    f.close()
    print "\t%s (%s)" % (dst, (time.strftime(
        '%H:%M:%S', time.gmtime(time.time() - start_time))))
Ejemplo n.º 3
0
 def getTestPerf(self, node):
     test_perf = -1
     if node == (
             0,
     ):  # probably RL_pessimistic was trained for too little time and all ensembles of size 2 are performing worse than the individual base predictors
         force_picks = [
             val
             for val in [tuple([y]) for y in range(1, self.world.np + 1)]
             if val in self.getExploredStates()
         ]
         select_from = {x: self.world.perf[x] for x in force_picks}
         index = max(select_from, key=select_from.get)[0]
         node = tuple([index])
         test_pred = self.get_fold_probs(
             index, 'test', 'prediction') * self.world.bps_weight[index]
     elif (len(node) == 1):
         index = node[0]
         test_pred = self.get_fold_probs(
             index, 'test', 'prediction') * self.world.bps_weight[index]
     else:
         index = list(node)[0]
         test_pred = self.get_fold_probs(
             index, 'test', 'prediction') * self.world.bps_weight[index]
         for index in list(node)[1:]:
             test_pred = test_pred.add(
                 self.get_fold_probs(index, 'test', 'prediction') *
                 self.world.bps_weight[index])
     denom = sum([self.world.bps_weight[index] for index in list(node)])
     test_pred = test_pred / denom
     test_labels = self.get_fold_probs(node[0], 'test', 'label')
     test_perf = fmax_score(test_labels, test_pred)
     return test_perf
Ejemplo n.º 4
0
def FULL_ens():
    y_true = DataFrame(columns = ["label"])
    y_score = DataFrame(columns = ["prediction"])
    string = ""
    for fold in range(fold_count):
        ensemble_bps = full_ensemble()
        inner_y_true, inner_y_score = aggregate_predictions(ensemble_bps, seed, fold, "test")
        y_true = concat([y_true, inner_y_true], axis = 0)
        y_score = concat([y_score, inner_y_score], axis = 0)
	string += ("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score)))
	print(length(y_true))
    string += ("final,%f\n" % fmax_score(y_true, y_score))
    filename = '%s/BASELINE/ORDER%i/FE_bp%i_seed%i_%s.fmax' % (results_path, seed, size, seed, RULE)

    with open(filename, 'wb') as f:
    	f.write(string)
    f.close()
    print filename 
Ejemplo n.º 5
0
def select_top_classifier(classifiers, seed, fold, RULE):
    scores = [
        fmax_score(*aggregate_predictions([classifiers[i]], seed, fold,
                                          "valid", RULE))
        for i in range(len(classifiers))
    ]
    top_classifier = classifiers[argmax(scores)]
    classifiers.remove(top_classifier)
    return top_classifier
Ejemplo n.º 6
0
    def calc_perf(self, node):
        if (node not in self.perf):
            if (len(node) > len(self.cwan)
                    and len(setdiff1d(node, self.cwan)) == 1
                    and node != ['exit']):
                start_time = time.time()
                bp2add = setdiff1d(node, self.cwan)
                bp = bp2add[0]
                self.bps_weighted_pred_df[(
                    self.np +
                    1)] = self.bps_weighted_pred_df[(self.np + 1)].add(
                        self.bps_weighted_pred_df[bp])
                self.cwan = node

                denom = sum([self.bps_weight[index] for index in list(node)])
                y_score = self.bps_weighted_pred_df[(
                    self.np + 1
                )] / denom  # new y_score variable because col_np+1 needs to be the numerator
                performance = fmax_score(self.bps_weighted_pred_df['label'],
                                         y_score)
                self.perf[node] = performance

                if (len(node) == self.np):
                    self.reset_cwan_col()
                return performance
            else:  #de novo...
                start_time = time.time()
                index = list(node)[0]
                y_score = self.bps_weighted_pred_df[index]
                for index in list(node)[1:]:
                    y_score = y_score.add(self.bps_weighted_pred_df[index])
                self.bps_weighted_pred_df[(self.np + 1)] = y_score
                self.cwan = node

                denom = sum([self.bps_weight[index] for index in list(node)])
                y_score = y_score / denom
                performance = fmax_score(self.bps_weighted_pred_df['label'],
                                         y_score)
                self.perf[node] = performance
                return performance
        else:
            return self.perf[node]
Ejemplo n.º 7
0
def FULL_ens(parameters):
    size, seed = parameters

    y_true = DataFrame(columns=["label"])
    y_score = DataFrame(columns=["prediction"])
    string = ""
    for fold in range(fold_count):
        ensemble_bps = get_bps(project_path, seed, metric, size)[0]
        inner_y_true, inner_y_score = aggregate_predictions(
            ensemble_bps, seed, fold, "test", RULE)
        y_true = concat([y_true, inner_y_true], axis=0)
        y_score = concat([y_score, inner_y_score], axis=0)
        string += ("fold_%i,%f\n" %
                   (fold, fmax_score(inner_y_true, inner_y_score)))
    string += ("final,%f\n" % fmax_score(y_true, y_score))
    filename = '%s/%s/%s%i/FE_bp%i_seed%i_%s.fmax' % (
        project_path, directory, subdirectory, seed, size, seed, RULE)

    with open(filename, 'wb') as f:
        f.write(string)
    f.close()
    print filename
Ejemplo n.º 8
0
def find_ensemble(ensemble, classifiers, seed, fold, RULE, set_ensembles):
    if len(classifiers) == 0 or len(ensemble) == max_ens_size:
        return ensemble
    else:
        potential_ensembles = get_potential_ensembles(
            ensemble,
            random.choice(classifiers, len(classifiers), replace=False))
        scores = [
            fmax_score(*aggregate_predictions(pe, seed, fold, "valid", RULE))
            for pe in potential_ensembles
        ]
        ensemble.append(potential_ensembles[argmax(scores)][-1])
        #print "\t adding CURRENT ENSEMBLE:"
        #for c in ensemble:
        #    print "\t - %s" % c
        #print "\t ==> $%s" % max(scores)
        set_ensembles[tuple(deepcopy(ensemble))] = max(scores)
        classifiers.remove(potential_ensembles[argmax(scores)][-1])
        find_ensemble(ensemble, classifiers, seed, fold, RULE, set_ensembles)
Ejemplo n.º 9
0
def get_max_predictions(predictors, seed, fold, set):
    max_p = ''
    max_w = 0

    path, bag, weight = get_path_bag_weight(predictors[0])
    if weight > max_w:
        max_w = weight
        max_p = path

    for bp in predictors[1:]:
        path, bag, weight = get_path_bag_weight(bp)
        if weight > max_w:
            max_w = weight
            max_p = path

    set = 'test'
    #print 'GET_MAX_PREDICTIONS FOR THE BEST BP, I.E., %s_bag%s (based on the order file obtained form the validation set)\n' % (max_p, bag)
    y_true, y_score = get_set_preds(max_p, set, bag, fold, seed)
    perf = fmax_score(y_true, y_score)
    return (y_true, y_score, ('%s_bag%s' % (max_p, max_w)))
Ejemplo n.º 10
0
def aggregate_predictions(predictors, seed, fold, set):
    set = 'test'
    denom = 0
    path, bag, weight = get_path_bag_weight(predictors[0])
    
    denom = ((denom + weight) if RULE == 'WA' else (denom + 1)) 
    y_true, y_score = get_set_preds(path, set, bag, fold, seed)
    y_score = weight * y_score    
    
    for bp in predictors[1:]:
        path, bag, weight = get_path_bag_weight(bp)
        denom  += weight
        y_true, y_score_current = get_set_preds(path, set, bag, fold, seed)
        if RULE == 'WA':
            y_score = y_score.add(weight * y_score_current)        
        else:
            y_score = y_score.add(y_score_current)

    y_score = y_score/denom  
    perf    = fmax_score(y_true, y_score)
    #print perf
    return (y_true, y_score)
Ejemplo n.º 11
0
    order_fn = '%s/ENSEMBLES/order_of_seed%i_%s.txt' % (project_path, seed,
                                                        metric)
    with open(order_fn, 'wb') as order_file:
        for dirname in dirnames:
            for bag in bag_list:
                x1 = DataFrame(columns=["label"])
                x2 = DataFrame(columns=["prediction"])
                for fold in range(fold_count):
                    filename = '%s/valid-b%i-f%s-s%i.csv.gz' % (dirname, bag,
                                                                fold, seed)
                    df = read_csv(filename, skiprows=1, compression='gzip')
                    y_true = df.ix[:, 1:2]
                    y_score = df.ix[:, 2:3]
                x1 = concat([x1, y_true], axis=0)
                x2 = concat([x2, y_score], axis=0)

                if metric == "fmax":
                    dir_dict["%s_bag%i" % (dirname, bag)] = fmax_score(x1, x2)
                if metric == "auROC":
                    dir_dict["%s_bag%i" % (dirname, bag)] = roc_auc_score(
                        x1, x2)
                d_sorted_by_value = OrderedDict(
                    sorted(dir_dict.items(), key=lambda x: (-x[1], x[0])))

        for key, v in d_sorted_by_value.items():
            order_file.write("%s, %s \n" % (key, v))
        order_file.close()
        print order_fn

print "Done!\n"