def CES_run(): start_time = time.time() classifiers, bps_weight = get_bps(project_path, seed, metric, size) original = deepcopy(classifiers) ensemble = [] set_ensembles = {} for i in range(init_ens_size): ensemble.append(select_top_classifier(classifiers, seed, fold, RULE)) find_ensemble(ensemble, classifiers, seed, fold, RULE, set_ensembles) if len(set_ensembles) > 0: sel_ens = max(set_ensembles, key=set_ensembles.get) else: sel_ens = ensemble actual = [] for o in range(len(original)): if original[o] in sel_ens: actual.append(o + 1) val_score = fmax_score( *aggregate_predictions(sel_ens, seed, fold, "valid", RULE)) test_score = fmax_score( *aggregate_predictions(sel_ens, seed, fold, "test", RULE)) seconds = time.time() - start_time string = "Fold_%i (val = %f) (test = %f) :: (%s) [%s]\n%s" % ( fold, val_score, test_score, ", ".join(str(a) for a in actual), time.strftime('%H:%M:%S', time.gmtime(seconds)), bps2string(original)) dst = '%s/CES_OUTPUT/ORDER%i/bp%i_fold%i_seed%i_%s_start-%s.fmax' % ( project_path, seed, size, fold, seed, RULE, init_ens_size) with open(dst, 'wb') as f: f.write('%s' % string) f.close() print "\t%s (%s)" % (dst, (time.strftime('%H:%M:%S', time.gmtime(seconds))))
def CES_ens(): start_time = time.time() y_true = DataFrame(columns=["label"]) y_score = DataFrame(columns=["prediction"]) string = "" for fold in range(fold_count): filename_fold = '%s/CES_OUTPUT/ORDER%s/bp%s_fold%s_seed%s_%s_start-%s.%s' % ( project_path, seed, size, fold, seed, RULE, start, metric) ensemble = get_fold_ens(filename_fold) ensemble_bps = get_ens_bps(ensemble, filename_fold) inner_y_true, inner_y_score = aggregate_predictions( ensemble_bps, seed, fold, "test", RULE) y_true = concat([y_true, inner_y_true], axis=0) y_score = concat([y_score, inner_y_score], axis=0) string += ("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score))) string += ("final,%f\n" % fmax_score(y_true, y_score)) dst = '%s/CES_RESULTS/ORDER%i/CES_bp%i_seed%i_%s_start-%s.%s' % ( project_path, seed, size, seed, RULE, start, metric) with open(dst, 'wb') as f: f.write(string) f.close() print "\t%s (%s)" % (dst, (time.strftime( '%H:%M:%S', time.gmtime(time.time() - start_time))))
def getTestPerf(self, node): test_perf = -1 if node == ( 0, ): # probably RL_pessimistic was trained for too little time and all ensembles of size 2 are performing worse than the individual base predictors force_picks = [ val for val in [tuple([y]) for y in range(1, self.world.np + 1)] if val in self.getExploredStates() ] select_from = {x: self.world.perf[x] for x in force_picks} index = max(select_from, key=select_from.get)[0] node = tuple([index]) test_pred = self.get_fold_probs( index, 'test', 'prediction') * self.world.bps_weight[index] elif (len(node) == 1): index = node[0] test_pred = self.get_fold_probs( index, 'test', 'prediction') * self.world.bps_weight[index] else: index = list(node)[0] test_pred = self.get_fold_probs( index, 'test', 'prediction') * self.world.bps_weight[index] for index in list(node)[1:]: test_pred = test_pred.add( self.get_fold_probs(index, 'test', 'prediction') * self.world.bps_weight[index]) denom = sum([self.world.bps_weight[index] for index in list(node)]) test_pred = test_pred / denom test_labels = self.get_fold_probs(node[0], 'test', 'label') test_perf = fmax_score(test_labels, test_pred) return test_perf
def FULL_ens(): y_true = DataFrame(columns = ["label"]) y_score = DataFrame(columns = ["prediction"]) string = "" for fold in range(fold_count): ensemble_bps = full_ensemble() inner_y_true, inner_y_score = aggregate_predictions(ensemble_bps, seed, fold, "test") y_true = concat([y_true, inner_y_true], axis = 0) y_score = concat([y_score, inner_y_score], axis = 0) string += ("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score))) print(length(y_true)) string += ("final,%f\n" % fmax_score(y_true, y_score)) filename = '%s/BASELINE/ORDER%i/FE_bp%i_seed%i_%s.fmax' % (results_path, seed, size, seed, RULE) with open(filename, 'wb') as f: f.write(string) f.close() print filename
def select_top_classifier(classifiers, seed, fold, RULE): scores = [ fmax_score(*aggregate_predictions([classifiers[i]], seed, fold, "valid", RULE)) for i in range(len(classifiers)) ] top_classifier = classifiers[argmax(scores)] classifiers.remove(top_classifier) return top_classifier
def calc_perf(self, node): if (node not in self.perf): if (len(node) > len(self.cwan) and len(setdiff1d(node, self.cwan)) == 1 and node != ['exit']): start_time = time.time() bp2add = setdiff1d(node, self.cwan) bp = bp2add[0] self.bps_weighted_pred_df[( self.np + 1)] = self.bps_weighted_pred_df[(self.np + 1)].add( self.bps_weighted_pred_df[bp]) self.cwan = node denom = sum([self.bps_weight[index] for index in list(node)]) y_score = self.bps_weighted_pred_df[( self.np + 1 )] / denom # new y_score variable because col_np+1 needs to be the numerator performance = fmax_score(self.bps_weighted_pred_df['label'], y_score) self.perf[node] = performance if (len(node) == self.np): self.reset_cwan_col() return performance else: #de novo... start_time = time.time() index = list(node)[0] y_score = self.bps_weighted_pred_df[index] for index in list(node)[1:]: y_score = y_score.add(self.bps_weighted_pred_df[index]) self.bps_weighted_pred_df[(self.np + 1)] = y_score self.cwan = node denom = sum([self.bps_weight[index] for index in list(node)]) y_score = y_score / denom performance = fmax_score(self.bps_weighted_pred_df['label'], y_score) self.perf[node] = performance return performance else: return self.perf[node]
def FULL_ens(parameters): size, seed = parameters y_true = DataFrame(columns=["label"]) y_score = DataFrame(columns=["prediction"]) string = "" for fold in range(fold_count): ensemble_bps = get_bps(project_path, seed, metric, size)[0] inner_y_true, inner_y_score = aggregate_predictions( ensemble_bps, seed, fold, "test", RULE) y_true = concat([y_true, inner_y_true], axis=0) y_score = concat([y_score, inner_y_score], axis=0) string += ("fold_%i,%f\n" % (fold, fmax_score(inner_y_true, inner_y_score))) string += ("final,%f\n" % fmax_score(y_true, y_score)) filename = '%s/%s/%s%i/FE_bp%i_seed%i_%s.fmax' % ( project_path, directory, subdirectory, seed, size, seed, RULE) with open(filename, 'wb') as f: f.write(string) f.close() print filename
def find_ensemble(ensemble, classifiers, seed, fold, RULE, set_ensembles): if len(classifiers) == 0 or len(ensemble) == max_ens_size: return ensemble else: potential_ensembles = get_potential_ensembles( ensemble, random.choice(classifiers, len(classifiers), replace=False)) scores = [ fmax_score(*aggregate_predictions(pe, seed, fold, "valid", RULE)) for pe in potential_ensembles ] ensemble.append(potential_ensembles[argmax(scores)][-1]) #print "\t adding CURRENT ENSEMBLE:" #for c in ensemble: # print "\t - %s" % c #print "\t ==> $%s" % max(scores) set_ensembles[tuple(deepcopy(ensemble))] = max(scores) classifiers.remove(potential_ensembles[argmax(scores)][-1]) find_ensemble(ensemble, classifiers, seed, fold, RULE, set_ensembles)
def get_max_predictions(predictors, seed, fold, set): max_p = '' max_w = 0 path, bag, weight = get_path_bag_weight(predictors[0]) if weight > max_w: max_w = weight max_p = path for bp in predictors[1:]: path, bag, weight = get_path_bag_weight(bp) if weight > max_w: max_w = weight max_p = path set = 'test' #print 'GET_MAX_PREDICTIONS FOR THE BEST BP, I.E., %s_bag%s (based on the order file obtained form the validation set)\n' % (max_p, bag) y_true, y_score = get_set_preds(max_p, set, bag, fold, seed) perf = fmax_score(y_true, y_score) return (y_true, y_score, ('%s_bag%s' % (max_p, max_w)))
def aggregate_predictions(predictors, seed, fold, set): set = 'test' denom = 0 path, bag, weight = get_path_bag_weight(predictors[0]) denom = ((denom + weight) if RULE == 'WA' else (denom + 1)) y_true, y_score = get_set_preds(path, set, bag, fold, seed) y_score = weight * y_score for bp in predictors[1:]: path, bag, weight = get_path_bag_weight(bp) denom += weight y_true, y_score_current = get_set_preds(path, set, bag, fold, seed) if RULE == 'WA': y_score = y_score.add(weight * y_score_current) else: y_score = y_score.add(y_score_current) y_score = y_score/denom perf = fmax_score(y_true, y_score) #print perf return (y_true, y_score)
order_fn = '%s/ENSEMBLES/order_of_seed%i_%s.txt' % (project_path, seed, metric) with open(order_fn, 'wb') as order_file: for dirname in dirnames: for bag in bag_list: x1 = DataFrame(columns=["label"]) x2 = DataFrame(columns=["prediction"]) for fold in range(fold_count): filename = '%s/valid-b%i-f%s-s%i.csv.gz' % (dirname, bag, fold, seed) df = read_csv(filename, skiprows=1, compression='gzip') y_true = df.ix[:, 1:2] y_score = df.ix[:, 2:3] x1 = concat([x1, y_true], axis=0) x2 = concat([x2, y_score], axis=0) if metric == "fmax": dir_dict["%s_bag%i" % (dirname, bag)] = fmax_score(x1, x2) if metric == "auROC": dir_dict["%s_bag%i" % (dirname, bag)] = roc_auc_score( x1, x2) d_sorted_by_value = OrderedDict( sorted(dir_dict.items(), key=lambda x: (-x[1], x[0]))) for key, v in d_sorted_by_value.items(): order_file.write("%s, %s \n" % (key, v)) order_file.close() print order_fn print "Done!\n"