def greedyWeight(p1, weight1, p2, weight2_range, true_label): best_weight2 = 1.0 best_logloss = 10 for weight2 in weight2_range: p = (weight1 * p1 + weight2 * p2) / (weight1 + weight2) logloss = computeLogloss(p, true_label) #print( "w1: %s, w2: %s, logloss: %s" % (weight1, weight2, logloss) ) if logloss < best_logloss: best_logloss, best_weight2 = logloss, weight2 return best_weight2
def greedyEnsemble(list_file, fout, mode, task, fins): list_in = np.loadtxt(list_file, dtype=str) true_label = np.asarray(list_in[:, 1], dtype="int") numValid = true_label.shape[0] numTest = 130400 p0_valid = pd.read_csv(fins[-1] + "_valid.csv", index_col=0) numLabel = p0_valid.shape[1] p_ens_valid = np.zeros((numValid, numLabel), dtype="float") if task == "test": p0_test = pd.read_csv(fins[-1] + "_test.csv", index_col=0) p_ens_test = np.zeros((numTest, numLabel), dtype="float") best_dict = defaultdict(lambda: 0) fins_tmp = copy(fins) if mode == "average": # naive average for f in fins: this_p_valid = pd.read_csv(f + "_valid.csv", index_col=0).values[-numValid:] p_ens_valid += this_p_valid if task == "test": this_p_test = pd.read_csv(f + "_test.csv", index_col=0).values[-numTest:] p_ens_test += this_p_test p_ens_valid /= len(fins) if task == "test": p_ens_test /= len(fins) best_logloss = computeLogloss(p_ens_valid, true_label) print("\nAverage result:") print(("logloss: %s" % (best_logloss))) elif mode == "greedy": # greedy ensemble best_logloss = 10 best_accuracy = 0 best_fin = None first = True while True: for f in fins_tmp: this_p_valid = pd.read_csv(f + "_valid.csv", index_col=0).values[-numValid:] if first: w_ens, this_w = 0.0, 1.0 else: w_ens = 1.0 this_w = greedyWeight(p_ens_valid, w_ens, this_p_valid, np.arange(0.1, 2, 0.01), true_label) # all the current prediction to the ensemble tmp = (w_ens * p_ens_valid + this_w * this_p_valid) / (w_ens + this_w) logloss = computeLogloss(tmp, true_label) accuracy = computeAccuracy(tmp, true_label) if logloss < best_logloss: best_logloss, best_accuracy, best_fin, best_fin_w = logloss, accuracy, f, this_w if best_fin == None: break print(best_fin) print(best_fin_w) print(best_logloss) print(best_accuracy) best_dict[best_fin] += best_fin_w # valid this_p_valid = pd.read_csv(best_fin + "_valid.csv", index_col=0).values[-numValid:] p_ens_valid = (w_ens * p_ens_valid + best_fin_w * this_p_valid) / (w_ens + best_fin_w) # test if task == "test": this_p_test = pd.read_csv(best_fin + "_test.csv", index_col=0).values[-numTest:] p_ens_test = (w_ens * p_ens_test + best_fin_w * this_p_test) / (w_ens + best_fin_w) #fins_tmp.remove(best_fin) best_fin = None first = False # report the best weights and the corresponding logloss found print("\nGreedy ensemble result:") print(("logloss: %s" % (best_logloss))) print(("accuracy: %s" % (best_accuracy))) for i, f in enumerate(fins, start=1): if f not in best_dict: print((" w%s=%s" % (i, 0))) else: print((" w%s=%s" % (i, best_dict[f]))) # the final ensemble p_out_valid = pd.DataFrame(p_ens_valid, columns=p0_valid.columns, index=p0_valid.index) p_out_valid.index.name = p0_valid.index.name p_out_valid.to_csv(fout + "_[merge_nll" + str(np.round(best_logloss, 8)) + "]_valid.csv") # test if task == "test": p_out_test = pd.DataFrame(p_ens_test, columns=p0_test.columns, index=p0_test.index) p_out_test.index.name = p0_test.index.name p_out_test.to_csv(fout + "_[merge_nll" + str(np.round(best_logloss, 8)) + "]_test.csv")
def greedyEnsemble(list_file, fout, mode, task, fins): list_in = np.loadtxt(list_file, dtype=str) true_label = np.asarray(list_in[:,1], dtype="int") numValid = true_label.shape[0] numTest = 130400 p0_valid = pd.read_csv(fins[-1] + "_valid.csv", index_col=0) numLabel = p0_valid.shape[1] p_ens_valid = np.zeros((numValid, numLabel), dtype="float") if task == "test": p0_test = pd.read_csv(fins[-1] + "_test.csv", index_col=0) p_ens_test = np.zeros((numTest, numLabel), dtype="float") best_dict = defaultdict(lambda : 0) fins_tmp = copy(fins) if mode == "average": # naive average for f in fins: this_p_valid = pd.read_csv(f + "_valid.csv", index_col=0).values[-numValid:] p_ens_valid += this_p_valid if task == "test": this_p_test = pd.read_csv(f + "_test.csv", index_col=0).values[-numTest:] p_ens_test += this_p_test p_ens_valid /= len(fins) if task == "test": p_ens_test /= len(fins) best_logloss = computeLogloss(p_ens_valid, true_label) print( "\nAverage result:" ) print( "logloss: %s" % (best_logloss) ) elif mode == "greedy": # greedy ensemble best_logloss = 10 best_accuracy = 0 best_fin = None first = True while True: for f in fins_tmp: this_p_valid = pd.read_csv(f + "_valid.csv", index_col=0).values[-numValid:] if first: w_ens, this_w = 0.0, 1.0 else: w_ens = 1.0 this_w = greedyWeight(p_ens_valid, w_ens, this_p_valid, np.arange(0.1, 2, 0.01), true_label) # all the current prediction to the ensemble tmp = (w_ens * p_ens_valid + this_w * this_p_valid) / (w_ens + this_w) logloss = computeLogloss(tmp, true_label) accuracy = computeAccuracy(tmp, true_label) if logloss < best_logloss: best_logloss, best_accuracy, best_fin, best_fin_w = logloss, accuracy, f, this_w if best_fin == None: break print best_fin print best_fin_w print best_logloss print best_accuracy best_dict[best_fin] += best_fin_w # valid this_p_valid = pd.read_csv(best_fin + "_valid.csv", index_col=0).values[-numValid:] p_ens_valid = (w_ens * p_ens_valid + best_fin_w * this_p_valid) / (w_ens + best_fin_w) # test if task == "test": this_p_test = pd.read_csv(best_fin + "_test.csv", index_col=0).values[-numTest:] p_ens_test = (w_ens * p_ens_test + best_fin_w * this_p_test) / (w_ens + best_fin_w) #fins_tmp.remove(best_fin) best_fin = None first = False # report the best weights and the corresponding logloss found print( "\nGreedy ensemble result:" ) print( "logloss: %s" % (best_logloss) ) print( "accuracy: %s" % (best_accuracy) ) for i,f in enumerate(fins, start=1): if f not in best_dict: print( " w%s=%s" % (i, 0) ) else: print( " w%s=%s" % (i, best_dict[f]) ) # the final ensemble p_out_valid = pd.DataFrame(p_ens_valid, columns=p0_valid.columns, index=p0_valid.index) p_out_valid.index.name = p0_valid.index.name p_out_valid.to_csv( fout + "_[merge_nll" + str(np.round(best_logloss, 8)) + "]_valid.csv" ) # test if task == "test": p_out_test = pd.DataFrame(p_ens_test, columns=p0_test.columns, index=p0_test.index) p_out_test.index.name = p0_test.index.name p_out_test.to_csv( fout + "_[merge_nll" + str(np.round(best_logloss, 8)) + "]_test.csv" )