def crossValid(x, y, cv=5, Nu=10, Nv=20): results = {"perf": [], "Nu": [], "Nv": []} np.random.seed(2017) kf = KFold(n_splits=cv, shuffle=True, random_state=0) for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] ## hyperparameter tuning ## if type(Nu) == list: # both parameters should be of the same type # Nu_sel, Nv_sel = hyperparameters(x_train, y_train, cv=5, Nu=Nu, Nv=Nv) else: Nu_sel, Nv_sel = Nu, Nv y_pred = CADrank(Nu=Nu_sel, Nv=Nv_sel).fit(x_train, y_train).predict(x_test) results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True)) results["Nu"].append(Nu_sel) results["Nv"].append(Nv_sel) for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results
def crossValidate(x, y, cv=5, Abstention=True, Inverse_laplace=4): results = {"perf": []} np.random.seed(2016) kf = KFold(n_splits=cv, shuffle=True, random_state=0) for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] # score input for y to pairwise input # x_tr, y_tr = score2pair(x_train, y_train, k=Inverse_laplace, Abstention=Abstention) # train and predict ranks for test data # ranks = rankPairPref(x_tr, y_tr, x_test) # transform test score data to rank y_te = map(LogR.rankOrder, y_test.tolist()) results["perf"].append( LogR.perfMeasure(y_pred=ranks, y_test=y_te, rankopt=True)) for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results
def crossData(data_list, alpha=0.0, rank_weight=False, stop_criterion_mis_rate=None, stop_criterion_min_node=1, stop_criterion_gain=0.0, prune_criteria=0): results = {} for data_train in data_list: results[data_train] = {} for data_test in data_list: if data_test == data_train: continue x_train, y_tr = LogR.dataClean(data_train) y_train = label2Rank(y_tr.tolist()) x_test, y_te = LogR.dataClean(data_test) y_test = label2Rank(y_te.tolist()) tree = DecisionTree().buildtree( x_train, y_train, weights=None, stop_criterion_mis_rate=stop_criterion_mis_rate, stop_criterion_min_node=stop_criterion_min_node, stop_criterion_gain=stop_criterion_gain) y_pred = tree.predict(x_test, alpha) results[data_train][data_test] = LogR.perfMeasure(y_pred, y_test, rankopt=True) return results
def crossValidateSimple(x, y, method="logReg", cv=5, alpha=None): # error measure results = {"perf": []} # cross validation # np.random.seed(1100) kf = KFold(n_splits=cv, shuffle=True, random_state=0) ## for testing fixing random_state for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] # performance measure if method == "logReg": y_pred = np.zeros(x_test.shape) for i in range(y_pred.shape[0]): for j in range(y_pred.shape[1]): y_pred[i, j] = x_test[i, j] * weight[j] results["perf"].append(LogR.perfMeasure(y_pred, y_test)) for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results
def crossValidate(x, y, cv=5, K=None): """ :param y: N*L ranking vectors :return: """ results = {"perf": []} ## cross validation ## np.random.seed(1100) kf = KFold(n_splits=cv, shuffle=True, random_state=0) for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] # y_pred = KNN(K=K).fit(x_train, y_train).predict(x_test) y_pred = multithreadPredict(x_test, KNN(K=K).fit(x_train, y_train)) print y_pred # print y_pred ### test results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True)) # print results["perf"][-1] for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results
def hyperParameter(x, y, x_valid=None, y_valid=None, cv = 5, criteria = 0): if x_valid is None: # no validation set, using cross validation # alpha_perform = [] kf = KFold(n_splits=cv, shuffle=True, random_state=0) for train, valid in kf.split(x): x_train = x[train,:] y_train = y[train,:] x_valid = x[valid,:] y_valid = y[valid,:] tree = DecisionTree().buildtree(x_train, y_train) alpha_list = tree.alphalist() print "alpha_list in hyperparameter tuning: ", alpha_list alpha_best = [-1, None] for alpha in alpha_list: y_pred = tree.predict(x_valid, alpha=alpha) perf = LogR.perfMeasure(y_pred, y_valid, rankopt=True) perf_criteria = perf[criteria] if alpha_best[1] is not None and alpha_best[1]>perf_criteria: pass else: alpha_best[0] = alpha alpha_best[1] = perf_criteria alpha_perform.append(alpha_best) alpha_perform = np.array(alpha_perform, dtype=np.float32) print "inside hyperparameter:", alpha_perform ### test return np.average(alpha_perform, axis=0)[0] else: tree = DecisionTree().buildtree(x, y) alpha_list = tree.alphalist() alpha_best = [-1, None] for alpha in alpha_list: y_pred = tree.predict(x_valid, alpha=alpha) perf = LogR.perfMeasure(y_pred, y_valid, rankopt=True) perf_criteria = perf[criteria] if alpha_best[1] is not None and alpha_best[1] > perf_criteria: pass else: alpha_best[0] = alpha alpha_best[1] = perf_criteria return alpha_best[0]
def crossValidate(x, y, cv=5, alpha=0, rank_weight=False, stop_criterion_mis_rate=None, stop_criterion_min_node=1, stop_criterion_gain=0.0): results = {"alpha": [], "perf": [], "size": []} # cross validation # np.random.seed(1100) kf = KFold(n_splits=cv, shuffle=True, random_state=0) ## for testing fixing random_state for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] # training and predict # if alpha == None: # ## nested select validate and test ## # # print "start searching alpha:", datetime.now() ### test # alpha_sel, perf = DTme.hyperParometer(x_train, y_train) # # print "finish searching alpha:", datetime.now(), alpha ### test # else: # alpha_sel = alpha if rank_weight: weights = rank2Weight(y_train) else: weights = None tree = DecisionTree().buildtree( x_train, y_train, weights, stop_criterion_mis_rate=stop_criterion_mis_rate, stop_criterion_min_node=stop_criterion_min_node, stop_criterion_gain=stop_criterion_gain) # performance measure alpha_sel, y_pred = alpha, tree.predict(x_test, alpha) results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True)) results["alpha"].append(alpha_sel) results["size"].append(tree.size) print alpha_sel, "alpha" for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results
def crossValidate(x, y, method="logReg", cv=5, alpha=None): # error measure results = [] if method == "logReg": results = {"perf": [], "coef": [], "interc": []} elif method == "dT": results = {"alpha": [], "perf": []} # cross validation # np.random.seed(1100) kf = KFold(n_splits=cv, shuffle=True, random_state=0) ## for testing fixing random_state for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] # from multilabel to multiclass based on independencec assumption if method == "logReg": x_train, y_train = LogR.multiClass(x_train, y_train) elif method == "dT": pass # already in rank representation # training and predict if method == "dT": if alpha == None: ## nested select validate and test ## # print "start searching alpha:", datetime.now() ### test alpha_sel, perf = hyperParometer(x_train, y_train) # print "finish searching alpha:", datetime.now(), alpha ### test else: alpha_sel = alpha result = decisionTree(x_train, y_train, x_test, alpha=alpha_sel) # performance measure alpha_sel, y_pred = result results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True)) results["alpha"].append(alpha_sel) print alpha_sel, "alpha" for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results
def crossValidate(x, y, cv=5): results = {"perf": []} np.random.seed(1100) kf = KFold(n_splits=cv, shuffle=True, random_state=0) for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] y_pred = labelWiseRanking(x_train, y_train, x_test) results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True)) for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results
def hyperparameters(x, y, Nu, Nv, cv=5, criterion=-1): best_result = None best_para = [None, None] for Nu_sel in Nu: for Nv_sel in Nv: perfs = [] kf = KFold(n_splits=cv, shuffle=True, random_state=0) for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] y_pred = CADrank(Nu=Nu_sel, Nv=Nv_sel).fit(x_train, y_train).predict(x_test) perf = LogR.perfMeasure(y_pred, y_test, rankopt=True) perfs.append(perf[criterion]) result = sum(perfs) / cv if best_result is None or best_result < result: best_result = result best_para = [Nu_sel, Nv_sel] return best_para[0], best_para[1]
def crossValidate(x,y, cv=5, nocross = False, cost = None, iter_max = ITER_MAX): results = {"perf": []} # cross validation # np.random.seed(1100) kf = KFold(n_splits=cv, shuffle=True, random_state=0) ## for testing fixing random_state for train, test in kf.split(x): x_train = x[train, :] y_train = y[train, :] x_test = x[test, :] y_test = y[test, :] # training and predict # if alpha == None: # ## nested select validate and test ## # # print "start searching alpha:", datetime.now() ### test # alpha_sel, perf = DTme.hyperParometer(x_train, y_train) # # print "finish searching alpha:", datetime.now(), alpha ### test # else: # alpha_sel = alpha classifiers = adaboost(x_train,y_train,x_test,y_test, iter_max= iter_max, cost = cost) # performance measure y_pred = predict(x_test, classifiers) results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True)) if nocross: break for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results
def adaboost(x_train, y_train, x_test = None, y_test = None, cost_train = None, output = output, iter_max = ITER_MAX, cost = None): """ :param cost_train: cost for each training sample, used in cost = "C2" :param output: How often output performance on test data by current classifier :param iter_max: maximum of adaboost iteration :param cost: cost sensitive version indicator: None, "init_weight", "C2" """ Nsamp = y_train.shape[0] classifiers = [] # initialize weights # if cost is None or cost == "C2": weight = 1.0/Nsamp weights_init = np.array([weight for i in range(Nsamp)], dtype = np.float32) weights = weights_init if cost == "C2" and cost_train is None: cost_train = rank2Weight_cost(y_train, cost_level = COST_LEVEL) elif cost == "init_weight": weights_init = rank2Weight(y_train) weights = weights_init else: raise(ValueError, "unsupported cost type") start = datetime.now() # timer for iter in range(iter_max): # base classifier, decisiontree for now # tree = DecisionTree().buildtree(x_train, y_train, weights, stop_criterion_mis_rate = stop_criterion_mis_rate) # tree.printtree() # training result # compare_results = [False for i in range(Nsamp)]# whether correctly predicted for i in range(Nsamp): y_pred = tree.predict(x_train[i]) # print y_pred, y_train[i] cmp_result = not tree.diffLabel(y_pred, y_train[i]) compare_results[i] = cmp_result compare_results = np.array(compare_results, dtype = np.bool) # updating weight for classifier, wc# if cost is None or cost == "init_weight": weight_sum_cor = np.sum(weights[compare_results == True]) weight_sum_dis = np.sum(weights[compare_results == False]) elif cost == "C2": weight_sum_cor = costWeightSum(weights, compare_results, cost_train, cordis = True) weight_sum_dis = costWeightSum(weights, compare_results, cost_train, cordis = False) if weight_sum_cor < weight_sum_dis: # the classifier is too weak for boosting raise(ValueError,"too weak classifier") if weight_sum_dis == 0: # already perfect classifier Warning("perfect classifier") break wc = 0.5 * (math.log(weight_sum_cor) - math.log(weight_sum_dis)) # updating weights # weights = weightsUpdate(weights, compare_results, wc, cost_train) # add classifier to classifier list # classifiers.append([wc, tree]) # realtime output # if output is not None and (iter+1) % output == 0: # status of current classifiers # print "wc", wc ### test print "weights stats, mean, std, min, max " print np.mean(weights), np.std(weights), np.min(weights), np.max(weights) # performance on test set # y_pred = predict(x_test,classifiers) performance = LogR.perfMeasure(y_pred,y_test,rankopt = True) print "iter: ", iter+1 print performance with open(LOGFILE,"a") as log: log.write(" ".join(map(str, performance))+"\n") duration = datetime.now()-start start = datetime.now() print "time for %d iters: %f" % (output, duration.total_seconds()) return classifiers