def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids qids_train = np.loadtxt("train_2000_qids.txt") qids_test = np.loadtxt("test_2000_qids.txt") learner = QueryRankRLS(X_train, Y_train, qids_train) P_test = learner.predict(X_test) folds = map_ids(qids_train) perfs = [] for fold in folds: if np.var(Y_train[fold]) != 0: P = learner.holdout(fold) c = cindex(Y_train[fold], P) perfs.append(c) perf = np.mean(perfs) print("leave-query-out cross-validation cindex %f" % perf) partition = map_ids(qids_test) test_perfs = [] #compute the ranking accuracy separately for each test query for query in partition: #skip such queries, where all instances have the same #score, since in this case cindex is undefined if np.var(Y_test[query]) != 0: perf = cindex(Y_test[query], P_test[query]) test_perfs.append(perf) test_perf = np.mean(test_perfs) print("test cindex %f" % test_perf)
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids qids_train = np.loadtxt("train_2000_qids.txt") qids_test = np.loadtxt("test_2000_qids.txt") learner = QueryRankRLS(X_train, Y_train, qids_train) P_test = learner.predict(X_test) folds = map_ids(qids_train) perfs = [] for fold in folds: if np.var(Y_train[fold]) != 0: P = learner.holdout(fold) c = cindex(Y_train[fold], P) perfs.append(c) perf = np.mean(perfs) print("leave-query-out cross-validation cindex %f" %perf) partition = map_ids(qids_test) test_perfs = [] #compute the ranking accuracy separately for each test query for query in partition: #skip such queries, where all instances have the same #score, since in this case cindex is undefined if np.var(Y_test[query]) != 0: perf = cindex(Y_test[query], P_test[query]) test_perfs.append(perf) test_perf = np.mean(test_perfs) print("test cindex %f" %test_perf)
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids qids_train = np.loadtxt("train_2000_qids.txt") qids_test = np.loadtxt("test_2000_qids.txt") regparams = [2.**i for i in range(-10, 10)] learner = LeaveQueryOutRankRLS(X_train, Y_train, qids_train, regparams = regparams, measure = cindex) lqo_perfs = learner.cv_performances P_test = learner.predict(X_test) print("leave-query-out performances " +str(lqo_perfs)) print("chosen regparam %f" %learner.regparam) partition = map_ids(qids_test) #compute the ranking accuracy separately for each test query test_perfs = [] for query in partition: #skip such queries, where all instances have the same #score, since in this case cindex is undefined if np.var(Y_test[query]) != 0: perf = cindex(Y_test[query], P_test[query]) test_perfs.append(perf) test_perf = np.mean(test_perfs) print("test cindex %f" %test_perf)
def print_stats(): X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") ids = np.loadtxt("train_2000_qids.txt", dtype=int) folds = map_ids(ids) print("Parse data set characteristics") print("Training set: %d instances, %d features" % X_train.shape) print("Instances grouped into %d sentences" % len(folds))
def print_stats(): X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") ids = np.loadtxt("train_2000_qids.txt", dtype=int) folds = map_ids(ids) print("Parse data set characteristics") print("Training set: %d instances, %d features" %X_train.shape) print("Instances grouped into %d sentences" %len(folds))
def plot_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids ids = np.loadtxt("train_2000_qids.txt") #mapped to a list of lists, where each list #contains indices for one fold folds = map_ids(ids) learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) kfold_errors = [] loo_errors = [] test_errors = [] for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #K-fold cross-validation perfs = [] for fold in folds: #computes holdout predictions, where instances #in fold are left out of training set P = learner.holdout(fold) perfs.append(sqerror(Y_train[fold], P)) e_kfold = np.mean(perfs) kfold_errors.append(e_kfold) P_loo = learner.leave_one_out() e_loo = sqerror(Y_train, P_loo) loo_errors.append(e_loo) P_test = learner.predict(X_test) e_test = sqerror(Y_test, P_test) test_errors.append(e_test) plt.semilogy(log_regparams, loo_errors, label="leave-one-out") plt.semilogy(log_regparams, kfold_errors, label="leave-sentence-out") plt.semilogy(log_regparams, test_errors, label="test error") plt.xlabel("$log_2(\lambda)$") plt.ylabel("mean squared error") plt.legend(loc=3) plt.show()
def plot_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids ids = np.loadtxt("train_2000_qids.txt") #mapped to a list of lists, where each list #contains indices for one fold folds = map_ids(ids) learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) kfold_errors = [] loo_errors = [] test_errors = [] for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #K-fold cross-validation perfs = [] for fold in folds: #computes holdout predictions, where instances #in fold are left out of training set P = learner.holdout(fold) perfs.append(sqerror(Y_train[fold], P)) e_kfold = np.mean(perfs) kfold_errors.append(e_kfold) P_loo = learner.leave_one_out() e_loo = sqerror(Y_train, P_loo) loo_errors.append(e_loo) P_test = learner.predict(X_test) e_test = sqerror(Y_test, P_test) test_errors.append(e_test) plt.semilogy(log_regparams, loo_errors, label = "leave-one-out") plt.semilogy(log_regparams, kfold_errors, label = "leave-sentence-out") plt.semilogy(log_regparams, test_errors, label = "test error") plt.xlabel("$log_2(\lambda)$") plt.ylabel("mean squared error") plt.legend(loc=3) plt.show()
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids ids = np.loadtxt("train_2000_qids.txt") #mapped to a list of lists, where each list #contains indices for one fold folds = map_ids(ids) learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #K-fold cross-validation P = np.zeros(Y_train.shape) for fold in folds: #computes holdout predictions, where instances #in fold are left out of training set P[fold] = learner.holdout(fold) e = sqerror(Y_train, P) print("regparam 2**%d, k-fold error %f" % (log_regparam, e)) if e < best_error: best_error = e best_regparam = regparam learner.solve(best_regparam) P_test = learner.predict(X_test) print("best regparam %f k-fold error %f" % (best_regparam, best_error)) print("test error %f" % sqerror(Y_test, P_test))
def train_rls(): #Select regparam with k-fold cross-validation, #where instances related to a single sentence form #together a fold X_train = read_sparse("train_2000_x.txt") Y_train = np.loadtxt("train_2000_y.txt") X_test = read_sparse("test_2000_x.txt", X_train.shape[1]) Y_test = np.loadtxt("test_2000_y.txt") #list of sentence ids ids = np.loadtxt("train_2000_qids.txt") #mapped to a list of lists, where each list #contains indices for one fold folds = map_ids(ids) learner = RLS(X_train, Y_train) best_regparam = None best_error = float("inf") #exponential grid of possible regparam values log_regparams = range(-15, 16) for log_regparam in log_regparams: regparam = 2.**log_regparam #RLS is re-trained with the new regparam, this #is very fast due to computational short-cut learner.solve(regparam) #K-fold cross-validation P = np.zeros(Y_train.shape) for fold in folds: #computes holdout predictions, where instances #in fold are left out of training set P[fold] = learner.holdout(fold) e = sqerror(Y_train, P) print("regparam 2**%d, k-fold error %f" %(log_regparam, e)) if e < best_error: best_error = e best_regparam = regparam learner.solve(best_regparam) P_test = learner.predict(X_test) print("best regparam %f k-fold error %f" %(best_regparam, best_error)) print("test error %f" %sqerror(Y_test, P_test))