Ejemplo n.º 1
0
def train_rls():
    #Select regparam with k-fold cross-validation,
    #where instances related to a single sentence form
    #together a fold
    X_train = read_sparse("train_2000_x.txt")
    Y_train = np.loadtxt("train_2000_y.txt")
    X_test = read_sparse("test_2000_x.txt", X_train.shape[1])
    Y_test = np.loadtxt("test_2000_y.txt")
    #list of sentence ids
    qids_train = np.loadtxt("train_2000_qids.txt")
    qids_test = np.loadtxt("test_2000_qids.txt")
    learner = QueryRankRLS(X_train, Y_train, qids_train)
    P_test = learner.predict(X_test)
    folds = map_ids(qids_train)
    perfs = []
    for fold in folds:
        if np.var(Y_train[fold]) != 0:
            P = learner.holdout(fold)
            c = cindex(Y_train[fold], P)
            perfs.append(c)
    perf = np.mean(perfs)
    print("leave-query-out cross-validation cindex %f" % perf)
    partition = map_ids(qids_test)
    test_perfs = []
    #compute the ranking accuracy separately for each test query
    for query in partition:
        #skip such queries, where all instances have the same
        #score, since in this case cindex is undefined
        if np.var(Y_test[query]) != 0:
            perf = cindex(Y_test[query], P_test[query])
            test_perfs.append(perf)
    test_perf = np.mean(test_perfs)
    print("test cindex %f" % test_perf)
Ejemplo n.º 2
0
def train_rls():
    #Select regparam with k-fold cross-validation,
    #where instances related to a single sentence form
    #together a fold
    X_train = read_sparse("train_2000_x.txt")
    Y_train = np.loadtxt("train_2000_y.txt")
    X_test = read_sparse("test_2000_x.txt", X_train.shape[1])
    Y_test = np.loadtxt("test_2000_y.txt")
    #list of sentence ids
    qids_train = np.loadtxt("train_2000_qids.txt")
    qids_test = np.loadtxt("test_2000_qids.txt")
    regparams = [2.**i for i in range(-10, 10)]
    learner = LeaveQueryOutRankRLS(X_train,
                                   Y_train,
                                   qids_train,
                                   regparams=regparams,
                                   measure=cindex)
    lqo_perfs = learner.cv_performances
    P_test = learner.predict(X_test)
    print("leave-query-out performances " + str(lqo_perfs))
    print("chosen regparam %f" % learner.regparam)
    partition = map_ids(qids_test)
    #compute the ranking accuracy separately for each test query
    test_perfs = []
    for query in partition:
        #skip such queries, where all instances have the same
        #score, since in this case cindex is undefined
        if np.var(Y_test[query]) != 0:
            perf = cindex(Y_test[query], P_test[query])
            test_perfs.append(perf)
    test_perf = np.mean(test_perfs)
    print("test cindex %f" % test_perf)
Ejemplo n.º 3
0
def train_rls():
    #Select regparam with leave-one-out cross-validation
    X_train = read_sparse("train_2000_x.txt")
    Y_train = np.loadtxt("train_2000_y.txt")
    X_test = read_sparse("test_2000_x.txt", X_train.shape[1])
    Y_test = np.loadtxt("test_2000_y.txt")
    learner = RLS(X_train, Y_train)
    best_regparam = None
    best_error = float("inf")
    #exponential grid of possible regparam values
    log_regparams = range(-15, 16)
    for log_regparam in log_regparams:
        regparam = 2.**log_regparam
        #RLS is re-trained with the new regparam, this
        #is very fast due to computational short-cut
        learner.solve(regparam)
        #Leave-one-out cross-validation predictions, this is fast due to
        #computational short-cut
        P_loo = learner.leave_one_out()
        e = sqerror(Y_train, P_loo)
        print("regparam 2**%d, loo-error %f" % (log_regparam, e))
        if e < best_error:
            best_error = e
            best_regparam = regparam
    learner.solve(best_regparam)
    P_test = learner.predict(X_test)
    print("best regparam %d loo-error %f" % (best_regparam, best_error))
    print("test error %f" % sqerror(Y_test, P_test))
Ejemplo n.º 4
0
def plot_rls():
    #Select regparam with k-fold cross-validation,
    #where instances related to a single sentence form
    #together a fold
    X_train = read_sparse("train_2000_x.txt")
    Y_train = np.loadtxt("train_2000_y.txt")
    X_test = read_sparse("test_2000_x.txt", X_train.shape[1])
    Y_test = np.loadtxt("test_2000_y.txt")
    #list of sentence ids
    ids = np.loadtxt("train_2000_qids.txt")
    #mapped to a list of lists, where each list
    #contains indices for one fold
    folds = map_ids(ids)
    learner = RLS(X_train, Y_train)
    best_regparam = None
    best_error = float("inf")
    #exponential grid of possible regparam values
    log_regparams = range(-15, 16)
    kfold_errors = []
    loo_errors = []
    test_errors = []
    for log_regparam in log_regparams:
        regparam = 2.**log_regparam
        #RLS is re-trained with the new regparam, this
        #is very fast due to computational short-cut
        learner.solve(regparam)
        #K-fold cross-validation
        perfs = []
        for fold in folds:
            #computes holdout predictions, where instances
            #in fold are left out of training set
            P = learner.holdout(fold)
            perfs.append(sqerror(Y_train[fold], P))
        e_kfold = np.mean(perfs)
        kfold_errors.append(e_kfold)
        P_loo = learner.leave_one_out()
        e_loo = sqerror(Y_train, P_loo)
        loo_errors.append(e_loo)
        P_test = learner.predict(X_test)
        e_test = sqerror(Y_test, P_test)
        test_errors.append(e_test)
    plt.semilogy(log_regparams, loo_errors, label="leave-one-out")
    plt.semilogy(log_regparams, kfold_errors, label="leave-sentence-out")
    plt.semilogy(log_regparams, test_errors, label="test error")
    plt.xlabel("$log_2(\lambda)$")
    plt.ylabel("mean squared error")
    plt.legend(loc=3)
    plt.show()
Ejemplo n.º 5
0
def print_stats():
    X_train = read_sparse("train_2000_x.txt")
    Y_train = np.loadtxt("train_2000_y.txt")
    ids = np.loadtxt("train_2000_qids.txt", dtype=int)
    folds = map_ids(ids)
    print("Parse data set characteristics")
    print("Training set: %d instances, %d features" % X_train.shape)
    print("Instances grouped into %d sentences" % len(folds))
Ejemplo n.º 6
0
def train_rls():
    #Select regparam with k-fold cross-validation,
    #where instances related to a single sentence form
    #together a fold
    X_train = read_sparse("train_2000_x.txt")
    Y_train = np.loadtxt("train_2000_y.txt")
    X_test = read_sparse("test_2000_x.txt", X_train.shape[1])
    Y_test = np.loadtxt("test_2000_y.txt")
    #list of sentence ids
    ids = np.loadtxt("train_2000_qids.txt")
    #mapped to a list of lists, where each list
    #contains indices for one fold
    folds = map_ids(ids)
    learner = RLS(X_train, Y_train)
    best_regparam = None
    best_error = float("inf")
    #exponential grid of possible regparam values
    log_regparams = range(-15, 16)
    for log_regparam in log_regparams:
        regparam = 2.**log_regparam
        #RLS is re-trained with the new regparam, this
        #is very fast due to computational short-cut
        learner.solve(regparam)
        #K-fold cross-validation
        P = np.zeros(Y_train.shape)
        for fold in folds:
            #computes holdout predictions, where instances
            #in fold are left out of training set
            P[fold] = learner.holdout(fold)
        e = sqerror(Y_train, P)
        print("regparam 2**%d, k-fold error %f" % (log_regparam, e))
        if e < best_error:
            best_error = e
            best_regparam = regparam
    learner.solve(best_regparam)
    P_test = learner.predict(X_test)
    print("best regparam %f k-fold error %f" % (best_regparam, best_error))
    print("test error %f" % sqerror(Y_test, P_test))
Ejemplo n.º 7
0
import numpy as np
from rlscore.learner.query_rankrls import LeaveQueryOutRankRLS
from rlscore.reader import read_qids
from rlscore.reader import read_sparse
from rlscore.reader import read_sparse
from rlscore.reader import read_qids
from rlscore.measure import cindex
train_labels = np.loadtxt("./examples/data/rank_train.labels")
test_labels = np.loadtxt("./examples/data/rank_test.labels")
train_qids = read_qids("./examples/data/rank_train.qids")
test_features = read_sparse("./examples/data/rank_test.features")
train_features = read_sparse("./examples/data/rank_train.features")
test_qids = read_qids("./examples/data/rank_test.qids")
kwargs = {}
kwargs['measure'] = cindex
kwargs['regparams'] = [2**i for i in range(-10, 11)]
kwargs["Y"] = train_labels
kwargs["X"] = train_features
kwargs["qids"] = train_qids
learner = LeaveQueryOutRankRLS(**kwargs)
grid = kwargs['regparams']
perfs = learner.cv_performances
for i in range(len(grid)):
    print "parameter %f cv_performance %f" % (grid[i], perfs[i])
P = learner.predict(test_features)
from rlscore.measure.measure_utilities import UndefinedPerformance
from rlscore.measure.measure_utilities import qids_to_splits
test_qids = qids_to_splits(test_qids)
perfs = []
for query in test_qids:
    try:
Ejemplo n.º 8
0
import numpy as np
from rlscore.learner.rls import RLS
from rlscore.reader import read_sparse
from rlscore.reader import read_sparse
from rlscore.measure import auc
from rlscore.learner.rls import LOOCV
from rlscore.utilities.grid_search import grid_search

train_labels = np.loadtxt("./examples/data/class_train.labels")
test_labels = np.loadtxt("./examples/data/class_test.labels")
train_features = read_sparse("./examples/data/class_train.features")
test_features = read_sparse("./examples/data/class_test.features")
kwargs = {}
kwargs["train_labels"] = train_labels
kwargs["train_features"] = train_features
kwargs["regparam"] = 1
kwargs["coef0"] = 1
kwargs["degree"] = 3
kwargs["gamma"] = 2
kwargs["kernel"] = "PolynomialKernel"
learner = RLS.createLearner(**kwargs)
learner.train()
kwargs = {}
kwargs["learner"] = learner
kwargs["measure"] = auc
crossvalidator = LOOCV(**kwargs)
grid = [2**i for i in range(-10, 11)]
learner, perfs = grid_search(crossvalidator, grid)
for i in range(len(grid)):
    print "parameter %f cv_performance %f" % (grid[i], perfs[i])
model = learner.getModel()
Ejemplo n.º 9
0
import numpy as np
from rlscore.learner.rls import RLS
from rlscore.reader import read_folds
from rlscore.reader import read_sparse
from rlscore.reader import read_sparse
from rlscore.measure import auc
from rlscore.learner.rls import NfoldCV
from rlscore.utilities.grid_search import grid_search

train_labels = np.loadtxt("./examples/data/class_train.labels")
test_labels = np.loadtxt("./examples/data/class_test.labels")
folds = read_folds("./examples/data/folds.txt")
train_features = read_sparse("./examples/data/class_train.features")
test_features = read_sparse("./examples/data/class_test.features")
kwargs = {}
kwargs["train_labels"] = train_labels
kwargs["train_features"] = train_features
kwargs["regparam"] = 1
learner = RLS.createLearner(**kwargs)
learner.train()
kwargs = {}
kwargs["learner"] = learner
kwargs["folds"] = folds
kwargs["measure"] = auc
crossvalidator = NfoldCV(**kwargs)
grid = [2 ** i for i in range(-10, 11)]
learner, perfs = grid_search(crossvalidator, grid)
for i in range(len(grid)):
    print "parameter %f cv_performance %f" % (grid[i], perfs[i])
model = learner.getModel()
P = model.predict(test_features)
from rlscore.learner.mmc import MMC
from rlscore.reader import read_sparse

## Import the dataset
gene_data_na = read_sparse("./gene_data_na.txt")

## Build the model
kwargs = {}
kwargs["X"] = gene_data_na
kwargs["regparam"] = 1
kwargs["kernel"] = "GaussianKernel"
kwargs["number_of_clusters"] = 4    ## Set the number of clusters found with the eigengap method for this kernel
learner = MMC(**kwargs)
labels = learner.results

# Write the results in output file
# out = open("python_clustering.out","w")
# for label in labels["predicted_clusters_for_training_data"]:
#    out.write(str(label) + "\n")
# out.close()
Ejemplo n.º 11
0
import numpy as np
from rlscore.learner.label_rankrls import LabelRankRLS
from rlscore.reader import read_qids
from rlscore.reader import read_sparse
from rlscore.reader import read_sparse
from rlscore.reader import read_qids
from rlscore.measure import cindex
train_labels = np.loadtxt("./examples/data/rank_train.labels")
test_labels = np.loadtxt("./examples/data/rank_test.labels")
train_qids = read_qids("./examples/data/rank_train.qids")
test_features = read_sparse("./examples/data/rank_test.features")
train_features = read_sparse("./examples/data/rank_train.features")
test_qids = read_qids("./examples/data/rank_test.qids")
kwargs = {}
kwargs["train_labels"] = train_labels
kwargs["train_qids"] = train_qids
kwargs["train_features"] = train_features
kwargs["regparam"] = 1
learner = LabelRankRLS.createLearner(**kwargs)
learner.train()
model = learner.getModel()
P = model.predict(test_features)
from rlscore.measure.measure_utilities import UndefinedPerformance
perfs = []
for query in test_qids:
    try:
        perf = cindex(test_labels[query], P[query])
        perfs.append(perf)
    except UndefinedPerformance:
        pass
test_perf = np.mean(perfs)