def check_C_testset(mss_id): import pylab import expenv import numpy from helper import Options from method_hierarchy_svm_new import Method #from method_augmented_svm_new import Method #costs = 10000 #[float(c) for c in numpy.exp(numpy.linspace(numpy.log(10), numpy.log(20000), 6))] costs = [float(c) for c in numpy.exp(numpy.linspace(numpy.log(0.4), numpy.log(10), 6))] print costs mss = expenv.MultiSplitSet.get(mss_id) train = mss.get_train_data(-1) test = mss.get_eval_data(-1) au_roc = [] au_prc = [] for cost in costs: #create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 10 param.transform = cost param.base_similarity = 1.0 param.taxonomy = mss.taxonomy param.id = 666 #param.cost = cost param.cost = 10000 param.freeze() # train mymethod = Method(param) mymethod.train(train) assessment = mymethod.evaluate(test) au_roc.append(assessment.auROC) au_prc.append(assessment.auPRC) print assessment assessment.destroySelf() pylab.title("auROC") pylab.semilogx(costs, au_roc, "-o") pylab.show() pylab.figure() pylab.title("auPRC") pylab.semilogx(costs, au_prc, "-o") pylab.show() return (costs, au_roc, au_prc)
def main(): print "starting debugging:" SPLIT_POINTER = -1 from expenv import MultiSplitSet from helper import Options # select dataset #multi_split_set = MultiSplitSet.get(387) #multi_split_set = MultiSplitSet.get(407) multi_split_set = MultiSplitSet.get(399) #dataset_name = multi_split_set.description # create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeRBFKernel" #"WeightedDegreeStringKernel"#"PolyKernel" param.wdk_degree = 2 param.cost = 1.0 param.transform = 0.2 param.base_similarity = 1.0 param.taxonomy = multi_split_set.taxonomy param.id = 666 flags= {} #flags["boosting"] = "ones" #flags["boosting"] = "L1" flags["boosting"] = "L2" #flags["boosting"] = "L2_reg" flags["signum"] = False flags["normalize_cost"] = True flags["all_positions"] = False flags["wdk_rbf_on"] = False param.flags = flags param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train mymethod = Method(param) mymethod.train(data_train) assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf()
def main(): print "starting debugging:" SPLIT_POINTER = -1 from expenv import MultiSplitSet from helper import Options # select dataset #multi_split_set = MultiSplitSet.get(387) #multi_split_set = MultiSplitSet.get(407) multi_split_set = MultiSplitSet.get(399) #dataset_name = multi_split_set.description # create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel"#"PolyKernel" param.wdk_degree = 2 param.cost = 1.0 param.transform = 0.2 param.base_similarity = 1 param.taxonomy = multi_split_set.taxonomy param.id = 666 flags= {} #flags["boosting"] = "ones" flags["boosting"] = "L1" #flags["boosting"] = "L2" #flags["boosting"] = "L2_reg" flags["signum"] = False flags["normalize_cost"] = True flags["all_positions"] = False flags["wdk_rbf_on"] = False param.flags = flags param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train mymethod = Method(param) mymethod.train(data_train) assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf()
def main(): print "starting debugging:" SPLIT_POINTER = -1 from expenv import MultiSplitSet from helper import Options # select dataset multi_split_set = MultiSplitSet.get(399) #dataset_name = multi_split_set.description flags = {} flags["normalize_cost"] = False flags["epsilon"] = 0.05 flags["cache_size"] = 7 #flags["solver_type"] = "ST_DIRECT" #ST_CPLEX #ST_GLPK) #ST_DIRECT) #ST_NEWTON) flags["normalize_trace"] = True flags["interleaved"] = True #create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 1 param.cost = 1 param.transform = 1 #2.0 param.taxonomy = multi_split_set.taxonomy param.id = 666 param.flags = flags param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train mymethod = Method(param) mymethod.train(data_train) assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf()
def main(): print "starting debugging:" SPLIT_POINTER = 1 from expenv import MultiSplitSet from helper import Options from task_similarities import fetch_gammas # select dataset multi_split_set = MultiSplitSet.get(317) #multi_split_set = MultiSplitSet.get(374) #multi_split_set = MultiSplitSet.get(2) dataset_name = multi_split_set.description transform = 1.0 base = 1.0 similarity_matrix = fetch_gammas(transform, base, dataset_name) #create mock taxonomy object by freezable struct taxonomy = Options() taxonomy.data = similarity_matrix taxonomy.description = dataset_name taxonomy.freeze() #create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 1 param.cost = 1.0 param.transform = 1.0 param.taxonomy = taxonomy param.id = 666 param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) create_plot_inner(param, data_train, data_eval)
def main(): print "starting debugging:" SPLIT_POINTER = 1 from expenv import MultiSplitSet from helper import Options # select dataset multi_split_set = MultiSplitSet.get(379) dataset_name = multi_split_set.description print "dataset_name", dataset_name #create mock taxonomy object by freezable struct #taxonomy = Options() #taxonomy.data = taxonomy_graph.data #taxonomy.description = dataset_name #taxonomy.freeze() #create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 1 param.cost = 1.0 param.transform = 2.0 param.taxonomy = multi_split_set.taxonomy param.id = 666 param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train hierarchical xval mymethod = Method(param) mymethod.train(data_train) assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf();
def training_for_sigma(sigma): print "starting debugging:" from expenv import MultiSplitSet # select dataset multi_split_set = MultiSplitSet.get(393) SPLIT_POINTER = 1 #create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel" #"WeightedDegreeRBFKernel" # # param.wdk_degree = 2 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.base_similarity = sigma param.degree = 2 param.flags = {} param.flags["wdk_rbf_on"] = False param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train mymethod = Method(param) mymethod.train(data_train) print "training done" assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf() return assessment.auROC
def main(): print "starting debugging:" SPLIT_POINTER = 1 from expenv import MultiSplitSet from helper import Options # select dataset multi_split_set = MultiSplitSet.get(379) dataset_name = multi_split_set.description print "dataset_name", dataset_name #create mock taxonomy object by freezable struct #taxonomy = Options() #taxonomy.data = taxonomy_graph.data #taxonomy.description = dataset_name #taxonomy.freeze() #create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 1 param.cost = 1.0 param.transform = 2.0 param.taxonomy = multi_split_set.taxonomy param.id = 666 param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train hierarchical xval mymethod = Method(param) mymethod.train(data_train) assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf()
def main(): print "starting debugging:" SPLIT_POINTER = 1 from expenv import MultiSplitSet from helper import Options # select dataset multi_split_set = MultiSplitSet.get(399) #create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeRBFKernel" #"WeightedDegreeStringKernel"# # param.wdk_degree = 1 param.cost = 1.0 param.transform = 1.0 param.sigma = 1.0 param.id = 666 param.base_similarity = 1 param.degree = 2 param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train mymethod = Method(param) mymethod.train(data_train) print "training done" assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf()
def main(): print "starting debugging:" SPLIT_POINTER = -1 from expenv import MultiSplitSet from helper import Options # select dataset #multi_split_set = MultiSplitSet.get(387) multi_split_set = MultiSplitSet.get(386) #dataset_name = multi_split_set.description # create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel"#"PolyKernel" param.wdk_degree = 1 param.cost = 100 param.transform = 2 #2.0 param.taxonomy = multi_split_set.taxonomy param.id = 666 param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train mymethod = Method(param) mymethod.train(data_train) assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf()
def main(): print "starting debugging:" SPLIT_POINTER = -1 from expenv import MultiSplitSet from helper import Options # select dataset #multi_split_set = MultiSplitSet.get(387) multi_split_set = MultiSplitSet.get(386) #dataset_name = multi_split_set.description # create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel"#"PolyKernel" param.wdk_degree = 1 param.cost = 1 param.transform = 2 #2.0 param.taxonomy = multi_split_set.taxonomy param.id = 666 param.freeze() data_train = multi_split_set.get_train_data(SPLIT_POINTER) data_eval = multi_split_set.get_eval_data(SPLIT_POINTER) # train mymethod = Method(param) mymethod.train(data_train) assessment = mymethod.evaluate(data_eval) print assessment assessment.destroySelf()
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->",wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert(len(alphas_tree)==len(alphas)) for i in xrange(len(alphas)): assert(abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) # create tree normalizer tree_normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names, data.task_vector_names, taxonomy)
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity( taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->", wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert (len(alphas_tree) == len(alphas)) for i in xrange(len(alphas)): assert (abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
def __init__(self, degree, sigma, active_set, wdk_rbf_on): ''' loads data into handler ''' self.active_set = active_set fn = "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHCsequenzen/pseudo.txt" tmp_key = "" tmp_idx = 0 self.seqs = [] self.keys = [] self.name_to_id = {} # parse file for line in file(fn): if line.startswith(">"): tmp_key = line.strip()[1:] else: if active_set.count(tmp_key) > 0: assert self.keys.count(tmp_key) == 0, "key %s is already contained in self.keys" % (tmp_key) self.seqs.append(line.strip()) self.keys.append(tmp_key) self.name_to_id[tmp_key] = tmp_idx tmp_idx += 1 assert len(self.seqs) == tmp_idx, "incorrect number of sequences %i != %i" % (len(self.seqs), tmp_idx) assert len(self.keys) == tmp_idx, "incorrect number of keys %i != %i" % (len(self.keys), tmp_idx) # setup kernel param = Options() if wdk_rbf_on: param.kernel = "WeightedDegreeRBFKernel" else: param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = degree param.transform = sigma self.kernel = shogun_factory.create_kernel(self.seqs, param) ####################### # compute kernel ####################### num_tasks = len(self.seqs) self.similarity = numpy.zeros((num_tasks, num_tasks)) for i in xrange(num_tasks): for j in xrange(num_tasks): self.similarity[i,j] = self.kernel.kernel(i, j) # normalize kernel my_min = numpy.min(self.similarity) my_max = numpy.max(self.similarity) my_diff = my_max - my_min # scale to interval [0,1] #self.similarity = (self.similarity - my_min) / my_diff self.similarity = (self.similarity) / my_max print self.similarity
def __init__(self, degree, sigma, active_set, wdk_rbf_on): ''' loads data into handler ''' self.active_set = active_set fn = "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHCsequenzen/pseudo.txt" tmp_key = "" tmp_idx = 0 self.seqs = [] self.keys = [] self.name_to_id = {} # parse file for line in file(fn): if line.startswith(">"): tmp_key = line.strip()[1:] else: if active_set.count(tmp_key) > 0: assert self.keys.count( tmp_key ) == 0, "key %s is already contained in self.keys" % ( tmp_key) self.seqs.append(line.strip()) self.keys.append(tmp_key) self.name_to_id[tmp_key] = tmp_idx tmp_idx += 1 assert len( self.seqs ) == tmp_idx, "incorrect number of sequences %i != %i" % ( len(self.seqs), tmp_idx) assert len( self.keys ) == tmp_idx, "incorrect number of keys %i != %i" % (len( self.keys), tmp_idx) # setup kernel param = Options() if wdk_rbf_on: param.kernel = "WeightedDegreeRBFKernel" else: param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = degree param.transform = sigma self.kernel = shogun_factory.create_kernel(self.seqs, param) ####################### # compute kernel ####################### num_tasks = len(self.seqs) self.similarity = numpy.zeros((num_tasks, num_tasks)) for i in xrange(num_tasks): for j in xrange(num_tasks): self.similarity[i, j] = self.kernel.kernel(i, j) # normalize kernel my_min = numpy.min(self.similarity) my_max = numpy.max(self.similarity) my_diff = my_max - my_min # scale to interval [0,1] #self.similarity = (self.similarity - my_min) / my_diff self.similarity = (self.similarity) / my_max print self.similarity
################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data)
def check_C_testset(mss_id): import pylab import expenv import numpy from helper import Options from method_hierarchy_svm_new import Method #from method_augmented_svm_new import Method #costs = 10000 #[float(c) for c in numpy.exp(numpy.linspace(numpy.log(10), numpy.log(20000), 6))] costs = [ float(c) for c in numpy.exp(numpy.linspace(numpy.log(0.4), numpy.log(10), 6)) ] print costs mss = expenv.MultiSplitSet.get(mss_id) train = mss.get_train_data(-1) test = mss.get_eval_data(-1) au_roc = [] au_prc = [] for cost in costs: #create mock param object by freezable struct param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 10 param.transform = cost param.base_similarity = 1.0 param.taxonomy = mss.taxonomy param.id = 666 #param.cost = cost param.cost = 10000 param.freeze() # train mymethod = Method(param) mymethod.train(train) assessment = mymethod.evaluate(test) au_roc.append(assessment.auROC) au_prc.append(assessment.auPRC) print assessment assessment.destroySelf() pylab.title("auROC") pylab.semilogx(costs, au_roc, "-o") pylab.show() pylab.figure() pylab.title("auPRC") pylab.semilogx(costs, au_prc, "-o") pylab.show() return (costs, au_roc, au_prc)