taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) # create tree normalizer tree_normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names, data.task_vector_names, taxonomy) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->",wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert(len(alphas_tree)==len(alphas)) for i in xrange(len(alphas)): assert(abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert(param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j,v) in enumerate(tokens) if j!=0]) assert len(entry)==num_lines, "len_entry %i, num_lines %i" % (len(entry), num_lines) task_distances[i,:] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # support support = numpy.linspace(0, 1, 5) # set normalizer normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_nums) # fetch taxonomy from parameter object taxonomy = param.taxonomy.data taxonomy.plot() import os os.system("evince demo.png &") # compute distances distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i, task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): distances[i, j] = task_similarities.compute_hop_distance( taxonomy, task_name_lhs, task_name_rhs) # normalize distances distances = distances / numpy.max(distances) # set distances for (i, task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): normalizer.set_task_distance(i, j, distances[i, j]) # assign normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() svm = None debug_weights = {} num_subk = base_wdk.get_num_subkernels() print "num subkernels:", num_subk #print "subkernel weights:", base_wdk.get_subkernel_weights() debug_weights["before"] = [ normalizer.get_beta(i) for i in range(num_subk) ] print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_C(param.cost, param.cost) svm.set_kernel(base_wdk) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() #print "subkernel weights (after):", base_wdk.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "svm objective:", svm.get_objective() debug_weights["after"] = [ normalizer.get_beta(i) for i in range(num_subk) ] # debugging output print "debug weights (before/after):" print debug_weights["before"] print debug_weights["after"] print "" # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (svm, data.name_to_id(task_name)) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # support support = numpy.linspace(0, 1, 5) # set normalizer normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_nums) # fetch taxonomy from parameter object taxonomy = param.taxonomy.data taxonomy.plot() import os os.system("evince demo.png &") # compute distances distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i,task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): distances[i,j] = task_similarities.compute_hop_distance(taxonomy, task_name_lhs, task_name_rhs) # normalize distances distances = distances / numpy.max(distances) # set distances for (i,task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): normalizer.set_task_distance(i, j, distances[i,j]) # assign normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() svm = None debug_weights = {} num_subk = base_wdk.get_num_subkernels() print "num subkernels:", num_subk #print "subkernel weights:", base_wdk.get_subkernel_weights() debug_weights["before"] = [normalizer.get_beta(i) for i in range(num_subk)] print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_C(param.cost, param.cost) svm.set_kernel(base_wdk) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() #print "subkernel weights (after):", base_wdk.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "svm objective:", svm.get_objective() debug_weights["after"] = [normalizer.get_beta(i) for i in range(num_subk)] # debugging output print "debug weights (before/after):" print debug_weights["before"] print debug_weights["after"] print "" # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (svm, data.name_to_id(task_name)) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert (param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity( task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert(param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # create normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load hard-coded task-similarity task_similarity = helper.load("/fml/ag-raetsch/home/cwidmer/svn/projects/alt_splice_code/src/task_sim_tis.bz2") # set similarity similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i, task_name_lhs) in enumerate(data.get_task_names()): #max_value_row = max(task_similarity.get_row(task_name_lhs)) max_value_row = 1.0 for (j, task_name_rhs) in enumerate(data.get_task_names()): similarity = task_similarity.get_value(task_name_lhs, task_name_rhs) / max_value_row normalizer.set_task_similarity(i, j, similarity) similarities[i,j] = similarity pprint.pprint similarities # set normalizer #print "WARNING MTK disabled!!!!!!!!!!!!!!!!!!!!!" base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm param.flags["svm_type"] = "svmlight" #fix svm type svm = shogun_factory.create_svm(param, base_wdk, lab) # make sure these parameters are set correctly #print "WARNING MTK WONT WORK WITH THESE SETTINGS!!!!!!!!!!!!!!!!!!!!!" svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) assert svm.get_linadd_enabled() == False, "linadd should be disabled" assert svm.get_batch_computation_enabled == False, "batch compute should be disabled" # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, svm) return svms
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity( taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->", wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert (len(alphas_tree) == len(alphas)) for i in xrange(len(alphas)): assert (abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
# taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) # create tree normalizer tree_normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names, data.task_vector_names, taxonomy) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_empty_kernel(param) lab = shogun_factory.create_labels(data.labels) combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) #base_wdk.init_normalizer() combined_features.append_feature_obj(base_features) combined_kernel.append_kernel(base_wdk) ################################################## # intra-domain blocks intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) # set mixing factor (used if MKL is OFF) assert (param.base_similarity <= 1) assert (param.base_similarity >= 0) combined_kernel.set_subkernel_weights( [param.base_similarity, 1 - param.base_similarity]) combined_kernel.init(combined_features, combined_features) svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) print "WARNING: custom epsilon set" svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() self.additional_information["similarities"] = similarities self.additional_information[ "post_weights"] = combined_kernel.get_subkernel_weights() # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, combined_kernel, svm) return svms