def _union_train(self, prepared_data, param): """ perform inner training by processing the tree """ normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = 1.0 normalizer.set_task_similarity( prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() return svm
def _union_train(self, prepared_data, param): """ perform inner training by processing the tree """ normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = 1.0 normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() return svm
def create_normalizer_from_taxonomy(taxonomy): """ creates kernel normalizer with similarities set from hop-distance according to taxnomoy """ #TODO fix --> num tasks can be computed from leaves etc... # fetch taxonomy # taxonomy = param.taxonomy.data print "WARNING; HARDCODED DISTANCE MATRIX IN HERE" hardcoded_distances = helper.load("/fml/ag-raetsch/home/cwidmer/svn/projects/alt_splice_code/src/task_sim_tis.bz2") # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # compute distances distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i,task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): distances[i,j] = task_similarities.compute_hop_distance(taxonomy, task_name_lhs, task_name_rhs) # normalize distances distances = distances / numpy.max(distances) # set similarity for (i, task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): similarity = param.base_similarity - distances[i,j] normalizer.set_task_similarity(i, j, similarity) # save for later similarities[i,j] = similarity return normalizer
def _inner_train(self, prepared_data, param): """ perform inner training by processing the tree """ # init seq handler classifiers = [] ################# # mtk normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) from method_mhc_rbf import SequencesHandlerRbf task_kernel = SequencesHandlerRbf(1, param.base_similarity, prepared_data.get_task_names(), param.flags["wdk_rbf_on"]) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs) normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() classifiers.append(svm) ################# # dirac #import pdb #pdb.set_trace() svm_dirac = self._dirac_train(prepared_data, param) classifiers.append(svm_dirac) ## #union #svm_union = self._union_train(prepared_data, param) #classifiers.append(svm_union) return classifiers
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity( task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _inner_train(self, prepared_data, param): """ perform inner training by processing the tree """ # init seq handler pseudoseqs = SequencesHandler() classifiers = [] for pocket in self.get_pockets(param.flags["all_positions"]): print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) from method_mhc_rbf import SequencesHandlerRbf task_kernel = SequencesHandlerRbf(1, param.base_similarity, prepared_data.get_task_names(), param.flags["wdk_rbf_on"]) print "processing pocket", pocket M = prepared_data.get_num_tasks() save_sim_p = numpy.zeros((M,M)) save_sim_t = numpy.zeros((M,M)) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) similarity_task = task_kernel.get_similarity(task_name_lhs, task_name_rhs) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) save_sim_p[prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs)] = similarity save_sim_t[prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs)] = similarity_task #from IPython.Shell import IPShellEmbed #IPShellEmbed([])() lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() print "training SVM for pocket", pocket svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() #import pdb #pdb.set_trace() classifiers.append(svm) return classifiers
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0] * 9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float( pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos - 1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % ( str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity( data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel # init weights # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels()) combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) #print "WARNING: custom epsilon set" #svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information[ "svm num sv"] = svm.get_num_support_vectors() self.additional_information[ "post_weights"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms
N = subset_size ################################################################## # internal modification ################################################################## task_vector = [0]*(N/2) task_vector.extend([1]*(N/2)) base_wdk = WeightedDegreeStringKernel(feat, feat, 1) normalizer = MultitaskKernelNormalizer(task_vector) #wdk.set_task_vector(task_vector) #, task_vector) for i in xrange(2): for j in xrange(2): if i==j: normalizer.set_task_similarity(i,j, 4.0) else: normalizer.set_task_similarity(i,j, 1.0) base_wdk.set_normalizer(normalizer) print base_wdk.get_kernel_matrix()
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->",wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert(len(alphas_tree)==len(alphas)) for i in xrange(len(alphas)): assert(abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
def solver_mtk_shogun(C, all_xt, all_lt, task_indicator, M, L, eps, target_obj): """ implementation using multitask kernel """ xt = numpy.array(all_xt) lt = numpy.array(all_lt) tt = numpy.array(task_indicator, dtype=numpy.int32) tsm = numpy.array(M) print "task_sim:", tsm num_tasks = L.shape[0] # sanity checks assert len(xt) == len(lt) == len(tt) assert M.shape == L.shape assert num_tasks == len(set(tt)) # set up shogun objects if type(xt[0]) == numpy.string_: feat = StringCharFeatures(DNA) xt = [str(a) for a in xt] feat.set_features(xt) base_kernel = WeightedDegreeStringKernel(feat, feat, 8) else: feat = RealFeatures(xt.T) base_kernel = LinearKernel(feat, feat) lab = Labels(lt) # set up normalizer normalizer = MultitaskKernelNormalizer(tt.tolist()) for i in xrange(num_tasks): for j in xrange(num_tasks): normalizer.set_task_similarity(i, j, M[i, j]) print "num of unique tasks: ", normalizer.get_num_unique_tasks( task_indicator) # set up kernel base_kernel.set_cache_size(2000) base_kernel.set_normalizer(normalizer) base_kernel.init_normalizer() # set up svm svm = SVMLight() #LibSVM() svm.set_epsilon(eps) #print "reducing num threads to one" #svm.parallel.set_num_threads(1) #print "using one thread" # how often do we like to compute objective etc svm.set_record_interval(0) svm.set_target_objective(target_obj) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.io.set_loglevel(MSG_DEBUG) #SET THREADS TO 1 svm.set_C(C, C) svm.set_bias_enabled(False) # prepare for training svm.set_labels(lab) svm.set_kernel(base_kernel) # train svm svm.train() train_times = svm.get_training_times() objectives = [-obj for obj in svm.get_dual_objectives()] if False: # get model parameters sv_idx = svm.get_support_vectors() sparse_alphas = svm.get_alphas() assert len(sv_idx) == len(sparse_alphas) # compute dense alpha (remove label) alphas = numpy.zeros(len(xt)) for id_sparse, id_dense in enumerate(sv_idx): alphas[id_dense] = sparse_alphas[id_sparse] * lt[id_dense] # print alphas W = alphas_to_w(alphas, xt, lt, task_indicator, M) primal_obj = compute_primal_objective( W.reshape(W.shape[0] * W.shape[1]), C, all_xt, all_lt, task_indicator, L) objectives.append(primal_obj) train_times.append(train_times[-1] + 100) return objectives, train_times
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity( taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->", wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert (len(alphas_tree) == len(alphas)) for i in xrange(len(alphas)): assert (abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels)
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_empty_kernel(param) lab = shogun_factory.create_labels(data.labels) combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) #base_wdk.init_normalizer() combined_features.append_feature_obj(base_features) combined_kernel.append_kernel(base_wdk) ################################################## # intra-domain blocks intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) # set mixing factor (used if MKL is OFF) assert (param.base_similarity <= 1) assert (param.base_similarity >= 0) combined_kernel.set_subkernel_weights( [param.base_similarity, 1 - param.base_similarity]) combined_kernel.init(combined_features, combined_features) svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) print "WARNING: custom epsilon set" svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() self.additional_information["similarities"] = similarities self.additional_information[ "post_weights"] = combined_kernel.get_subkernel_weights() # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, combined_kernel, svm) return svms
def _inner_train(self, prepared_data, param): """ perform inner training by processing the tree """ # init seq handler classifiers = [] ################# # mtk normalizer = MultitaskKernelNormalizer(prepared_data.task_vector_nums) from method_mhc_rbf import SequencesHandlerRbf task_kernel = SequencesHandlerRbf(1, param.base_similarity, prepared_data.get_task_names(), param.flags["wdk_rbf_on"]) # set similarity for task_name_lhs in prepared_data.get_task_names(): for task_name_rhs in prepared_data.get_task_names(): similarity = task_kernel.get_similarity( task_name_lhs, task_name_rhs) normalizer.set_task_similarity( prepared_data.name_to_id(task_name_lhs), prepared_data.name_to_id(task_name_rhs), similarity) lab = shogun_factory.create_labels(prepared_data.labels) print "creating empty kernel" kernel = shogun_factory.create_kernel(prepared_data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) kernel.init_normalizer() svm = shogun_factory.create_svm(param, kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # train SVM svm.train() classifiers.append(svm) ################# # dirac #import pdb #pdb.set_trace() svm_dirac = self._dirac_train(prepared_data, param) classifiers.append(svm_dirac) ## #union #svm_union = self._union_train(prepared_data, param) #classifiers.append(svm_union) return classifiers
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert(param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # create normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load hard-coded task-similarity task_similarity = helper.load("/fml/ag-raetsch/home/cwidmer/svn/projects/alt_splice_code/src/task_sim_tis.bz2") # set similarity similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i, task_name_lhs) in enumerate(data.get_task_names()): #max_value_row = max(task_similarity.get_row(task_name_lhs)) max_value_row = 1.0 for (j, task_name_rhs) in enumerate(data.get_task_names()): similarity = task_similarity.get_value(task_name_lhs, task_name_rhs) / max_value_row normalizer.set_task_similarity(i, j, similarity) similarities[i,j] = similarity pprint.pprint similarities # set normalizer #print "WARNING MTK disabled!!!!!!!!!!!!!!!!!!!!!" base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm param.flags["svm_type"] = "svmlight" #fix svm type svm = shogun_factory.create_svm(param, base_wdk, lab) # make sure these parameters are set correctly #print "WARNING MTK WONT WORK WITH THESE SETTINGS!!!!!!!!!!!!!!!!!!!!!" svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) assert svm.get_linadd_enabled() == False, "linadd should be disabled" assert svm.get_batch_computation_enabled == False, "batch compute should be disabled" # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # split for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) # merge data sets data = PreparedMultitaskData(train_weak, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0]*9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] pockets = [] for i in xrange(35): pockets.append([i]) #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # init seq handler pseudoseqs = SequencesHandler() classifiers = [] for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel = shogun_factory.create_kernel(data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) print "training SVM for pocket", pocket svm = self._train_single_svm(param, kernel, lab) classifiers.append(svm) print "done obtaining weak learners" # save additional info #self.additional_information["svm_objective"] = svm.get_objective() #self.additional_information["svm num sv"] = svm.get_num_support_vectors() #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() #print self.additional_information ################################################## # combine weak learners for each task ################################################## # set constants some = 0.9 import cvxmod # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_boosting.keys(): instances = train_boosting[task_name] N = len(instances) F = len(pockets) examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): svm = classifiers[i] tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name)) out[:,i] = numpy.sign(tmp_out) #out[:,i] = tmp_out #TODO: fix helper.save("/tmp/out_sparse", (out,labels)) pdb.set_trace() weights = solve_boosting(out, labels, some, solver="mosek") svms[task_name] = (data.name_to_id(task_name), svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
feat = StringCharFeatures(DNA) feat.set_features(examples) lab = Labels(numpy.array(labels)) N = subset_size ################################################################## # internal modification ################################################################## task_vector = [0] * (N / 2) task_vector.extend([1] * (N / 2)) base_wdk = WeightedDegreeStringKernel(feat, feat, 1) normalizer = MultitaskKernelNormalizer(task_vector) #wdk.set_task_vector(task_vector) #, task_vector) for i in xrange(2): for j in xrange(2): if i == j: normalizer.set_task_similarity(i, j, 4.0) else: normalizer.set_task_similarity(i, j, 1.0) base_wdk.set_normalizer(normalizer) print base_wdk.get_kernel_matrix() print "--->", base_wdk.get_normalizer().get_name()
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0]*9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel # init weights # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels()) combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) #print "WARNING: custom epsilon set" #svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information["svm num sv"] = svm.get_num_support_vectors() self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert(param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j,v) in enumerate(tokens) if j!=0]) assert len(entry)==num_lines, "len_entry %i, num_lines %i" % (len(entry), num_lines) task_distances[i,:] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert (param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels)
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # split for training weak_learners and boosting (train_weak, train_boosting) = split_data(train_data, 4) # merge data sets data = PreparedMultitaskData(train_weak, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel:" ######################################################## # init seq handler pseudoseqs = SequencesHandler() classifiers = [] for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel = shogun_factory.create_kernel(data.examples, param) print "setting normalizer" kernel.set_normalizer(normalizer) print "training SVM for pocket", pocket svm = self._train_single_svm(param, kernel, lab) classifiers.append(svm) print "done obtaining weak learners" # save additional info #self.additional_information["svm_objective"] = svm.get_objective() #self.additional_information["svm num sv"] = svm.get_num_support_vectors() #self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() #print self.additional_information ################################################## # combine weak learners for each task ################################################## # set constants some = 0.9 import cvxmod # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_boosting.keys(): instances = train_boosting[task_name] N = len(instances) F = len(pockets) examples = [inst.example for inst in instances] labels = [inst.label for inst in instances] # dim = (F x N) out = cvxmod.zeros((N,F)) for i in xrange(F): svm = classifiers[i] tmp_out = self._predict_weak(svm, examples, data.name_to_id(task_name)) out[:,i] = numpy.sign(tmp_out) #out[:,i] = tmp_out #TODO: fix helper.save("/tmp/out_sparse", (out,labels)) pdb.set_trace() weights = solve_boosting(out, labels, some, solver="mosek") svms[task_name] = (data.name_to_id(task_name), svm) return svms