def classifier_svmlight_linear_term_modular(fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna,degree=3, \ C=10,epsilon=1e-5,num_threads=1): from shogun.Features import StringCharFeatures, BinaryLabels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() return out,kernel
def classifier_domainadaptationsvm_modular(fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna, \ label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \ label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3): feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ##################################### #print("obtaining DA SVM from previously trained SVM") feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = BinaryLabels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.apply_binary(feats_test2) return out #,dasvm TODO
def classifier_domainadaptationsvm_modular(fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna, \ label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \ label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3): feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = Labels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ##################################### #print "obtaining DA SVM from previously trained SVM" feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = Labels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.apply(feats_test2).get_labels() return out #,dasvm TODO
def svm_learn(kernel, labels, options): """train SVM using SVMLight or LibSVM Arguments: kernel -- kernel object from Shogun toolbox lebels -- list of labels options -- object containing option data Return: trained svm object """ try: svm = SVMLight(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double))) except NameError: svm = LibSVM(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double))) if options.quiet == False: svm.io.set_loglevel(MSG_INFO) svm.io.set_target_to_stderr() svm.set_epsilon(options.epsilon) svm.parallel.set_num_threads(1) if options.weight != 1.0: svm.set_C(options.svmC, options.svmC * options.weight) svm.train() if options.quiet == False: svm.io.set_loglevel(MSG_ERROR) return svm
def train(self, data, labels): """ model training """ # centered WDK/WDK-shift if self.param["shifts"] == 0: kernel_center = WeightedDegreeStringKernel(self.param["degree"]) else: kernel_center = WeightedDegreePositionStringKernel( 10, self.param["degree"]) shifts_vector = numpy.ones( self.param["center_offset"] * 2, dtype=numpy.int32) * self.param["shifts"] kernel_center.set_shifts(shifts_vector) kernel_center.set_cache_size(self.param["kernel_cache"] / 3) # border spetrum kernels size = self.param["kernel_cache"] / 3 use_sign = False kernel_left = WeightedCommWordStringKernel(size, use_sign) kernel_right = WeightedCommWordStringKernel(size, use_sign) # assemble combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_center) kernel.append_kernel(kernel_left) kernel.append_kernel(kernel_right) ## building features feat = create_features(data, self.param["center_offset"], self.param["center_pos"]) # init combined kernel kernel.init(feat, feat) print "len(labels) = %i" % (len(labels)) lab = BinaryLabels(numpy.double(labels)) self.svm = SVMLight(self.param["cost"], kernel, lab) # show debugging output self.svm.io.enable_progress() self.svm.io.set_loglevel(MSG_DEBUG) # optimization settings num_threads = 2 self.svm.parallel.set_num_threads(num_threads) self.svm.set_epsilon(10e-8) self.svm.train() return self
def classifier_svmlight_modular (fm_train_dna=traindat,fm_test_dna=testdat,label_train_dna=label_traindat,C=1.2,epsilon=1e-5,num_threads=1): from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel
def _train_single_svm(self, param, kernel, lab): kernel.set_cache_size(500) #lab = shogun_factory.create_labels(data.labels) svm = SVMLight(param.cost, kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost #norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) #norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) #svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() return svm
def svm_learn(kernel, labels, options): """train SVM using SVMLight or LibSVM Arguments: kernel -- kernel object from Shogun toolbox lebels -- list of labels options -- object containing option data Return: trained svm object """ try: svm=SVMLight(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double))) except NameError: svm=LibSVM(options.svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double))) if options.quiet == False: svm.io.set_loglevel(MSG_INFO) svm.io.set_target_to_stderr() svm.set_epsilon(options.epsilon) svm.parallel.set_num_threads(1) if options.weight != 1.0: svm.set_C(options.svmC, options.svmC*options.weight) svm.train() if options.quiet == False: svm.io.set_loglevel(MSG_ERROR) return svm
def create_svm(param, data, lab): """ create SVM object with standard settings @param param: parameter object @param data: kernel or feature object (for kernelized/linear svm) @param lab: label object @return: svm object """ # create SVM if param.flags.has_key( "svm_type") and param.flags["svm_type"] == "liblineardual": print "creating LibLinear object" svm = LibLinear(param.cost, data, lab) svm.set_liblinear_solver_type(L2R_L2LOSS_SVC_DUAL) # set solver type if param.flags.has_key( "solver_type") and param.flags["solver_type"] == "L2R_LR": print "setting linear solver type to: L2R_LR" svm.set_liblinear_solver_type(L2R_LR) else: print "creating SVMLight object" svm = SVMLight(param.cost, data, lab) return set_svm_parameters(svm, param)
class ShogunPredictor(object): """ basic single-task promoter model using string kernels """ def __init__(self, degree=4, shifts=32, kernel_cache=10000, cost=1.0): #TODO: clean up degree self.degree = degree self.degree_wdk = degree self.degree_spectrum = degree self.shifts = shifts self.kernel_cache = kernel_cache self.cost = cost self.center_offset = 50 self.center_pos = 1200 self.epsilon = 10e-2 self.num_threads = 4 def train(self, data, labels): kernel = create_promoter_kernel(data, self.center_offset, self.center_pos, self.degree_wdk, self.degree_spectrum, self.shifts, kernel_cache=self.kernel_cache) print "len(labels) = %i" % (len(labels)) lab = create_labels(labels) self.svm = SVMLight(self.cost, kernel, lab) # show debugging output self.svm.io.enable_progress() self.svm.io.set_loglevel(MSG_DEBUG) # optimization settings num_threads = self.num_threads self.svm.parallel.set_num_threads(num_threads) self.svm.set_epsilon(self.epsilon) self.svm.train() return self def predict(self, data): feat = create_promoter_features(data, self.center_offset, self.center_pos) out = self.svm.apply(feat).get_values() return out
def svm_light (): print 'SVMLight' from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) C=1.2 epsilon=1e-5 num_threads=1 labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.classify().get_labels()
def train(self, data, labels): """ model training """ # centered WDK/WDK-shift if self.param["shifts"] == 0: kernel_center = WeightedDegreeStringKernel(self.param["degree"]) else: kernel_center = WeightedDegreePositionStringKernel(10, self.param["degree"]) shifts_vector = numpy.ones(self.param["center_offset"]*2, dtype=numpy.int32)*self.param["shifts"] kernel_center.set_shifts(shifts_vector) kernel_center.set_cache_size(self.param["kernel_cache"]/3) # border spetrum kernels size = self.param["kernel_cache"]/3 use_sign = False kernel_left = WeightedCommWordStringKernel(size, use_sign) kernel_right = WeightedCommWordStringKernel(size, use_sign) # assemble combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_center) kernel.append_kernel(kernel_left) kernel.append_kernel(kernel_right) ## building features feat = create_features(data, self.param["center_offset"], self.param["center_pos"]) # init combined kernel kernel.init(feat, feat) print "len(labels) = %i" % (len(labels)) lab = BinaryLabels(numpy.double(labels)) self.svm = SVMLight(self.param["cost"], kernel, lab) # show debugging output self.svm.io.enable_progress() self.svm.io.set_loglevel(MSG_DEBUG) # optimization settings num_threads = 2 self.svm.parallel.set_num_threads(num_threads) self.svm.set_epsilon(10e-8) self.svm.train() return self
def svm_learn(kernel, labels, svmC, epsilon, weight): """ """ try: svm = SVMLight(svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double))) except NameError: print 'No support for SVMLight available.' return svm.io.set_loglevel(MSG_INFO) svm.io.set_target_to_stderr() svm.set_epsilon(epsilon) svm.parallel.set_num_threads(1) if weight != 1.0: svm.set_C(svmC, svmC * weight) svm.train() svm.io.set_loglevel(MSG_ERROR) return svm
def train(self, data, labels): kernel = create_promoter_kernel(data, self.center_offset, self.center_pos, self.degree_wdk, self.degree_spectrum, self.shifts, kernel_cache=self.kernel_cache) print "len(labels) = %i" % (len(labels)) lab = create_labels(labels) self.svm = SVMLight(self.cost, kernel, lab) # show debugging output self.svm.io.enable_progress() self.svm.io.set_loglevel(MSG_DEBUG) # optimization settings num_threads = self.num_threads self.svm.parallel.set_num_threads(num_threads) self.svm.set_epsilon(self.epsilon) self.svm.train() return self
def svm_learn(kernel, labels, svmC, epsilon, weight): """ """ try: svm=SVMLight(svmC, kernel, Labels(numpy.array(labels, dtype=numpy.double))) except NameError: print 'No support for SVMLight available.' return svm.io.set_loglevel(MSG_INFO) svm.io.set_target_to_stderr() svm.set_epsilon(epsilon) svm.parallel.set_num_threads(1) if weight != 1.0: svm.set_C(svmC, svmC*weight) svm.train() svm.io.set_loglevel(MSG_ERROR) return svm
def serialization_svmlight_modular(num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## seed(17) traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) trainlab = concatenate((-ones(num), ones(num))) testlab = concatenate((-ones(num), ones(num))) feats_train = RealFeatures(traindata_real) feats_test = RealFeatures(testdata_real) kernel = GaussianKernel(feats_train, feats_train, width) #kernel.io.set_loglevel(MSG_DEBUG) labels = BinaryLabels(trainlab) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## #print("labels:") #print(pickle.dumps(labels)) # #print("features") #print(pickle.dumps(feats_train)) # #print("kernel") #print(pickle.dumps(kernel)) # #print("svm") #print(pickle.dumps(svm)) # #print("#################################") fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) #print("#################################") #print("unserializing SVM") svm2 = load(fn) #print("#################################") #print("comparing training") svm2.train() #print("objective before serialization:", svm.get_objective()) #print("objective after serialization:", svm2.get_objective()) return svm, svm.get_objective(), svm2, svm2.get_objective()
def _inner_train(self, train_data, param): """ perform inner training by processing the tree """ data_keys = [] # top-down processing of taxonomy classifiers = [] classifier_at_node = {} root = param.taxonomy.data grey_nodes = [root] while len(grey_nodes)>0: node = grey_nodes.pop(0) # pop first item # enqueue children if node.children != None: grey_nodes.extend(node.children) ##################################################### # init data structures ##################################################### # get data below current node data = [train_data[key] for key in node.get_data_keys()] data_keys.append(node.get_data_keys()) print "data at current level" for instance_set in data: print instance_set[0].dataset # initialize containers examples = [] labels = [] # concatenate data for instance_set in data: print "train split_set:", instance_set[0].dataset.organism for inst in instance_set: examples.append(inst.example) labels.append(inst.label) # create shogun data objects k = shogun_factory.create_kernel(examples, param) lab = shogun_factory.create_labels(labels) ##################################################### # train weak learners ##################################################### cost = param.cost # set up svm svm = SVMLight(cost, k, lab) if param.flags["normalize_cost"]: # set class-specific Cs norm_c_pos = param.cost / float(len([l for l in labels if l==1])) norm_c_neg = param.cost / float(len([l for l in labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) print "using cost: negative class=%f, positive class=%f" % (norm_c_neg, norm_c_pos) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) # train svm.train() # append svm object classifiers.append(svm) classifier_at_node[node.name] = svm # save some information self.additional_information[node.name + " svm obj"] = svm.get_objective() self.additional_information[node.name + " svm num sv"] = svm.get_num_support_vectors() self.additional_information[node.name + " runtime"] = svm.get_runtime() return (classifiers, classifier_at_node)
traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) trainlab = concatenate((-ones(num), ones(num))) testlab = concatenate((-ones(num), ones(num))) feats_train = RealFeatures(traindata_real) feats_test = RealFeatures(testdata_real) kernel = GaussianKernel(feats_train, feats_train, width) #kernel.io.set_loglevel(MSG_DEBUG) labels = Labels(trainlab) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## #print "labels:" #print pickle.dumps(labels) # #print "features" #print pickle.dumps(feats_train) # #print "kernel" #print pickle.dumps(kernel) # #print "svm"
def solver_mtk_shogun(C, all_xt, all_lt, task_indicator, M, L, eps, target_obj): """ implementation using multitask kernel """ xt = numpy.array(all_xt) lt = numpy.array(all_lt) tt = numpy.array(task_indicator, dtype=numpy.int32) tsm = numpy.array(M) print "task_sim:", tsm num_tasks = L.shape[0] # sanity checks assert len(xt) == len(lt) == len(tt) assert M.shape == L.shape assert num_tasks == len(set(tt)) # set up shogun objects if type(xt[0]) == numpy.string_: feat = StringCharFeatures(DNA) xt = [str(a) for a in xt] feat.set_features(xt) base_kernel = WeightedDegreeStringKernel(feat, feat, 8) else: feat = RealFeatures(xt.T) base_kernel = LinearKernel(feat, feat) lab = Labels(lt) # set up normalizer normalizer = MultitaskKernelNormalizer(tt.tolist()) for i in xrange(num_tasks): for j in xrange(num_tasks): normalizer.set_task_similarity(i, j, M[i, j]) print "num of unique tasks: ", normalizer.get_num_unique_tasks( task_indicator) # set up kernel base_kernel.set_cache_size(2000) base_kernel.set_normalizer(normalizer) base_kernel.init_normalizer() # set up svm svm = SVMLight() #LibSVM() svm.set_epsilon(eps) #print "reducing num threads to one" #svm.parallel.set_num_threads(1) #print "using one thread" # how often do we like to compute objective etc svm.set_record_interval(0) svm.set_target_objective(target_obj) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.io.set_loglevel(MSG_DEBUG) #SET THREADS TO 1 svm.set_C(C, C) svm.set_bias_enabled(False) # prepare for training svm.set_labels(lab) svm.set_kernel(base_kernel) # train svm svm.train() train_times = svm.get_training_times() objectives = [-obj for obj in svm.get_dual_objectives()] if False: # get model parameters sv_idx = svm.get_support_vectors() sparse_alphas = svm.get_alphas() assert len(sv_idx) == len(sparse_alphas) # compute dense alpha (remove label) alphas = numpy.zeros(len(xt)) for id_sparse, id_dense in enumerate(sv_idx): alphas[id_dense] = sparse_alphas[id_sparse] * lt[id_dense] # print alphas W = alphas_to_w(alphas, xt, lt, task_indicator, M) primal_obj = compute_primal_objective( W.reshape(W.shape[0] * W.shape[1]), C, all_xt, all_lt, task_indicator, L) objectives.append(primal_obj) train_times.append(train_times[-1] + 100) return objectives, train_times
for i in xrange(2): for j in xrange(2): if i==j: normalizer.set_task_similarity(i,j, 4.0) else: normalizer.set_task_similarity(i,j, 1.0) base_wdk.set_normalizer(normalizer) print base_wdk.get_kernel_matrix() print "--->",base_wdk.get_normalizer().get_name() svm = SVMLight(1, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train(feat) print "interally modified kernel. objective:", svm.get_objective() ################################################################## # regular SVM ################################################################## wdk = WeightedDegreeStringKernel(feat, feat, 1)
def serialization_string_kernels_modular(n_data, num_shifts, size): """ serialize svm with string kernels """ ################################################## # set up toy data and svm train_xt, train_lt = generate_random_data(n_data) test_xt, test_lt = generate_random_data(n_data) feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(size, 5) shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign) kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) kernel.append_kernel(kernel_spec_2) # init kernel labels = BinaryLabels(train_lt); svm = SVMLight(1.0, kernel, labels) #svm.io.set_loglevel(MSG_DEBUG) svm.train(feats_train) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in range(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return out,out2
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_empty_kernel(param) lab = shogun_factory.create_labels(data.labels) combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) #base_wdk.init_normalizer() combined_features.append_feature_obj(base_features) combined_kernel.append_kernel(base_wdk) ################################################## # intra-domain blocks intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) # set mixing factor (used if MKL is OFF) assert (param.base_similarity <= 1) assert (param.base_similarity >= 0) combined_kernel.set_subkernel_weights( [param.base_similarity, 1 - param.base_similarity]) combined_kernel.init(combined_features, combined_features) svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) print "WARNING: custom epsilon set" svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() self.additional_information["similarities"] = similarities self.additional_information[ "post_weights"] = combined_kernel.get_subkernel_weights() # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, combined_kernel, svm) return svms
from numpy import array, float64 import sys # create dense matrices A,B,C A=array([[1,2,3],[4,0,0],[0,0,0],[0,5,0],[0,0,6],[9,9,9]], dtype=float64) B=array([1,1,1,-1,-1,-1], dtype=float64) # ... of type Real, LongInt and Byte feats_train = RealFeatures(A.transpose()) kernel = GaussianKernel(feats_train, feats_train, 1.0) kernel.io.set_loglevel(MSG_DEBUG) lab = Labels(B) svm = SVMLight(1, kernel, lab) svm.train() helper.save("/tmp/awesome_svm", svm) svm = helper.load("/tmp/awesome_svm") svm.train() #sys.exit(0) run = expenv.Run.get(1010) #run = expenv.Run.get(974) dat = run.get_train_data()
data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->", wdk.get_normalizer().get_name()
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity( task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert (param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
labels_presvm = [i.label for i in d[0:subset_size]] labels_presvm[2] = 1 labels_presvm[12] = 1 labels_presvm[15] = 1 labels_presvm[8] = 1 labels_presvm[19] = 1 feat_presvm = StringCharFeatures(DNA) feat_presvm.set_features(examples_presvm) wdk_presvm = WeightedDegreeStringKernel(feat_presvm, feat_presvm, 1) lab_presvm = Labels(numpy.array(labels_presvm)) presvm = SVMLight(1, wdk_presvm, lab_presvm) presvm.train() presvm2 = LibSVM(1, wdk_presvm, lab_presvm) presvm2.train() print "svmlight", presvm.get_objective() print "libsvm", presvm2.get_objective() assert(abs(presvm.get_objective() - presvm2.get_objective())<= 0.001) print "simple svm", presvm.get_objective() print "len(examples_presvm)", len(examples_presvm) print "##############"
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() pseudoseq_length = pseudoseqs.seq_length for pos in range(pseudoseq_length): print "appending kernel for pos %i" % (pos) print "nums", data.task_vector_nums pos_block_vec = PairiiVec() # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pos) #print "computing similarity for tasks (%s, %s) = %i" % (task_name_lhs, task_name_rhs, similarity) if similarity == 1: tmp_pair = Pairii(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)) pos_block_vec.push_back(tmp_pair) print "creating normalizer" normalizer_pos = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, pos_block_vec) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer_pos) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) print "WARNING: custom epsilon set" svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information["svm num sv"] = svm.get_num_support_vectors() self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert(param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j,v) in enumerate(tokens) if j!=0]) assert len(entry)==num_lines, "len_entry %i, num_lines %i" % (len(entry), num_lines) task_distances[i,:] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # fetch taxonomy from parameter object taxonomy = shogun_factory.create_taxonomy(param.taxonomy.data) # set normalizer normalizer = MultitaskKernelTreeNormalizer(data.task_vector_names, data.task_vector_names, taxonomy) ######################################################## gammas = self.taxonomy_to_gammas(data, taxonomy) print "gammas before MKL:" print gammas ######################################################## base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() svm = None num_subk = base_wdk.get_num_subkernels() print "num subkernels:", num_subk #print "subkernel weights:", base_wdk.get_subkernel_weights() self.additional_information["weights_before"] = [ normalizer.get_beta(i) for i in range(num_subk) ] print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_kernel(base_wdk) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) if param.flags["normalize_cost"]: # normalize cost norm_c_pos = param.cost / float( len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() #print "subkernel weights (after):", base_wdk.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "svm objective:", svm.get_objective() self.additional_information["weights"] = [ normalizer.get_beta(i) for i in range(num_subk) ] self.additional_information["gammas"] = self.taxonomy_to_gammas( data, taxonomy) print "debug weights:" print self.additional_information print "" # wrap up predictors svms = {} # use a reference to the same svm several times for task_id in train_data.keys(): svms[task_id] = svm return svms
def classifier_svmlight_linear_term_modular(fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna,degree=3, \ C=10,epsilon=1e-5,num_threads=1): from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight feats_train = StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = Labels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term( -numpy.array([1, 2, 3, 4, 5, 6, 7, 8, 7, 6], dtype=numpy.double)) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() return out, kernel
seed(17) traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); #kernel.io.set_loglevel(MSG_DEBUG) labels=Labels(trainlab); svm=SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## #print "labels:" #print pickle.dumps(labels) # #print "features" #print pickle.dumps(feats_train) # #print "kernel" #print pickle.dumps(kernel) # #print "svm"
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ import numpy numpy.random.seed(666) # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG) # set kernel cache if param.flags.has_key("cache_size"): combined_kernel.set_cache_size(param.flags["cache_size"]) # create features base_features = shogun_factory.create_features(data.examples, param) combined_features = CombinedFeatures() ######################################################## print "creating a masked kernel for possible subset:" ######################################################## power_set_tasks = power_set(data.get_task_ids()) for active_task_ids in power_set_tasks: print "masking all entries other than:", active_task_ids # create mask-based normalizer normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums, data.task_vector_nums, active_task_ids) # normalize trace if param.flags.has_key( "normalize_trace") and param.flags["normalize_trace"]: norm_factor = len(data.get_task_ids()) / len(active_task_ids) normalizer.set_normalization_constant(norm_factor) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() self.additional_information[ "weights before trainng"] = combined_kernel.get_subkernel_weights( ) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.flags["mkl_q"]) # set interleaved optimization if param.flags.has_key("interleaved"): svm.set_interleaved_optimization_enabled( param.flags["interleaved"]) # set solver type if param.flags.has_key( "solver_type") and param.flags["solver_type"]: if param.flags["solver_type"] == "ST_CPLEX": svm.set_solver_type(ST_CPLEX) if param.flags["solver_type"] == "ST_DIRECT": svm.set_solver_type(ST_DIRECT) if param.flags["solver_type"] == "ST_NEWTON": svm.set_solver_type(ST_NEWTON) if param.flags["solver_type"] == "ST_GLPK": svm.set_solver_type(ST_GLPK) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: svm = SVMLight(param.cost, combined_kernel, lab) # optimization settings num_threads = 4 svm.parallel.set_num_threads(num_threads) if param.flags.has_key("epsilon"): svm.set_epsilon(param.flags["epsilon"]) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) # disable unsupported optimizations (due to special normalizer) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float( len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() # prepare mapping weight_map = {} weights = combined_kernel.get_subkernel_weights() for (i, pset) in enumerate(power_set_tasks): print pset subset_str = str([data.id_to_name(task_idx) for task_idx in pset]) weight_map[subset_str] = weights[i] # store additional info self.additional_information["svm objective"] = svm.get_objective() self.additional_information["weight_map"] = weight_map # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), len(power_set_tasks), combined_kernel, svm, param) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ for task_id in train_data.keys(): print "task_id:", task_id root = param.taxonomy.data grey_nodes = [root] # top-down processing of taxonomy for node in root.get_leaves(): ##################################################### # train predictor ##################################################### parent_node = node.get_nearest_neighbor() cost = param.cost (examples, labels) = self.get_data(parent_node, train_data) # create shogun data objects k_parent = shogun_factory_new.create_kernel(examples, param) lab_parent = shogun_factory_new.create_labels(labels) parent_svm = SVMLight(cost, k_parent, lab_parent) parent_svm.train() ##################################################### # train predictors ##################################################### (examples, labels) = self.get_data(node, train_data) # create shogun data objects k = shogun_factory_new.create_kernel(examples, param) lab = shogun_factory_new.create_labels(labels) # regularize vs parent predictor weight = param.transform print "current edge_weight:", weight, " ,name:", node.name svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight) svm.train() # attach svm to node node.predictor = svm ##################################################### # Wrap things up ##################################################### # wrap up predictors for later use predictors = {} for leaf in root.get_leaves(): predictors[leaf.name] = leaf.predictor assert (leaf.predictor != None) sym_diff_keys = set(train_data.keys()).symmetric_difference( set(predictors.keys())) assert len( sym_diff_keys ) == 0, "symmetric difference between keys non-empty: " + str( sym_diff_keys) return predictors
from numpy import array, float64 import sys # create dense matrices A,B,C A = array([[1, 2, 3], [4, 0, 0], [0, 0, 0], [0, 5, 0], [0, 0, 6], [9, 9, 9]], dtype=float64) B = array([1, 1, 1, -1, -1, -1], dtype=float64) # ... of type Real, LongInt and Byte feats_train = RealFeatures(A.transpose()) kernel = GaussianKernel(feats_train, feats_train, 1.0) kernel.io.set_loglevel(MSG_DEBUG) lab = Labels(B) svm = SVMLight(1, kernel, lab) svm.train() helper.save("/tmp/awesome_svm", svm) svm = helper.load("/tmp/awesome_svm") svm.train() #sys.exit(0) run = expenv.Run.get(1010) #run = expenv.Run.get(974) dat = run.get_train_data() print dat.keys() d = dat["thaliana"]
def serialization_string_kernels_modular(n_data, num_shifts, size): """ serialize svm with string kernels """ ################################################## # set up toy data and svm train_xt, train_lt = generate_random_data(n_data) test_xt, test_lt = generate_random_data(n_data) feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(size, 5) shifts_vector = numpy.ones(max_len, dtype=numpy.int32) * num_shifts kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign) kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) kernel.append_kernel(kernel_spec_2) # init kernel labels = BinaryLabels(train_lt) svm = SVMLight(1.0, kernel, labels) #svm.io.set_loglevel(MSG_DEBUG) svm.train(feats_train) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in xrange(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return out, out2
labels = [i.label for i in d[0:subset_size]] labels[2] = 1 labels[12] = 1 labels[15] = 1 labels[8] = 1 labels[19] = 1 feat = StringCharFeatures(DNA) feat.set_features(examples) wdk = WeightedDegreeStringKernel(feat, feat, 1) lab = Labels(numpy.array(labels)) svm = SVMLight(1, wdk, lab) svm.train() svm.set_shrinking_enabled(False) print "simple svm", svm.get_objective() print "len(examples)", len(examples) print "##############" #print "##############" #print "svm light" #svm_light = SVMLight(1.0,wdk,lab)
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity( taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->", wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert (len(alphas_tree) == len(alphas)) for i in xrange(len(alphas)): assert (abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
def classifier_svmlight_batch_linadd_modular(fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads): from shogun.Features import StringCharFeatures, BinaryLabels, DNA from shogun.Kernel import WeightedDegreeStringKernel, MSG_DEBUG try: from shogun.Classifier import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train=StringCharFeatures(DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) #print('SVMLight Objective: %f num_sv: %d' % \) # (svm.get_objective(), svm.get_num_support_vectors()) svm.set_batch_computation_enabled(False) svm.set_linadd_enabled(False) svm.apply().get_labels() svm.set_batch_computation_enabled(True) labels = svm.apply().get_labels() return labels, svm
def serialization_svmlight_modular(num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## seed(17) traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); #kernel.io.set_loglevel(MSG_DEBUG) labels=BinaryLabels(trainlab); svm=SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## #print("labels:") #print(pickle.dumps(labels)) # #print("features") #print(pickle.dumps(feats_train)) # #print("kernel") #print(pickle.dumps(kernel)) # #print("svm") #print(pickle.dumps(svm)) # #print("#################################") fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) #print("#################################") #print("unserializing SVM") svm2 = load(fn) #print("#################################") #print("comparing training") svm2.train() #print("objective before serialization:", svm.get_objective()) #print("objective after serialization:", svm2.get_objective()) return svm, svm.get_objective(), svm2, svm2.get_objective()
def do_batch_linadd (): print 'SVMlight batch' from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree=20 kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) C=1 epsilon=1e-5 num_threads=2 labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) #print 'SVMLight Objective: %f num_sv: %d' % \ # (svm.get_objective(), svm.get_num_support_vectors()) svm.set_batch_computation_enabled(False) svm.set_linadd_enabled(False) svm.classify().get_labels() svm.set_batch_computation_enabled(True) svm.classify().get_labels()
print 'SVMLight' from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel from shogun.Classifier import SVMLight feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) C=10 epsilon=1e-5 num_threads=1 labels=Labels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.classify().get_labels()
width=2.1 traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); kernel.io.set_loglevel(MSG_DEBUG) labels=Labels(trainlab); svm=SVMLight(2, kernel, labels) svm.train() svm.io.set_loglevel(MSG_DEBUG) ################################################## print "labels:" print labels.to_string() print "features" print feats_train.to_string() print "kernel" print kernel.to_string() print "svm"
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->",wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert(len(alphas_tree)==len(alphas)) for i in xrange(len(alphas)): assert(abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
examples_presvm = [i.example for i in d[0:subset_size]] labels_presvm = [i.label for i in d[0:subset_size]] labels_presvm[2] = 1 labels_presvm[12] = 1 labels_presvm[15] = 1 labels_presvm[8] = 1 labels_presvm[19] = 1 feat_presvm = StringCharFeatures(DNA) feat_presvm.set_features(examples_presvm) wdk_presvm = WeightedDegreeStringKernel(feat_presvm, feat_presvm, 1) lab_presvm = Labels(numpy.array(labels_presvm)) presvm = SVMLight(1, wdk_presvm, lab_presvm) presvm.train() presvm2 = LibSVM(1, wdk_presvm, lab_presvm) presvm2.train() print "svmlight", presvm.get_objective() print "libsvm", presvm2.get_objective() assert (abs(presvm.get_objective() - presvm2.get_objective()) <= 0.001) print "simple svm", presvm.get_objective() print "len(examples_presvm)", len(examples_presvm) print "##############"
def classifier_svmlight_batch_linadd_modular(fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads): from shogun.Features import StringCharFeatures, Labels, DNA from shogun.Kernel import WeightedDegreeStringKernel, MSG_DEBUG try: from shogun.Classifier import SVMLight except ImportError: print 'No support for SVMLight available.' return feats_train = StringCharFeatures(DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20 kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = Labels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) #print 'SVMLight Objective: %f num_sv: %d' % \ # (svm.get_objective(), svm.get_num_support_vectors()) svm.set_batch_computation_enabled(False) svm.set_linadd_enabled(False) svm.apply().get_labels() svm.set_batch_computation_enabled(True) labels = svm.apply().get_labels() return labels, svm
def serialization_svmlight_modular(num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## # set up toy data and svm traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) trainlab = concatenate((-ones(num), ones(num))) testlab = concatenate((-ones(num), ones(num))) feats_train = RealFeatures(traindata_real) feats_test = RealFeatures(testdata_real) kernel = GaussianKernel(feats_train, feats_train, width) #kernel.io.set_loglevel(MSG_DEBUG) labels = BinaryLabels(trainlab) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## # serialize to file fn = "serialized_svm.bz2" print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check print("unserializing SVM") svm2 = load(fn) print("comparing objectives") svm2.train() print("objective before serialization:", svm.get_objective()) print("objective after serialization:", svm2.get_objective()) print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in xrange(len(out)): assert abs(out[i] - out2[i] < 0.000001) print("all checks passed.") return True
################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer()
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # support support = numpy.linspace(0, 1, 5) # set normalizer normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_nums) # fetch taxonomy from parameter object taxonomy = param.taxonomy.data taxonomy.plot() import os os.system("evince demo.png &") # compute distances distances = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for (i, task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): distances[i, j] = task_similarities.compute_hop_distance( taxonomy, task_name_lhs, task_name_rhs) # normalize distances distances = distances / numpy.max(distances) # set distances for (i, task_name_lhs) in enumerate(data.get_task_names()): for (j, task_name_rhs) in enumerate(data.get_task_names()): normalizer.set_task_distance(i, j, distances[i, j]) # assign normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() svm = None debug_weights = {} num_subk = base_wdk.get_num_subkernels() print "num subkernels:", num_subk #print "subkernel weights:", base_wdk.get_subkernel_weights() debug_weights["before"] = [ normalizer.get_beta(i) for i in range(num_subk) ] print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_C(param.cost, param.cost) svm.set_kernel(base_wdk) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() #print "subkernel weights (after):", base_wdk.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "svm objective:", svm.get_objective() debug_weights["after"] = [ normalizer.get_beta(i) for i in range(num_subk) ] # debugging output print "debug weights (before/after):" print debug_weights["before"] print debug_weights["after"] print "" # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (svm, data.name_to_id(task_name)) return svms
def serialization_svmlight_modular(num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, "wb") except IOError as details: sys.stderr.write("File " + filename + " cannot be written\n") sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, "rb") except IOError as details: sys.stderr.write("File " + filename + " cannot be read\n") sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## # set up toy data and svm traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) trainlab = concatenate((-ones(num), ones(num))) testlab = concatenate((-ones(num), ones(num))) feats_train = RealFeatures(traindata_real) feats_test = RealFeatures(testdata_real) kernel = GaussianKernel(feats_train, feats_train, width) # kernel.io.set_loglevel(MSG_DEBUG) labels = BinaryLabels(trainlab) svm = SVMLight(C, kernel, labels) svm.train() # svm.io.set_loglevel(MSG_DEBUG) ################################################## # serialize to file fn = "serialized_svm.bz2" # print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check # print("unserializing SVM") svm2 = load(fn) # print("comparing objectives") svm2.train() # print("objective before serialization:", svm.get_objective()) # print("objective after serialization:", svm2.get_objective()) # print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in xrange(len(out)): assert abs(out[i] - out2[i] < 0.000001) # print("all checks passed.") return True
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # dict to save additional information for later analysis self.additional_information = {} # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples, param) combined_features = CombinedFeatures() ################################################## # intra-domain blocks (dirac kernel) intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" ################################################## # all blocks (full kernel matrix) all_block_vec = PairiiVec() for task_id_1 in data.get_task_ids(): for task_id_2 in data.get_task_ids(): all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # create mask-based normalizer normalizer_all = MultitaskKernelMaskPairNormalizer( data.task_vector_nums, all_block_vec) kernel_all = shogun_factory.create_empty_kernel(param) kernel_all.set_normalizer(normalizer_all) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_all) # append features combined_features.append_feature_obj(base_features) ################################################## # hack # hack_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # hack_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001"))) # other_group = ["B_0702", "B_1501", "B_5801"] # for task_id_1 in other_group: # for task_id_2 in other_group: # hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2))) # # # # # create mask-based normalizer # normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec) # kernel_hack = shogun_factory.create_empty_kernel(param) # kernel_hack.set_normalizer(normalizer_hack) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_hack) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # init combined kernel combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() self.additional_information[ "mkl weights before"] = combined_kernel.get_subkernel_weights() print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.flags["mkl_q"]) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.set_epsilon(0.03) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float( len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() print "subkernel weights (after):", combined_kernel.get_subkernel_weights( ) ######################################################## print "svm objective:" print svm.get_objective() self.additional_information["svm_objective"] = svm.get_objective() self.additional_information[ "svm num sv"] = svm.get_num_support_vectors() self.additional_information[ "mkl weights post-training"] = combined_kernel.get_subkernel_weights( ) ######################################################## # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm, param) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) kernel_matrix = base_wdk.get_kernel_matrix() lab = shogun_factory.create_labels(data.labels) # fetch taxonomy from parameter object taxonomy = param.taxonomy.data # create name to leaf map nodes = taxonomy.get_all_nodes() ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel from shogun.Kernel import CombinedKernel, CustomKernel combined_kernel = CombinedKernel() # indicator to which task each example belongs task_vector = data.task_vector_names for node in nodes: print "creating kernel for ", node.name # fetch sub-tree leaf_names = [leaf.name for leaf in node.get_leaves()] print "masking all entries other than:", leaf_names # init matrix kernel_matrix_node = numpy.zeros(kernel_matrix.shape) # fill matrix for node for (i, task_lhs) in enumerate(task_vector): for (j, task_rhs) in enumerate(task_vector): # only copy values, if both tasks are present in subtree if task_lhs in leaf_names and task_rhs in leaf_names: kernel_matrix_node[i,j] = kernel_matrix[i,j] # create custom kernel kernel_node = CustomKernel() kernel_node.set_full_kernel_matrix_from_full(kernel_matrix_node) # append custom kernel to CombinedKernel combined_kernel.append_kernel(kernel_node) print "------" print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) svm.set_solver_type(ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) #svm.set_linadd_enabled(False) #svm.set_batch_computation_enabled(False) svm.train() print "subkernel weights (after):", combined_kernel.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, combined_kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() ######################################################## print "svm objective:" print svm.get_objective() ######################################################## # wrap up predictors svms = {} # use a reference to the same svm several times for task_id in train_data.keys(): svms[task_id] = svm return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ for task_id in train_data.keys(): print "task_id:", task_id root = param.taxonomy.data grey_nodes = [root] # top-down processing of taxonomy for node in root.get_leaves(): ##################################################### # train predictor ##################################################### parent_node = node.get_nearest_neighbor() cost = param.cost (examples, labels) = self.get_data(parent_node, train_data) # create shogun data objects k_parent = shogun_factory_new.create_kernel(examples, param) lab_parent = shogun_factory_new.create_labels(labels) parent_svm = SVMLight(cost, k_parent, lab_parent) parent_svm.train() ##################################################### # train predictors ##################################################### (examples, labels) = self.get_data(node, train_data) # create shogun data objects k = shogun_factory_new.create_kernel(examples, param) lab = shogun_factory_new.create_labels(labels) # regularize vs parent predictor weight = param.transform print "current edge_weight:", weight, " ,name:", node.name svm = DomainAdaptationSVM(cost, k, lab, parent_svm, weight) svm.train() # attach svm to node node.predictor = svm ##################################################### # Wrap things up ##################################################### # wrap up predictors for later use predictors = {} for leaf in root.get_leaves(): predictors[leaf.name] = leaf.predictor assert(leaf.predictor!=None) sym_diff_keys = set(train_data.keys()).symmetric_difference(set(predictors.keys())) assert len(sym_diff_keys)==0, "symmetric difference between keys non-empty: " + str(sym_diff_keys) return predictors