labels[8] = 1 labels[19] = 1 feat = StringCharFeatures(DNA) feat.set_features(examples) wdk = WeightedDegreeStringKernel(feat, feat, 1) lab = Labels(numpy.array(labels)) svm = SVMLight(1, wdk, lab) svm.train() svm.set_shrinking_enabled(False) print "simple svm", svm.get_objective() print "len(examples)", len(examples) print "##############" #print "##############" #print "svm light" #svm_light = SVMLight(1.0,wdk,lab) #svm_light.train() #print "svmlight objective", svm_light.get_objective()
def serialization_svmlight_modular(num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## # set up toy data and svm traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) trainlab = concatenate((-ones(num), ones(num))) testlab = concatenate((-ones(num), ones(num))) feats_train = RealFeatures(traindata_real) feats_test = RealFeatures(testdata_real) kernel = GaussianKernel(feats_train, feats_train, width) #kernel.io.set_loglevel(MSG_DEBUG) labels = BinaryLabels(trainlab) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## # serialize to file fn = "serialized_svm.bz2" print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check print("unserializing SVM") svm2 = load(fn) print("comparing objectives") svm2.train() print("objective before serialization:", svm.get_objective()) print("objective after serialization:", svm2.get_objective()) print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in xrange(len(out)): assert abs(out[i] - out2[i] < 0.000001) print("all checks passed.") return True
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity( task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
#print pickle.dumps(kernel) # #print "svm" #print pickle.dumps(svm) # #print "#################################" fn = "serialized_svm.bz2" #print "serializing SVM to file", fn save(fn, svm) #print "#################################" #print "unserializing SVM" svm2 = load(fn) #print "#################################" #print "comparing training" svm2.train() #print "objective before serialization:", svm.get_objective() #print "objective after serialization:", svm2.get_objective() return svm, svm.get_objective(), svm2, svm2.get_objective() if __name__=='__main__': print 'Serialization SVMLight' serialization_svmlight_modular(*parameter_list[0])
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert(param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j,v) in enumerate(tokens) if j!=0]) assert len(entry)==num_lines, "len_entry %i, num_lines %i" % (len(entry), num_lines) task_distances[i,:] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->", wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective()
print "features" print feats_train.to_string() print "kernel" print kernel.to_string() print "svm" print svm.to_string() print "#################################" fn = "serialized_svm.bz2" print "serializing SVM to file", fn save(fn, svm) print "#################################" print "unserializing SVM" svm2 = load(fn) print "#################################" print "comparing training" svm2.train() print "objective before serialization:", svm.get_objective() print "objective after serialization:", svm2.get_objective()
def serialization_svmlight_modular(num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## seed(17) traindata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) testdata_real = concatenate((randn(2, num) - dist, randn(2, num) + dist), axis=1) trainlab = concatenate((-ones(num), ones(num))) testlab = concatenate((-ones(num), ones(num))) feats_train = RealFeatures(traindata_real) feats_test = RealFeatures(testdata_real) kernel = GaussianKernel(feats_train, feats_train, width) #kernel.io.set_loglevel(MSG_DEBUG) labels = BinaryLabels(trainlab) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## #print("labels:") #print(pickle.dumps(labels)) # #print("features") #print(pickle.dumps(feats_train)) # #print("kernel") #print(pickle.dumps(kernel)) # #print("svm") #print(pickle.dumps(svm)) # #print("#################################") fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) #print("#################################") #print("unserializing SVM") svm2 = load(fn) #print("#################################") #print("comparing training") svm2.train() #print("objective before serialization:", svm.get_objective()) #print("objective after serialization:", svm2.get_objective()) return svm, svm.get_objective(), svm2, svm2.get_objective()
def serialization_svmlight_modular (num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## # set up toy data and svm traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab = concatenate((-ones(num), ones(num))); testlab = concatenate((-ones(num), ones(num))); feats_train = RealFeatures(traindata_real); feats_test = RealFeatures(testdata_real); kernel = GaussianKernel(feats_train, feats_train, width); #kernel.io.set_loglevel(MSG_DEBUG) labels = BinaryLabels(trainlab); svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## # serialize to file fn = "serialized_svm.bz2" print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check print("unserializing SVM") svm2 = load(fn) print("comparing objectives") svm2.train() print("objective before serialization:", svm.get_objective()) print("objective after serialization:", svm2.get_objective()) print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in xrange(len(out)): assert abs(out[i] - out2[i] < 0.000001) print("all checks passed.") return True
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) ######################################################## print "creating a kernel for each node:" ######################################################## # init seq handler task_kernel = SequencesHandlerRbf(1, param.base_similarity, data.get_task_names(), param.flags["wdk_rbf_on"]) similarities = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = task_kernel.get_similarity(task_name_lhs, task_name_rhs) print similarity print "similarity (%s,%s)=%f" % (task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs),data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
labels_presvm[15] = 1 labels_presvm[8] = 1 labels_presvm[19] = 1 feat_presvm = StringCharFeatures(DNA) feat_presvm.set_features(examples_presvm) wdk_presvm = WeightedDegreeStringKernel(feat_presvm, feat_presvm, 1) lab_presvm = Labels(numpy.array(labels_presvm)) presvm = SVMLight(1, wdk_presvm, lab_presvm) presvm.train() presvm2 = LibSVM(1, wdk_presvm, lab_presvm) presvm2.train() print "svmlight", presvm.get_objective() print "libsvm", presvm2.get_objective() assert (abs(presvm.get_objective() - presvm2.get_objective()) <= 0.001) print "simple svm", presvm.get_objective() print "len(examples_presvm)", len(examples_presvm) print "##############" ############################################# # compute linear term manually ############################################# examples = [i.example for i in d[subset_size:subset_size * 2]]
else: normalizer.set_task_similarity(i,j, 1.0) base_wdk.set_normalizer(normalizer) print base_wdk.get_kernel_matrix() print "--->",base_wdk.get_normalizer().get_name() svm = SVMLight(1, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train(feat) print "interally modified kernel. objective:", svm.get_objective() ################################################################## # regular SVM ################################################################## wdk = WeightedDegreeStringKernel(feat, feat, 1) normalizer = IdentityKernelNormalizer() wdk.set_normalizer(normalizer) svm = SVMLight(1, wdk, lab) svm.set_linadd_enabled(False)
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity( taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->", wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->", wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert (len(alphas_tree) == len(alphas)) for i in xrange(len(alphas)): assert (abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
#print pickle.dumps(kernel) # #print "svm" #print pickle.dumps(svm) # #print "#################################" fn = "serialized_svm.bz2" #print "serializing SVM to file", fn save(fn, svm) #print "#################################" #print "unserializing SVM" svm2 = load(fn) #print "#################################" #print "comparing training" svm2.train() #print "objective before serialization:", svm.get_objective() #print "objective after serialization:", svm2.get_objective() return svm, svm.get_objective(), svm2, svm2.get_objective() if __name__ == '__main__': print 'Serialization SVMLight' serialization_svmlight_modular(*parameter_list[0])
def _inner_train(self, train_data, param): """ perform inner training by processing the tree """ data_keys = [] # top-down processing of taxonomy classifiers = [] classifier_at_node = {} root = param.taxonomy.data grey_nodes = [root] while len(grey_nodes)>0: node = grey_nodes.pop(0) # pop first item # enqueue children if node.children != None: grey_nodes.extend(node.children) ##################################################### # init data structures ##################################################### # get data below current node data = [train_data[key] for key in node.get_data_keys()] data_keys.append(node.get_data_keys()) print "data at current level" for instance_set in data: print instance_set[0].dataset # initialize containers examples = [] labels = [] # concatenate data for instance_set in data: print "train split_set:", instance_set[0].dataset.organism for inst in instance_set: examples.append(inst.example) labels.append(inst.label) # create shogun data objects k = shogun_factory.create_kernel(examples, param) lab = shogun_factory.create_labels(labels) ##################################################### # train weak learners ##################################################### cost = param.cost # set up svm svm = SVMLight(cost, k, lab) if param.flags["normalize_cost"]: # set class-specific Cs norm_c_pos = param.cost / float(len([l for l in labels if l==1])) norm_c_neg = param.cost / float(len([l for l in labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) print "using cost: negative class=%f, positive class=%f" % (norm_c_neg, norm_c_pos) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) # train svm.train() # append svm object classifiers.append(svm) classifier_at_node[node.name] = svm # save some information self.additional_information[node.name + " svm obj"] = svm.get_objective() self.additional_information[node.name + " svm num sv"] = svm.get_num_support_vectors() self.additional_information[node.name + " runtime"] = svm.get_runtime() return (classifiers, classifier_at_node)
def serialization_svmlight_modular(num, dist, width, C): from shogun.IO import MSG_DEBUG from shogun.Features import RealFeatures, BinaryLabels, DNA, Alphabet from shogun.Kernel import WeightedDegreeStringKernel, GaussianKernel from shogun.Classifier import SVMLight from numpy import concatenate, ones from numpy.random import randn, seed import sys import types import random import bz2 try: import cPickle as pickle except ImportError: import pickle as pickle import inspect def save(filename, myobj): """ save object to file using pickle @param filename: name of destination file @type filename: str @param myobj: object to save (has to be pickleable) @type myobj: obj """ try: f = bz2.BZ2File(filename, 'wb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be written\n') sys.stderr.write(details) return pickle.dump(myobj, f, protocol=2) f.close() def load(filename): """ Load from filename using pickle @param filename: name of file to load from @type filename: str """ try: f = bz2.BZ2File(filename, 'rb') except IOError as details: sys.stderr.write('File ' + filename + ' cannot be read\n') sys.stderr.write(details) return myobj = pickle.load(f) f.close() return myobj ################################################## seed(17) traindata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real=concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1); trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); feats_train=RealFeatures(traindata_real); feats_test=RealFeatures(testdata_real); kernel=GaussianKernel(feats_train, feats_train, width); #kernel.io.set_loglevel(MSG_DEBUG) labels=BinaryLabels(trainlab); svm=SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ################################################## #print("labels:") #print(pickle.dumps(labels)) # #print("features") #print(pickle.dumps(feats_train)) # #print("kernel") #print(pickle.dumps(kernel)) # #print("svm") #print(pickle.dumps(svm)) # #print("#################################") fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) #print("#################################") #print("unserializing SVM") svm2 = load(fn) #print("#################################") #print("comparing training") svm2.train() #print("objective before serialization:", svm.get_objective()) #print("objective after serialization:", svm2.get_objective()) return svm, svm.get_objective(), svm2, svm2.get_objective()
labels_presvm[19] = 1 feat_presvm = StringCharFeatures(DNA) feat_presvm.set_features(examples_presvm) wdk_presvm = WeightedDegreeStringKernel(feat_presvm, feat_presvm, 1) lab_presvm = Labels(numpy.array(labels_presvm)) presvm = SVMLight(1, wdk_presvm, lab_presvm) presvm.train() presvm2 = LibSVM(1, wdk_presvm, lab_presvm) presvm2.train() print "svmlight", presvm.get_objective() print "libsvm", presvm2.get_objective() assert(abs(presvm.get_objective() - presvm2.get_objective())<= 0.001) print "simple svm", presvm.get_objective() print "len(examples_presvm)", len(examples_presvm) print "##############" ############################################# # compute linear term manually #############################################
def test_data(): ################################################################## # select MSS ################################################################## mss = expenv.MultiSplitSet.get(379) ################################################################## # data ################################################################## # fetch data instance_set = mss.get_train_data(-1) # prepare data data = PreparedMultitaskData(instance_set, shuffle=True) # set parameters param = Options() param.kernel = "WeightedDegreeStringKernel" param.wdk_degree = 4 param.cost = 1.0 param.transform = 1.0 param.id = 666 param.freeze() ################################################################## # taxonomy ################################################################## taxonomy = shogun_factory.create_taxonomy(mss.taxonomy.data) support = numpy.linspace(0, 100, 4) distances = [[0, 1, 2, 2], [1, 0, 2, 2], [2, 2, 0, 1], [2, 2, 1, 0]] # create tree normalizer tree_normalizer = MultitaskKernelPlifNormalizer(support, data.task_vector_names) task_names = data.get_task_names() FACTOR = 1.0 # init gamma matrix gammas = numpy.zeros((data.get_num_tasks(), data.get_num_tasks())) for t1_name in task_names: for t2_name in task_names: similarity = taxonomy.compute_node_similarity(taxonomy.get_id(t1_name), taxonomy.get_id(t2_name)) gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] = similarity helper.save("/tmp/gammas", gammas) gammas = gammas * FACTOR cost = param.cost * numpy.sqrt(FACTOR) print gammas ########## # regular normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) for t1_name in task_names: for t2_name in task_names: similarity = gammas[data.name_to_id(t1_name), data.name_to_id(t2_name)] normalizer.set_task_similarity(data.name_to_id(t1_name), data.name_to_id(t2_name), similarity) ################################################################## # Train SVMs ################################################################## # create shogun objects wdk_tree = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->",wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective() alphas_tree = svm_tree.get_alphas() alphas = svm.get_alphas() assert(len(alphas_tree)==len(alphas)) for i in xrange(len(alphas)): assert(abs(alphas_tree[i] - alphas[i]) < 0.0001) print "success: all alphas are the same"
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ assert (param.base_similarity >= 1) # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) lab = shogun_factory.create_labels(data.labels) # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) base_wdk.init_normalizer() # set up svm svm = SVMLight(param.cost, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() #self.additional_information["distances"] = distances self.additional_information["similarities"] = similarities # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, param, svm) return svms
wdk_tree.set_normalizer(tree_normalizer) wdk_tree.init_normalizer() print "--->",wdk_tree.get_normalizer().get_name() svm_tree = SVMLight(cost, wdk_tree, lab) svm_tree.set_linadd_enabled(False) svm_tree.set_batch_computation_enabled(False) svm_tree.train() del wdk_tree del tree_normalizer print "finished training tree-norm SVM:", svm_tree.get_objective() wdk = shogun_factory.create_kernel(data.examples, param) wdk.set_normalizer(normalizer) wdk.init_normalizer() print "--->",wdk.get_normalizer().get_name() svm = SVMLight(cost, wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() print "finished training manually set SVM:", svm.get_objective()
normalizer.set_task_similarity(i, j, 4.0) else: normalizer.set_task_similarity(i, j, 1.0) base_wdk.set_normalizer(normalizer) print base_wdk.get_kernel_matrix() print "--->", base_wdk.get_normalizer().get_name() svm = SVMLight(1, base_wdk, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train(feat) print "interally modified kernel. objective:", svm.get_objective() ################################################################## # regular SVM ################################################################## wdk = WeightedDegreeStringKernel(feat, feat, 1) normalizer = IdentityKernelNormalizer() wdk.set_normalizer(normalizer) svm = SVMLight(1, wdk, lab) svm.set_linadd_enabled(False) svm.train() #print "unmodified svm. objective:", svm.get_objective()