def evaluation_cross_validation_multiclass_storage(traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage, CrossValidationMulticlassStorage from shogun.Evaluation import MulticlassAccuracy, F1Measure from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import MulticlassLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import MKLMulticlass from shogun.Mathematics import Statistics, MSG_DEBUG # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLMulticlass(1.0,kernel,labels); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=MulticlassAccuracy() # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) #mkl_storage=CrossValidationMKLStorage() #cross_validation.add_cross_validation_output(mkl_storage) multiclass_storage=CrossValidationMulticlassStorage() multiclass_storage.append_binary_evaluation(F1Measure()) cross_validation.add_cross_validation_output(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0) #print roc_0_0_0 auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0) #print auc_0_0_0 return roc_0_0_0, auc_0_0_0
def evaluation_cross_validation_mkl_weight_storage(traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import BinaryLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import LibSVM, MKLClassification from shogun.Mathematics import Statistics # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=BinaryLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLClassification(LibSVM()); svm.set_interleaved_optimization_enabled(False); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) mkl_storage=CrossValidationMKLStorage() cross_validation.add_cross_validation_output(mkl_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() # print mkl weights weights=mkl_storage.get_mkl_weights()
def statistics_linear_time_mmd_kernel_choice(): from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n=50000 dim=5 difference=2 # data is standard normal distributed. only one dimension of Y has a mean # shift of difference (X,Y)=gen_data.create_mean_data(n,dim,difference) # concatenate since MMD class takes data as one feature object # (it is possible to give two, but then data is copied) Z=concatenate((X,Y), axis=1) print "dimension means of X", [mean(x) for x in X] print "dimension means of Y", [mean(x) for x in Y] # create kernels/features to choose from # here: just a bunch of Gaussian Kernels with different widths # real sigmas are 2^-5, ..., 2^10 sigmas=array([pow(2,x) for x in range(-5,10)]) # shogun has a different parametrization of the Gaussian kernel shogun_sigmas=array([x*x*2 for x in sigmas]) # We will use multiple kernels kernel=CombinedKernel() # two separate feature objects here, could also be one with appended data features=CombinedFeatures() # all kernels work on same features for i in range(len(sigmas)): kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i])) features.append_feature_obj(RealFeatures(Z)) mmd=LinearTimeMMD(kernel,features, n) print "start learning kernel weights" mmd.set_opt_regularization_eps(10E-5) mmd.set_opt_low_cut(10E-5) mmd.set_opt_max_iterations(1000) mmd.set_opt_epsilon(10E-7) mmd.optimize_kernel_weights() weights=kernel.get_subkernel_weights() print "learned weights:", weights #pyplot.plot(array(range(len(sigmas))), weights) #pyplot.show() print "index of max weight", weights.argmax()
def statistics_linear_time_mmd_kernel_choice(): from shogun.Features import RealFeatures, CombinedFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n=50000 dim=5 difference=2 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create kernels/features to choose from # here: just a bunch of Gaussian Kernels with different widths # real sigmas are 2^-5, ..., 2^10 sigmas=array([pow(2,x) for x in range(-5,10)]) # shogun has a different parametrization of the Gaussian kernel shogun_sigmas=array([x*x*2 for x in sigmas]) # We will use multiple kernels kernel=CombinedKernel() # two separate feature objects here, could also be one with appended data features=CombinedFeatures() # all kernels work on same features for i in range(len(sigmas)): kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i])) features.append_feature_obj(RealFeatures(data)) mmd=LinearTimeMMD(kernel,features, n) print "start learning kernel weights" mmd.set_opt_regularization_eps(10E-5) mmd.set_opt_low_cut(10E-5) mmd.set_opt_max_iterations(1000) mmd.set_opt_epsilon(10E-7) mmd.optimize_kernel_weights() weights=kernel.get_subkernel_weights() print "learned weights:", weights #pyplot.plot(array(range(len(sigmas))), weights) #pyplot.show() print "index of max weight", weights.argmax()
def training_run(options): """Conduct a training run and return a trained SVM kernel""" settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width, options.replace) positives = MotifFinder(finder_settings=settings) positives.setFastaFile(options.positives) positives.setMotifs(options.pgff) pmotifs, ppositions = positives.getResults() negatives = MotifFinder(finder_settings=settings) negatives.setFastaFile(options.negatives) negatives.setMotifs(options.ngff) nmotifs, npositions = negatives.getResults() wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS wds_svm = EasySVM.EasySVM(wds_kparams) num_positives = len(pmotifs.values()[0]) num_negatives = len(nmotifs.values()[0]) # Creating Kernel Objects kernel = CombinedKernel() features = CombinedFeatures() kernel_array = [] motifs = pmotifs.keys() motifs.sort() # Adding Kmer Kernels for motif in motifs: all_examples = pmotifs[motif] + nmotifs[motif] motif_features = wds_svm.createFeatures(all_examples) wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, wds_kparams["degree"]) wds_kernel.set_shifts(wds_kparams["shift"] * ones(wds_kparams["seqlength"], dtype=int32)) features.append_feature_obj(motif_features) kernel_array.append(wds_kernel) kernel.append_kernel(wds_kernel) rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS) positions = array(ppositions + npositions, dtype=float64).T position_features = rbf_svm.createFeatures(positions) features.append_feature_obj(position_features) motif_labels = append(ones(num_positives), -ones(num_negatives)) complete_labels = Labels(motif_labels) rbf_kernel = GaussianKernel(position_features, position_features, kirmes_ini.RBF_KERNEL_PARAMETERS["width"]) kernel_array.append(rbf_kernel) kernel.append_kernel(rbf_kernel) # Kernel init kernel.init(features, features) kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE) svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels) svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS) # Training svm.train() if not os.path.exists(options.output_path): os.mkdir(options.output_path) html = {} if options.contrib: html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array, motifs) if options.logos: html["poims"] = poims(svm, kernel, kernel_array, motifs, options.output_path) if options.query: html["query"] = evaluate(options, svm, kernel, features, motifs) htmlize(html, options.output_html)
def create_empty_promoter_kernel(param): """ creates an uninitialized promoter kernel @param param: """ # centered WDK/WDK-shift if param["shifts"] == 0: kernel_center = WeightedDegreeStringKernel(param["degree"]) else: kernel_center = WeightedDegreePositionStringKernel(10, param["degree"]) shifts_vector = numpy.ones(param["center_offset"]*2, dtype=numpy.int32)*param["shifts"] kernel_center.set_shifts(shifts_vector) kernel_center.set_cache_size(param["kernel_cache"]/3) # border spetrum kernels size = param["kernel_cache"]/3 use_sign = False kernel_left = WeightedCommWordStringKernel(size, use_sign) kernel_right = WeightedCommWordStringKernel(size, use_sign) # assemble combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_center) kernel.append_kernel(kernel_left) kernel.append_kernel(kernel_right) return kernel
def get_weighted_spectrum_kernel(subfeats_list, options): """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement) Arguments: subfeats_list -- list of sub-feature objects options -- object containing option data Return: CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel """ kmerlen = options.kmerlen kmerlen2 = options.kmerlen2 subkernels = 0 kernel = CombinedKernel() feats = CombinedFeatures() for subfeats in subfeats_list: feats.append_feature_obj(subfeats) for k in xrange(kmerlen, kmerlen2+1): if k <= 8: subkernel = CommWordStringKernel(10, False) else: subkernel = CommUlongStringKernel(10, False) kernel.append_kernel(subkernel) subkernels+=1 kernel.init(feats, feats) kernel.set_subkernel_weights(numpy.array([1/float(subkernels)]*subkernels, numpy.dtype('float64'))) return kernel
def create_combined_kernel(kname, kparam, examples, train_mode, preproc): """A wrapper for creating combined kernels. kname, kparam and examples are lists. """ num_kernels = len(kname) feats['combined'] = CombinedFeatures() kernel = CombinedKernel() for kix in xrange(num_kernels): cur_kname = '%s%d' % (kname[kix],kix) (cur_feats, cur_preproc) = create_features(kname[kix], examples[kix], kparam[kix], train_mode, preproc) feats[cur_kname] = cur_feats cur_kernel = create_kernel(kname[kix], kparam[kix], cur_feats) kernel.append_kernel(cur_kernel) return (feats,kernel)
def train(self, data, labels): """ model training """ # centered WDK/WDK-shift if self.param["shifts"] == 0: kernel_center = WeightedDegreeStringKernel(self.param["degree"]) else: kernel_center = WeightedDegreePositionStringKernel(10, self.param["degree"]) shifts_vector = numpy.ones(self.param["center_offset"]*2, dtype=numpy.int32)*self.param["shifts"] kernel_center.set_shifts(shifts_vector) kernel_center.set_cache_size(self.param["kernel_cache"]/3) # border spetrum kernels size = self.param["kernel_cache"]/3 use_sign = False kernel_left = WeightedCommWordStringKernel(size, use_sign) kernel_right = WeightedCommWordStringKernel(size, use_sign) # assemble combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_center) kernel.append_kernel(kernel_left) kernel.append_kernel(kernel_right) ## building features feat = create_features(data, self.param["center_offset"], self.param["center_pos"]) # init combined kernel kernel.init(feat, feat) print "len(labels) = %i" % (len(labels)) lab = BinaryLabels(numpy.double(labels)) self.svm = SVMLight(self.param["cost"], kernel, lab) # show debugging output self.svm.io.enable_progress() self.svm.io.set_loglevel(MSG_DEBUG) # optimization settings num_threads = 2 self.svm.parallel.set_num_threads(num_threads) self.svm.set_epsilon(10e-8) self.svm.train() return self
def loadSVM(pickled_svm_filename, C, labels): """Loads a Shogun SVM object which was pickled by saveSVM""" from cPickle import Unpickler, PickleError from shogun.Kernel import CombinedKernel pickle_file = open(pickled_svm_filename, 'rb') unpck = Unpickler(pickle_file) (version, num_sv, name, bias, alphas, svs) = unpck.load() if (version == __version__): svm = LibSVM(num_sv) # same as .create_new_model(num_sv) svm.set_bias(bias) svm.set_alphas(alphas) svm.set_support_vectors(svs) kernel = CombinedKernel() #not sure if this is even required kernel.set_name(name) # maybe not required svm.set_kernel(kernel) else: print "File was pickled by another version of EasySVM.py or is not a kernel:" print "Received from ", pickled_svm_filename, ": ", version, " expected: ", __version__ raise PickleError return svm
def create_combined_wd_kernel(instances, param): """ creates a combined wd kernel object """ num_features = len(instances[0]) # contruct combined features kernel = CombinedKernel() for idx in range(num_features): param.kernel = "WeightedDegreeStringKernel" tmp_kernel = create_empty_kernel(param) kernel.append_kernel(tmp_kernel) combined_features = create_combined_wd_features(instances, feat_type="dna") kernel.init(combined_features, combined_features) return kernel
def mkl_binclass_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat): sc1 = StringCharFeatures(strings1, RAWBYTE) sc2 = StringCharFeatures(strings2, RAWBYTE) sfeats1 = StringWordFeatures(RAWBYTE) sfeats1.obtain_from_char(sc1, 0, 2, 0, False) skernel1 = CommWordStringKernel(10, False) skernel1.init(sfeats1, sfeats1) sfeats2 = StringWordFeatures(RAWBYTE) sfeats2.obtain_from_char(sc2, 0, 2, 0, False) skernel2 = CommWordStringKernel(10, False) skernel2.init(sfeats2, sfeats2) ffeats = RealFeatures(traindat) fkernel = LinearKernel(ffeats, ffeats) fffeats = RealFeatures(traindat) ffkernel = GaussianKernel(fffeats, fffeats, 1.0) # COMBINING LINADD FEATURES/KERNELS LEAD TO FAIL feats_train = CombinedFeatures() feats_train.append_feature_obj(ffeats) #feats_train.append_feature_obj(fffeats) feats_train.append_feature_obj(sfeats2) print feats_train.get_num_vectors() kernel = CombinedKernel() kernel.append_kernel(fkernel) #kernel.append_kernel(ffkernel) kernel.append_kernel(skernel2) kernel.init(feats_train, feats_train) labels = RegressionLabels(fm_label_twoclass) mkl = MKLRegression() mkl.set_mkl_norm(1) #2,3 mkl.set_C(1, 1) mkl.set_kernel(kernel) mkl.set_labels(labels) mkl.io.enable_file_and_line() mkl.io.set_loglevel(MSG_DEBUG)
def kernel_combined_modular(fm_train_real=traindat, fm_test_real=testdat, fm_train_dna=traindna, fm_test_dna=testdna): from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = StringCharFeatures(fm_train_dna, DNA) subkfeats_test = StringCharFeatures(fm_test_dna, DNA) degree = 3 subkernel = FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = StringCharFeatures(fm_train_dna, DNA) subkfeats_test = StringCharFeatures(fm_test_dna, DNA) subkernel = LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def serialization_string_kernels_modular(n_data, num_shifts, size): """ serialize svm with string kernels """ ################################################## # set up toy data and svm train_xt, train_lt = generate_random_data(n_data) test_xt, test_lt = generate_random_data(n_data) feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(size, 5) shifts_vector = numpy.ones(max_len, dtype=numpy.int32)*num_shifts kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign) kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) kernel.append_kernel(kernel_spec_2) # init kernel labels = BinaryLabels(train_lt); svm = SVMLight(1.0, kernel, labels) #svm.io.set_loglevel(MSG_DEBUG) svm.train(feats_train) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in range(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return out,out2
def serialization_string_kernels_modular(n_data, num_shifts, size): """ serialize svm with string kernels """ ################################################## # set up toy data and svm train_xt, train_lt = generate_random_data(n_data) test_xt, test_lt = generate_random_data(n_data) feats_train = construct_features(train_xt) feats_test = construct_features(test_xt) max_len = len(train_xt[0]) kernel_wdk = WeightedDegreePositionStringKernel(size, 5) shifts_vector = numpy.ones(max_len, dtype=numpy.int32) * num_shifts kernel_wdk.set_shifts(shifts_vector) ######## # set up spectrum use_sign = False kernel_spec_1 = WeightedCommWordStringKernel(size, use_sign) kernel_spec_2 = WeightedCommWordStringKernel(size, use_sign) ######## # combined kernel kernel = CombinedKernel() kernel.append_kernel(kernel_wdk) kernel.append_kernel(kernel_spec_1) kernel.append_kernel(kernel_spec_2) # init kernel labels = BinaryLabels(train_lt) svm = SVMLight(1.0, kernel, labels) #svm.io.set_loglevel(MSG_DEBUG) svm.train(feats_train) ################################################## # serialize to file fn = "serialized_svm.bz2" #print("serializing SVM to file", fn) save(fn, svm) ################################################## # unserialize and sanity check #print("unserializing SVM") svm2 = load(fn) #print("comparing predictions") out = svm.apply(feats_test).get_labels() out2 = svm2.apply(feats_test).get_labels() # assert outputs are close for i in xrange(len(out)): assert abs(out[i] - out2[i] < 0.000001) #print("all checks passed.") return out, out2
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel from shogun.Classifier import MKLMultiClass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10, 2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = Labels(label_train_multiclass) mkl = MKLMultiClass(C, kernel, labels) mkl.set_epsilon(epsilon) mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() pseudoseq_length = pseudoseqs.seq_length for pos in range(pseudoseq_length): print "appending kernel for pos %i" % (pos) print "nums", data.task_vector_nums pos_block_vec = PairiiVec() # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pos) #print "computing similarity for tasks (%s, %s) = %i" % (task_name_lhs, task_name_rhs, similarity) if similarity == 1: tmp_pair = Pairii(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)) pos_block_vec.push_back(tmp_pair) print "creating normalizer" normalizer_pos = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, pos_block_vec) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer_pos) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) print "WARNING: custom epsilon set" svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information["svm num sv"] = svm.get_num_support_vectors() self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms
def quadratic_time_mmd_graphical(): # parameters, change to get different results m=100 dim=2 # setting the difference of the first dimension smaller makes a harder test difference=0.5 # number of samples taken from null and alternative distribution num_null_samples=500 # streaming data generator for mean shift distributions gen_p=MeanShiftDataGenerator(0, dim) gen_q=MeanShiftDataGenerator(difference, dim) # Stream examples and merge them in order to compute MMD on joint sample # alternative is to call a different constructor of QuadraticTimeMMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] print "kernel widths:", widths combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # create MMD instance, use biased statistic mmd=QuadraticTimeMMD(combined,features, m) mmd.set_statistic_type(BIASED) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection=MMDKernelSelectionMax(mmd) # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel); print "selected kernel width:", kernel.get_width() # sample alternative distribution (new data each trial) alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): # Stream examples and merge them in order to replace in MMD features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) mmd.set_p_and_q(features) alt_samples[i]=mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_statistic_type(BIASED) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot=mmd.bootstrap_null() # sample from null distribution # spectrum, biased statistic if "sample_null_spectrum" in dir(QuadraticTimeMMD): mmd.set_null_approximation_method(MMD2_SPECTRUM) mmd.set_statistic_type(BIASED) null_samples_spectrum=mmd.sample_null_spectrum(num_null_samples, m-10) # fit gamma distribution, biased statistic mmd.set_null_approximation_method(MMD2_GAMMA) mmd.set_statistic_type(BIASED) gamma_params=mmd.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # to plot data, sample a few examples from stream first features=gen_p.get_streamed_features(m) features=features.create_merged_copy(gen_q.get_streamed_features(m)) data=features.get_feature_matrix() # plot figure() title('Quadratic Time MMD') # plot data of p and q subplot(2,3,1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m+1:2*m], data[1][m+1:2*m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$='+str(difference)+'\nm='+str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2,3,2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs=linspace(min(data[0])-1,max(data[0])+1, 50) plot(xs,normpdf( xs, 0, 1), 'r', linewidth=3) plot(xs,normpdf( xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_spectrum.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_spectrum=null_samples_spectrum[floor(len(null_samples_spectrum)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_spectrum=sum(null_samples_spectrum<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,3,4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_spectrum), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_spectrum), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,3,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3 )) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) grid(True) # plot null distribution spectrum subplot(2,3,5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_spectrum, 20, range=hist_range, normed=True); axvline(thresh_spectrum, 0, 1, linewidth=2, color='red') title('Null Dist. Spectrum\nType I error is ' + str(type_one_error_spectrum)) # plot null distribution gamma subplot(2,3,6) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel from shogun.Classifier import MKLMultiClass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10,2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = Labels(label_train_multiclass) mkl = MKLMultiClass(C, kernel, labels) mkl.set_epsilon(epsilon); mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() kernel.init(feats_train, feats_test) out = mkl.classify().get_labels() return out
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_kernel(data.examples, param) kernel_matrix = base_wdk.get_kernel_matrix() lab = shogun_factory.create_labels(data.labels) # fetch taxonomy from parameter object taxonomy = param.taxonomy.data # create name to leaf map nodes = taxonomy.get_all_nodes() ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel from shogun.Kernel import CombinedKernel, CustomKernel combined_kernel = CombinedKernel() # indicator to which task each example belongs task_vector = data.task_vector_names for node in nodes: print "creating kernel for ", node.name # fetch sub-tree leaf_names = [leaf.name for leaf in node.get_leaves()] print "masking all entries other than:", leaf_names # init matrix kernel_matrix_node = numpy.zeros(kernel_matrix.shape) # fill matrix for node for (i, task_lhs) in enumerate(task_vector): for (j, task_rhs) in enumerate(task_vector): # only copy values, if both tasks are present in subtree if task_lhs in leaf_names and task_rhs in leaf_names: kernel_matrix_node[i,j] = kernel_matrix[i,j] # create custom kernel kernel_node = CustomKernel() kernel_node.set_full_kernel_matrix_from_full(kernel_matrix_node) # append custom kernel to CombinedKernel combined_kernel.append_kernel(kernel_node) print "------" print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: num_threads = 4 svm = MKLClassification() svm.set_mkl_norm(param.transform) svm.set_solver_type(ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) svm.parallel.set_num_threads(num_threads) #svm.set_linadd_enabled(False) #svm.set_batch_computation_enabled(False) svm.train() print "subkernel weights (after):", combined_kernel.get_subkernel_weights() else: # create SVM (disable unsupported optimizations) svm = SVMLight(param.cost, combined_kernel, lab) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.train() ######################################################## print "svm objective:" print svm.get_objective() ######################################################## # wrap up predictors svms = {} # use a reference to the same svm several times for task_id in train_data.keys(): svms[task_id] = svm return svms
feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_test = CombinedFeatures() feats_test.append_feature_obj(RealFeatures(testdata_real)) feats_test.append_feature_obj(RealFeatures(testdata_real)) feats_test.append_feature_obj(RealFeatures(testdata_real)) feats_test.append_feature_obj(RealFeatures(testdata_real)) feats_test.append_feature_obj(RealFeatures(testdata_real)) labels = BinaryLabels(trainlab) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, s)) kernel.append_kernel(GaussianKernel(10, s)) kernel.append_kernel(GaussianKernel(10, s)) kernel.append_kernel(GaussianKernel(10, s)) kernel.append_kernel(GaussianKernel(10, s)) kernel.init(feats_train, feats_train) # Create a classifier classifier=MKLClassification(LibSVM()) classifier.set_interleaved_optimization_enabled(False) classifier.set_kernel(kernel) classifier.set_labels(labels) classifier.set_C(C, C) param_tree_root=ModelSelectionParameters()
trainlab=concatenate((-ones(num), ones(num))); testlab=concatenate((-ones(num), ones(num))); pos = traindata_real[:,trainlab == 1] neg = traindata_real[:,trainlab == -1] # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, s)) kernel.append_kernel(GaussianKernel(10, s)) kernel.append_kernel(GaussianKernel(10, s)) kernel.append_kernel(GaussianKernel(10, s)) kernel.append_kernel(GaussianKernel(10, s)) kernel.init(feats_train, feats_train) kernel.print_modsel_params() # train mkl labels = BinaryLabels(trainlab) mkl = MKLClassification() # which norm to use for MKL mkl.set_mkl_norm(1) #2,3 # set cost (neg, pos)
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ import numpy numpy.random.seed(666) # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG) # set kernel cache if param.flags.has_key("cache_size"): combined_kernel.set_cache_size(param.flags["cache_size"]) # create features base_features = shogun_factory.create_features(data.examples, param) combined_features = CombinedFeatures() ######################################################## print "creating a masked kernel for possible subset:" ######################################################## power_set_tasks = power_set(data.get_task_ids()) for active_task_ids in power_set_tasks: print "masking all entries other than:", active_task_ids # create mask-based normalizer normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums, data.task_vector_nums, active_task_ids) # normalize trace if param.flags.has_key( "normalize_trace") and param.flags["normalize_trace"]: norm_factor = len(data.get_task_ids()) / len(active_task_ids) normalizer.set_normalization_constant(norm_factor) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() self.additional_information[ "weights before trainng"] = combined_kernel.get_subkernel_weights( ) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.flags["mkl_q"]) # set interleaved optimization if param.flags.has_key("interleaved"): svm.set_interleaved_optimization_enabled( param.flags["interleaved"]) # set solver type if param.flags.has_key( "solver_type") and param.flags["solver_type"]: if param.flags["solver_type"] == "ST_CPLEX": svm.set_solver_type(ST_CPLEX) if param.flags["solver_type"] == "ST_DIRECT": svm.set_solver_type(ST_DIRECT) if param.flags["solver_type"] == "ST_NEWTON": svm.set_solver_type(ST_NEWTON) if param.flags["solver_type"] == "ST_GLPK": svm.set_solver_type(ST_GLPK) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: svm = SVMLight(param.cost, combined_kernel, lab) # optimization settings num_threads = 4 svm.parallel.set_num_threads(num_threads) if param.flags.has_key("epsilon"): svm.set_epsilon(param.flags["epsilon"]) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) # disable unsupported optimizations (due to special normalizer) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float( len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() # prepare mapping weight_map = {} weights = combined_kernel.get_subkernel_weights() for (i, pset) in enumerate(power_set_tasks): print pset subset_str = str([data.id_to_name(task_idx) for task_idx in pset]) weight_map[subset_str] = weights[i] # store additional info self.additional_information["svm objective"] = svm.get_objective() self.additional_information["weight_map"] = weight_map # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), len(power_set_tasks), combined_kernel, svm, param) return svms
feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(data_1)) feats_train.append_feature_obj(RealFeatures(data_2)) feats_train.append_feature_obj(RealFeatures(data_3)) #feats_test = CombinedFeatures() #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) labels = BinaryLabels(trainlab) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(LinearKernel()) kernel.append_kernel(LinearKernel()) kernel.append_kernel(LinearKernel()) kernel.init(feats_train, feats_train) kernel.print_modsel_params() # Create a classifier classifier=MKLClassification(LibSVM()) classifier.set_interleaved_optimization_enabled(False) classifier.set_kernel(kernel) classifier.set_labels(labels) classifier.set_C(2, 1) param_tree_root=ModelSelectionParameters()
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0]*9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel # init weights # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels()) combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) #print "WARNING: custom epsilon set" #svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information["svm num sv"] = svm.get_num_support_vectors() self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms
def kernel_combined_modular(fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def statistics_mmd_kernel_selection_single(m,distance,stretch,num_blobs,angle,selection_method): from shogun.Features import RealFeatures from shogun.Features import GaussianBlobsDataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import MMDKernelSelectionMedian from shogun.Statistics import MMDKernelSelectionMax from shogun.Statistics import MMDKernelSelectionOpt from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) m=1000 distance=10 stretch=5 num_blobs=3 angle=pi/4 # streaming data generator gen_p=GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q=GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot=1000 features=gen_p.get_streamed_features(num_plot) features=features.create_merged_copy(gen_q.get_streamed_features(num_plot)) data=features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas=[2**x for x in range(-3,10)] widths=[x*x*2 for x in sigmas] combined=CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size=1000 mmd=LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels if selection_method=="opt": selection=MMDKernelSelectionOpt(mmd) elif selection_method=="max": selection=MMDKernelSelectionMax(mmd) elif selection_method=="median": selection=MMDKernelSelectionMedian(mmd) # print measures (just for information) # in case Opt: ratios of MMD and standard deviation # in case Max: MMDs for each kernel # Does not work for median method if selection_method!="median": ratios=selection.compute_measures() #print "Measures:", ratios #subplot(2,2,3) #plot(ratios) #title('Measures') # perform kernel selection kernel=selection.select_kernel() kernel=GaussianKernel.obtain_from_generic(kernel) #print "selected kernel width:", kernel.get_width() # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_kernel(kernel) mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials=5; alpha=0.05 # test power typeIerrors=[0 for x in range(num_trials)] typeIIerrors=[0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i]=mmd.perform_test()>alpha mmd.set_simulate_h0(False) typeIIerrors[i]=mmd.perform_test()>alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel,typeIerrors,typeIIerrors
def evaluation_cross_validation_classification(traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import BinaryLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import LibSVM, MKLClassification from shogun.Mathematics import Statistics # training data, combined features all on same data features = RealFeatures(traindat) comb_features = CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels = BinaryLabels(label_traindat) # kernel, different Gaussians combined kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm = MKLClassification(LibSVM()) svm.set_interleaved_optimization_enabled(False) svm.set_kernel(kernel) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation = CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) mkl_storage = CrossValidationMKLStorage() cross_validation.add_cross_validation_output(mkl_storage) cross_validation.set_num_runs(3) # perform cross-validation result = cross_validation.evaluate() # print mkl weights weights = mkl_storage.get_mkl_weights() print "mkl weights during cross--validation" print weights print "mean per kernel" print Statistics.matrix_mean(weights, False) print "variance per kernel" print Statistics.matrix_variance(weights, False) print "std-dev per kernel" print Statistics.matrix_std_deviation(weights, False)
def kernel_combined_custom_poly_modular(fm_train_real = traindat,fm_test_real = testdat,fm_label_twoclass=label_traindat): from shogun.Features import CombinedFeatures, RealFeatures, BinaryLabels from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel from shogun.Classifier import LibSVM kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(fm_train_real) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10,2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = BinaryLabels(fm_label_twoclass) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(fm_test_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(fm_test_real) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.apply() km_train=kernel.get_kernel_matrix() return km_train,kernel
def __init__(self): self.sensors = list() self.kernel = CombinedKernel() self.svs = CombinedFeatures() self.svm = None self.window = (+100000, -1000000)
def mkl_binclass_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat): ################################## # set up and train # create some poly train/test matrix tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(fm_test_real) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(fm_train_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_train) # train mkl labels = BinaryLabels(fm_label_twoclass) mkl = MKLClassification() # which norm to use for MKL mkl.set_mkl_norm(1) # 2,3 # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() # w=kernel.get_subkernel_weights() # kernel.set_subkernel_weights(w) ################################## # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(fm_test_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) mkl.apply() return mkl.apply(), kernel
def statistics_mmd_kernel_selection_combined(m, distance, stretch, num_blobs, angle, selection_method): from shogun.Features import RealFeatures from shogun.Features import GaussianBlobsDataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import MMDKernelSelectionCombMaxL2 from shogun.Statistics import MMDKernelSelectionCombOpt from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN from shogun.Distance import EuclideanDistance from shogun.Mathematics import Statistics, Math # init seed for reproducability Math.init_random(1) # note that the linear time statistic is designed for much larger datasets # results for this low number will be bad (unstable, type I error wrong) # streaming data generator gen_p = GaussianBlobsDataGenerator(num_blobs, distance, 1, 0) gen_q = GaussianBlobsDataGenerator(num_blobs, distance, stretch, angle) # stream some data and plot num_plot = 1000 features = gen_p.get_streamed_features(num_plot) features = features.create_merged_copy( gen_q.get_streamed_features(num_plot)) data = features.get_feature_matrix() #figure() #subplot(2,2,1) #grid(True) #plot(data[0][0:num_plot], data[1][0:num_plot], 'r.', label='$x$') #title('$X\sim p$') #subplot(2,2,2) #grid(True) #plot(data[0][num_plot+1:2*num_plot], data[1][num_plot+1:2*num_plot], 'b.', label='$x$', alpha=0.5) #title('$Y\sim q$') # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # different to the standard form, see documentation) sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size = 10000 mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # combined kernels if selection_method == "opt": selection = MMDKernelSelectionCombOpt(mmd) elif selection_method == "l2": selection = MMDKernelSelectionCombMaxL2(mmd) # perform kernel selection (kernel is automatically set) kernel = selection.select_kernel() kernel = CombinedKernel.obtain_from_generic(kernel) #print "selected kernel weights:", kernel.get_subkernel_weights() #subplot(2,2,3) #plot(kernel.get_subkernel_weights()) #title("Kernel weights") # compute tpye I and II error (use many more trials). Type I error is only # estimated to check MMD1_GAUSSIAN method for estimating the null # distribution. Note that testing has to happen on difference data than # kernel selecting, but the linear time mmd does this implicitly mmd.set_null_approximation_method(MMD1_GAUSSIAN) # number of trials should be larger to compute tight confidence bounds num_trials = 5 alpha = 0.05 # test power typeIerrors = [0 for x in range(num_trials)] typeIIerrors = [0 for x in range(num_trials)] for i in range(num_trials): # this effectively means that p=q - rejecting is tpye I error mmd.set_simulate_h0(True) typeIerrors[i] = mmd.perform_test() > alpha mmd.set_simulate_h0(False) typeIIerrors[i] = mmd.perform_test() > alpha #print "type I error:", mean(typeIerrors), ", type II error:", mean(typeIIerrors) return kernel, typeIerrors, typeIIerrors
def linear_time_mmd_graphical(): # parameters, change to get different results m = 1000 # set to 10000 for a good test result dim = 2 # setting the difference of the first dimension smaller makes a harder test difference = 1 # number of samples taken from null and alternative distribution num_null_samples = 150 # streaming data generator for mean shift distributions gen_p = MeanShiftDataGenerator(0, dim) gen_q = MeanShiftDataGenerator(difference, dim) # use the median kernel selection # create combined kernel with Gaussian kernels inside (shoguns Gaussian kernel is # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable sigmas = [2**x for x in range(-3, 10)] widths = [x * x * 2 for x in sigmas] print "kernel widths:", widths combined = CombinedKernel() for i in range(len(sigmas)): combined.append_kernel(GaussianKernel(10, widths[i])) # mmd instance using streaming features, blocksize of 10000 block_size = 1000 mmd = LinearTimeMMD(combined, gen_p, gen_q, m, block_size) # kernel selection instance (this can easily replaced by the other methods for selecting # single kernels selection = MMDKernelSelectionOpt(mmd) # perform kernel selection kernel = selection.select_kernel() kernel = GaussianKernel.obtain_from_generic(kernel) mmd.set_kernel(kernel) print "selected kernel width:", kernel.get_width() # sample alternative distribution, stream ensures different samples each run alt_samples = zeros(num_null_samples) for i in range(len(alt_samples)): alt_samples[i] = mmd.compute_statistic() # sample from null distribution # bootstrapping, biased statistic mmd.set_null_approximation_method(BOOTSTRAP) mmd.set_bootstrap_iterations(num_null_samples) null_samples_boot = mmd.bootstrap_null() # fit normal distribution to null and sample a normal distribution mmd.set_null_approximation_method(MMD1_GAUSSIAN) variance = mmd.compute_variance_estimate() null_samples_gaussian = normal(0, sqrt(variance), num_null_samples) # to plot data, sample a few examples from stream first features = gen_p.get_streamed_features(m) features = features.create_merged_copy(gen_q.get_streamed_features(m)) data = features.get_feature_matrix() # plot figure() # plot data of p and q subplot(2, 3, 1) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=4)) # reduce number of x-ticks plot(data[0][0:m], data[1][0:m], 'ro', label='$x$') plot(data[0][m + 1:2 * m], data[1][m + 1:2 * m], 'bo', label='$x$', alpha=0.5) title('Data, shift in $x_1$=' + str(difference) + '\nm=' + str(m)) xlabel('$x_1, y_1$') ylabel('$x_2, y_2$') # histogram of first data dimension and pdf subplot(2, 3, 2) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(data[0], bins=50, alpha=0.5, facecolor='r', normed=True) hist(data[1], bins=50, alpha=0.5, facecolor='b', normed=True) xs = linspace(min(data[0]) - 1, max(data[0]) + 1, 50) plot(xs, normpdf(xs, 0, 1), 'r', linewidth=3) plot(xs, normpdf(xs, difference, 1), 'b', linewidth=3) xlabel('$x_1, y_1$') ylabel('$p(x_1), p(y_1)$') title('Data PDF in $x_1, y_1$') # compute threshold for test level alpha = 0.05 null_samples_boot.sort() null_samples_gaussian.sort() thresh_boot = null_samples_boot[floor( len(null_samples_boot) * (1 - alpha))] thresh_gaussian = null_samples_gaussian[floor( len(null_samples_gaussian) * (1 - alpha))] type_one_error_boot = sum( null_samples_boot < thresh_boot) / float(num_null_samples) type_one_error_gaussian = sum( null_samples_gaussian < thresh_boot) / float(num_null_samples) # plot alternative distribution with threshold subplot(2, 3, 4) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(alt_samples, 20, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error = sum(alt_samples < thresh_boot) / float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range = [ min([min(null_samples_boot), min(null_samples_gaussian)]), max([max(null_samples_boot), max(null_samples_gaussian)]) ] # plot null distribution with threshold subplot(2, 3, 3) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_boot, 20, range=hist_range, normed=True) axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Bootstrapped Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gaussian subplot(2, 3, 5) grid(True) gca().xaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins=3)) # reduce number of x-ticks hist(null_samples_gaussian, 20, range=hist_range, normed=True) axvline(thresh_gaussian, 0, 1, linewidth=2, color='red') title('Null Dist. Gaussian\nType I error is ' + str(type_one_error_gaussian)) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # dict to save additional information for later analysis self.additional_information = {} # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples, param) combined_features = CombinedFeatures() ################################################## # intra-domain blocks (dirac kernel) intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" ################################################## # all blocks (full kernel matrix) all_block_vec = PairiiVec() for task_id_1 in data.get_task_ids(): for task_id_2 in data.get_task_ids(): all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # create mask-based normalizer normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) kernel_all = shogun_factory.create_empty_kernel(param) kernel_all.set_normalizer(normalizer_all) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_all) # append features combined_features.append_feature_obj(base_features) ################################################## # hack # hack_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # hack_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001"))) # other_group = ["B_0702", "B_1501", "B_5801"] # for task_id_1 in other_group: # for task_id_2 in other_group: # hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2))) # # # # # create mask-based normalizer # normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec) # kernel_hack = shogun_factory.create_empty_kernel(param) # kernel_hack.set_normalizer(normalizer_hack) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_hack) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # init combined kernel combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() self.additional_information["mkl weights before"] = combined_kernel.get_subkernel_weights() print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.flags["mkl_q"]) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.set_epsilon(0.03) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() print "subkernel weights (after):", combined_kernel.get_subkernel_weights() ######################################################## print "svm objective:" print svm.get_objective() self.additional_information["svm_objective"] = svm.get_objective() self.additional_information["svm num sv"] = svm.get_num_support_vectors() self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights() ######################################################## # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm, param) return svms
def mkl_binclass_modular (train_data, testdata, train_labels, test_labels, d1, d2): # create some Gaussian train/test matrix tfeats = RealFeatures(train_data) tkernel = GaussianKernel(128, d1) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(test_data) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(train_data)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(GaussianKernel(128, d2)) kernel.init(feats_train, feats_train) # train mkl labels = Labels(train_labels) mkl = MKLClassification() # not to use svmlight mkl.set_interleaved_optimization_enabled(0) # which norm to use for MKL mkl.set_mkl_norm(2) # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(test_data)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(GaussianKernel(128, d2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) output = mkl.apply().get_labels() output = [1.0 if i>0 else -1.0 for i in output] accu = len(where(output == test_labels)[0]) / float(len(output)) return accu
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # dict to save additional information for later analysis self.additional_information = {} # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples, param) combined_features = CombinedFeatures() ################################################## # intra-domain blocks (dirac kernel) intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" ################################################## # all blocks (full kernel matrix) all_block_vec = PairiiVec() for task_id_1 in data.get_task_ids(): for task_id_2 in data.get_task_ids(): all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # create mask-based normalizer normalizer_all = MultitaskKernelMaskPairNormalizer( data.task_vector_nums, all_block_vec) kernel_all = shogun_factory.create_empty_kernel(param) kernel_all.set_normalizer(normalizer_all) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_all) # append features combined_features.append_feature_obj(base_features) ################################################## # hack # hack_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # hack_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001"))) # other_group = ["B_0702", "B_1501", "B_5801"] # for task_id_1 in other_group: # for task_id_2 in other_group: # hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2))) # # # # # create mask-based normalizer # normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec) # kernel_hack = shogun_factory.create_empty_kernel(param) # kernel_hack.set_normalizer(normalizer_hack) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_hack) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # init combined kernel combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() self.additional_information[ "mkl weights before"] = combined_kernel.get_subkernel_weights() print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.flags["mkl_q"]) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.set_epsilon(0.03) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float( len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() print "subkernel weights (after):", combined_kernel.get_subkernel_weights( ) ######################################################## print "svm objective:" print svm.get_objective() self.additional_information["svm_objective"] = svm.get_objective() self.additional_information[ "svm num sv"] = svm.get_num_support_vectors() self.additional_information[ "mkl weights post-training"] = combined_kernel.get_subkernel_weights( ) ######################################################## # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm, param) return svms
train_data = np.load('/home/matt/Documents/TechnicalProject/DeepLearningGenomicMed/Python/MultitaskLearn/train.npy') test_data = np.load('/home/matt/Documents/TechnicalProject/DeepLearningGenomicMed/Python/MultitaskLearn/test.npy') train_feats = train_data[1, :, :-1] test_feats = test_data[1, :, :-1] train_label = train_data[1, :, -1] test_label = test_data[1, :, -1] features_train = RealFeatures(train_feats.T) features_test = RealFeatures(test_feats.T) labels_train = BinaryLabels(train_label.T) labels_test = BinaryLabels(test_label.T) epsilon = 0.001 C = 1.0 combined_kernel = CombinedKernel() #gauss_kernel_1 = GaussianKernel(features_train, features_train, 15) gauss_kernel_1 = GaussianKernel(features_train, features_train, 1.0) gauss_kernel_2 = GaussianKernel(features_train, features_train, 2.0) combined_kernel.append_kernel(gauss_kernel_1) combined_kernel.append_kernel(gauss_kernel_2) combined_kernel.init(features_train, features_train) #svm = LibSVM(C, gauss_kernel, labels_train) #svm = LibSVM(C, combined_kernel, labels_train) #svm.set_epsilon(epsilon)
class SignalSensor(object): """ A collection of sensors """ def __init__(self): self.sensors = list() self.kernel = CombinedKernel() self.svs = CombinedFeatures() self.svm = None self.window = (+100000, -1000000) def from_file(self, file): sys.stderr.write('loading model file') l = file.readline(); if l != '%arts version: 1.0\n': sys.stderr.write("\nfile not an arts definition file\n") return None bias = None alphas = None num_kernels = None while l: # skip comment or empty line if not (l.startswith('%') or l.startswith('\n')): if bias is None: bias = parse_float(l, 'b') if alphas is None: alphas = parse_vector(l, file, 'alphas') if num_kernels is None: num_kernels = parse_int(l, 'num_kernels') if num_kernels and bias and alphas is not None: for i in xrange(num_kernels): s = Sensor() (k, f) = s.from_file(file, i + 1) k.io.enable_progress() self.window = (min(self.window[0], s.window[0]), max(self.window[1], s.window[2])) self.sensors.append(s) self.kernel.append_kernel(k) self.svs.append_feature_obj(f) self.kernel.init(self.svs, self.svs) self.svm = SVM(self.kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), bias) self.svm.io.set_target_to_stderr() self.svm.io.enable_progress() self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus()) sys.stderr.write('done\n') return l = file.readline() sys.stderr.write('error loading model file\n') def predict(self, seq, chunk_size = int(10e6)): """ predicts on whole contig, splits up sequence in chunks of size chunk_size """ seq_len = len(seq) num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size))) assert(num_chunks > 0) sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks)) start = 0 stop = min(chunk_size, seq_len) out = [] # iterate over chunks for chunk_idx in range(num_chunks): sys.stderr.write("processing chunk #%i\n" % (chunk_idx)) assert (start < stop) chunk = seq[start:stop] assert(len(self.sensors) > 0) tf = CombinedFeatures() for i in xrange(len(self.sensors)): f = self.sensors[i].get_test_features(chunk, self.window) tf.append_feature_obj(f) sys.stderr.write("initialising kernel...") self.kernel.init(self.svs, tf) sys.stderr.write("..done\n") lab_out = self.svm.apply() # work around problem with get_labels() tmp_out = [lab_out.get_label(idx) for idx in range(0, lab_out.get_num_labels())] assert(len(tmp_out) > 0) out.extend(tmp_out) print "len out", len(out) # increment chunk start = stop stop = min(stop+chunk_size, seq_len) l = (-self.window[0]) * [-42] r = self.window[1] * [-42] # concatenate ret = l + out + r assert(len(ret) == len(seq)) return ret
# train_data = np.load('/home/matt/Documents/TechnicalProject/DeepLearningGenomicMed/Python/MultitaskLearn/train.npy') # test_data = np.load('/home/matt/Documents/TechnicalProject/DeepLearningGenomicMed/Python/MultitaskLearn/test.npy') # # train_feats = train_data[1, :, :-1] # test_feats = test_data[1, :, :-1] # train_label = train_data[1, :, -1] # test_label = test_data[1, :, -1] # # features_train = RealFeatures(train_feats.T) # features_test = RealFeatures(test_feats.T) # labels_train = BinaryLabels(train_label.T) # labels_test = BinaryLabels(test_label.T) epsilon = 0.001 C = 100000 combined_kernel = CombinedKernel() #gauss_kernel_1 = GaussianKernel(features_train, features_train, 15) gauss_kernel_1 = GaussianKernel(features_train, features_train, 10) gauss_kernel_2 = GaussianKernel(features_train, features_train, 1.0) combined_kernel.append_kernel(gauss_kernel_1) combined_kernel.append_kernel(gauss_kernel_2) combined_kernel.init(features_train, features_train) libsvm = LibSVM() svm = MKLClassification(libsvm) svm.set_interleaved_optimization_enabled(False) svm.set_kernel(combined_kernel) svm.set_labels(labels_train)
feats_train.append_feature_obj(RealFeatures(data_2)) feats_train.append_feature_obj(RealFeatures(data_3)) feats_train.append_feature_obj(RealFeatures(data_4)) feats_train.append_feature_obj(RealFeatures(data_5)) #feats_test = CombinedFeatures() #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) labels = Labels(trainlab) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, 2.0)) kernel.append_kernel(GaussianKernel(10, 0.25)) kernel.append_kernel(GaussianKernel(10, 0.062)) kernel.append_kernel(GaussianKernel(10, 8.0)) kernel.append_kernel(GaussianKernel(10, 10.0)) kernel.init(feats_train, feats_train) # Create a classifier classifier=MKLClassification(LibSVM()) classifier.set_interleaved_optimization_enabled(False) classifier.set_kernel(kernel) classifier.set_labels(labels) classifier.set_C(1, 1) param_tree_root=ModelSelectionParameters()
def training_run(options): """Conduct a training run and return a trained SVM kernel""" settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width, options.replace) positives = MotifFinder(finder_settings=settings) positives.setFastaFile(options.positives) positives.setMotifs(options.pgff) pmotifs, ppositions = positives.getResults() negatives = MotifFinder(finder_settings=settings) negatives.setFastaFile(options.negatives) negatives.setMotifs(options.ngff) nmotifs, npositions = negatives.getResults() wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS wds_svm = EasySVM.EasySVM(wds_kparams) num_positives = len(pmotifs.values()[0]) num_negatives = len(nmotifs.values()[0]) #Creating Kernel Objects kernel = CombinedKernel() features = CombinedFeatures() kernel_array = [] motifs = pmotifs.keys() motifs.sort() #Adding Kmer Kernels for motif in motifs: all_examples = pmotifs[motif] + nmotifs[motif] motif_features = wds_svm.createFeatures(all_examples) wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, \ wds_kparams['degree']) wds_kernel.set_shifts(wds_kparams['shift'] * ones(wds_kparams['seqlength'], dtype=int32)) features.append_feature_obj(motif_features) kernel_array.append(wds_kernel) kernel.append_kernel(wds_kernel) rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS) positions = array(ppositions + npositions, dtype=float64).T position_features = rbf_svm.createFeatures(positions) features.append_feature_obj(position_features) motif_labels = append(ones(num_positives), -ones(num_negatives)) complete_labels = Labels(motif_labels) rbf_kernel = GaussianKernel(position_features, position_features, \ kirmes_ini.RBF_KERNEL_PARAMETERS['width']) kernel_array.append(rbf_kernel) kernel.append_kernel(rbf_kernel) #Kernel init kernel.init(features, features) kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE) svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels) svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS) #Training svm.train() if not os.path.exists(options.output_path): os.mkdir(options.output_path) html = {} if options.contrib: html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array, motifs) if options.logos: html["poims"] = poims(svm, kernel, kernel_array, motifs, options.output_path) if options.query: html["query"] = evaluate(options, svm, kernel, features, motifs) htmlize(html, options.output_html)
class SignalSensor(object): """ A collection of sensors """ def __init__(self): self.sensors = list() self.kernel = CombinedKernel() self.svs = CombinedFeatures() self.svm = None self.window = (+100000, -1000000) def from_file(self, file): sys.stderr.write('loading model file') l = file.readline() if l != '%arts version: 1.0\n': sys.stderr.write("\nfile not an arts definition file\n") return None bias = None alphas = None num_kernels = None while l: # skip comment or empty line if not (l.startswith('%') or l.startswith('\n')): if bias is None: bias = parse_float(l, 'b') if alphas is None: alphas = parse_vector(l, file, 'alphas') if num_kernels is None: num_kernels = parse_int(l, 'num_kernels') if num_kernels and bias and alphas is not None: for i in xrange(num_kernels): s = Sensor() (k, f) = s.from_file(file, i + 1) k.io.enable_progress() self.window = (min(self.window[0], s.window[0]), max(self.window[1], s.window[2])) self.sensors.append(s) self.kernel.append_kernel(k) self.svs.append_feature_obj(f) self.kernel.init(self.svs, self.svs) self.svm = KernelMachine( self.kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), bias) self.svm.io.set_target_to_stderr() self.svm.io.enable_progress() self.svm.parallel.set_num_threads( self.svm.parallel.get_num_cpus()) sys.stderr.write('done\n') return l = file.readline() sys.stderr.write('error loading model file\n') def predict(self, seq, chunk_size=int(10e6)): """ predicts on whole contig, splits up sequence in chunks of size chunk_size """ seq_len = len(seq) num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size))) assert (num_chunks > 0) sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks)) start = 0 stop = min(chunk_size, seq_len) out = [] # iterate over chunks for chunk_idx in range(num_chunks): sys.stderr.write("processing chunk #%i\n" % (chunk_idx)) assert (start < stop) chunk = seq[start:stop] assert (len(self.sensors) > 0) tf = CombinedFeatures() for i in xrange(len(self.sensors)): f = self.sensors[i].get_test_features(chunk, self.window) tf.append_feature_obj(f) sys.stderr.write("initialising kernel...") self.kernel.init(self.svs, tf) sys.stderr.write("..done\n") self.svm.set_kernel(self.kernel) lab_out = self.svm.apply().get_values() assert (len(lab_out) > 0) out.extend(lab_out) # increment chunk start = stop stop = min(stop + chunk_size, seq_len) l = (-self.window[0]) * [-42] r = self.window[1] * [-42] # concatenate ret = l + out + r assert (len(ret) == len(seq)) return ret
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_empty_kernel(param) lab = shogun_factory.create_labels(data.labels) combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) #base_wdk.init_normalizer() combined_features.append_feature_obj(base_features) combined_kernel.append_kernel(base_wdk) ################################################## # intra-domain blocks intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) # set mixing factor (used if MKL is OFF) assert (param.base_similarity <= 1) assert (param.base_similarity >= 0) combined_kernel.set_subkernel_weights( [param.base_similarity, 1 - param.base_similarity]) combined_kernel.init(combined_features, combined_features) svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) print "WARNING: custom epsilon set" svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() self.additional_information["similarities"] = similarities self.additional_information[ "post_weights"] = combined_kernel.get_subkernel_weights() # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, combined_kernel, svm) return svms
def evaluation_cross_validation_multiclass_storage( traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage, CrossValidationMulticlassStorage from shogun.Evaluation import MulticlassAccuracy, F1Measure from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import MulticlassLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import MKLMulticlass from shogun.Mathematics import Statistics, MSG_DEBUG # training data, combined features all on same data features = RealFeatures(traindat) comb_features = CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels = MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm = MKLMulticlass(1.0, kernel, labels) svm.set_kernel(kernel) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MulticlassAccuracy() # cross-validation instance cross_validation = CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) #mkl_storage=CrossValidationMKLStorage() #cross_validation.add_cross_validation_output(mkl_storage) multiclass_storage = CrossValidationMulticlassStorage() multiclass_storage.append_binary_evaluation(F1Measure()) cross_validation.add_cross_validation_output(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result = cross_validation.evaluate() roc_0_0_0 = multiclass_storage.get_fold_ROC(0, 0, 0) #print roc_0_0_0 auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0, 0, 0, 0) #print auc_0_0_0 return roc_0_0_0, auc_0_0_0
def mkl_binclass_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat): ################################## # set up and train # create some poly train/test matrix tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(fm_test_real) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(fm_train_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_train) # train mkl labels = BinaryLabels(fm_label_twoclass) mkl = MKLClassification() # which norm to use for MKL mkl.set_mkl_norm(1) #2,3 # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() #w=kernel.get_subkernel_weights() #kernel.set_subkernel_weights(w) ################################## # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(fm_test_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) mkl.apply() return mkl.apply(), kernel
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0] * 9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float( pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos - 1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % ( str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity( data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel # init weights # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels()) combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) #print "WARNING: custom epsilon set" #svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information[ "svm num sv"] = svm.get_num_support_vectors() self.additional_information[ "post_weights"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms