def get_weighted_spectrum_kernel(subfeats_list, options): """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement) Arguments: subfeats_list -- list of sub-feature objects options -- object containing option data Return: CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel """ kmerlen = options.kmerlen kmerlen2 = options.kmerlen2 subkernels = 0 kernel = CombinedKernel() feats = CombinedFeatures() for subfeats in subfeats_list: feats.append_feature_obj(subfeats) for k in xrange(kmerlen, kmerlen2+1): if k <= 8: subkernel = CommWordStringKernel(10, False) else: subkernel = CommUlongStringKernel(10, False) kernel.append_kernel(subkernel) subkernels+=1 kernel.init(feats, feats) kernel.set_subkernel_weights(numpy.array([1/float(subkernels)]*subkernels, numpy.dtype('float64'))) return kernel
def evaluation_cross_validation_multiclass_storage(traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage, CrossValidationMulticlassStorage from shogun.Evaluation import MulticlassAccuracy, F1Measure from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import MulticlassLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import MKLMulticlass from shogun.Mathematics import Statistics, MSG_DEBUG # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLMulticlass(1.0,kernel,labels); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=MulticlassAccuracy() # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) #mkl_storage=CrossValidationMKLStorage() #cross_validation.add_cross_validation_output(mkl_storage) multiclass_storage=CrossValidationMulticlassStorage() multiclass_storage.append_binary_evaluation(F1Measure()) cross_validation.add_cross_validation_output(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() roc_0_0_0 = multiclass_storage.get_fold_ROC(0,0,0) #print roc_0_0_0 auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0,0,0,0) #print auc_0_0_0 return roc_0_0_0, auc_0_0_0
def create_combined_wd_features(instances, feat_type): """ creates a combined wd feature object """ num_features = len(instances[0]) # contruct combined features feat = CombinedFeatures() for idx in range(num_features): # cut column idx data = [instance[idx] for instance in instances] seq_len = len(data[0]) for seq in data: if len(seq) != seq_len: print "warning, seq lengths differ", len(seq), seq_len, "in", idx, "num_feat", num_features tmp_feat = get_wd_features(data, feat_type) feat.append_feature_obj(tmp_feat) return feat
def create_combined_wd_features(instances, feat_type): """ creates a combined wd feature object """ num_features = len(instances[0]) # contruct combined features feat = CombinedFeatures() for idx in range(num_features): # cut column idx data = [instance[idx] for instance in instances] seq_len = len(data[0]) for seq in data: if len(seq) != seq_len: print "warning, seq lengths differ", len( seq), seq_len, "in", idx, "num_feat", num_features tmp_feat = get_wd_features(data, feat_type) feat.append_feature_obj(tmp_feat) return feat
def predict(self, seq, chunk_size=int(10e6)): """ predicts on whole contig, splits up sequence in chunks of size chunk_size """ seq_len = len(seq) num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size))) assert (num_chunks > 0) sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks)) start = 0 stop = min(chunk_size, seq_len) out = [] # iterate over chunks for chunk_idx in range(num_chunks): sys.stderr.write("processing chunk #%i\n" % (chunk_idx)) assert (start < stop) chunk = seq[start:stop] assert (len(self.sensors) > 0) tf = CombinedFeatures() for i in xrange(len(self.sensors)): f = self.sensors[i].get_test_features(chunk, self.window) tf.append_feature_obj(f) sys.stderr.write("initialising kernel...") self.kernel.init(self.svs, tf) sys.stderr.write("..done\n") self.svm.set_kernel(self.kernel) lab_out = self.svm.apply() # work around problem with get_labels() tmp_out = [ lab_out.get_label(idx) for idx in range(0, lab_out.get_num_labels()) ] assert (len(tmp_out) > 0) out.extend(tmp_out) print "len out", len(out) # increment chunk start = stop stop = min(stop + chunk_size, seq_len) l = (-self.window[0]) * [-42] r = self.window[1] * [-42] # concatenate ret = l + out + r assert (len(ret) == len(seq)) return ret
def evaluate(options, svm, kernel, features, motifs): """Evaluate examples using a trained kernel""" query = MotifFinder(finder_settings=MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width)) query.setFastaFile(options.query) query.setMotifs(options.qgff) qmotifs, qpositions = query.getResults() feats_query = CombinedFeatures() wds_svm = EasySVM.EasySVM(kirmes_ini.WDS_KERNEL_PARAMETERS) try: assert set(qmotifs.keys()).issuperset(set(motifs)) except AssertionError: print "The motif positions in the query sequence are incomplete, there are no positions for:" print set(motifs).difference(qmotifs.keys()) raise for motif in motifs: feats_query.append_feature_obj(wds_svm.createFeatures(qmotifs[motif])) query_positions = array(qpositions, dtype=float64) query_positions = query_positions.T rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS) feats_query.append_feature_obj(rbf_svm.createFeatures(query_positions)) kernel.init(features, feats_query) out = svm.classify().get_labels() qgenes = query.getGenes() ret_str = "" print "#example\toutput\tsplit" for i in xrange(len(out)): if out[i] >= 0: classif = "\tpositive\t" else: classif = "\tnegative\t" ret_str += qgenes[i] + classif + str(out[i]) + "\n" print str(i) + "\t" + str(out[i]) + "\t0" return ret_str
def _predict(self, predictor, examples, task_id): """ make prediction on examples using trained predictor @param predictor: trained predictor @type predictor: dict<str, tuple<SVM, int> > @param examples: list of examples @type examples: list<str> @param task_id: task identifier @type task_id: str """ (task_num, combined_kernel, svm) = predictor # shogun data base_feat = shogun_factory.create_features(examples) feat = CombinedFeatures() feat.append_feature_obj(base_feat) feat.append_feature_obj(base_feat) # update normalizers normalizer = combined_kernel.get_kernel(0).get_normalizer() normalizer = KernelNormalizerToMultitaskKernelNormalizer(normalizer) normalizer.set_task_vector_rhs([task_num] * len(examples)) normalizer_dirac = combined_kernel.get_kernel(1).get_normalizer() normalizer_dirac = KernelNormalizerToMultitaskKernelMaskPairNormalizer( normalizer_dirac) normalizer_dirac.set_task_vector_rhs([task_num] * len(examples)) # predict out = svm.classify(feat).get_labels() return out
def evaluate(options, svm, kernel, features, motifs): """Evaluate examples using a trained kernel""" query = MotifFinder(finder_settings=MotifFinderSettings( kirmes_ini.MOTIF_LENGTH, options.window_width)) query.setFastaFile(options.query) query.setMotifs(options.qgff) qmotifs, qpositions = query.getResults() feats_query = CombinedFeatures() wds_svm = EasySVM.EasySVM(kirmes_ini.WDS_KERNEL_PARAMETERS) try: assert set(qmotifs.keys()).issuperset(set(motifs)) except AssertionError: print "The motif positions in the query sequence are incomplete, there are no positions for:" print set(motifs).difference(qmotifs.keys()) raise for motif in motifs: feats_query.append_feature_obj(wds_svm.createFeatures(qmotifs[motif])) query_positions = array(qpositions, dtype=float64) query_positions = query_positions.T rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS) feats_query.append_feature_obj(rbf_svm.createFeatures(query_positions)) kernel.init(features, feats_query) out = svm.classify().get_labels() qgenes = query.getGenes() ret_str = "" print "#example\toutput\tsplit" for i in xrange(len(out)): if out[i] >= 0: classif = "\tpositive\t" else: classif = "\tnegative\t" ret_str += qgenes[i] + classif + str(out[i]) + "\n" print str(i) + "\t" + str(out[i]) + "\t0" return ret_str
def mkl_binclass_modular (train_data, testdata, train_labels, test_labels, d1, d2): # create some Gaussian train/test matrix tfeats = RealFeatures(train_data) tkernel = GaussianKernel(128, d1) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(test_data) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(train_data)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(GaussianKernel(128, d2)) kernel.init(feats_train, feats_train) # train mkl labels = Labels(train_labels) mkl = MKLClassification() # not to use svmlight mkl.set_interleaved_optimization_enabled(0) # which norm to use for MKL mkl.set_mkl_norm(2) # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(test_data)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(GaussianKernel(128, d2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) output = mkl.apply().get_labels() output = [1.0 if i>0 else -1.0 for i in output] accu = len(where(output == test_labels)[0]) / float(len(output)) return accu
def predict(self, seq, chunk_size = int(10e6)): """ predicts on whole contig, splits up sequence in chunks of size chunk_size """ seq_len = len(seq) num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size))) assert(num_chunks > 0) sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks)) start = 0 stop = min(chunk_size, seq_len) out = [] # iterate over chunks for chunk_idx in range(num_chunks): sys.stderr.write("processing chunk #%i\n" % (chunk_idx)) assert (start < stop) chunk = seq[start:stop] assert(len(self.sensors) > 0) tf = CombinedFeatures() for i in xrange(len(self.sensors)): f = self.sensors[i].get_test_features(chunk, self.window) tf.append_feature_obj(f) sys.stderr.write("initialising kernel...") self.kernel.init(self.svs, tf) sys.stderr.write("..done\n") self.svm.set_kernel(self.kernel) lab_out = self.svm.apply() # work around problem with get_labels() tmp_out = [lab_out.get_label(idx) for idx in range(0, lab_out.get_num_labels())] assert(len(tmp_out) > 0) out.extend(tmp_out) print "len out", len(out) # increment chunk start = stop stop = min(stop+chunk_size, seq_len) l = (-self.window[0]) * [-42] r = self.window[1] * [-42] # concatenate ret = l + out + r assert(len(ret) == len(seq)) return ret
def training_run(options): """Conduct a training run and return a trained SVM kernel""" settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width, options.replace) positives = MotifFinder(finder_settings=settings) positives.setFastaFile(options.positives) positives.setMotifs(options.pgff) pmotifs, ppositions = positives.getResults() negatives = MotifFinder(finder_settings=settings) negatives.setFastaFile(options.negatives) negatives.setMotifs(options.ngff) nmotifs, npositions = negatives.getResults() wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS wds_svm = EasySVM.EasySVM(wds_kparams) num_positives = len(pmotifs.values()[0]) num_negatives = len(nmotifs.values()[0]) # Creating Kernel Objects kernel = CombinedKernel() features = CombinedFeatures() kernel_array = [] motifs = pmotifs.keys() motifs.sort() # Adding Kmer Kernels for motif in motifs: all_examples = pmotifs[motif] + nmotifs[motif] motif_features = wds_svm.createFeatures(all_examples) wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, wds_kparams["degree"]) wds_kernel.set_shifts(wds_kparams["shift"] * ones(wds_kparams["seqlength"], dtype=int32)) features.append_feature_obj(motif_features) kernel_array.append(wds_kernel) kernel.append_kernel(wds_kernel) rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS) positions = array(ppositions + npositions, dtype=float64).T position_features = rbf_svm.createFeatures(positions) features.append_feature_obj(position_features) motif_labels = append(ones(num_positives), -ones(num_negatives)) complete_labels = Labels(motif_labels) rbf_kernel = GaussianKernel(position_features, position_features, kirmes_ini.RBF_KERNEL_PARAMETERS["width"]) kernel_array.append(rbf_kernel) kernel.append_kernel(rbf_kernel) # Kernel init kernel.init(features, features) kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE) svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels) svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS) # Training svm.train() if not os.path.exists(options.output_path): os.mkdir(options.output_path) html = {} if options.contrib: html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array, motifs) if options.logos: html["poims"] = poims(svm, kernel, kernel_array, motifs, options.output_path) if options.query: html["query"] = evaluate(options, svm, kernel, features, motifs) htmlize(html, options.output_html)
def evaluation_cross_validation_mkl_weight_storage(traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage from shogun.Evaluation import ContingencyTableEvaluation, ACCURACY from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import BinaryLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import LibSVM, MKLClassification from shogun.Mathematics import Statistics # training data, combined features all on same data features=RealFeatures(traindat) comb_features=CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels=BinaryLabels(label_traindat) # kernel, different Gaussians combined kernel=CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm=MKLClassification(LibSVM()); svm.set_interleaved_optimization_enabled(False); svm.set_kernel(kernel); # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy=StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium=ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation=CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) mkl_storage=CrossValidationMKLStorage() cross_validation.add_cross_validation_output(mkl_storage) cross_validation.set_num_runs(3) # perform cross-validation result=cross_validation.evaluate() # print mkl weights weights=mkl_storage.get_mkl_weights()
def statistics_linear_time_mmd_kernel_choice(): from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n=50000 dim=5 difference=2 # data is standard normal distributed. only one dimension of Y has a mean # shift of difference (X,Y)=gen_data.create_mean_data(n,dim,difference) # concatenate since MMD class takes data as one feature object # (it is possible to give two, but then data is copied) Z=concatenate((X,Y), axis=1) print "dimension means of X", [mean(x) for x in X] print "dimension means of Y", [mean(x) for x in Y] # create kernels/features to choose from # here: just a bunch of Gaussian Kernels with different widths # real sigmas are 2^-5, ..., 2^10 sigmas=array([pow(2,x) for x in range(-5,10)]) # shogun has a different parametrization of the Gaussian kernel shogun_sigmas=array([x*x*2 for x in sigmas]) # We will use multiple kernels kernel=CombinedKernel() # two separate feature objects here, could also be one with appended data features=CombinedFeatures() # all kernels work on same features for i in range(len(sigmas)): kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i])) features.append_feature_obj(RealFeatures(Z)) mmd=LinearTimeMMD(kernel,features, n) print "start learning kernel weights" mmd.set_opt_regularization_eps(10E-5) mmd.set_opt_low_cut(10E-5) mmd.set_opt_max_iterations(1000) mmd.set_opt_epsilon(10E-7) mmd.optimize_kernel_weights() weights=kernel.get_subkernel_weights() print "learned weights:", weights #pyplot.plot(array(range(len(sigmas))), weights) #pyplot.show() print "index of max weight", weights.argmax()
def statistics_linear_time_mmd_kernel_choice(): from shogun.Features import RealFeatures, CombinedFeatures from shogun.Features import MeanShiftRealDataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n = 50000 dim = 5 difference = 2 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data = DataGenerator.generate_mean_data(n, dim, difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2 * n + 1].T) # create kernels/features to choose from # here: just a bunch of Gaussian Kernels with different widths # real sigmas are 2^-5, ..., 2^10 sigmas = array([pow(2, x) for x in range(-5, 10)]) # shogun has a different parametrization of the Gaussian kernel shogun_sigmas = array([x * x * 2 for x in sigmas]) # We will use multiple kernels kernel = CombinedKernel() # two separate feature objects here, could also be one with appended data features = CombinedFeatures() # all kernels work on same features for i in range(len(sigmas)): kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i])) features.append_feature_obj(RealFeatures(data)) mmd = LinearTimeMMD(kernel, features, n) print "start learning kernel weights" mmd.set_opt_regularization_eps(10E-5) mmd.set_opt_low_cut(10E-5) mmd.set_opt_max_iterations(1000) mmd.set_opt_epsilon(10E-7) mmd.optimize_kernel_weights() weights = kernel.get_subkernel_weights() print "learned weights:", weights #pyplot.plot(array(range(len(sigmas))), weights) #pyplot.show() print "index of max weight", weights.argmax()
def statistics_linear_time_mmd_kernel_choice(): from shogun.Features import RealFeatures, CombinedFeatures from shogun.Features import DataGenerator from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Statistics import LinearTimeMMD from shogun.Statistics import BOOTSTRAP, MMD1_GAUSSIAN # note that the linear time statistic is designed for much larger datasets n=50000 dim=5 difference=2 # use data generator class to produce example data # in pratice, this generate data function could be replaced by a method # that obtains data from a stream data=DataGenerator.generate_mean_data(n,dim,difference) print "dimension means of X", mean(data.T[0:n].T) print "dimension means of Y", mean(data.T[n:2*n+1].T) # create kernels/features to choose from # here: just a bunch of Gaussian Kernels with different widths # real sigmas are 2^-5, ..., 2^10 sigmas=array([pow(2,x) for x in range(-5,10)]) # shogun has a different parametrization of the Gaussian kernel shogun_sigmas=array([x*x*2 for x in sigmas]) # We will use multiple kernels kernel=CombinedKernel() # two separate feature objects here, could also be one with appended data features=CombinedFeatures() # all kernels work on same features for i in range(len(sigmas)): kernel.append_kernel(GaussianKernel(10, shogun_sigmas[i])) features.append_feature_obj(RealFeatures(data)) mmd=LinearTimeMMD(kernel,features, n) print "start learning kernel weights" mmd.set_opt_regularization_eps(10E-5) mmd.set_opt_low_cut(10E-5) mmd.set_opt_max_iterations(1000) mmd.set_opt_epsilon(10E-7) mmd.optimize_kernel_weights() weights=kernel.get_subkernel_weights() print "learned weights:", weights #pyplot.plot(array(range(len(sigmas))), weights) #pyplot.show() print "index of max weight", weights.argmax()
def init_weighted_spectrum_kernel(kern, subfeats_list_lhs, subfeats_list_rhs): """initialize weighted spectrum kernel (wrapper function) """ feats_lhs = CombinedFeatures() feats_rhs = CombinedFeatures() for subfeats in subfeats_list_lhs: feats_lhs.append_feature_obj(subfeats) for subfeats in subfeats_list_rhs: feats_rhs.append_feature_obj(subfeats) kern.init(feats_lhs, feats_rhs)
def get_weighted_spectrum_kernel(subfeats_list, options): """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement) Arguments: subfeats_list -- list of sub-feature objects options -- object containing option data Return: CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel """ kmerlen = options.kmerlen kmerlen2 = options.kmerlen2 subkernels = 0 kernel = CombinedKernel() feats = CombinedFeatures() weights = [] i = 0 for subfeats in subfeats_list: feats.append_feature_obj(subfeats) combine_kcount = Counter() for i in xrange(subfeats.get_num_vectors()): fv = list(subfeats.get_feature_vector(i)) combine_kcount += Counter(fv) number = len(combine_kcount) klen = kmerlen + i for k in xrange(kmerlen, kmerlen2 + 1): if k <= 8: subkernel = CommWordStringKernel(10, False) else: subkernel = CommUlongStringKernel(10, False) kernel.append_kernel(subkernel) subkernels += 1 kernel.init(feats, feats) # here the weight for each k-mer is uniform ''' subkernels = 8 numpy.array([1 / float(subkernels)] * subkernels, numpy.dtype('float64')) array([ 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.125]) ''' kernel.set_subkernel_weights( numpy.array([1 / float(subkernels)] * subkernels, numpy.dtype('float64'))) return kernel
def kernel_combined_custom_poly_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat): from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel from shogun.Classifier import LibSVM kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(fm_train_real) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = Labels(fm_label_twoclass) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(fm_test_real) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(fm_test_real) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.classify() km_train = kernel.get_kernel_matrix() return km_train, kernel
def mkl_binclass_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat): sc1 = StringCharFeatures(strings1, RAWBYTE) sc2 = StringCharFeatures(strings2, RAWBYTE) sfeats1 = StringWordFeatures(RAWBYTE) sfeats1.obtain_from_char(sc1, 0, 2, 0, False) skernel1 = CommWordStringKernel(10, False) skernel1.init(sfeats1, sfeats1) sfeats2 = StringWordFeatures(RAWBYTE) sfeats2.obtain_from_char(sc2, 0, 2, 0, False) skernel2 = CommWordStringKernel(10, False) skernel2.init(sfeats2, sfeats2) ffeats = RealFeatures(traindat) fkernel = LinearKernel(ffeats, ffeats) fffeats = RealFeatures(traindat) ffkernel = GaussianKernel(fffeats, fffeats, 1.0) # COMBINING LINADD FEATURES/KERNELS LEAD TO FAIL feats_train = CombinedFeatures() feats_train.append_feature_obj(ffeats) #feats_train.append_feature_obj(fffeats) feats_train.append_feature_obj(sfeats2) print feats_train.get_num_vectors() kernel = CombinedKernel() kernel.append_kernel(fkernel) #kernel.append_kernel(ffkernel) kernel.append_kernel(skernel2) kernel.init(feats_train, feats_train) labels = RegressionLabels(fm_label_twoclass) mkl = MKLRegression() mkl.set_mkl_norm(1) #2,3 mkl.set_C(1, 1) mkl.set_kernel(kernel) mkl.set_labels(labels) mkl.io.enable_file_and_line() mkl.io.set_loglevel(MSG_DEBUG)
def kernel_combined_custom_poly_modular(fm_train_real = traindat,fm_test_real = testdat,fm_label_twoclass=label_traindat): from shogun.Features import CombinedFeatures, RealFeatures, BinaryLabels from shogun.Kernel import CombinedKernel, PolyKernel, CustomKernel from shogun.Classifier import LibSVM kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(fm_train_real) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10,2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = BinaryLabels(fm_label_twoclass) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(fm_test_real) tkernel = PolyKernel(10,3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(fm_test_real) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.apply() km_train=kernel.get_kernel_matrix() return km_train,kernel
def _predict(self, predictor, examples, task_name): """ make prediction on examples using trained predictor @param predictor: trained predictor (task_id, num_nodes, combined_kernel, predictor) @type predictor: tuple<int, int, CombinedKernel, SVM> @param examples: list of examples @type examples: list<object> @param task_name: task name @type task_name: str """ (task_id, combined_kernel, svm, param) = predictor # shogun data base_feat = shogun_factory.create_features(examples, param) # construct combined kernel feat = CombinedFeatures() for i in xrange(combined_kernel.get_num_subkernels()): feat.append_feature_obj(base_feat) # fetch kernel normalizer normalizer = combined_kernel.get_kernel(i).get_normalizer() # cast using dedicated SWIG-helper function normalizer = KernelNormalizerToMultitaskKernelMaskPairNormalizer(normalizer) # set task vector normalizer.set_task_vector_rhs([task_id]*len(examples)) combined_kernel = svm.get_kernel() combined_kernel.init(combined_kernel.get_lhs(), feat) # predict out = svm.classify().get_labels() # predict #out = svm.classify(feat).get_labels() return out
def _predict(self, predictor, examples, task_name): """ make prediction on examples using trained predictor @param predictor: trained predictor (task_id, num_nodes, combined_kernel, predictor) @type predictor: tuple<int, int, CombinedKernel, SVM> @param examples: list of examples @type examples: list<object> @param task_name: task name @type task_name: str """ (task_id, combined_kernel, svm, param) = predictor # shogun data base_feat = shogun_factory.create_features(examples, param) # construct combined kernel feat = CombinedFeatures() for i in xrange(combined_kernel.get_num_subkernels()): feat.append_feature_obj(base_feat) # fetch kernel normalizer normalizer = combined_kernel.get_kernel(i).get_normalizer() # cast using dedicated SWIG-helper function normalizer = KernelNormalizerToMultitaskKernelMaskPairNormalizer( normalizer) # set task vector normalizer.set_task_vector_rhs([task_id] * len(examples)) combined_kernel = svm.get_kernel() combined_kernel.init(combined_kernel.get_lhs(), feat) # predict out = svm.classify().get_labels() # predict #out = svm.classify(feat).get_labels() return out
def construct_features(features): """ makes a list """ feat_all = [inst for inst in features] feat_lhs = [inst[0:15] for inst in features] feat_rhs = [inst[15:] for inst in features] feat_wd = get_wd_features(feat_all) feat_spec_1 = get_spectrum_features(feat_lhs, order=3) feat_spec_2 = get_spectrum_features(feat_rhs, order=3) feat_comb = CombinedFeatures() feat_comb.append_feature_obj(feat_wd) feat_comb.append_feature_obj(feat_spec_1) feat_comb.append_feature_obj(feat_spec_2) return feat_comb
def create_promoter_features(data, param): """ creates promoter combined features @param examples: @param param: """ print "creating promoter features" (center, left, right) = split_data_promoter(data, param["center_offset"], param["center_pos"]) # set up base features feat_center = StringCharFeatures(DNA) feat_center.set_features(center) feat_left = get_spectrum_features(left) feat_right = get_spectrum_features(right) # construct combined features feat = CombinedFeatures() feat.append_feature_obj(feat_center) feat.append_feature_obj(feat_left) feat.append_feature_obj(feat_right) return feat
def get_weighted_spectrum_kernel(subfeats_list, options): """build weighted spectrum kernel with non-redundant k-mer list (removing reverse complement) Arguments: subfeats_list -- list of sub-feature objects options -- object containing option data Return: CombinedFeatures of StringWord(Ulong)Features, CombinedKernel of CommWord(Ulong)StringKernel """ kmerlen = options.kmerlen kmerlen2 = options.kmerlen2 subkernels = 0 kernel = CombinedKernel() feats = CombinedFeatures() for subfeats in subfeats_list: feats.append_feature_obj(subfeats) for k in xrange(kmerlen, kmerlen2 + 1): if k <= 8: subkernel = CommWordStringKernel(10, False) else: subkernel = CommUlongStringKernel(10, False) kernel.append_kernel(subkernel) subkernels += 1 kernel.init(feats, feats) kernel.set_subkernel_weights( numpy.array([1 / float(subkernels)] * subkernels, numpy.dtype('float64'))) return kernel
def create_features(data, center_offset, center_pos): """ combined features from different kernel perspective """ #print("creating promoter features - center_offset %d, center_pos %d - %s" % (center_offset, center_pos, data)) (center, left, right) = split_data_promoter(data, center_offset, center_pos) # sanity check sequences assert len(center) == len(left) == len(right) feat_center = StringCharFeatures(DNA) feat_center.set_features(center) feat_left = get_spectrum_features(left) feat_right = get_spectrum_features(right) # contruct combined features feat = CombinedFeatures() feat.append_feature_obj(feat_center) feat.append_feature_obj(feat_left) feat.append_feature_obj(feat_right) return feat
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=False) # create shogun data objects base_wdk = shogun_factory.create_empty_kernel(param) lab = shogun_factory.create_labels(data.labels) combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() # set normalizer normalizer = MultitaskKernelNormalizer(data.task_vector_nums) # load data #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_pearson.txt") f = file( "/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/All_PseudoSeq_Hamming.txt" ) #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_PseudoSeq_BlosumEnc_euklid.txt") #f = file("/fml/ag-raetsch/home/cwidmer/Documents/phd/projects/multitask/data/mhc/MHC_Distanzen/MHC_Distanzen/ALL_RAxML.txt") num_lines = int(f.readline().strip()) task_distances = numpy.zeros((num_lines, num_lines)) name_to_id = {} for (i, line) in enumerate(f): tokens = line.strip().split("\t") name = str(tokens[0]) name_to_id[name] = i entry = numpy.array([v for (j, v) in enumerate(tokens) if j != 0]) assert len(entry) == num_lines, "len_entry %i, num_lines %i" % ( len(entry), num_lines) task_distances[i, :] = entry # cut relevant submatrix active_ids = [name_to_id[name] for name in data.get_task_names()] tmp_distances = task_distances[active_ids, :] tmp_distances = tmp_distances[:, active_ids] print "distances ", tmp_distances.shape # normalize distances task_distances = task_distances / numpy.max(tmp_distances) similarities = numpy.zeros( (data.get_num_tasks(), data.get_num_tasks())) # convert distance to similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): # convert similarity with simple transformation similarity = param.base_similarity - task_distances[ name_to_id[task_name_lhs], name_to_id[task_name_rhs]] normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) # save for later similarities[data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)] = similarity # set normalizer base_wdk.set_normalizer(normalizer) #base_wdk.init_normalizer() combined_features.append_feature_obj(base_features) combined_kernel.append_kernel(base_wdk) ################################################## # intra-domain blocks intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) # set mixing factor (used if MKL is OFF) assert (param.base_similarity <= 1) assert (param.base_similarity >= 0) combined_kernel.set_subkernel_weights( [param.base_similarity, 1 - param.base_similarity]) combined_kernel.init(combined_features, combined_features) svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) print "WARNING: custom epsilon set" svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional information self.additional_information["svm objective"] = svm.get_objective() self.additional_information["num sv"] = svm.get_num_support_vectors() self.additional_information["similarities"] = similarities self.additional_information[ "post_weights"] = combined_kernel.get_subkernel_weights() # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in data.get_task_names(): task_num = data.name_to_id(task_name) # save svm and task_num svms[task_name] = (task_num, combined_kernel, svm) return svms
traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) testdata_real = concatenate((randn(2,numt)-dist, randn(2,numt)+dist), axis=1); # BinaryLabels trainlab = concatenate((-ones(num), ones(num))); testlab = concatenate((-ones(numt), ones(numt))); # Split into pos/neg train/test for plotting trainpos = traindata_real[:,trainlab == 1] trainneg = traindata_real[:,trainlab == -1] testpos = testdata_real[:,testlab == 1] testneg = testdata_real[:,testlab == -1] # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_train.append_feature_obj(RealFeatures(traindata_real)) feats_test = CombinedFeatures() feats_test.append_feature_obj(RealFeatures(testdata_real)) feats_test.append_feature_obj(RealFeatures(testdata_real)) feats_test.append_feature_obj(RealFeatures(testdata_real)) feats_test.append_feature_obj(RealFeatures(testdata_real)) feats_test.append_feature_obj(RealFeatures(testdata_real)) labels = BinaryLabels(trainlab) # and corresponding combined kernel
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel from shogun.Classifier import MKLMultiClass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10, 2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = Labels(label_train_multiclass) mkl = MKLMultiClass(C, kernel, labels) mkl.set_epsilon(epsilon) mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0]*9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float(pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos-1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % (str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel # init weights # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels()) combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) #print "WARNING: custom epsilon set" #svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information["svm num sv"] = svm.get_num_support_vectors() self.additional_information["post_weights"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ import numpy numpy.random.seed(666) # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG) # set kernel cache if param.flags.has_key("cache_size"): combined_kernel.set_cache_size(param.flags["cache_size"]) # create features base_features = shogun_factory.create_features(data.examples, param) combined_features = CombinedFeatures() ######################################################## print "creating a masked kernel for possible subset:" ######################################################## power_set_tasks = power_set(data.get_task_ids()) for active_task_ids in power_set_tasks: print "masking all entries other than:", active_task_ids # create mask-based normalizer normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums, data.task_vector_nums, active_task_ids) # normalize trace if param.flags.has_key( "normalize_trace") and param.flags["normalize_trace"]: norm_factor = len(data.get_task_ids()) / len(active_task_ids) normalizer.set_normalization_constant(norm_factor) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() self.additional_information[ "weights before trainng"] = combined_kernel.get_subkernel_weights( ) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.flags["mkl_q"]) # set interleaved optimization if param.flags.has_key("interleaved"): svm.set_interleaved_optimization_enabled( param.flags["interleaved"]) # set solver type if param.flags.has_key( "solver_type") and param.flags["solver_type"]: if param.flags["solver_type"] == "ST_CPLEX": svm.set_solver_type(ST_CPLEX) if param.flags["solver_type"] == "ST_DIRECT": svm.set_solver_type(ST_DIRECT) if param.flags["solver_type"] == "ST_NEWTON": svm.set_solver_type(ST_NEWTON) if param.flags["solver_type"] == "ST_GLPK": svm.set_solver_type(ST_GLPK) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: svm = SVMLight(param.cost, combined_kernel, lab) # optimization settings num_threads = 4 svm.parallel.set_num_threads(num_threads) if param.flags.has_key("epsilon"): svm.set_epsilon(param.flags["epsilon"]) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) # disable unsupported optimizations (due to special normalizer) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float( len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() # prepare mapping weight_map = {} weights = combined_kernel.get_subkernel_weights() for (i, pset) in enumerate(power_set_tasks): print pset subset_str = str([data.id_to_name(task_idx) for task_idx in pset]) weight_map[subset_str] = weights[i] # store additional info self.additional_information["svm objective"] = svm.get_objective() self.additional_information["weight_map"] = weight_map # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), len(power_set_tasks), combined_kernel, svm, param) return svms
#traindata_real = concatenate((randn(2,num)-dist, randn(2,num)+dist), axis=1) #testdata_real = concatenate((randn(2,numt)-dist, randn(2,numt)+dist), axis=1); # Labels trainlab = labs_1 #testlab = concatenate((-ones(numt), ones(numt))); # Split into pos/neg train/test for plotting #trainpos = traindata_real[:,trainlab == 1] #trainneg = traindata_real[:,trainlab == -1] #testpos = testdata_real[:,testlab == 1] #testneg = testdata_real[:,testlab == -1] # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(data_1)) feats_train.append_feature_obj(RealFeatures(data_2)) feats_train.append_feature_obj(RealFeatures(data_3)) #feats_test = CombinedFeatures() #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) #feats_test.append_feature_obj(RealFeatures(testdata_real)) labels = BinaryLabels(trainlab) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(LinearKernel())
def kernel_combined_modular(fm_train_real=traindat, fm_test_real=testdat, fm_train_dna=traindna, fm_test_dna=testdna): from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = StringCharFeatures(fm_train_dna, DNA) subkfeats_test = StringCharFeatures(fm_test_dna, DNA) degree = 3 subkernel = FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = StringCharFeatures(fm_train_dna, DNA) subkfeats_test = StringCharFeatures(fm_test_dna, DNA) subkernel = LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def training_run(options): """Conduct a training run and return a trained SVM kernel""" settings = MotifFinderSettings(kirmes_ini.MOTIF_LENGTH, options.window_width, options.replace) positives = MotifFinder(finder_settings=settings) positives.setFastaFile(options.positives) positives.setMotifs(options.pgff) pmotifs, ppositions = positives.getResults() negatives = MotifFinder(finder_settings=settings) negatives.setFastaFile(options.negatives) negatives.setMotifs(options.ngff) nmotifs, npositions = negatives.getResults() wds_kparams = kirmes_ini.WDS_KERNEL_PARAMETERS wds_svm = EasySVM.EasySVM(wds_kparams) num_positives = len(pmotifs.values()[0]) num_negatives = len(nmotifs.values()[0]) #Creating Kernel Objects kernel = CombinedKernel() features = CombinedFeatures() kernel_array = [] motifs = pmotifs.keys() motifs.sort() #Adding Kmer Kernels for motif in motifs: all_examples = pmotifs[motif] + nmotifs[motif] motif_features = wds_svm.createFeatures(all_examples) wds_kernel = WeightedDegreePositionStringKernel(motif_features, motif_features, \ wds_kparams['degree']) wds_kernel.set_shifts(wds_kparams['shift'] * ones(wds_kparams['seqlength'], dtype=int32)) features.append_feature_obj(motif_features) kernel_array.append(wds_kernel) kernel.append_kernel(wds_kernel) rbf_svm = EasySVM.EasySVM(kirmes_ini.RBF_KERNEL_PARAMETERS) positions = array(ppositions + npositions, dtype=float64).T position_features = rbf_svm.createFeatures(positions) features.append_feature_obj(position_features) motif_labels = append(ones(num_positives), -ones(num_negatives)) complete_labels = Labels(motif_labels) rbf_kernel = GaussianKernel(position_features, position_features, \ kirmes_ini.RBF_KERNEL_PARAMETERS['width']) kernel_array.append(rbf_kernel) kernel.append_kernel(rbf_kernel) #Kernel init kernel.init(features, features) kernel.set_cache_size(kirmes_ini.K_CACHE_SIZE) svm = LibSVM(kirmes_ini.K_COMBINED_C, kernel, complete_labels) svm.parallel.set_num_threads(kirmes_ini.K_NUM_THREADS) #Training svm.train() if not os.path.exists(options.output_path): os.mkdir(options.output_path) html = {} if options.contrib: html["contrib"] = contrib(svm, kernel, motif_labels, kernel_array, motifs) if options.logos: html["poims"] = poims(svm, kernel, kernel_array, motifs, options.output_path) if options.query: html["query"] = evaluate(options, svm, kernel, features, motifs) htmlize(html, options.output_html)
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ #numpy.random.seed(1337) numpy.random.seed(666) # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_DEBUG) # set kernel cache if param.flags.has_key("cache_size"): combined_kernel.set_cache_size(param.flags["cache_size"]) # create features base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ######################################################## print "creating a masked kernel for each node:" ######################################################## # fetch taxonomy from parameter object taxonomy = param.taxonomy.data # create name to leaf map nodes = taxonomy.get_all_nodes() for node in nodes: print "creating kernel for ", node.name # fetch sub-tree active_task_ids = [data.name_to_id(leaf.name) for leaf in node.get_leaves()] print "masking all entries other than:", active_task_ids # create mask-based normalizer normalizer = MultitaskKernelMaskNormalizer(data.task_vector_nums, data.task_vector_nums, active_task_ids) # normalize trace if param.flags.has_key("normalize_trace") and param.flags["normalize_trace"]: norm_factor = len(node.get_leaves()) / len(active_task_ids) normalizer.set_normalization_constant(norm_factor) # create kernel kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: # set up MKL svm = MKLClassification() # set the "q" in q-norm MKL svm.set_mkl_norm(param.flags["mkl_q"]) # set interleaved optimization if param.flags.has_key("interleaved"): svm.set_interleaved_optimization_enabled(param.flags["interleaved"]) # set solver type if param.flags.has_key("solver_type") and param.flags["solver_type"]: if param.flags["solver_type"] == "ST_CPLEX": svm.set_solver_type(ST_CPLEX) if param.flags["solver_type"] == "ST_DIRECT": svm.set_solver_type(ST_DIRECT) if param.flags["solver_type"] == "ST_NEWTON": svm.set_solver_type(ST_NEWTON) if param.flags["solver_type"] == "ST_GLPK": svm.set_solver_type(ST_GLPK) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create vanilla SVM svm = SVMLight(param.cost, combined_kernel, lab) # optimization settings num_threads = 4 svm.parallel.set_num_threads(num_threads) if param.flags.has_key("epsilon"): svm.set_epsilon(param.flags["epsilon"]) # enable output svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) # disable unsupported optimizations (due to special normalizer) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) # start training svm.train() ######################################################## print "svm objective:" print svm.get_objective() ######################################################## # store additional info self.additional_information["svm objective"] = svm.get_objective() self.additional_information["weights"] = combined_kernel.get_subkernel_weights() # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), len(nodes), combined_kernel, svm) return svms
def evaluation_cross_validation_multiclass_storage( traindat=traindat, label_traindat=label_traindat): from shogun.Evaluation import CrossValidation, CrossValidationResult from shogun.Evaluation import CrossValidationPrintOutput from shogun.Evaluation import CrossValidationMKLStorage, CrossValidationMulticlassStorage from shogun.Evaluation import MulticlassAccuracy, F1Measure from shogun.Evaluation import StratifiedCrossValidationSplitting from shogun.Features import MulticlassLabels from shogun.Features import RealFeatures, CombinedFeatures from shogun.Kernel import GaussianKernel, CombinedKernel from shogun.Classifier import MKLMulticlass from shogun.Mathematics import Statistics, MSG_DEBUG # training data, combined features all on same data features = RealFeatures(traindat) comb_features = CombinedFeatures() comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) comb_features.append_feature_obj(features) labels = MulticlassLabels(label_traindat) # kernel, different Gaussians combined kernel = CombinedKernel() kernel.append_kernel(GaussianKernel(10, 0.1)) kernel.append_kernel(GaussianKernel(10, 1)) kernel.append_kernel(GaussianKernel(10, 2)) # create mkl using libsvm, due to a mem-bug, interleaved is not possible svm = MKLMulticlass(1.0, kernel, labels) svm.set_kernel(kernel) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "StratifiedCrossValidationSplitting" is also available splitting_strategy = StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = MulticlassAccuracy() # cross-validation instance cross_validation = CrossValidation(svm, comb_features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # append cross vlaidation output classes #cross_validation.add_cross_validation_output(CrossValidationPrintOutput()) #mkl_storage=CrossValidationMKLStorage() #cross_validation.add_cross_validation_output(mkl_storage) multiclass_storage = CrossValidationMulticlassStorage() multiclass_storage.append_binary_evaluation(F1Measure()) cross_validation.add_cross_validation_output(multiclass_storage) cross_validation.set_num_runs(3) # perform cross-validation result = cross_validation.evaluate() roc_0_0_0 = multiclass_storage.get_fold_ROC(0, 0, 0) #print roc_0_0_0 auc_0_0_0 = multiclass_storage.get_fold_evaluation_result(0, 0, 0, 0) #print auc_0_0_0 return roc_0_0_0, auc_0_0_0
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() pseudoseq_length = pseudoseqs.seq_length for pos in range(pseudoseq_length): print "appending kernel for pos %i" % (pos) print "nums", data.task_vector_nums pos_block_vec = PairiiVec() # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pos) #print "computing similarity for tasks (%s, %s) = %i" % (task_name_lhs, task_name_rhs, similarity) if similarity == 1: tmp_pair = Pairii(data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs)) pos_block_vec.push_back(tmp_pair) print "creating normalizer" normalizer_pos = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, pos_block_vec) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer_pos) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) print "WARNING: custom epsilon set" svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information["svm num sv"] = svm.get_num_support_vectors() self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms
def mkl_multiclass (): print 'mkl_multiclass' from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel from shogun.Classifier import MKLMultiClass width=1.2 C=1.2 epsilon=1e-5 num_threads=1 kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=PolyKernel(10,2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels=Labels(label_train_multiclass) mkl=MKLMultiClass(C, kernel, labels) mkl.set_epsilon(epsilon); mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(0.001) mkl.train() kernel.init(feats_train, feats_test) out= mkl.classify().get_labels() print out
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # dict to save additional information for later analysis self.additional_information = {} # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples, param) combined_features = CombinedFeatures() ################################################## # intra-domain blocks (dirac kernel) intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" ################################################## # all blocks (full kernel matrix) all_block_vec = PairiiVec() for task_id_1 in data.get_task_ids(): for task_id_2 in data.get_task_ids(): all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # create mask-based normalizer normalizer_all = MultitaskKernelMaskPairNormalizer( data.task_vector_nums, all_block_vec) kernel_all = shogun_factory.create_empty_kernel(param) kernel_all.set_normalizer(normalizer_all) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_all) # append features combined_features.append_feature_obj(base_features) ################################################## # hack # hack_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # hack_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001"))) # other_group = ["B_0702", "B_1501", "B_5801"] # for task_id_1 in other_group: # for task_id_2 in other_group: # hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2))) # # # # # create mask-based normalizer # normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec) # kernel_hack = shogun_factory.create_empty_kernel(param) # kernel_hack.set_normalizer(normalizer_hack) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_hack) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # init combined kernel combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() self.additional_information[ "mkl weights before"] = combined_kernel.get_subkernel_weights() print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.flags["mkl_q"]) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.set_epsilon(0.03) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float( len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() print "subkernel weights (after):", combined_kernel.get_subkernel_weights( ) ######################################################## print "svm objective:" print svm.get_objective() self.additional_information["svm_objective"] = svm.get_objective() self.additional_information[ "svm num sv"] = svm.get_num_support_vectors() self.additional_information[ "mkl weights post-training"] = combined_kernel.get_subkernel_weights( ) ######################################################## # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm, param) return svms
def kernel_combined_modular(fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from shogun.Kernel import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun.Features import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def mkl_binclass_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat): ################################## # set up and train # create some poly train/test matrix tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(fm_test_real) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(fm_train_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_train) # train mkl labels = BinaryLabels(fm_label_twoclass) mkl = MKLClassification() # which norm to use for MKL mkl.set_mkl_norm(1) # 2,3 # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() # w=kernel.get_subkernel_weights() # kernel.set_subkernel_weights(w) ################################## # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(fm_test_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) mkl.apply() return mkl.apply(), kernel
class SignalSensor(object): """ A collection of sensors """ def __init__(self): self.sensors = list() self.kernel = CombinedKernel() self.svs = CombinedFeatures() self.svm = None self.window = (+100000, -1000000) def from_file(self, file): sys.stderr.write('loading model file') l = file.readline(); if l != '%arts version: 1.0\n': sys.stderr.write("\nfile not an arts definition file\n") return None bias = None alphas = None num_kernels = None while l: # skip comment or empty line if not (l.startswith('%') or l.startswith('\n')): if bias is None: bias = parse_float(l, 'b') if alphas is None: alphas = parse_vector(l, file, 'alphas') if num_kernels is None: num_kernels = parse_int(l, 'num_kernels') if num_kernels and bias and alphas is not None: for i in xrange(num_kernels): s = Sensor() (k, f) = s.from_file(file, i + 1) k.io.enable_progress() self.window = (min(self.window[0], s.window[0]), max(self.window[1], s.window[2])) self.sensors.append(s) self.kernel.append_kernel(k) self.svs.append_feature_obj(f) self.kernel.init(self.svs, self.svs) self.svm = SVM(self.kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), bias) self.svm.io.set_target_to_stderr() self.svm.io.enable_progress() self.svm.parallel.set_num_threads(self.svm.parallel.get_num_cpus()) sys.stderr.write('done\n') return l = file.readline() sys.stderr.write('error loading model file\n') def predict(self, seq, chunk_size = int(10e6)): """ predicts on whole contig, splits up sequence in chunks of size chunk_size """ seq_len = len(seq) num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size))) assert(num_chunks > 0) sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks)) start = 0 stop = min(chunk_size, seq_len) out = [] # iterate over chunks for chunk_idx in range(num_chunks): sys.stderr.write("processing chunk #%i\n" % (chunk_idx)) assert (start < stop) chunk = seq[start:stop] assert(len(self.sensors) > 0) tf = CombinedFeatures() for i in xrange(len(self.sensors)): f = self.sensors[i].get_test_features(chunk, self.window) tf.append_feature_obj(f) sys.stderr.write("initialising kernel...") self.kernel.init(self.svs, tf) sys.stderr.write("..done\n") lab_out = self.svm.apply() # work around problem with get_labels() tmp_out = [lab_out.get_label(idx) for idx in range(0, lab_out.get_num_labels())] assert(len(tmp_out) > 0) out.extend(tmp_out) print "len out", len(out) # increment chunk start = stop stop = min(stop+chunk_size, seq_len) l = (-self.window[0]) * [-42] r = self.window[1] * [-42] # concatenate ret = l + out + r assert(len(ret) == len(seq)) return ret
def mkl_multiclass_modular(fm_train_real, fm_test_real, label_train_multiclass, width, C, epsilon, num_threads, mkl_epsilon, mkl_norm): from shogun.Features import CombinedFeatures, RealFeatures, Labels from shogun.Kernel import CombinedKernel, GaussianKernel, LinearKernel,PolyKernel from shogun.Classifier import MKLMultiClass kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(10, width) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = LinearKernel() feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = PolyKernel(10,2) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = Labels(label_train_multiclass) mkl = MKLMultiClass(C, kernel, labels) mkl.set_epsilon(epsilon); mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(mkl_norm) mkl.train() kernel.init(feats_train, feats_test) out = mkl.classify().get_labels() return out
class SignalSensor(object): """ A collection of sensors """ def __init__(self): self.sensors = list() self.kernel = CombinedKernel() self.svs = CombinedFeatures() self.svm = None self.window = (+100000, -1000000) def from_file(self, file): sys.stderr.write('loading model file') l = file.readline() if l != '%arts version: 1.0\n': sys.stderr.write("\nfile not an arts definition file\n") return None bias = None alphas = None num_kernels = None while l: # skip comment or empty line if not (l.startswith('%') or l.startswith('\n')): if bias is None: bias = parse_float(l, 'b') if alphas is None: alphas = parse_vector(l, file, 'alphas') if num_kernels is None: num_kernels = parse_int(l, 'num_kernels') if num_kernels and bias and alphas is not None: for i in xrange(num_kernels): s = Sensor() (k, f) = s.from_file(file, i + 1) k.io.enable_progress() self.window = (min(self.window[0], s.window[0]), max(self.window[1], s.window[2])) self.sensors.append(s) self.kernel.append_kernel(k) self.svs.append_feature_obj(f) self.kernel.init(self.svs, self.svs) self.svm = KernelMachine( self.kernel, alphas, numpy.arange(len(alphas), dtype=numpy.int32), bias) self.svm.io.set_target_to_stderr() self.svm.io.enable_progress() self.svm.parallel.set_num_threads( self.svm.parallel.get_num_cpus()) sys.stderr.write('done\n') return l = file.readline() sys.stderr.write('error loading model file\n') def predict(self, seq, chunk_size=int(10e6)): """ predicts on whole contig, splits up sequence in chunks of size chunk_size """ seq_len = len(seq) num_chunks = int(numpy.ceil(float(seq_len) / float(chunk_size))) assert (num_chunks > 0) sys.stderr.write("number of chunks for contig: %i\n" % (num_chunks)) start = 0 stop = min(chunk_size, seq_len) out = [] # iterate over chunks for chunk_idx in range(num_chunks): sys.stderr.write("processing chunk #%i\n" % (chunk_idx)) assert (start < stop) chunk = seq[start:stop] assert (len(self.sensors) > 0) tf = CombinedFeatures() for i in xrange(len(self.sensors)): f = self.sensors[i].get_test_features(chunk, self.window) tf.append_feature_obj(f) sys.stderr.write("initialising kernel...") self.kernel.init(self.svs, tf) sys.stderr.write("..done\n") self.svm.set_kernel(self.kernel) lab_out = self.svm.apply().get_values() assert (len(lab_out) > 0) out.extend(lab_out) # increment chunk start = stop stop = min(stop + chunk_size, seq_len) l = (-self.window[0]) * [-42] r = self.window[1] * [-42] # concatenate ret = l + out + r assert (len(ret) == len(seq)) return ret
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # dict to save additional information for later analysis self.additional_information = {} # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ######################################################## print "creating a kernel for each node:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples, param) combined_features = CombinedFeatures() ################################################## # intra-domain blocks (dirac kernel) intra_block_vec = PairiiVec() for task_id in data.get_task_ids(): intra_block_vec.push_back(Pairii(task_id, task_id)) # create mask-based normalizer normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) kernel = shogun_factory.create_empty_kernel(param) kernel.set_normalizer(normalizer) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel) # append features combined_features.append_feature_obj(base_features) print "------" ################################################## # all blocks (full kernel matrix) all_block_vec = PairiiVec() for task_id_1 in data.get_task_ids(): for task_id_2 in data.get_task_ids(): all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # create mask-based normalizer normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) kernel_all = shogun_factory.create_empty_kernel(param) kernel_all.set_normalizer(normalizer_all) # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_all) # append features combined_features.append_feature_obj(base_features) ################################################## # hack # hack_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # hack_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # hack_block_vec.push_back(Pairii(data.name_to_id("B_2705"), data.name_to_id("B_4001"))) # other_group = ["B_0702", "B_1501", "B_5801"] # for task_id_1 in other_group: # for task_id_2 in other_group: # hack_block_vec.push_back(Pairii(data.name_to_id(task_id_1), data.name_to_id(task_id_2))) # # # # # create mask-based normalizer # normalizer_hack = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, hack_block_vec) # kernel_hack = shogun_factory.create_empty_kernel(param) # kernel_hack.set_normalizer(normalizer_hack) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_hack) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # init combined kernel combined_kernel.init(combined_features, combined_features) #combined_kernel.precompute_subkernels() self.additional_information["mkl weights before"] = combined_kernel.get_subkernel_weights() print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.flags["mkl_q"] >= 1.0) if param.flags["mkl_q"] >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.flags["mkl_q"]) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) num_threads = 8 svm.io.enable_progress() svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) svm.set_epsilon(0.03) # set cost if param.flags["normalize_cost"]: norm_c_pos = param.cost / float(len([l for l in data.labels if l==1])) norm_c_neg = param.cost / float(len([l for l in data.labels if l==-1])) svm.set_C(norm_c_neg, norm_c_pos) else: svm.set_C(param.cost, param.cost) svm.train() print "subkernel weights (after):", combined_kernel.get_subkernel_weights() ######################################################## print "svm objective:" print svm.get_objective() self.additional_information["svm_objective"] = svm.get_objective() self.additional_information["svm num sv"] = svm.get_num_support_vectors() self.additional_information["mkl weights post-training"] = combined_kernel.get_subkernel_weights() ######################################################## # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm, param) return svms
def mkl_binclass_modular(fm_train_real=traindat, fm_test_real=testdat, fm_label_twoclass=label_traindat): ################################## # set up and train # create some poly train/test matrix tfeats = RealFeatures(fm_train_real) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, tfeats) K_train = tkernel.get_kernel_matrix() pfeats = RealFeatures(fm_test_real) tkernel.init(tfeats, pfeats) K_test = tkernel.get_kernel_matrix() # create combined train features feats_train = CombinedFeatures() feats_train.append_feature_obj(RealFeatures(fm_train_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_train)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_train) # train mkl labels = BinaryLabels(fm_label_twoclass) mkl = MKLClassification() # which norm to use for MKL mkl.set_mkl_norm(1) #2,3 # set cost (neg, pos) mkl.set_C(1, 1) # set kernel and labels mkl.set_kernel(kernel) mkl.set_labels(labels) # train mkl.train() #w=kernel.get_subkernel_weights() #kernel.set_subkernel_weights(w) ################################## # test # create combined test features feats_pred = CombinedFeatures() feats_pred.append_feature_obj(RealFeatures(fm_test_real)) # and corresponding combined kernel kernel = CombinedKernel() kernel.append_kernel(CustomKernel(K_test)) kernel.append_kernel(PolyKernel(10, 2)) kernel.init(feats_train, feats_pred) # and classify mkl.set_kernel(kernel) mkl.apply() return mkl.apply(), kernel
def _train(self, train_data, param): """ training procedure using training examples and labels @param train_data: Data relevant to SVM training @type train_data: dict<str, list<instances> > @param param: Parameters for the training procedure @type param: ParameterSvm """ # merge data sets data = PreparedMultitaskData(train_data, shuffle=True) # create shogun label lab = shogun_factory.create_labels(data.labels) ################################################## # define pockets ################################################## pockets = [0] * 9 pockets[0] = [1, 5, 6, 7, 8, 31, 32, 33, 34] pockets[1] = [1, 2, 3, 4, 6, 7, 8, 9, 11, 21, 31] pockets[2] = [11, 20, 21, 22, 29, 31] pockets[3] = [8, 30, 31, 32] pockets[4] = [10, 11, 30] pockets[5] = [10, 11, 12, 13, 20, 29] pockets[6] = [10, 12, 20, 22, 26, 27, 28, 29] pockets[7] = [12, 14, 15, 26] pockets[8] = [13, 15, 16, 17, 18, 19, 20, 23, 24, 25, 26] #new_pockets = [] # merge neighboring pockets #for i in range(8): # new_pockets.append(list(set(pockets[i]).union(set(pockets[i+1])))) #pockets = new_pockets ######################################################## print "creating a kernel:" ######################################################## # assemble combined kernel combined_kernel = CombinedKernel() combined_kernel.io.set_loglevel(shogun.Kernel.MSG_INFO) base_features = shogun_factory.create_features(data.examples) combined_features = CombinedFeatures() ################################################## # intra-domain blocks # intra_block_vec = PairiiVec() # # for task_id in data.get_task_ids(): # intra_block_vec.push_back(Pairii(task_id, task_id)) # # # # # create mask-based normalizer # normalizer = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, intra_block_vec) # kernel = shogun_factory.create_empty_kernel(param) # kernel.set_normalizer(normalizer) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel) # # # append features # combined_features.append_feature_obj(base_features) # # print "------" # # ################################################## # # all blocks # # # all_block_vec = PairiiVec() # # for task_id_1 in data.get_task_ids(): # for task_id_2 in data.get_task_ids(): # all_block_vec.push_back(Pairii(task_id_1, task_id_2)) # # # # create mask-based normalizer # normalizer_all = MultitaskKernelMaskPairNormalizer(data.task_vector_nums, all_block_vec) # kernel_all = shogun_factory.create_empty_kernel(param) # kernel_all.set_normalizer(normalizer_all) # # # append current kernel to CombinedKernel # combined_kernel.append_kernel(kernel_all) # # # append features # combined_features.append_feature_obj(base_features) ################################################## # add one kernel per similarity position # init seq handler pseudoseqs = SequencesHandler() for pocket in pockets: print "creating normalizer" #import pdb #pdb.set_trace() normalizer = MultitaskKernelNormalizer(data.task_vector_nums) print "processing pocket", pocket # set similarity for task_name_lhs in data.get_task_names(): for task_name_rhs in data.get_task_names(): similarity = 0.0 for pseudo_seq_pos in pocket: similarity += float( pseudoseqs.get_similarity(task_name_lhs, task_name_rhs, pseudo_seq_pos - 1)) # normalize similarity = similarity / float(len(pocket)) print "pocket %s (%s, %s) = %f" % ( str(pocket), task_name_lhs, task_name_rhs, similarity) normalizer.set_task_similarity( data.name_to_id(task_name_lhs), data.name_to_id(task_name_rhs), similarity) print "creating empty kernel" kernel_pos = shogun_factory.create_empty_kernel(param) print "setting normalizer" kernel_pos.set_normalizer(normalizer) print "appending kernel" # append current kernel to CombinedKernel combined_kernel.append_kernel(kernel_pos) print "appending features" # append features combined_features.append_feature_obj(base_features) print "done constructing combined kernel" ################################################## # init combined kernel # init weights # combined_kernel.set_subkernel_weights([1.0/2.85]*combined_kernel.get_num_subkernels()) combined_kernel.init(combined_features, combined_features) print "subkernel weights:", combined_kernel.get_subkernel_weights() svm = None print "using MKL:", (param.transform >= 1.0) if param.transform >= 1.0: svm = MKLClassification() svm.set_mkl_norm(param.transform) #svm.set_solver_type(ST_CPLEX) #ST_GLPK) #DIRECT) #NEWTON)#ST_CPLEX) #auto svm.set_C(param.cost, param.cost) svm.set_kernel(combined_kernel) svm.set_labels(lab) else: # create SVM (disable unsupported optimizations) combined_kernel.set_cache_size(500) svm = SVMLight(param.cost, combined_kernel, lab) # set up SVM num_threads = 8 svm.io.enable_progress() #svm.io.set_loglevel(shogun.Classifier.MSG_INFO) svm.io.set_loglevel(shogun.Classifier.MSG_DEBUG) svm.parallel.set_num_threads(num_threads) svm.set_linadd_enabled(False) svm.set_batch_computation_enabled(False) #print "WARNING: custom epsilon set" #svm.set_epsilon(0.05) # normalize cost norm_c_pos = param.cost / float(len([l for l in data.labels if l == 1])) norm_c_neg = param.cost / float( len([l for l in data.labels if l == -1])) svm.set_C(norm_c_neg, norm_c_pos) # start training svm.train() # save additional info self.additional_information["svm_objective"] = svm.get_objective() self.additional_information[ "svm num sv"] = svm.get_num_support_vectors() self.additional_information[ "post_weights"] = combined_kernel.get_subkernel_weights() print self.additional_information # wrap up predictors svms = {} # use a reference to the same svm several times for task_name in train_data.keys(): svms[task_name] = (data.name_to_id(task_name), combined_kernel, svm) return svms