def fit_and_predict(load_test_data, train_data, test_feature_matrics, train_label, test_label_OR_test_data): features_train = RealFeatures(train_data) features_test = RealFeatures(test_feature_matrics) labels_train = BinaryLabels(train_label) learn_rate = 1.0 max_iter = 1000 perceptron = AveragedPerceptron(features_train, labels_train) perceptron.set_learn_rate(learn_rate) perceptron.set_max_iter(max_iter) perceptron.train() perceptron.set_features(features_test) labels_predict = perceptron.apply() if load_test_data: del test_label_OR_test_data['question_text'] # import pdb; pdb.set_trace() test_label_OR_test_data.insert(1, 'prediction', prediction80) test_label_OR_test_data.to_csv('submission.csv', index=False) return prediction else: labels_test = BinaryLabels(test_label_OR_test_data) accEval = AccuracyMeasure() accuracy = accEval.evaluate(labels_predict, labels_test) f1Eval = F1Measure() f1_score = f1Eval.evaluate(labels_predict, labels_test) print('#accuracy is: ', accuracy) print('#F1 score is: ', f1_score)
def evaluation_thresholds(index): from shogun import BinaryLabels, ROCEvaluation import numpy numpy.random.seed(17) output = numpy.arange(-1, 1, 0.001) output = (0.3 * output + 0.7 * (numpy.random.rand(len(output)) - 0.5)) label = [-1.0] * (len(output) // 2) label.extend([1.0] * (len(output) // 2)) label = numpy.array(label) pred = BinaryLabels(output) truth = BinaryLabels(label) evaluator = ROCEvaluation() evaluator.evaluate(pred, truth) [fp, tp] = evaluator.get_ROC() thresh = evaluator.get_thresholds() b = thresh[index] #print("tpr", numpy.mean(output[label>0]>b), tp[index]) #print("fpr", numpy.mean(output[label<0]>b), fp[index]) return tp[index], fp[index], numpy.mean(output[label > 0] > b), numpy.mean( output[label < 0] > b)
def classifier_domainadaptationsvm (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna, \ label_test_dna=label_testdna,fm_train_dna2=traindna2,fm_test_dna2=testdna2, \ label_train_dna2=label_traindna2,label_test_dna2=label_testdna2,C=1,degree=3): feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.train() #svm.io.set_loglevel(MSG_DEBUG) ##################################### #print("obtaining DA SVM from previously trained SVM") feats_train2 = StringCharFeatures(fm_train_dna, DNA) feats_test2 = StringCharFeatures(fm_test_dna, DNA) kernel2 = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels2 = BinaryLabels(label_train_dna) # we regularize against the previously obtained solution dasvm = DomainAdaptationSVM(C, kernel2, labels2, svm, 1.0) dasvm.train() out = dasvm.apply_binary(feats_test2) return out #,dasvm TODO
def modelselection_grid_search_kernel(num_subsets, num_vectors, dim_vectors): # init seed for reproducability Math.init_random(1) random.seed(1) # create some (non-sense) data matrix = random.rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features = RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels = BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i % 2 == 0 else -1) # create svm classifier = LibSVM() # splitting strategy splitting_strategy = StratifiedCrossValidationSplitting( labels, num_subsets) # accuracy evaluation evaluation_criterion = ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross = CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #classifier.print_modsel_params() # model parameter selection param_tree = create_param_tree() #param_tree.print_tree() grid_search = GridSearchModelSelection(cross, param_tree) print_state = False best_combination = grid_search.select_model(print_state) #print("best parameter(s):") #best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have less variance cross.set_num_runs(10) result = cross.evaluate() casted = CrossValidationResult.obtain_from_generic(result) #print "result mean:", casted.mean return classifier, result, casted.get_mean()
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors): # init seed for reproducability Math.init_random(1) random.seed(1); # create some (non-sense) data matrix=random.rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features=RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels=BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i%2==0 else -1) # create svm classifier=LibSVM() # splitting strategy splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets) # accuracy evaluation evaluation_criterion=ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #classifier.print_modsel_params() # model parameter selection param_tree=create_param_tree() #param_tree.print_tree() grid_search=GridSearchModelSelection(cross, param_tree) print_state=False best_combination=grid_search.select_model(print_state) #print("best parameter(s):") #best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have less variance cross.set_num_runs(10) result=cross.evaluate() casted=CrossValidationResult.obtain_from_generic(result); #print "result mean:", casted.mean return classifier,result,casted.get_mean()
def evaluation_rocevaluation(ground_truth, predicted): from shogun import BinaryLabels from shogun import ROCEvaluation ground_truth_labels = BinaryLabels(ground_truth) predicted_labels = BinaryLabels(predicted) evaluator = ROCEvaluation() evaluator.evaluate(predicted_labels, ground_truth_labels) return evaluator.get_ROC(), evaluator.get_auROC()
def evaluation_cross_validation_classification(traindat=traindat, label_traindat=label_traindat): from shogun import CrossValidation, CrossValidationResult from shogun import ContingencyTableEvaluation, ACCURACY from shogun import StratifiedCrossValidationSplitting from shogun import BinaryLabels from shogun import RealFeatures from shogun import LibLinear, L2R_L2LOSS_SVC # training data features = RealFeatures(traindat) labels = BinaryLabels(label_traindat) # classifier classifier = LibLinear(L2R_L2LOSS_SVC) # splitting strategy for 5 fold cross-validation (for classification its better # to use "StratifiedCrossValidation", but the standard # "CrossValidationSplitting" is also available splitting_strategy = StratifiedCrossValidationSplitting(labels, 5) # evaluation method evaluation_criterium = ContingencyTableEvaluation(ACCURACY) # cross-validation instance cross_validation = CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterium) cross_validation.set_autolock(False) # (optional) repeat x-val 10 times cross_validation.set_num_runs(10) # perform cross-validation and print(results) result = cross_validation.evaluate()
def classifier_svmlight(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, C=1.2, epsilon=1e-5, num_threads=1): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel try: from shogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train = StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20 kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) svm.apply().get_labels() return kernel
def transfer_multitask_clustered_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat ): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG try: from shogun import MultitaskClusteredLogisticRegression except ImportError: print("MultitaskClusteredLogisticRegression not available") exit() features = RealFeatures(hstack((traindat, sin(traindat), cos(traindat)))) labels = BinaryLabels(hstack((label_train, label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 3) task_two = Task(n_vectors // 3, 2 * n_vectors // 3) task_three = Task(2 * n_vectors // 3, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0, 100.0, features, labels, task_group, 2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out
def kernel_salzberg_word_string(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, order=3, gap=0, reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from shogun import SalzbergWordStringKernel from shogun import PluginEstimate charfeat = StringCharFeatures(fm_train_dna, DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) charfeat = StringCharFeatures(fm_test_dna, DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) pie = PluginEstimate() labels = BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel = SalzbergWordStringKernel(feats_train, feats_train, pie, labels) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_svmocas(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, C=0.9, epsilon=1e-5, num_threads=1): from shogun import RealFeatures, BinaryLabels from shogun import CSVFile try: from shogun import SVMOcas except ImportError: print("SVMOcas not available") return feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) svm = SVMOcas(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.train() bias = svm.get_bias() w = svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def classifier_ssk(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, C=1, maxlen=1, decay=1): from shogun import StringCharFeatures, BinaryLabels from shogun import LibSVM, SubsequenceStringKernel, DNA from shogun import ErrorRateMeasure feats_train = StringCharFeatures(fm_train_dna, DNA) feats_test = StringCharFeatures(fm_test_dna, DNA) labels = BinaryLabels(label_train_dna) kernel = SubsequenceStringKernel(feats_train, feats_train, maxlen, decay) svm = LibSVM(C, kernel, labels) svm.train() out = svm.apply(feats_train) evaluator = ErrorRateMeasure() trainerr = evaluator.evaluate(out, labels) # print(trainerr) kernel.init(feats_train, feats_test) predicted_labels = svm.apply(feats_test).get_labels() # print predicted_labels return predicted_labels
def transfer_multitask_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from shogun import MultitaskLogisticRegression except ImportError: print("MultitaskLogisticRegression not available") exit() features = RealFeatures(hstack((traindat, traindat))) labels = BinaryLabels(hstack((label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 2) task_two = Task(n_vectors // 2, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskLogisticRegression(0.1, features, labels, task_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() mtlr.set_current_task(0) out = mtlr.apply().get_labels() return out
def kernel_histogram_word_string(fm_train_dna=traindat, fm_test_dna=testdat, label_train_dna=label_traindat, order=3, ppseudo_count=1, npseudo_count=1): from shogun import StringCharFeatures, StringWordFeatures, DNA, BinaryLabels from shogun import HistogramWordStringKernel, AvgDiagKernelNormalizer from shogun import PluginEstimate #, MSG_DEBUG charfeat = StringCharFeatures(DNA) #charfeat.io.set_loglevel(MSG_DEBUG) charfeat.set_features(fm_train_dna) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, 0, False) charfeat = StringCharFeatures(DNA) charfeat.set_features(fm_test_dna) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, 0, False) pie = PluginEstimate(ppseudo_count, npseudo_count) labels = BinaryLabels(label_train_dna) pie.set_labels(labels) pie.set_features(feats_train) pie.train() kernel = HistogramWordStringKernel(feats_train, feats_train, pie) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) pie.set_features(feats_test) pie.apply().get_labels() km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_featureblock_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup try: from shogun import FeatureBlockLogisticRegression except ImportError: print("FeatureBlockLogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat, traindat))) labels = BinaryLabels(hstack((label_train, label_train))) n_features = features.get_num_features() block_one = IndexBlock(0, n_features // 2) block_two = IndexBlock(n_features // 2, n_features) block_group = IndexBlockGroup() block_group.add_block(block_one) block_group.add_block(block_two) mtlr = FeatureBlockLogisticRegression(0.1, features, labels, block_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() out = mtlr.apply().get_labels() return out
def classifier_svmlight_linear_term (fm_train_dna=traindna,fm_test_dna=testdna, \ label_train_dna=label_traindna,degree=3, \ C=10,epsilon=1e-5,num_threads=1): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel try: from shogun import SVMLight except ImportError: print("SVMLight is not available") exit(0) feats_train=StringCharFeatures(DNA) feats_train.set_features(fm_train_dna) feats_test=StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) kernel=WeightedDegreeStringKernel(feats_train, feats_train, degree) labels=BinaryLabels(label_train_dna) svm=SVMLight(C, kernel, labels) svm.set_qpsize(3) svm.set_linear_term(-numpy.array([1,2,3,4,5,6,7,8,7,6], dtype=numpy.double)); svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) out = svm.apply().get_labels() return out,kernel
def classifier_gpbtsvm(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, width=2.1, C=1, epsilon=1e-5): from shogun import RealFeatures, BinaryLabels from shogun import GaussianKernel from shogun import CSVFile try: from shogun import GPBTSVM except ImportError: print("GPBTSVM not available") exit(0) feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) kernel = GaussianKernel(feats_train, feats_train, width) svm = GPBTSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def evaluation_contingencytableevaluation(ground_truth, predicted): from shogun import BinaryLabels from shogun import ContingencyTableEvaluation from shogun import AccuracyMeasure, ErrorRateMeasure, BALMeasure from shogun import WRACCMeasure, F1Measure, CrossCorrelationMeasure from shogun import RecallMeasure, PrecisionMeasure, SpecificityMeasure ground_truth_labels = BinaryLabels(ground_truth) predicted_labels = BinaryLabels(predicted) base_evaluator = ContingencyTableEvaluation() base_evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = AccuracyMeasure() accuracy = evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = ErrorRateMeasure() errorrate = evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = BALMeasure() bal = evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = WRACCMeasure() wracc = evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = F1Measure() f1 = evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = CrossCorrelationMeasure() crosscorrelation = evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = RecallMeasure() recall = evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = PrecisionMeasure() precision = evaluator.evaluate(predicted_labels, ground_truth_labels) evaluator = SpecificityMeasure() specificity = evaluator.evaluate(predicted_labels, ground_truth_labels) return accuracy, errorrate, bal, wracc, f1, crosscorrelation, recall, precision, specificity
def kernel_auc (train_fname=traindat,label_fname=label_traindat,width=1.7): from shogun import GaussianKernel, AUCKernel, RealFeatures from shogun import BinaryLabels, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) subkernel=GaussianKernel(feats_train, feats_train, width) kernel=AUCKernel(0, subkernel) kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname))) km_train=kernel.get_kernel_matrix() return kernel
def get_labels(raw=False, type='binary'): data = concatenate( array( (-ones(NUM_EXAMPLES, dtype=double), ones(NUM_EXAMPLES, dtype=double)))) if raw: return data else: if type == 'binary': return BinaryLabels(data) if type == 'regression': return RegressionLabels(data) return None
def evaluation_director_contingencytableevaluation (ground_truth, predicted): try: from shogun import DirectorContingencyTableEvaluation, ED_MAXIMIZE except ImportError: print("recompile shogun with --enable-swig-directors") return class SimpleWeightedBinaryEvaluator(DirectorContingencyTableEvaluation): def __init__(self): DirectorContingencyTableEvaluation.__init__(self) def get_custom_direction(self): return ED_MAXIMIZE def get_custom_score(self): return self.get_WRACC()+self.get_BAL() from shogun import BinaryLabels evaluator = SimpleWeightedBinaryEvaluator() r = evaluator.evaluate(BinaryLabels(ground_truth), BinaryLabels(predicted)) r2 = evaluator.get_custom_score() print(r,r2) return r,r2
def features_from_file(fileName): fileHandle = open(fileName) fileHandle.readline() features = [] labels = [] for line in fileHandle: tokens = line.split(',') labels.append(float(tokens[1])) features.append([float(token) for token in tokens[2:]]) return RealFeatures(numpy.transpose( numpy.array(features))), features, BinaryLabels( numpy.array(labels, numpy.float))
def classifier_svmsgd (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,num_threads=1,num_iter=5): from shogun import RealFeatures, SparseRealFeatures, BinaryLabels from shogun import SVMSGD, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) svm=SVMSGD(C, feats_train, labels) svm.set_epochs(num_iter) #svm.io.set_loglevel(0) svm.train() bias=svm.get_bias() w=svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def kernel_combined_custom_poly(train_fname=traindat, test_fname=testdat, train_label_fname=label_traindat): from shogun import CombinedFeatures, RealFeatures, BinaryLabels from shogun import CombinedKernel, PolyKernel, CustomKernel from shogun import LibSVM, CSVFile kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(CSVFile(train_fname)) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(CSVFile(train_fname)) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = BinaryLabels(CSVFile(train_label_fname)) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(CSVFile(test_fname)) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(CSVFile(test_fname)) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.apply() km_train = kernel.get_kernel_matrix() return km_train, kernel
def classifier_lda(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, gamma=3, num_threads=1): from shogun import RealFeatures, BinaryLabels, LDA, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) lda = LDA(gamma, feats_train, labels) lda.train() bias = lda.get_bias() w = lda.get_w() predictions = lda.apply(feats_test).get_labels() return lda, predictions
def classifier_svmlin (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,epsilon=1e-5,num_threads=1): from shogun import RealFeatures, SparseRealFeatures, BinaryLabels from shogun import SVMLin, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) svm=SVMLin(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(True) svm.train() bias=svm.get_bias() w=svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def classifier_custom_kernel(C=1, dim=7): from shogun import RealFeatures, BinaryLabels, CustomKernel, LibSVM from numpy import diag, ones, sign from numpy.random import rand, seed seed((C, dim)) lab = sign(2 * rand(dim) - 1) data = rand(dim, dim) symdata = data * data.T + diag(ones(dim)) kernel = CustomKernel() kernel.set_full_kernel_matrix_from_full(data) labels = BinaryLabels(lab) svm = LibSVM(C, kernel, labels) svm.train() predictions = svm.apply() out = svm.apply().get_labels() return svm, out
def runShogunSVMDNASpectrumKernel(train_xt, train_lt, test_xt): """ run svm with spectrum kernel """ ################################################## # set up SVM charfeat_train = StringCharFeatures(train_xt, DNA) feats_train = StringWordFeatures(DNA) feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False) preproc=SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat_test = StringCharFeatures(test_xt, DNA) feats_test=StringWordFeatures(DNA) feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=CommWordStringKernel(feats_train, feats_train, False) kernel.io.set_loglevel(MSG_DEBUG) # init kernel labels = BinaryLabels(train_lt) # run svm model print "Ready to train!" svm=LibSVM(SVMC, kernel, labels) svm.io.set_loglevel(MSG_DEBUG) svm.train() # predictions print "Making predictions!" out1DecisionValues = svm.apply(feats_train) out1=out1DecisionValues.get_labels() kernel.init(feats_train, feats_test) out2DecisionValues = svm.apply(feats_test) out2=out2DecisionValues.get_labels() return out1,out2,out1DecisionValues,out2DecisionValues
def classifier_svmlight_batch_linadd(fm_train_dna, fm_test_dna, label_train_dna, degree, C, epsilon, num_threads): from shogun import StringCharFeatures, BinaryLabels, DNA from shogun import WeightedDegreeStringKernel, MSG_DEBUG try: from shogun import SVMLight except ImportError: print('No support for SVMLight available.') return feats_train = StringCharFeatures(DNA) #feats_train.io.set_loglevel(MSG_DEBUG) feats_train.set_features(fm_train_dna) feats_test = StringCharFeatures(DNA) feats_test.set_features(fm_test_dna) degree = 20 kernel = WeightedDegreeStringKernel(feats_train, feats_train, degree) labels = BinaryLabels(label_train_dna) svm = SVMLight(C, kernel, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.train() kernel.init(feats_train, feats_test) #print('SVMLight Objective: %f num_sv: %d' % \) # (svm.get_objective(), svm.get_num_support_vectors()) svm.set_batch_computation_enabled(False) svm.set_linadd_enabled(False) svm.apply().get_labels() svm.set_batch_computation_enabled(True) labels = svm.apply().get_labels() return labels, svm
def classifier_mpdsvm(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, C=1, epsilon=1e-5): from shogun import RealFeatures, BinaryLabels from shogun import GaussianKernel from shogun import MPDSVM, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) width = 2.1 kernel = GaussianKernel(feats_train, feats_train, width) svm = MPDSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def classifier_perceptron(n=100, dim=2, distance=5, learn_rate=1., max_iter=1000, num_threads=1, seed=1): from shogun import RealFeatures, BinaryLabels from shogun import Perceptron random.seed(seed) # produce some (probably) linearly separable training data by hand # Two Gaussians at a far enough distance X = array(random.randn(dim, n)) + distance Y = array(random.randn(dim, n)) - distance X_test = array(random.randn(dim, n)) + distance Y_test = array(random.randn(dim, n)) - distance label_train_twoclass = hstack((ones(n), -ones(n))) #plot(X[0,:], X[1,:], 'x', Y[0,:], Y[1,:], 'o') fm_train_real = hstack((X, Y)) fm_test_real = hstack((X_test, Y_test)) feats_train = RealFeatures(fm_train_real) feats_test = RealFeatures(fm_test_real) labels = BinaryLabels(label_train_twoclass) perceptron = Perceptron(feats_train, labels) perceptron.set_learn_rate(learn_rate) perceptron.set_max_iter(max_iter) # only guaranteed to converge for separable data perceptron.train() perceptron.set_features(feats_test) out_labels = perceptron.apply().get_labels() return perceptron, out_labels