def classifier_svmocas(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, C=0.9, epsilon=1e-5, num_threads=1): from shogun import RealFeatures, BinaryLabels from shogun import CSVFile try: from shogun import SVMOcas except ImportError: print("SVMOcas not available") return feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) svm = SVMOcas(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.train() bias = svm.get_bias() w = svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def distance_manhattenword(train_fname=traindna, test_fname=testdna, order=3, gap=0, reverse=False): from shogun import StringCharFeatures, StringWordFeatures, DNA from shogun import SortWordString, ManhattanWordDistance, CSVFile charfeat = StringCharFeatures(CSVFile(train_fname), DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(CSVFile(test_fname), DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance = ManhattanWordDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return dm_train, dm_test
def features_dense_io(): from shogun import RealFeatures, CSVFile feats = RealFeatures() f = CSVFile("../data/fm_train_real.dat", "r") f.set_delimiter(" ") feats.load(f) return feats
def classifier_gpbtsvm(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, width=2.1, C=1, epsilon=1e-5): from shogun import RealFeatures, BinaryLabels from shogun import GaussianKernel from shogun import CSVFile try: from shogun import GPBTSVM except ImportError: print("GPBTSVM not available") exit(0) feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) kernel = GaussianKernel(feats_train, feats_train, width) svm = GPBTSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def metric_lmnn(train_fname=traindat, test_fname=testdat, label_train_fname=label_traindat, k=3): try: from shogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile except ImportError: return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = MulticlassLabels(CSVFile(label_train_fname)) # LMNN lmnn = LMNN(feats_train, labels, k) lmnn.train() lmnn_distance = lmnn.get_distance() # perform classification with KNN knn = KNN(k, lmnn_distance, labels) knn.train() output = knn.apply(feats_test).get_labels() return lmnn, output
def features_dense_io(): from shogun import RealFeatures, CSVFile feats=RealFeatures() f=CSVFile("../data/fm_train_real.dat","r") f.set_delimiter(" ") feats.load(f) return feats
def labels_io(): from shogun import RegressionLabels, CSVFile lab = RegressionLabels() f = CSVFile("../data/label_train_regression.dat", "r") f.set_delimiter(" ") lab.load(f) #print lab.get_labels() return lab
def kernel_auc (train_fname=traindat,label_fname=label_traindat,width=1.7): from shogun import GaussianKernel, AUCKernel, RealFeatures from shogun import BinaryLabels, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) subkernel=GaussianKernel(feats_train, feats_train, width) kernel=AUCKernel(0, subkernel) kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname))) km_train=kernel.get_kernel_matrix() return kernel
def distance_chisquare(train_fname=traindat, test_fname=testdat): from shogun import RealFeatures, ChiSquareDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = ChiSquareDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_cauchy (train_fname=traindat,test_fname=testdat, sigma=1.0): from shogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=CauchyKernel(feats_train, feats_train, sigma, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_canberra (train_fname=traindat,test_fname=testdat): from shogun import RealFeatures, CanberraMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=CanberraMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def kernel_gaussian(train_fname=traindat, test_fname=testdat, width=1.3): from shogun import RealFeatures, GaussianKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_linear_byte (train_fname=traindat,test_fname=testdat): from shogun import LinearKernel, ByteFeatures, CSVFile feats_train=ByteFeatures(CSVFile(train_fname)) feats_test=ByteFeatures(CSVFile(test_fname)) kernel=LinearKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return kernel
def kernel_sigmoid (train_fname=traindat,test_fname=testdat,size_cache=10,gamma=1.2,coef0=1.3): from shogun import RealFeatures, SigmoidKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_minkowski(train_fname=traindat, test_fname=testdat, k=3): from shogun import RealFeatures, MinkowskiMetric, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = MinkowskiMetric(feats_train, feats_train, k) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_distance(train_fname=traindat, test_fname=testdat, width=1.7): from shogun import RealFeatures, DistanceKernel, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance() kernel = DistanceKernel(feats_train, feats_test, width, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_gaussian_shift (train_fname=traindat,test_fname=testdat,width=1.8,max_shift=2,shift_step=1): from shogun import RealFeatures, GaussianShiftKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianShiftKernel(feats_train, feats_train, width, max_shift, shift_step) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_exponential(train_fname=traindat, test_fname=testdat, tau_coef=1.0): from shogun import RealFeatures, ExponentialKernel, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel = ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def classifier_libsvmoneclass (train_fname=traindat,test_fname=testdat,width=2.1,C=1,epsilon=1e-5): from shogun import RealFeatures, GaussianKernel, LibSVMOneClass, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=GaussianKernel(feats_train, feats_train, width) svm=LibSVMOneClass(C, kernel) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def kernel_rationalquadratic (train_fname=traindat,test_fname=testdat, shift_coef=1.0): from shogun import RealFeatures, RationalQuadraticKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_power(train_fname=traindat, test_fname=testdat, degree=2.0): from shogun import RealFeatures, PowerKernel, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel = PowerKernel(feats_train, feats_train, degree, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def distance_normsquared(train_fname=traindat, test_fname=testdat): from shogun import RealFeatures, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) distance.set_disable_sqrt(True) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_chi2(train_fname=traindat, test_fname=testdat, width=1.4, size_cache=10): from shogun import RealFeatures, Chi2Kernel, CSVFile, NormOne feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = Chi2Kernel(feats_train, feats_train, width, size_cache) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_anova(train_fname=traindat, test_fname=testdat, cardinality=2, size_cache=10): from shogun import ANOVAKernel, RealFeatures, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = ANOVAKernel(feats_train, feats_train, cardinality, size_cache) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_linear(train_fname=traindat, test_fname=testdat, scale=1.2): from shogun import RealFeatures, LinearKernel, AvgDiagKernelNormalizer, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = LinearKernel() kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def converter_multidimensionalscaling(data_fname): try: import numpy from shogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile features = RealFeatures(CSVFile(data_fname)) distance_before = EuclideanDistance() distance_before.init(features, features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding, embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after - distance_matrix_before) / numpy.linalg.norm( distance_matrix_before) < 1e-6 except ImportError: print('No Eigen3 available')
def features_string_file(directory, fname): from shogun import StringCharFeatures, RAWBYTE from shogun import CSVFile # load features from directory f = StringCharFeatures(RAWBYTE) f.load_from_directory(directory) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)) #print("len(str[0])", f.get_vector_length(0)) #print("str[0]", f.get_feature_vector(0)) #or load features from file (one string per line) fil = CSVFile(fname) f.load(fil) #print(f.get_features()) #or load fasta file #f.load_fasta('fasta.fa') #print(f.get_features()) return f.get_features(), f
def multiclass_c45classifiertree(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from shogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset = int32(random.permutation(feats_train.get_num_vectors())) vsubset = subset[1:int(subset.size / 3)] trsubset = subset[1 + int(subset.size / 3):subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c = C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train, train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output = c.apply_multiclass(feats_test).get_labels() output_certainty = c.get_certainty_vector() return c, output, output_certainty
def classifier_svmsgd (train_fname=traindat,test_fname=testdat,label_fname=label_traindat,C=0.9,num_threads=1,num_iter=5): from shogun import RealFeatures, SparseRealFeatures, BinaryLabels from shogun import SVMSGD, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=BinaryLabels(CSVFile(label_fname)) svm=SVMSGD(C, feats_train, labels) svm.set_epochs(num_iter) #svm.io.set_loglevel(0) svm.train() bias=svm.get_bias() w=svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def distance_sparseeuclidean(train_fname=traindat, test_fname=testdat): from shogun import RealFeatures, SparseRealFeatures, SparseEuclideanDistance, CSVFile realfeat = RealFeatures(CSVFile(train_fname)) feats_train = SparseRealFeatures() feats_train.obtain_from_simple(realfeat) realfeat = RealFeatures(CSVFile(test_fname)) feats_test = SparseRealFeatures() feats_test.obtain_from_simple(realfeat) distance = SparseEuclideanDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_combined_custom_poly(train_fname=traindat, test_fname=testdat, train_label_fname=label_traindat): from shogun import CombinedFeatures, RealFeatures, BinaryLabels from shogun import CombinedKernel, PolyKernel, CustomKernel from shogun import LibSVM, CSVFile kernel = CombinedKernel() feats_train = CombinedFeatures() tfeats = RealFeatures(CSVFile(train_fname)) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, tfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_train = RealFeatures(CSVFile(train_fname)) feats_train.append_feature_obj(subkfeats_train) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = BinaryLabels(CSVFile(train_label_fname)) svm = LibSVM(1.0, kernel, labels) svm.train() kernel = CombinedKernel() feats_pred = CombinedFeatures() pfeats = RealFeatures(CSVFile(test_fname)) tkernel = PolyKernel(10, 3) tkernel.init(tfeats, pfeats) K = tkernel.get_kernel_matrix() kernel.append_kernel(CustomKernel(K)) subkfeats_test = RealFeatures(CSVFile(test_fname)) feats_pred.append_feature_obj(subkfeats_test) subkernel = PolyKernel(10, 2) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_pred) svm.set_kernel(kernel) svm.apply() km_train = kernel.get_kernel_matrix() return km_train, kernel