def features_dense_io_modular(): from modshogun import RealFeatures, CSVFile feats = RealFeatures() f = CSVFile("../data/fm_train_real.dat", "r") f.set_delimiter(" ") feats.load(f) return feats
def classifier_gpbtsvm_modular(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, width=2.1, C=1, epsilon=1e-5): from modshogun import RealFeatures, BinaryLabels from modshogun import GaussianKernel from modshogun import CSVFile try: from modshogun import GPBTSVM except ImportError: print("GPBTSVM not available") exit(0) feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) kernel = GaussianKernel(feats_train, feats_train, width) svm = GPBTSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def distance_manhattenword_modular(train_fname=traindna, test_fname=testdna, order=3, gap=0, reverse=False): from modshogun import StringCharFeatures, StringWordFeatures, DNA from modshogun import SortWordString, ManhattanWordDistance, CSVFile charfeat = StringCharFeatures(CSVFile(train_fname), DNA) feats_train = StringWordFeatures(charfeat.get_alphabet()) feats_train.obtain_from_char(charfeat, order - 1, order, gap, reverse) preproc = SortWordString() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() charfeat = StringCharFeatures(CSVFile(test_fname), DNA) feats_test = StringWordFeatures(charfeat.get_alphabet()) feats_test.obtain_from_char(charfeat, order - 1, order, gap, reverse) feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() distance = ManhattanWordDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return dm_train, dm_test
def features_dense_io_modular(): from modshogun import RealFeatures, CSVFile feats=RealFeatures() f=CSVFile("../data/fm_train_real.dat","r") f.set_delimiter(" ") feats.load(f) return feats
def metric_lmnn_modular(train_fname=traindat, test_fname=testdat, label_train_fname=label_traindat, k=3): try: from modshogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile except ImportError: return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = MulticlassLabels(CSVFile(label_train_fname)) # LMNN lmnn = LMNN(feats_train, labels, k) lmnn.train() lmnn_distance = lmnn.get_distance() # perform classification with KNN knn = KNN(k, lmnn_distance, labels) knn.train() output = knn.apply(feats_test).get_labels() return lmnn, output
def multiclass_randomforest_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, RandomForest, MajorityVote except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # Random Forest formation rand_forest = RandomForest(feats_train, train_labels, 20, 1) rand_forest.set_feature_types(ft) rand_forest.set_combination_rule(MajorityVote()) rand_forest.train() # Classify test data output = rand_forest.apply_multiclass(feats_test).get_labels() return rand_forest, output
def classifier_svmocas_modular(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, C=0.9, epsilon=1e-5, num_threads=1): from modshogun import RealFeatures, BinaryLabels from modshogun import CSVFile try: from modshogun import SVMOcas except ImportError: print("SVMOcas not available") return feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) svm = SVMOcas(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.train() bias = svm.get_bias() w = svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def labels_io_modular(): from modshogun import RegressionLabels, CSVFile lab=RegressionLabels() f=CSVFile("../data/label_train_regression.dat","r") f.set_delimiter(" ") lab.load(f) #print lab.get_labels() return lab
def labels_io_modular(): from modshogun import RegressionLabels, CSVFile lab = RegressionLabels() f = CSVFile("../data/label_train_regression.dat", "r") f.set_delimiter(" ") lab.load(f) #print lab.get_labels() return lab
def classifier_gaussiannaivebayes_modular (train_fname=traindat,test_fname=testdat,label_train_fname=label_traindat): from modshogun import RealFeatures, MulticlassLabels, GaussianNaiveBayes, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) labels=MulticlassLabels(CSVFile(label_train_fname)) gnb=GaussianNaiveBayes(feats_train, labels) gnb_train = gnb.train() output=gnb.apply(feats_test).get_labels() return gnb, gnb_train, output
def distance_chisquare_modular(train_fname=traindat, test_fname=testdat): from modshogun import RealFeatures, ChiSquareDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = ChiSquareDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_chi2_modular (train_fname=traindat,test_fname=testdat,width=1.4, size_cache=10): from modshogun import RealFeatures, Chi2Kernel, CSVFile, NormOne feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_braycurtis_modular(train_fname=traindat, test_fname=testdat): from modshogun import RealFeatures, BrayCurtisDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = BrayCurtisDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_cauchy_modular(train_fname=traindat, test_fname=testdat, sigma=1.0): from modshogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel = CauchyKernel(feats_train, feats_train, sigma, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_linear_byte_modular (train_fname=traindat,test_fname=testdat): from modshogun import LinearKernel, ByteFeatures, CSVFile feats_train=ByteFeatures(CSVFile(train_fname)) feats_test=ByteFeatures(CSVFile(test_fname)) kernel=LinearKernel(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return kernel
def kernel_auc_modular(train_fname=traindat, label_fname=label_traindat, width=1.7): from modshogun import GaussianKernel, AUCKernel, RealFeatures from modshogun import BinaryLabels, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) subkernel = GaussianKernel(feats_train, feats_train, width) kernel = AUCKernel(0, subkernel) kernel.setup_auc_maximization(BinaryLabels(CSVFile(label_fname))) km_train = kernel.get_kernel_matrix() return kernel
def distance_minkowski_modular(train_fname=traindat, test_fname=testdat, k=3): from modshogun import RealFeatures, MinkowskiMetric, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = MinkowskiMetric(feats_train, feats_train, k) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def distance_manhatten_modular (train_fname,test_fname=testdat): from modshogun import RealFeatures, ManhattanMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=ManhattanMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def kernel_exponential_modular (train_fname=traindat,test_fname=testdat, tau_coef=1.0): from modshogun import RealFeatures, ExponentialKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel=ExponentialKernel(feats_train, feats_train, tau_coef, distance, 10) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_gaussian_modular(train_fname=traindat, test_fname=testdat, width=1.3): from modshogun import RealFeatures, GaussianKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_rationalquadratic_modular (train_fname=traindat,test_fname=testdat, shift_coef=1.0): from modshogun import RealFeatures, RationalQuadraticKernel, EuclideanDistance, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=RationalQuadraticKernel(feats_train, feats_train, shift_coef, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def distance_geodesic_modular (train_fname=traindat,test_fname=testdat): from modshogun import RealFeatures, GeodesicMetric, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=GeodesicMetric(feats_train, feats_train) dm_train=distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test=distance.get_distance_matrix() return distance,dm_train,dm_test
def kernel_power_modular(train_fname=traindat, test_fname=testdat, degree=2.0): from modshogun import RealFeatures, PowerKernel, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) kernel = PowerKernel(feats_train, feats_train, degree, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def distance_normsquared_modular(train_fname=traindat, test_fname=testdat): from modshogun import RealFeatures, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance(feats_train, feats_train) distance.set_disable_sqrt(True) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def kernel_distance_modular(train_fname=traindat, test_fname=testdat, width=1.7): from modshogun import RealFeatures, DistanceKernel, EuclideanDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = EuclideanDistance() kernel = DistanceKernel(feats_train, feats_test, width, distance) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_anova_modular(train_fname=traindat, test_fname=testdat, cardinality=2, size_cache=10): from modshogun import ANOVAKernel, RealFeatures, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = ANOVAKernel(feats_train, feats_train, cardinality, size_cache) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_linear_modular(train_fname=traindat, test_fname=testdat, scale=1.2): from modshogun import RealFeatures, LinearKernel, AvgDiagKernelNormalizer, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = LinearKernel() kernel.set_normalizer(AvgDiagKernelNormalizer(scale)) kernel.init(feats_train, feats_train) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def kernel_sigmoid_modular(train_fname=traindat, test_fname=testdat, size_cache=10, gamma=1.2, coef0=1.3): from modshogun import RealFeatures, SigmoidKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def features_string_file_modular (directory, fname): from modshogun import StringCharFeatures, RAWBYTE from modshogun import CSVFile # load features from directory f=StringCharFeatures(RAWBYTE) f.load_from_directory(directory) #and output several stats #print("max string length", f.get_max_vector_length()) #print("number of strings", f.get_num_vectors()) #print("length of first string", f.get_vector_length(0)) #print("str[0,0:3]", f.get_feature(0,0), f.get_feature(0,1), f.get_feature(0,2)) #print("len(str[0])", f.get_vector_length(0)) #print("str[0]", f.get_feature_vector(0)) #or load features from file (one string per line) fil=CSVFile(fname) f.load(fil) #print(f.get_features()) #or load fasta file #f.load_fasta('fasta.fa') #print(f.get_features()) return f.get_features(), f
def converter_multidimensionalscaling_modular(data_fname): try: import numpy from modshogun import RealFeatures, MultidimensionalScaling, EuclideanDistance, CSVFile features = RealFeatures(CSVFile(data_fname)) distance_before = EuclideanDistance() distance_before.init(features, features) converter = MultidimensionalScaling() converter.set_target_dim(2) converter.set_landmark(False) embedding = converter.apply(features) distance_after = EuclideanDistance() distance_after.init(embedding, embedding) distance_matrix_after = distance_after.get_distance_matrix() distance_matrix_before = distance_before.get_distance_matrix() return numpy.linalg.norm(distance_matrix_after - distance_matrix_before) / numpy.linalg.norm( distance_matrix_before) < 1e-6 except ImportError: print('No Eigen3 available')
def multiclass_c45classifiertree_modular(train=traindat, test=testdat, labels=label_traindat, ft=feattypes): try: from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree from numpy import random, int32 except ImportError: print("Could not import Shogun and/or numpy modules") return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train)) feats_test = RealFeatures(CSVFile(test)) train_labels = MulticlassLabels(CSVFile(labels)) # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3 subset = int32(random.permutation(feats_train.get_num_vectors())) vsubset = subset[1:subset.size / 3] trsubset = subset[1 + subset.size / 3:subset.size] # C4.5 Tree formation using training subset train_labels.add_subset(trsubset) feats_train.add_subset(trsubset) c = C45ClassifierTree() c.set_labels(train_labels) c.set_feature_types(ft) c.train(feats_train) train_labels.remove_subset() feats_train.remove_subset() # prune tree using validation subset train_labels.add_subset(vsubset) feats_train.add_subset(vsubset) c.prune_tree(feats_train, train_labels) train_labels.remove_subset() feats_train.remove_subset() # Classify test data output = c.apply_multiclass(feats_test).get_labels() output_certainty = c.get_certainty_vector() return c, output, output_certainty
def kernel_poly_modular(train_fname=traindat, test_fname=testdat, degree=4, inhomogene=False, use_normalization=True): from modshogun import RealFeatures, PolyKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = PolyKernel(feats_train, feats_train, degree, inhomogene, use_normalization) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel