def transfer_multitask_trace_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from shogun import MultitaskTraceLogisticRegression except ImportError: print("MultitaskTraceLogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskTraceLogisticRegression(0.1,features,labels,task_group) mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.set_max_iter(10) mtlr.train() mtlr.set_current_task(0) out = mtlr.apply_regression().get_labels() return out
def regression_chaidtree(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes): try: from shogun import RealFeatures, RegressionLabels, CSVFile, CHAIDTree, PT_REGRESSION from numpy import random except ImportError: print("Could not import Shogun and/or numpy modules") return random.seed(1) # form training dataset : y=x with noise X_train=random.rand(1,num_train)*x_range; Y_train=X_train+random.randn(num_train)*noise_var # form test dataset X_test=array([[float(i)/num_test*x_range for i in range(num_test)]]) # wrap features and labels into Shogun objects feats_train=RealFeatures(X_train) feats_test=RealFeatures(X_test) train_labels=RegressionLabels(Y_train[0]) # CHAID Tree formation c=CHAIDTree(2,feattypes,50) c.set_labels(train_labels) c.train(feats_train) # Regress on test data output=c.apply_regression(feats_test).get_labels() return c,output
def classifier_featureblock_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup try: from shogun import FeatureBlockLogisticRegression except ImportError: print("FeatureBlockLogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat,traindat))) labels = BinaryLabels(hstack((label_train,label_train))) n_features = features.get_num_features() block_one = IndexBlock(0,n_features//2) block_two = IndexBlock(n_features//2,n_features) block_group = IndexBlockGroup() block_group.add_block(block_one) block_group.add_block(block_two) mtlr = FeatureBlockLogisticRegression(0.1,features,labels,block_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() out = mtlr.apply().get_labels() return out
def RunLinearRidgeRegressionShogun(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the responses # file. Log.Info("Loading dataset", self.verbose) if len(self.dataset) >= 2: testSet = np.genfromtxt(self.dataset[1], delimiter=',') # Use the last row of the training set as the responses. X, y = SplitTrainData(self.dataset) if "alpha" in options: tau = float(options.pop("alpha")) else: Log.Fatal("Required parameter 'alpha' not specified!") raise Exception("missing parameter") if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: # Perform linear ridge regression. model = LRR(tau, RealFeatures(X.T), RegressionLabels(y)) model.train() if len(self.dataset) >= 2: model.apply_regression(RealFeatures(testSet.T)) except Exception as e: return [-1] return [totalTimer.ElapsedTime(), model]
def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG try: from shogun import MultitaskClusteredLogisticRegression except ImportError: print("MultitaskClusteredLogisticRegression not available") exit() features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat)))) labels = BinaryLabels(hstack((label_train,label_train,label_train))) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//3) task_two = Task(n_vectors//3,2*n_vectors//3) task_three = Task(2*n_vectors//3,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out
def __init__(self, method_param, run_param): self.info = "SHOGUN_DTC (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) # Flag for Cross Validation Pruning self.cv_prune = False if "pruning" in method_param: self.cv_prune = bool(method_param["pruning"]) self.num_folds = 2 if "k" in method_param: # Making sure that the value is of the right type self.num_folds = int(method_param["k"])
def classifier_multiclassocas (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1): from shogun import RealFeatures, MulticlassLabels from shogun import Math_init_random try: from shogun import MulticlassOCAS except ImportError: print("MulticlassOCAS not available") return # reproducible results random.seed(seed) Math_init_random(seed) # generate some training data where each class pair is linearly separable label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64") fm_train=array(random.randn(num_class,num_vec)) fm_test=array(random.randn(num_class,num_vec)) for i in range(len(label_train)): fm_train[int(label_train[i]),i]+=distance fm_test[int(label_test[i]),i]+=distance feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) labels=MulticlassLabels(label_train) classifier = MulticlassOCAS(C,feats_train,labels) classifier.train() out = classifier.apply(feats_test).get_labels() #print label_test #print out return out,classifier
def features_dense_io(): from shogun import RealFeatures, CSVFile feats = RealFeatures() f = CSVFile("../data/fm_train_real.dat", "r") f.set_delimiter(" ") feats.load(f) return feats
def RunNBCShogun(): totalTimer = Timer() self.predictions = None Log.Info("Loading dataset", self.verbose) try: # Load train and test dataset. trainData = np.genfromtxt(self.dataset[0], delimiter=',') testData = np.genfromtxt(self.dataset[1], delimiter=',') # Labels are the last row of the training set. labels = MulticlassLabels(trainData[:, (trainData.shape[1] - 1)]) with totalTimer: # Transform into features. trainFeat = RealFeatures(trainData[:, :-1].T) testFeat = RealFeatures(testData.T) # Create and train the classifier. self.model = self.BuildModel(trainFeat, labels, options) # Run Naive Bayes Classifier on the test dataset. self.predictions = self.model.apply_multiclass( testFeat).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() if len(self.dataset) > 1: return [time, self.predictions] return [time]
def kernel_combined (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ): from shogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel from shogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA kernel=CombinedKernel() feats_train=CombinedFeatures() feats_test=CombinedFeatures() subkfeats_train=RealFeatures(fm_train_real) subkfeats_test=RealFeatures(fm_test_real) subkernel=GaussianKernel(10, 1.1) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) degree=3 subkernel=FixedDegreeStringKernel(10, degree) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) subkfeats_train=StringCharFeatures(fm_train_dna, DNA) subkfeats_test=StringCharFeatures(fm_test_dna, DNA) subkernel=LocalAlignmentStringKernel(10) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def metric_lmnn(train_fname=traindat, test_fname=testdat, label_train_fname=label_traindat, k=3): try: from shogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile except ImportError: return # wrap features and labels into Shogun objects feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = MulticlassLabels(CSVFile(label_train_fname)) # LMNN lmnn = LMNN(feats_train, labels, k) lmnn.train() lmnn_distance = lmnn.get_distance() # perform classification with KNN knn = KNN(k, lmnn_distance, labels) knn.train() output = knn.apply(feats_test).get_labels() return lmnn, output
def __init__(self, method_param, run_param): self.info = "SHOGUN_RANDOMFOREST (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) self.num_trees = 50 if "num-trees" in method_param: self.num_trees = int(method_param["num-trees"]) self.form = 1 if "dimensions" in method_param: self.form = int(method_param["dimensions"]) self.solver = "auto" if "solver" in method_param: self.solver = str(method_param["solver"])
def fit_and_predict(load_test_data, train_data, test_feature_matrics, train_label, test_label_OR_test_data): features_train = RealFeatures(train_data) features_test = RealFeatures(test_feature_matrics) labels_train = BinaryLabels(train_label) learn_rate = 1.0 max_iter = 1000 perceptron = AveragedPerceptron(features_train, labels_train) perceptron.set_learn_rate(learn_rate) perceptron.set_max_iter(max_iter) perceptron.train() perceptron.set_features(features_test) labels_predict = perceptron.apply() if load_test_data: del test_label_OR_test_data['question_text'] # import pdb; pdb.set_trace() test_label_OR_test_data.insert(1, 'prediction', prediction80) test_label_OR_test_data.to_csv('submission.csv', index=False) return prediction else: labels_test = BinaryLabels(test_label_OR_test_data) accEval = AccuracyMeasure() accuracy = accEval.evaluate(labels_predict, labels_test) f1Eval = F1Measure() f1_score = f1Eval.evaluate(labels_predict, labels_test) print('#accuracy is: ', accuracy) print('#F1 score is: ', f1_score)
def mkl_multiclass_1(fm_train_real, fm_test_real, label_train_multiclass, C): kernel = CombinedKernel() feats_train = CombinedFeatures() feats_test = CombinedFeatures() for i in range(-10, 11): subkfeats_train = RealFeatures(fm_train_real) subkfeats_test = RealFeatures(fm_test_real) subkernel = GaussianKernel(pow(2, i + 1)) feats_train.append_feature_obj(subkfeats_train) feats_test.append_feature_obj(subkfeats_test) kernel.append_kernel(subkernel) kernel.init(feats_train, feats_train) labels = MulticlassLabels(label_train_multiclass) mkl = MKLMulticlass(C, kernel, labels) mkl.set_epsilon(1e-2) mkl.parallel.set_num_threads(num_threads) mkl.set_mkl_epsilon(mkl_epsilon) mkl.set_mkl_norm(1) mkl.train() kernel.init(feats_train, feats_test) out = mkl.apply().get_labels() return out
def classifier_svmocas(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, C=0.9, epsilon=1e-5, num_threads=1): from shogun import RealFeatures, BinaryLabels from shogun import CSVFile try: from shogun import SVMOcas except ImportError: print("SVMOcas not available") return feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) svm = SVMOcas(C, feats_train, labels) svm.set_epsilon(epsilon) svm.parallel.set_num_threads(num_threads) svm.set_bias_enabled(False) svm.train() bias = svm.get_bias() w = svm.get_w() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def predict(self, image): """ Predict the face """ #image as row imageAsRow = np.asarray( image.reshape(image.shape[0] * image.shape[1], 1), np.float64) #project inthe subspace p = self.pca.apply_to_feature_vector( RealFeatures(imageAsRow).get_feature_vector(0)) #min value to find the face minDist = 1e100 #class minClass = -1 #search which face is the best match for sampleIdx in range(len(self._projections)): test = RealFeatures(np.asmatrix(p, np.float64).T) projection = RealFeatures( np.asmatrix(self._projections[sampleIdx], np.float64).T) dist = EuclideanDistance(test, projection).distance(0, 0) if (dist < minDist): minDist = dist minClass = self._labels[sampleIdx] return minClass
def features_dense_io(): from shogun import RealFeatures, CSVFile feats=RealFeatures() f=CSVFile("../data/fm_train_real.dat","r") f.set_delimiter(" ") feats.load(f) return feats
def metric_lmnn_statistics(k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'): try: from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG import matplotlib.pyplot as pyplot except ImportError: print 'Error importing shogun or other required modules. Please, verify their installation.' return features = RealFeatures(load_compressed_features(fname_features).T) labels = MulticlassLabels(CSVFile(fname_labels)) # print 'number of examples = %d' % features.get_num_vectors() # print 'number of features = %d' % features.get_num_features() assert(features.get_num_vectors() == labels.get_num_labels()) # train LMNN lmnn = LMNN(features, labels, k) lmnn.set_correction(100) # lmnn.io.set_loglevel(MSG_DEBUG) print 'Training LMNN, this will take about two minutes...' lmnn.train() print 'Training done!' # plot objective obtained during training statistics = lmnn.get_statistics() pyplot.plot(statistics.obj.get()) pyplot.grid(True) pyplot.xlabel('Iterations') pyplot.ylabel('LMNN objective') pyplot.title('LMNN objective during training for the multiclass digits data set') pyplot.show()
def classifier_featureblock_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup try: from shogun import FeatureBlockLogisticRegression except ImportError: print("FeatureBlockLogisticRegression not available") exit(0) features = RealFeatures(hstack((traindat, traindat))) labels = BinaryLabels(hstack((label_train, label_train))) n_features = features.get_num_features() block_one = IndexBlock(0, n_features // 2) block_two = IndexBlock(n_features // 2, n_features) block_group = IndexBlockGroup() block_group.add_block(block_one) block_group.add_block(block_two) mtlr = FeatureBlockLogisticRegression(0.1, features, labels, block_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() out = mtlr.apply().get_labels() return out
def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat): from shogun import RegressionLabels, RealFeatures, Task, TaskGroup try: from shogun import MultitaskLeastSquaresRegression except ImportError: print("MultitaskLeastSquaresRegression not available") exit(0) features = RealFeatures(traindat) labels = RegressionLabels(label_train) n_vectors = features.get_num_vectors() task_one = Task(0,n_vectors//2) task_two = Task(n_vectors//2,n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group) mtlsr.set_regularization(1) # use regularization ratio mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance mtlsr.train() mtlsr.set_current_task(0) out = mtlsr.apply_regression().get_labels() return out
def __init__(self, method_param, run_param): self.info = "SHOGUN_LDA (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) self.tolerance = 1e-4 if "tolerance" in method_param: self.tolerance = float(method_param["tolerance"]) self.store_cov = False if "store-covar" in method_param: self.store_cov = bool(method_param["store-covar"]) self.solver = "auto" if "solver" in method_param: self.solver = str(method_param["solver"])
def regression_svrlight (fm_train=traindat,fm_test=testdat,label_train=label_traindat, \ width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3): from shogun import RegressionLabels, RealFeatures from shogun import GaussianKernel try: from shogun import SVRLight except ImportError: print('No support for SVRLight available.') return feats_train=RealFeatures(fm_train) feats_test=RealFeatures(fm_test) kernel=GaussianKernel(feats_train, feats_train, width) labels=RegressionLabels(label_train) svr=SVRLight(C, epsilon, kernel, labels) svr.set_tube_epsilon(tube_epsilon) svr.parallel.set_num_threads(num_threads) svr.train() kernel.init(feats_train, feats_test) out = svr.apply().get_labels() return out, kernel
def transfer_multitask_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup try: from shogun import MultitaskLogisticRegression except ImportError: print("MultitaskLogisticRegression not available") exit() features = RealFeatures(hstack((traindat, traindat))) labels = BinaryLabels(hstack((label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 2) task_two = Task(n_vectors // 2, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) mtlr = MultitaskLogisticRegression(0.1, features, labels, task_group) mtlr.set_regularization(1) # use regularization ratio mtlr.set_tolerance(1e-2) # use 1e-2 tolerance mtlr.train() mtlr.set_current_task(0) out = mtlr.apply().get_labels() return out
def __init__(self, method_param, run_param): self.info = "SHOGUN_KNN (" + str(method_param) + ")" # Assemble run model parameter. self.data = load_dataset(method_param["datasets"], ["csv"]) self.data_split = split_dataset(self.data[0]) self.train_feat = RealFeatures(self.data_split[0].T) # Encode the labels into {0,1,2,3,......,num_classes-1} self.train_labels, self.label_map = label_encoder(self.data_split[1]) self.train_labels = MulticlassLabels(self.train_labels) if len(self.data) >= 2: self.test_feat = RealFeatures(self.data[1].T) self.k = 3 if "k" in method_param: self.k = int(method_param["k"]) self.distance = "Euclidean" if "distance" in method_param: self.distance = str(method_param["distance"]) self.solver = "Brute" if "solver" in method_param: self.solver = str(method_param["solver"]) self.degree = 3 if "degree" in method_param: self.degree = float(method_param["degree"])
def features_dense(A=matrixA, B=matrixB, C=matrixC): a = RealFeatures(A) b = LongIntFeatures(B) c = ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print(some statistics about a) # get first feature vector and set it a.set_feature_vector(array([1, 4, 0, 0, 0, 9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix() assert (all(a_out == A)) assert (all(b_out == B)) assert (all(c_out == C)) return a_out, b_out, c_out, a, b, c
def features_dense (A=matrixA,B=matrixB,C=matrixC): a=RealFeatures(A) b=LongIntFeatures(B) c=ByteFeatures(C) # or 16bit wide ... #feat1 = f.ShortFeatures(N.zeros((10,5),N.short)) #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16)) # print(some statistics about a) # get first feature vector and set it a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrices a_out = a.get_feature_matrix() b_out = b.get_feature_matrix() c_out = c.get_feature_matrix() assert(all(a_out==A)) assert(all(b_out==B)) assert(all(c_out==C)) return a_out,b_out,c_out,a,b,c
def RunRandomForestShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) if "num_trees" in options: self.numTrees = int(options.pop("num_trees")) else: Log.Fatal("Required parameter 'num_trees' not specified!") raise Exception("missing parameter") self.form = 1 if "dimensions" in options: self.form = int(options.pop("dimensions")) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run the Random Forest Classifier on the test dataset. self.predictions = self.model.apply_multiclass(testData).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() return [time, self.predictions]
def transfer_multitask_clustered_logistic_regression(fm_train=traindat, fm_test=testdat, label_train=label_traindat ): from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG try: from shogun import MultitaskClusteredLogisticRegression except ImportError: print("MultitaskClusteredLogisticRegression not available") exit() features = RealFeatures(hstack((traindat, sin(traindat), cos(traindat)))) labels = BinaryLabels(hstack((label_train, label_train, label_train))) n_vectors = features.get_num_vectors() task_one = Task(0, n_vectors // 3) task_two = Task(n_vectors // 3, 2 * n_vectors // 3) task_three = Task(2 * n_vectors // 3, n_vectors) task_group = TaskGroup() task_group.append_task(task_one) task_group.append_task(task_two) task_group.append_task(task_three) mtlr = MultitaskClusteredLogisticRegression(1.0, 100.0, features, labels, task_group, 2) #mtlr.io.set_loglevel(MSG_DEBUG) mtlr.set_tolerance(1e-3) # use 1e-2 tolerance mtlr.set_max_iter(100) mtlr.train() mtlr.set_current_task(0) #print mtlr.get_w() out = mtlr.apply_regression().get_labels() return out
def RunDTCShogun(): totalTimer = Timer() Log.Info("Loading dataset", self.verbose) trainData, labels = SplitTrainData(self.dataset) trainData = RealFeatures(trainData.T) labels = MulticlassLabels(labels) testData = RealFeatures(LoadDataset(self.dataset[1]).T) if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") try: with totalTimer: self.model = self.BuildModel(trainData, labels, options) # Run the CARTree Classifier on the test dataset. self.predictions = self.model.apply_multiclass( testData).get_labels() except Exception as e: return [-1] time = totalTimer.ElapsedTime() if len(self.dataset) > 1: return [time, self.predictions] return [time]
def classifier_gpbtsvm(train_fname=traindat, test_fname=testdat, label_fname=label_traindat, width=2.1, C=1, epsilon=1e-5): from shogun import RealFeatures, BinaryLabels from shogun import GaussianKernel from shogun import CSVFile try: from shogun import GPBTSVM except ImportError: print("GPBTSVM not available") exit(0) feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) labels = BinaryLabels(CSVFile(label_fname)) kernel = GaussianKernel(feats_train, feats_train, width) svm = GPBTSVM(C, kernel, labels) svm.set_epsilon(epsilon) svm.train() predictions = svm.apply(feats_test) return predictions, svm, predictions.get_labels()
def RunLDAShogun(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the test file. try: if len(self.dataset) > 1: testSet = LoadDataset(self.dataset[1]) # Use the last row of the training set as the responses. trainSet, trainLabels = SplitTrainData(self.dataset) # if the labels are not in {0,1,2,...,num_classes-1}, map them to this set and store the mapping # shogun's MCLDA class requires the labels to be in {0,1,2,...,num_classes-1} distinctLabels = list(set(trainLabels)) mapping = {} reverseMapping = {} idx = 0 for label in distinctLabels: mapping[label] = idx reverseMapping[idx] = label idx += 1 for i in range(len(trainLabels)): trainLabels[i] = mapping[trainLabels[i]] trainFeat = RealFeatures(trainSet.T) trainLabels = MulticlassLabels(trainLabels) # Gather optional parameters. if "tolerance" in options: self.tolerance = float(options.pop("tolerance")) if "store" in options: self.store = bool(options.pop("store")) if (len(options) > 0): Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") with totalTimer: self.model = MCLDA(trainFeat, trainLabels, self.tolerance, self.store) self.model.train() if (len(self.dataset) > 0): self.predictions = self.model.apply_multiclass( RealFeatures(testSet.T)) self.predictions = self.predictions.get_labels() # reverse map the predicted labels to actual labels for i in range(len(self.predictions)): self.predictions[i] = reverseMapping[ self.predictions[i]] except Exception as e: Log.Info("Exception: " + str(e)) return -1 time = totalTimer.ElapsedTime() return time
def RunAllKnnShogun(): totalTimer = Timer() # Load input dataset. # If the dataset contains two files then the second file is the query # file. try: Log.Info("Loading dataset", self.verbose) if len(self.dataset) == 2: referenceData = np.genfromtxt(self.dataset[0], delimiter=',') queryData = np.genfromtxt(self.dataset[1], delimiter=',') queryFeat = RealFeatures(queryFeat.T) else: referenceData = np.genfromtxt(self.dataset, delimiter=',') # Labels are the last row of the dataset. labels = MulticlassLabels( referenceData[:, (referenceData.shape[1] - 1)]) referenceData = referenceData[:, :-1] with totalTimer: # Get all the parameters. if "k" in options: k = int(options.pop("k")) if (k < 1 or k > referenceData.shape[0]): Log.Fatal("Invalid k: " + k.group(1) + "; must be greater than 0" + " and less or equal than " + str(referenceData.shape[0])) return -1 else: Log.Fatal( "Required option: Number of furthest neighbors to find." ) return -1 if len(options) > 0: Log.Fatal("Unknown parameters: " + str(options)) raise Exception("unknown parameters") referenceFeat = RealFeatures(referenceData.T) distance = EuclideanDistance(referenceFeat, referenceFeat) # Perform All K-Nearest-Neighbors. model = SKNN(k, distance, labels) model.train() if len(self.dataset) == 2: out = model.apply(queryFeat).get_labels() else: out = model.apply(referenceFeat).get_labels() except Exception as e: return -1 return totalTimer.ElapsedTime()
def modelselection_grid_search_kernel(num_subsets, num_vectors, dim_vectors): # init seed for reproducability Math.init_random(1) random.seed(1) # create some (non-sense) data matrix = random.rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features = RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels = BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i % 2 == 0 else -1) # create svm classifier = LibSVM() # splitting strategy splitting_strategy = StratifiedCrossValidationSplitting( labels, num_subsets) # accuracy evaluation evaluation_criterion = ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross = CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #classifier.print_modsel_params() # model parameter selection param_tree = create_param_tree() #param_tree.print_tree() grid_search = GridSearchModelSelection(cross, param_tree) print_state = False best_combination = grid_search.select_model(print_state) #print("best parameter(s):") #best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have less variance cross.set_num_runs(10) result = cross.evaluate() casted = CrossValidationResult.obtain_from_generic(result) #print "result mean:", casted.mean return classifier, result, casted.get_mean()
def features_dense_zero_copy (in_data=data): feats = None if numpy.__version__ >= '1.5': feats=numpy.array(in_data, dtype=float64, order='F') a=RealFeatures() a.frombuffer(feats, False) b=numpy.array(a, copy=False) c=numpy.array(a, copy=True) d=RealFeatures() d.frombuffer(a, False) e=RealFeatures() e.frombuffer(a, True) a[:,0]=0 #print a[0:4] #print b[0:4] #print c[0:4] #print d[0:4] #print e[0:4] else: print("numpy version >= 1.5 is needed") return feats
def distance_chisquare(train_fname=traindat, test_fname=testdat): from shogun import RealFeatures, ChiSquareDistance, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) distance = ChiSquareDistance(feats_train, feats_train) dm_train = distance.get_distance_matrix() distance.init(feats_train, feats_test) dm_test = distance.get_distance_matrix() return distance, dm_train, dm_test
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors): # init seed for reproducability Math.init_random(1) random.seed(1); # create some (non-sense) data matrix=random.rand(dim_vectors, num_vectors) # create num_feautres 2-dimensional vectors features=RealFeatures() features.set_feature_matrix(matrix) # create labels, two classes labels=BinaryLabels(num_vectors) for i in range(num_vectors): labels.set_label(i, 1 if i%2==0 else -1) # create svm classifier=LibSVM() # splitting strategy splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets) # accuracy evaluation evaluation_criterion=ContingencyTableEvaluation(ACCURACY) # cross validation class for evaluation in model selection cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion) cross.set_num_runs(1) # print all parameter available for modelselection # Dont worry if yours is not included, simply write to the mailing list #classifier.print_modsel_params() # model parameter selection param_tree=create_param_tree() #param_tree.print_tree() grid_search=GridSearchModelSelection(cross, param_tree) print_state=False best_combination=grid_search.select_model(print_state) #print("best parameter(s):") #best_combination.print_tree() best_combination.apply_to_machine(classifier) # larger number of runs to have less variance cross.set_num_runs(10) result=cross.evaluate() casted=CrossValidationResult.obtain_from_generic(result); #print "result mean:", casted.mean return classifier,result,casted.get_mean()
def kernel_sigmoid (train_fname=traindat,test_fname=testdat,size_cache=10,gamma=1.2,coef0=1.3): from shogun import RealFeatures, SigmoidKernel, CSVFile feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_cauchy (train_fname=traindat,test_fname=testdat, sigma=1.0): from shogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance feats_train=RealFeatures(CSVFile(train_fname)) feats_test=RealFeatures(CSVFile(test_fname)) distance=EuclideanDistance(feats_train, feats_train) kernel=CauchyKernel(feats_train, feats_train, sigma, distance) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def kernel_gaussian(train_fname=traindat, test_fname=testdat, width=1.3): from shogun import RealFeatures, GaussianKernel, CSVFile feats_train = RealFeatures(CSVFile(train_fname)) feats_test = RealFeatures(CSVFile(test_fname)) kernel = GaussianKernel(feats_train, feats_train, width) km_train = kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test = kernel.get_kernel_matrix() return km_train, km_test, kernel
def features_dense_real (A=matrix): # ... of type Real, LongInt and Byte a=RealFeatures(A) # print(some statistics about a) #print(a.get_num_vectors()) #print(a.get_num_features()) # get first feature vector and set it #print(a.get_feature_vector(0)) a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0) # get matrix a_out = a.get_feature_matrix() assert(all(a_out==A)) return a_out
def preprocessor_prunevarsubmean (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from shogun import Chi2Kernel from shogun import RealFeatures from shogun import PruneVarSubMean feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=PruneVarSubMean() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def preprocessor_randomfouriergausspreproc (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from shogun import Chi2Kernel from shogun import RealFeatures from shogun import RandomFourierGaussPreproc feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preproc=RandomFourierGaussPreproc() preproc.init(feats_train) feats_train.add_preprocessor(preproc) feats_train.apply_preprocessor() feats_test.add_preprocessor(preproc) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def preprocessor_normone (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10): from shogun import Chi2Kernel from shogun import RealFeatures from shogun import NormOne feats_train=RealFeatures(fm_train_real) feats_test=RealFeatures(fm_test_real) preprocessor=NormOne() preprocessor.init(feats_train) feats_train.add_preprocessor(preprocessor) feats_train.apply_preprocessor() feats_test.add_preprocessor(preprocessor) feats_test.apply_preprocessor() kernel=Chi2Kernel(feats_train, feats_train, width, size_cache) km_train=kernel.get_kernel_matrix() kernel.init(feats_train, feats_test) km_test=kernel.get_kernel_matrix() return km_train,km_test,kernel
def stochasticgbmachine(train=traindat,train_labels=label_traindat,ft=feat_types): try: from shogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss except ImportError: print("Could not import Shogun modules") return # wrap features and labels into Shogun objects feats=RealFeatures(CSVFile(train)) labels=RegressionLabels(CSVFile(train_labels)) # divide into training (90%) and test dataset (10%) p=np.random.permutation(labels.get_num_labels()) num=labels.get_num_labels()*0.9 cart=CARTree() cart.set_feature_types(ft) cart.set_max_depth(1) loss=SquaredLoss() s=StochasticGBMachine(cart,loss,500,0.01,0.6) # train feats.add_subset(np.int32(p[0:int(num)])) labels.add_subset(np.int32(p[0:int(num)])) s.set_labels(labels) s.train(feats) feats.remove_subset() labels.remove_subset() # apply feats.add_subset(np.int32(p[int(num):len(p)])) labels.add_subset(np.int32(p[int(num):len(p)])) output=s.apply_regression(feats) feats.remove_subset() labels.remove_subset() return s,output
def hsic_graphical(): # parameters, change to get different results m=250 difference=3 # setting the angle lower makes a harder test angle=pi/30 # number of samples taken from null and alternative distribution num_null_samples=500 # use data generator class to produce example data data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) # create shogun feature representation features_x=RealFeatures(array([data[0]])) features_y=RealFeatures(array([data[1]])) # compute median data distance in order to use for Gaussian kernel width # 0.5*median_distance normally (factor two in Gaussian kernel) # However, shoguns kernel width is different to usual parametrization # Therefore 0.5*2*median_distance^2 # Use a subset of data for that, only 200 elements. Median is stable subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy subset=random.permutation(subset) # numpy permutation subset=subset[0:200] features_x.add_subset(subset) dist=EuclideanDistance(features_x, features_x) distances=dist.get_distance_matrix() features_x.remove_subset() median_distance=np.median(distances) sigma_x=median_distance**2 features_y.add_subset(subset) dist=EuclideanDistance(features_y, features_y) distances=dist.get_distance_matrix() features_y.remove_subset() median_distance=np.median(distances) sigma_y=median_distance**2 print "median distance for Gaussian kernel on x:", sigma_x print "median distance for Gaussian kernel on y:", sigma_y kernel_x=GaussianKernel(10,sigma_x) kernel_y=GaussianKernel(10,sigma_y) # create hsic instance. Note that this is a convienience constructor which copies # feature data. features_x and features_y are not these used in hsic. # This is only for user-friendlyness. Usually, its ok to do this. # Below, the alternative distribution is sampled, which means # that new feature objects have to be created in each iteration (slow) # However, normally, the alternative distribution is not sampled hsic=HSIC(kernel_x,kernel_y,features_x,features_y) # sample alternative distribution alt_samples=zeros(num_null_samples) for i in range(len(alt_samples)): data=DataGenerator.generate_sym_mix_gauss(m,difference,angle) features_x.set_feature_matrix(array([data[0]])) features_y.set_feature_matrix(array([data[1]])) # re-create hsic instance everytime since feature objects are copied due to # useage of convienience constructor hsic=HSIC(kernel_x,kernel_y,features_x,features_y) alt_samples[i]=hsic.compute_statistic() # sample from null distribution # permutation, biased statistic hsic.set_null_approximation_method(PERMUTATION) hsic.set_num_null_samples(num_null_samples) null_samples_boot=hsic.sample_null() # fit gamma distribution, biased statistic hsic.set_null_approximation_method(HSIC_GAMMA) gamma_params=hsic.fit_null_gamma() # sample gamma with parameters null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)]) # plot figure() # plot data x and y subplot(2,2,1) gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks grid(True) plot(data[0], data[1], 'o') title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m)) xlabel('$x$') ylabel('$y$') # compute threshold for test level alpha=0.05 null_samples_boot.sort() null_samples_gamma.sort() thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))]; thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))]; type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples) type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples) # plot alternative distribution with threshold subplot(2,2,2) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(alt_samples, 20, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples) title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error)) # compute range for all null distribution histograms hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])] # plot null distribution with threshold subplot(2,2,3) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_boot, 20, range=hist_range, normed=True); axvline(thresh_boot, 0, 1, linewidth=2, color='red') title('Sampled Null Dist.\n' + 'Type I error is ' + str(type_one_error_boot)) # plot null distribution gamma subplot(2,2,4) gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks grid(True) hist(null_samples_gamma, 20, range=hist_range, normed=True); axvline(thresh_gamma, 0, 1, linewidth=2, color='red') title('Null Dist. Gamma\nType I error is ' + str(type_one_error_gamma)) grid(True) # pull plots a bit apart subplots_adjust(hspace=0.5) subplots_adjust(wspace=0.5)
def serialization_complex_example (num=5, dist=1, dim=10, C=2.0, width=10): import os from numpy import concatenate, zeros, ones from numpy.random import randn, seed from shogun import RealFeatures, MulticlassLabels from shogun import GMNPSVM from shogun import GaussianKernel from shogun import SerializableHdf5File,SerializableAsciiFile, \ SerializableJsonFile,SerializableXmlFile,MSG_DEBUG from shogun import NormOne, LogPlusOne from tempfile import NamedTemporaryFile seed(17) data=concatenate((randn(dim, num), randn(dim, num) + dist, randn(dim, num) + 2*dist, randn(dim, num) + 3*dist), axis=1) lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num))) feats=RealFeatures(data) #feats.io.set_loglevel(MSG_DEBUG) #feats.io.enable_file_and_line() kernel=GaussianKernel(feats, feats, width) labels=MulticlassLabels(lab) svm = GMNPSVM(C, kernel, labels) feats.add_preprocessor(NormOne()) feats.add_preprocessor(LogPlusOne()) feats.set_preprocessed(1) svm.train(feats) bias_ref = svm.get_svm(0).get_bias() #svm.print_serializable() tmp_h5 = NamedTemporaryFile(suffix='h5') fstream = SerializableHdf5File(tmp_h5.name, "w") status = svm.save_serializable(fstream) check_status(status,'h5') tmp_asc = NamedTemporaryFile(suffix='asc') fstream = SerializableAsciiFile(tmp_asc.name, "w") status = svm.save_serializable(fstream) check_status(status,'asc') tmp_json = NamedTemporaryFile(suffix='json') fstream = SerializableJsonFile(tmp_json.name, "w") status = svm.save_serializable(fstream) check_status(status,'json') tmp_xml = NamedTemporaryFile(suffix='xml') fstream = SerializableXmlFile(tmp_xml.name, "w") status = svm.save_serializable(fstream) check_status(status,'xml') fstream = SerializableHdf5File(tmp_h5.name, "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'h5') new_svm.train() bias_h5 = new_svm.get_svm(0).get_bias() fstream = SerializableAsciiFile(tmp_asc.name, "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'asc') new_svm.train() bias_asc = new_svm.get_svm(0).get_bias() fstream = SerializableJsonFile(tmp_json.name, "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'json') new_svm.train() bias_json = new_svm.get_svm(0).get_bias() fstream = SerializableXmlFile(tmp_xml.name, "r") new_svm=GMNPSVM() status = new_svm.load_serializable(fstream) check_status(status,'xml') new_svm.train() bias_xml = new_svm.get_svm(0).get_bias() return svm,new_svm, bias_ref, bias_h5, bias_asc, bias_json, bias_xml
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con): """Converts numpy arrays or sequences into shogun features""" if kname == 'gauss' or kname == 'linear' or kname == 'poly': examples = numpy.array(examples) feats = RealFeatures(examples) elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) elif kname == 'spec' or kname == 'cumspec': if seq_source == 'dna': examples = non_atcg_convert(examples, nuc_con) feats = StringCharFeatures(examples, DNA) elif seq_source == 'protein': examples = non_aminoacid_converter(examples, nuc_con) feats = StringCharFeatures(examples, PROTEIN) else: sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n") sys.exit(-1) wf = StringUlongFeatures( feats.get_alphabet() ) wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec') del feats if train_mode: preproc = SortUlongString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() #assert(ret) feats = wf elif kname == 'spec2' or kname == 'cumspec2': # spectrum kernel on two sequences feats = {} feats['combined'] = CombinedFeatures() reversed = kname=='cumspec2' (ex0,ex1) = zip(*examples) f0 = StringCharFeatures(list(ex0), DNA) wf = StringWordFeatures(f0.get_alphabet()) wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed) del f0 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() assert(ret) feats['combined'].append_feature_obj(wf) feats['f0'] = wf f1 = StringCharFeatures(list(ex1), DNA) wf = StringWordFeatures( f1.get_alphabet() ) wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed) del f1 if train_mode: preproc = SortWordString() preproc.init(wf) wf.add_preprocessor(preproc) ret = wf.apply_preprocessor() assert(ret) feats['combined'].append_feature_obj(wf) feats['f1'] = wf else: print 'Unknown kernel %s' % kname return (feats,preproc)