def transfer_multitask_trace_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from shogun import BinaryLabels, RealFeatures, Task, TaskGroup
	try:
		from shogun import MultitaskTraceLogisticRegression
	except ImportError:
		print("MultitaskTraceLogisticRegression not available")
		exit(0)

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlr = MultitaskTraceLogisticRegression(0.1,features,labels,task_group)
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.set_max_iter(10)
	mtlr.train()
	mtlr.set_current_task(0)
	out = mtlr.apply_regression().get_labels()

	return out
def regression_chaidtree(num_train=500,num_test=50,x_range=15,noise_var=0.2,ft=feattypes):
	try:
		from shogun import RealFeatures, RegressionLabels, CSVFile, CHAIDTree, PT_REGRESSION
		from numpy import random
	except ImportError:
		print("Could not import Shogun and/or numpy modules")
		return

	random.seed(1)

	# form training dataset : y=x with noise
	X_train=random.rand(1,num_train)*x_range;
	Y_train=X_train+random.randn(num_train)*noise_var

	# form test dataset
	X_test=array([[float(i)/num_test*x_range for i in range(num_test)]])

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(X_train)
	feats_test=RealFeatures(X_test)
	train_labels=RegressionLabels(Y_train[0])

	# CHAID Tree formation
	c=CHAIDTree(2,feattypes,50)
	c.set_labels(train_labels)
	c.train(feats_train)

	# Regress on test data
	output=c.apply_regression(feats_test).get_labels()

	return c,output
def classifier_featureblock_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):

	from shogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup
	try:
		from shogun import FeatureBlockLogisticRegression
	except ImportError:
		print("FeatureBlockLogisticRegression not available")
		exit(0)

	features = RealFeatures(hstack((traindat,traindat)))
	labels = BinaryLabels(hstack((label_train,label_train)))

	n_features = features.get_num_features()
	block_one = IndexBlock(0,n_features//2)
	block_two = IndexBlock(n_features//2,n_features)
	block_group = IndexBlockGroup()
	block_group.add_block(block_one)
	block_group.add_block(block_two)

	mtlr = FeatureBlockLogisticRegression(0.1,features,labels,block_group)
	mtlr.set_regularization(1) # use regularization ratio
	mtlr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlr.train()
	out = mtlr.apply().get_labels()

	return out
    def RunLinearRidgeRegressionShogun():
      totalTimer = Timer()

      # Load input dataset.
      # If the dataset contains two files then the second file is the responses
      # file.
      Log.Info("Loading dataset", self.verbose)
      if len(self.dataset) >= 2:
        testSet = np.genfromtxt(self.dataset[1], delimiter=',')

      # Use the last row of the training set as the responses.
      X, y = SplitTrainData(self.dataset)
      if "alpha" in options:
        tau = float(options.pop("alpha"))
      else:
        Log.Fatal("Required parameter 'alpha' not specified!")
        raise Exception("missing parameter")

      if len(options) > 0:
        Log.Fatal("Unknown parameters: " + str(options))
        raise Exception("unknown parameters")

      try:
        with totalTimer:
          # Perform linear ridge regression.
          model = LRR(tau, RealFeatures(X.T), RegressionLabels(y))
          model.train()

          if len(self.dataset) >= 2:
            model.apply_regression(RealFeatures(testSet.T))

      except Exception as e:
        return [-1]

      return [totalTimer.ElapsedTime(), model]
def transfer_multitask_clustered_logistic_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG
	try:
		from shogun import MultitaskClusteredLogisticRegression
	except ImportError:
		print("MultitaskClusteredLogisticRegression not available")
		exit()

	features = RealFeatures(hstack((traindat,sin(traindat),cos(traindat))))
	labels = BinaryLabels(hstack((label_train,label_train,label_train)))

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//3)
	task_two = Task(n_vectors//3,2*n_vectors//3)
	task_three = Task(2*n_vectors//3,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)
	task_group.append_task(task_three)

	mtlr = MultitaskClusteredLogisticRegression(1.0,100.0,features,labels,task_group,2)
	#mtlr.io.set_loglevel(MSG_DEBUG)
	mtlr.set_tolerance(1e-3) # use 1e-2 tolerance
	mtlr.set_max_iter(100)
	mtlr.train()
	mtlr.set_current_task(0)
	#print mtlr.get_w()
	out = mtlr.apply_regression().get_labels()

	return out
Beispiel #6
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_DTC (" + str(method_param) + ")"

        # Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_feat = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_feat = RealFeatures(self.data[1].T)

        # Flag for Cross Validation Pruning
        self.cv_prune = False
        if "pruning" in method_param:
            self.cv_prune = bool(method_param["pruning"])

        self.num_folds = 2
        if "k" in method_param:
            # Making sure that the value is of the right type
            self.num_folds = int(method_param["k"])
def classifier_multiclassocas (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1):
	from shogun import RealFeatures, MulticlassLabels
	from shogun import Math_init_random
	try:
		from shogun import MulticlassOCAS
	except ImportError:
		print("MulticlassOCAS not available")
		return

	# reproducible results
	random.seed(seed)
	Math_init_random(seed)

	# generate some training data where each class pair is linearly separable
	label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	fm_train=array(random.randn(num_class,num_vec))
	fm_test=array(random.randn(num_class,num_vec))
	for i in range(len(label_train)):
		fm_train[int(label_train[i]),i]+=distance
		fm_test[int(label_test[i]),i]+=distance

	feats_train=RealFeatures(fm_train)
	feats_test=RealFeatures(fm_test)

	labels=MulticlassLabels(label_train)

	classifier = MulticlassOCAS(C,feats_train,labels)
	classifier.train()

	out = classifier.apply(feats_test).get_labels()
	#print label_test
	#print out
	return out,classifier
Beispiel #8
0
def features_dense_io():
    from shogun import RealFeatures, CSVFile
    feats = RealFeatures()
    f = CSVFile("../data/fm_train_real.dat", "r")
    f.set_delimiter(" ")
    feats.load(f)
    return feats
Beispiel #9
0
        def RunNBCShogun():
            totalTimer = Timer()
            self.predictions = None
            Log.Info("Loading dataset", self.verbose)
            try:
                # Load train and test dataset.
                trainData = np.genfromtxt(self.dataset[0], delimiter=',')
                testData = np.genfromtxt(self.dataset[1], delimiter=',')

                # Labels are the last row of the training set.
                labels = MulticlassLabels(trainData[:,
                                                    (trainData.shape[1] - 1)])

                with totalTimer:
                    # Transform into features.
                    trainFeat = RealFeatures(trainData[:, :-1].T)
                    testFeat = RealFeatures(testData.T)

                    # Create and train the classifier.
                    self.model = self.BuildModel(trainFeat, labels, options)

                    # Run Naive Bayes Classifier on the test dataset.
                    self.predictions = self.model.apply_multiclass(
                        testFeat).get_labels()

            except Exception as e:
                return [-1]

            time = totalTimer.ElapsedTime()
            if len(self.dataset) > 1:
                return [time, self.predictions]

            return [time]
Beispiel #10
0
def kernel_combined (fm_train_real=traindat,fm_test_real=testdat,fm_train_dna=traindna,fm_test_dna=testdna ):
	from shogun import CombinedKernel, GaussianKernel, FixedDegreeStringKernel, LocalAlignmentStringKernel
	from shogun import RealFeatures, StringCharFeatures, CombinedFeatures, DNA

	kernel=CombinedKernel()
	feats_train=CombinedFeatures()
	feats_test=CombinedFeatures()

	subkfeats_train=RealFeatures(fm_train_real)
	subkfeats_test=RealFeatures(fm_test_real)
	subkernel=GaussianKernel(10, 1.1)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	degree=3
	subkernel=FixedDegreeStringKernel(10, degree)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	subkfeats_train=StringCharFeatures(fm_train_dna, DNA)
	subkfeats_test=StringCharFeatures(fm_test_dna, DNA)
	subkernel=LocalAlignmentStringKernel(10)
	feats_train.append_feature_obj(subkfeats_train)
	feats_test.append_feature_obj(subkfeats_test)
	kernel.append_kernel(subkernel)

	kernel.init(feats_train, feats_train)
	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Beispiel #11
0
def metric_lmnn(train_fname=traindat,
                test_fname=testdat,
                label_train_fname=label_traindat,
                k=3):
    try:
        from shogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile
    except ImportError:
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = MulticlassLabels(CSVFile(label_train_fname))

    # LMNN
    lmnn = LMNN(feats_train, labels, k)
    lmnn.train()
    lmnn_distance = lmnn.get_distance()

    # perform classification with KNN
    knn = KNN(k, lmnn_distance, labels)
    knn.train()
    output = knn.apply(feats_test).get_labels()

    return lmnn, output
Beispiel #12
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_RANDOMFOREST (" + str(method_param) + ")"

        # Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_feat = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_feat = RealFeatures(self.data[1].T)

        self.num_trees = 50
        if "num-trees" in method_param:
            self.num_trees = int(method_param["num-trees"])

        self.form = 1
        if "dimensions" in method_param:
            self.form = int(method_param["dimensions"])

        self.solver = "auto"
        if "solver" in method_param:
            self.solver = str(method_param["solver"])
def fit_and_predict(load_test_data, train_data, test_feature_matrics,
                    train_label, test_label_OR_test_data):
    features_train = RealFeatures(train_data)
    features_test = RealFeatures(test_feature_matrics)
    labels_train = BinaryLabels(train_label)

    learn_rate = 1.0
    max_iter = 1000
    perceptron = AveragedPerceptron(features_train, labels_train)
    perceptron.set_learn_rate(learn_rate)
    perceptron.set_max_iter(max_iter)
    perceptron.train()
    perceptron.set_features(features_test)
    labels_predict = perceptron.apply()
    if load_test_data:
        del test_label_OR_test_data['question_text']
        # import pdb; pdb.set_trace()
        test_label_OR_test_data.insert(1, 'prediction', prediction80)
        test_label_OR_test_data.to_csv('submission.csv', index=False)
        return prediction
    else:
        labels_test = BinaryLabels(test_label_OR_test_data)
        accEval = AccuracyMeasure()
        accuracy = accEval.evaluate(labels_predict, labels_test)
        f1Eval = F1Measure()
        f1_score = f1Eval.evaluate(labels_predict, labels_test)
        print('#accuracy is: ', accuracy)
        print('#F1 score is: ', f1_score)
Beispiel #14
0
def mkl_multiclass_1(fm_train_real, fm_test_real, label_train_multiclass, C):
    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    for i in range(-10, 11):
        subkfeats_train = RealFeatures(fm_train_real)
        subkfeats_test = RealFeatures(fm_test_real)
        subkernel = GaussianKernel(pow(2, i + 1))
        feats_train.append_feature_obj(subkfeats_train)
        feats_test.append_feature_obj(subkfeats_test)
        kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = MulticlassLabels(label_train_multiclass)

    mkl = MKLMulticlass(C, kernel, labels)

    mkl.set_epsilon(1e-2)
    mkl.parallel.set_num_threads(num_threads)
    mkl.set_mkl_epsilon(mkl_epsilon)
    mkl.set_mkl_norm(1)

    mkl.train()

    kernel.init(feats_train, feats_test)

    out = mkl.apply().get_labels()
    return out
Beispiel #15
0
def classifier_svmocas(train_fname=traindat,
                       test_fname=testdat,
                       label_fname=label_traindat,
                       C=0.9,
                       epsilon=1e-5,
                       num_threads=1):
    from shogun import RealFeatures, BinaryLabels
    from shogun import CSVFile
    try:
        from shogun import SVMOcas
    except ImportError:
        print("SVMOcas not available")
        return

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))

    svm = SVMOcas(C, feats_train, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.set_bias_enabled(False)
    svm.train()

    bias = svm.get_bias()
    w = svm.get_w()
    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
Beispiel #16
0
    def predict(self, image):
        """
        Predict the face
        """
        #image as row
        imageAsRow = np.asarray(
            image.reshape(image.shape[0] * image.shape[1], 1), np.float64)
        #project inthe subspace
        p = self.pca.apply_to_feature_vector(
            RealFeatures(imageAsRow).get_feature_vector(0))

        #min value to find the face
        minDist = 1e100
        #class
        minClass = -1
        #search which face is the best match
        for sampleIdx in range(len(self._projections)):
            test = RealFeatures(np.asmatrix(p, np.float64).T)
            projection = RealFeatures(
                np.asmatrix(self._projections[sampleIdx], np.float64).T)
            dist = EuclideanDistance(test, projection).distance(0, 0)

            if (dist < minDist):
                minDist = dist
                minClass = self._labels[sampleIdx]

        return minClass
def features_dense_io():
	from shogun import RealFeatures, CSVFile
	feats=RealFeatures()
	f=CSVFile("../data/fm_train_real.dat","r")
	f.set_delimiter(" ")
	feats.load(f)
	return feats
def metric_lmnn_statistics(k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'):
	try:
		from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG
		import matplotlib.pyplot as pyplot
	except ImportError:
		print 'Error importing shogun or other required modules. Please, verify their installation.'
		return

	features = RealFeatures(load_compressed_features(fname_features).T)
	labels = MulticlassLabels(CSVFile(fname_labels))

#	print 'number of examples = %d' % features.get_num_vectors()
#	print 'number of features = %d' % features.get_num_features()

	assert(features.get_num_vectors() == labels.get_num_labels())

	# train LMNN
	lmnn = LMNN(features, labels, k)
	lmnn.set_correction(100)
#	lmnn.io.set_loglevel(MSG_DEBUG)
	print 'Training LMNN, this will take about two minutes...'
	lmnn.train()
	print 'Training done!'

	# plot objective obtained during training
	statistics = lmnn.get_statistics()

	pyplot.plot(statistics.obj.get())
	pyplot.grid(True)
	pyplot.xlabel('Iterations')
	pyplot.ylabel('LMNN objective')
	pyplot.title('LMNN objective during training for the multiclass digits data set')

	pyplot.show()
def classifier_featureblock_logistic_regression(fm_train=traindat,
                                                fm_test=testdat,
                                                label_train=label_traindat):

    from shogun import BinaryLabels, RealFeatures, IndexBlock, IndexBlockGroup
    try:
        from shogun import FeatureBlockLogisticRegression
    except ImportError:
        print("FeatureBlockLogisticRegression not available")
        exit(0)

    features = RealFeatures(hstack((traindat, traindat)))
    labels = BinaryLabels(hstack((label_train, label_train)))

    n_features = features.get_num_features()
    block_one = IndexBlock(0, n_features // 2)
    block_two = IndexBlock(n_features // 2, n_features)
    block_group = IndexBlockGroup()
    block_group.add_block(block_one)
    block_group.add_block(block_two)

    mtlr = FeatureBlockLogisticRegression(0.1, features, labels, block_group)
    mtlr.set_regularization(1)  # use regularization ratio
    mtlr.set_tolerance(1e-2)  # use 1e-2 tolerance
    mtlr.train()
    out = mtlr.apply().get_labels()

    return out
def transfer_multitask_leastsquares_regression (fm_train=traindat,fm_test=testdat,label_train=label_traindat):
	from shogun import RegressionLabels, RealFeatures, Task, TaskGroup
	try:
		from shogun import MultitaskLeastSquaresRegression
	except ImportError:
		print("MultitaskLeastSquaresRegression not available")
		exit(0)

	features = RealFeatures(traindat)
	labels = RegressionLabels(label_train)

	n_vectors = features.get_num_vectors()
	task_one = Task(0,n_vectors//2)
	task_two = Task(n_vectors//2,n_vectors)
	task_group = TaskGroup()
	task_group.append_task(task_one)
	task_group.append_task(task_two)

	mtlsr = MultitaskLeastSquaresRegression(0.1,features,labels,task_group)
	mtlsr.set_regularization(1) # use regularization ratio
	mtlsr.set_tolerance(1e-2) # use 1e-2 tolerance
	mtlsr.train()
	mtlsr.set_current_task(0)
	out = mtlsr.apply_regression().get_labels()
	return out
Beispiel #21
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_LDA (" + str(method_param) + ")"

        # Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_feat = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_feat = RealFeatures(self.data[1].T)

        self.tolerance = 1e-4
        if "tolerance" in method_param:
            self.tolerance = float(method_param["tolerance"])

        self.store_cov = False
        if "store-covar" in method_param:
            self.store_cov = bool(method_param["store-covar"])

        self.solver = "auto"
        if "solver" in method_param:
            self.solver = str(method_param["solver"])
Beispiel #22
0
def regression_svrlight (fm_train=traindat,fm_test=testdat,label_train=label_traindat, \
				    width=1.2,C=1,epsilon=1e-5,tube_epsilon=1e-2,num_threads=3):


	from shogun import RegressionLabels, RealFeatures
	from shogun import GaussianKernel
	try:
		from shogun import SVRLight
	except ImportError:
		print('No support for SVRLight available.')
		return

	feats_train=RealFeatures(fm_train)
	feats_test=RealFeatures(fm_test)

	kernel=GaussianKernel(feats_train, feats_train, width)

	labels=RegressionLabels(label_train)

	svr=SVRLight(C, epsilon, kernel, labels)
	svr.set_tube_epsilon(tube_epsilon)
	svr.parallel.set_num_threads(num_threads)
	svr.train()

	kernel.init(feats_train, feats_test)
	out = svr.apply().get_labels()

	return out, kernel
def transfer_multitask_logistic_regression(fm_train=traindat,
                                           fm_test=testdat,
                                           label_train=label_traindat):
    from shogun import BinaryLabels, RealFeatures, Task, TaskGroup
    try:
        from shogun import MultitaskLogisticRegression
    except ImportError:
        print("MultitaskLogisticRegression not available")
        exit()

    features = RealFeatures(hstack((traindat, traindat)))
    labels = BinaryLabels(hstack((label_train, label_train)))

    n_vectors = features.get_num_vectors()
    task_one = Task(0, n_vectors // 2)
    task_two = Task(n_vectors // 2, n_vectors)
    task_group = TaskGroup()
    task_group.append_task(task_one)
    task_group.append_task(task_two)

    mtlr = MultitaskLogisticRegression(0.1, features, labels, task_group)
    mtlr.set_regularization(1)  # use regularization ratio
    mtlr.set_tolerance(1e-2)  # use 1e-2 tolerance
    mtlr.train()
    mtlr.set_current_task(0)
    out = mtlr.apply().get_labels()

    return out
Beispiel #24
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_KNN (" + str(method_param) + ")"

        # Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_feat = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_feat = RealFeatures(self.data[1].T)

        self.k = 3
        if "k" in method_param:
            self.k = int(method_param["k"])

        self.distance = "Euclidean"
        if "distance" in method_param:
            self.distance = str(method_param["distance"])

        self.solver = "Brute"
        if "solver" in method_param:
            self.solver = str(method_param["solver"])

        self.degree = 3
        if "degree" in method_param:
            self.degree = float(method_param["degree"])
Beispiel #25
0
def features_dense(A=matrixA, B=matrixB, C=matrixC):

    a = RealFeatures(A)
    b = LongIntFeatures(B)
    c = ByteFeatures(C)

    # or 16bit wide ...
    #feat1 = f.ShortFeatures(N.zeros((10,5),N.short))
    #feat2 = f.WordFeatures(N.zeros((10,5),N.uint16))

    # print(some statistics about a)

    # get first feature vector and set it

    a.set_feature_vector(array([1, 4, 0, 0, 0, 9], dtype=float64), 0)

    # get matrices
    a_out = a.get_feature_matrix()
    b_out = b.get_feature_matrix()
    c_out = c.get_feature_matrix()

    assert (all(a_out == A))

    assert (all(b_out == B))

    assert (all(c_out == C))
    return a_out, b_out, c_out, a, b, c
Beispiel #26
0
def features_dense (A=matrixA,B=matrixB,C=matrixC):

    a=RealFeatures(A)
    b=LongIntFeatures(B)
    c=ByteFeatures(C)

# or 16bit wide ...
#feat1 = f.ShortFeatures(N.zeros((10,5),N.short))
#feat2 = f.WordFeatures(N.zeros((10,5),N.uint16))


# print(some statistics about a)

# get first feature vector and set it

    a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)

# get matrices
    a_out = a.get_feature_matrix()
    b_out = b.get_feature_matrix()
    c_out = c.get_feature_matrix()

    assert(all(a_out==A))

    assert(all(b_out==B))

    assert(all(c_out==C))
    return a_out,b_out,c_out,a,b,c
Beispiel #27
0
    def RunRandomForestShogun():
      totalTimer = Timer()

      Log.Info("Loading dataset", self.verbose)
      trainData, labels = SplitTrainData(self.dataset)
      trainData = RealFeatures(trainData.T)
      labels = MulticlassLabels(labels)
      testData = RealFeatures(LoadDataset(self.dataset[1]).T)

      if "num_trees" in options:
        self.numTrees = int(options.pop("num_trees"))
      else:
        Log.Fatal("Required parameter 'num_trees' not specified!")
        raise Exception("missing parameter")

      self.form = 1
      if "dimensions" in options:
        self.form = int(options.pop("dimensions"))

      if len(options) > 0:
        Log.Fatal("Unknown parameters: " + str(options))
        raise Exception("unknown parameters")

      try:
        with totalTimer:
          self.model = self.BuildModel(trainData, labels, options)
          # Run the Random Forest Classifier on the test dataset.
          self.predictions = self.model.apply_multiclass(testData).get_labels()
      except Exception as e:
        return [-1]

      time = totalTimer.ElapsedTime()
      return [time, self.predictions]
def transfer_multitask_clustered_logistic_regression(fm_train=traindat,
                                                     fm_test=testdat,
                                                     label_train=label_traindat
                                                     ):
    from shogun import BinaryLabels, RealFeatures, Task, TaskGroup, MSG_DEBUG
    try:
        from shogun import MultitaskClusteredLogisticRegression
    except ImportError:
        print("MultitaskClusteredLogisticRegression not available")
        exit()

    features = RealFeatures(hstack((traindat, sin(traindat), cos(traindat))))
    labels = BinaryLabels(hstack((label_train, label_train, label_train)))

    n_vectors = features.get_num_vectors()
    task_one = Task(0, n_vectors // 3)
    task_two = Task(n_vectors // 3, 2 * n_vectors // 3)
    task_three = Task(2 * n_vectors // 3, n_vectors)
    task_group = TaskGroup()
    task_group.append_task(task_one)
    task_group.append_task(task_two)
    task_group.append_task(task_three)

    mtlr = MultitaskClusteredLogisticRegression(1.0, 100.0, features, labels,
                                                task_group, 2)
    #mtlr.io.set_loglevel(MSG_DEBUG)
    mtlr.set_tolerance(1e-3)  # use 1e-2 tolerance
    mtlr.set_max_iter(100)
    mtlr.train()
    mtlr.set_current_task(0)
    #print mtlr.get_w()
    out = mtlr.apply_regression().get_labels()

    return out
Beispiel #29
0
        def RunDTCShogun():
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            trainData, labels = SplitTrainData(self.dataset)
            trainData = RealFeatures(trainData.T)
            labels = MulticlassLabels(labels)
            testData = RealFeatures(LoadDataset(self.dataset[1]).T)

            if len(options) > 0:
                Log.Fatal("Unknown parameters: " + str(options))
                raise Exception("unknown parameters")

            try:
                with totalTimer:
                    self.model = self.BuildModel(trainData, labels, options)
                    # Run the CARTree Classifier on the test dataset.
                    self.predictions = self.model.apply_multiclass(
                        testData).get_labels()
            except Exception as e:
                return [-1]

            time = totalTimer.ElapsedTime()
            if len(self.dataset) > 1:
                return [time, self.predictions]

            return [time]
Beispiel #30
0
def classifier_gpbtsvm(train_fname=traindat,
                       test_fname=testdat,
                       label_fname=label_traindat,
                       width=2.1,
                       C=1,
                       epsilon=1e-5):
    from shogun import RealFeatures, BinaryLabels
    from shogun import GaussianKernel
    from shogun import CSVFile
    try:
        from shogun import GPBTSVM
    except ImportError:
        print("GPBTSVM not available")
        exit(0)

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = BinaryLabels(CSVFile(label_fname))
    kernel = GaussianKernel(feats_train, feats_train, width)

    svm = GPBTSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.train()

    predictions = svm.apply(feats_test)
    return predictions, svm, predictions.get_labels()
Beispiel #31
0
        def RunLDAShogun():
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the test file.
            try:
                if len(self.dataset) > 1:
                    testSet = LoadDataset(self.dataset[1])

                # Use the last row of the training set as the responses.
                trainSet, trainLabels = SplitTrainData(self.dataset)
                # if the labels are not in {0,1,2,...,num_classes-1}, map them to this set and store the mapping
                # shogun's MCLDA class requires the labels to be in {0,1,2,...,num_classes-1}
                distinctLabels = list(set(trainLabels))
                mapping = {}
                reverseMapping = {}
                idx = 0
                for label in distinctLabels:
                    mapping[label] = idx
                    reverseMapping[idx] = label
                    idx += 1
                for i in range(len(trainLabels)):
                    trainLabels[i] = mapping[trainLabels[i]]

                trainFeat = RealFeatures(trainSet.T)
                trainLabels = MulticlassLabels(trainLabels)
                # Gather optional parameters.
                if "tolerance" in options:
                    self.tolerance = float(options.pop("tolerance"))

                if "store" in options:
                    self.store = bool(options.pop("store"))

                if (len(options) > 0):
                    Log.Fatal("Unknown parameters: " + str(options))
                    raise Exception("unknown parameters")

                with totalTimer:
                    self.model = MCLDA(trainFeat, trainLabels, self.tolerance,
                                       self.store)
                    self.model.train()

                if (len(self.dataset) > 0):
                    self.predictions = self.model.apply_multiclass(
                        RealFeatures(testSet.T))
                    self.predictions = self.predictions.get_labels()
                    # reverse map the predicted labels to actual labels
                    for i in range(len(self.predictions)):
                        self.predictions[i] = reverseMapping[
                            self.predictions[i]]

            except Exception as e:
                Log.Info("Exception: " + str(e))
                return -1

            time = totalTimer.ElapsedTime()
            return time
Beispiel #32
0
        def RunAllKnnShogun():
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the query
            # file.
            try:
                Log.Info("Loading dataset", self.verbose)
                if len(self.dataset) == 2:
                    referenceData = np.genfromtxt(self.dataset[0],
                                                  delimiter=',')
                    queryData = np.genfromtxt(self.dataset[1], delimiter=',')
                    queryFeat = RealFeatures(queryFeat.T)
                else:
                    referenceData = np.genfromtxt(self.dataset, delimiter=',')

                # Labels are the last row of the dataset.
                labels = MulticlassLabels(
                    referenceData[:, (referenceData.shape[1] - 1)])
                referenceData = referenceData[:, :-1]

                with totalTimer:
                    # Get all the parameters.
                    if "k" in options:
                        k = int(options.pop("k"))
                        if (k < 1 or k > referenceData.shape[0]):
                            Log.Fatal("Invalid k: " + k.group(1) +
                                      "; must be greater than 0" +
                                      " and less or equal than " +
                                      str(referenceData.shape[0]))
                            return -1
                    else:
                        Log.Fatal(
                            "Required option: Number of furthest neighbors to find."
                        )
                        return -1

                    if len(options) > 0:
                        Log.Fatal("Unknown parameters: " + str(options))
                        raise Exception("unknown parameters")

                    referenceFeat = RealFeatures(referenceData.T)
                    distance = EuclideanDistance(referenceFeat, referenceFeat)

                    # Perform All K-Nearest-Neighbors.
                    model = SKNN(k, distance, labels)
                    model.train()

                    if len(self.dataset) == 2:
                        out = model.apply(queryFeat).get_labels()
                    else:
                        out = model.apply(referenceFeat).get_labels()
            except Exception as e:
                return -1

            return totalTimer.ElapsedTime()
Beispiel #33
0
def modelselection_grid_search_kernel(num_subsets, num_vectors, dim_vectors):
    # init seed for reproducability
    Math.init_random(1)
    random.seed(1)

    # create some (non-sense) data
    matrix = random.rand(dim_vectors, num_vectors)

    # create num_feautres 2-dimensional vectors
    features = RealFeatures()
    features.set_feature_matrix(matrix)

    # create labels, two classes
    labels = BinaryLabels(num_vectors)
    for i in range(num_vectors):
        labels.set_label(i, 1 if i % 2 == 0 else -1)

    # create svm
    classifier = LibSVM()

    # splitting strategy
    splitting_strategy = StratifiedCrossValidationSplitting(
        labels, num_subsets)

    # accuracy evaluation
    evaluation_criterion = ContingencyTableEvaluation(ACCURACY)

    # cross validation class for evaluation in model selection
    cross = CrossValidation(classifier, features, labels, splitting_strategy,
                            evaluation_criterion)
    cross.set_num_runs(1)

    # print all parameter available for modelselection
    # Dont worry if yours is not included, simply write to the mailing list
    #classifier.print_modsel_params()

    # model parameter selection
    param_tree = create_param_tree()
    #param_tree.print_tree()

    grid_search = GridSearchModelSelection(cross, param_tree)

    print_state = False
    best_combination = grid_search.select_model(print_state)
    #print("best parameter(s):")
    #best_combination.print_tree()

    best_combination.apply_to_machine(classifier)

    # larger number of runs to have less variance
    cross.set_num_runs(10)
    result = cross.evaluate()
    casted = CrossValidationResult.obtain_from_generic(result)
    #print "result mean:", casted.mean

    return classifier, result, casted.get_mean()
def features_dense_zero_copy (in_data=data):
	feats = None
	if numpy.__version__ >= '1.5':
		feats=numpy.array(in_data, dtype=float64, order='F')

		a=RealFeatures()
		a.frombuffer(feats, False)

		b=numpy.array(a, copy=False)
		c=numpy.array(a, copy=True)

		d=RealFeatures()
		d.frombuffer(a, False)

		e=RealFeatures()
		e.frombuffer(a, True)

		a[:,0]=0
		#print a[0:4]
		#print b[0:4]
		#print c[0:4]
		#print d[0:4]
		#print e[0:4]
	else:
		print("numpy version >= 1.5 is needed")

	return feats
Beispiel #35
0
def distance_chisquare(train_fname=traindat, test_fname=testdat):
    from shogun import RealFeatures, ChiSquareDistance, CSVFile
    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    distance = ChiSquareDistance(feats_train, feats_train)
    dm_train = distance.get_distance_matrix()

    distance.init(feats_train, feats_test)
    dm_test = distance.get_distance_matrix()
    return distance, dm_train, dm_test
def modelselection_grid_search_kernel (num_subsets, num_vectors, dim_vectors):
	# init seed for reproducability
	Math.init_random(1)
	random.seed(1);

	# create some (non-sense) data
	matrix=random.rand(dim_vectors, num_vectors)

	# create num_feautres 2-dimensional vectors
	features=RealFeatures()
	features.set_feature_matrix(matrix)

	# create labels, two classes
	labels=BinaryLabels(num_vectors)
	for i in range(num_vectors):
		labels.set_label(i, 1 if i%2==0 else -1)

	# create svm
	classifier=LibSVM()

	# splitting strategy
	splitting_strategy=StratifiedCrossValidationSplitting(labels, num_subsets)

	# accuracy evaluation
	evaluation_criterion=ContingencyTableEvaluation(ACCURACY)

	# cross validation class for evaluation in model selection
	cross=CrossValidation(classifier, features, labels, splitting_strategy, evaluation_criterion)
	cross.set_num_runs(1)

	# print all parameter available for modelselection
	# Dont worry if yours is not included, simply write to the mailing list
	#classifier.print_modsel_params()

	# model parameter selection
	param_tree=create_param_tree()
	#param_tree.print_tree()

	grid_search=GridSearchModelSelection(cross, param_tree)

	print_state=False
	best_combination=grid_search.select_model(print_state)
	#print("best parameter(s):")
	#best_combination.print_tree()

	best_combination.apply_to_machine(classifier)

	# larger number of runs to have less variance
	cross.set_num_runs(10)
	result=cross.evaluate()
	casted=CrossValidationResult.obtain_from_generic(result);
	#print "result mean:", casted.mean

	return classifier,result,casted.get_mean()
Beispiel #37
0
def kernel_sigmoid (train_fname=traindat,test_fname=testdat,size_cache=10,gamma=1.2,coef0=1.3):
	from shogun import RealFeatures, SigmoidKernel, CSVFile

	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	kernel=SigmoidKernel(feats_train, feats_train, size_cache, gamma, coef0)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Beispiel #38
0
def kernel_cauchy (train_fname=traindat,test_fname=testdat, sigma=1.0):
	from shogun import RealFeatures, CauchyKernel, CSVFile, EuclideanDistance
	feats_train=RealFeatures(CSVFile(train_fname))
	feats_test=RealFeatures(CSVFile(test_fname))

	distance=EuclideanDistance(feats_train, feats_train)
	kernel=CauchyKernel(feats_train, feats_train, sigma, distance)
	km_train=kernel.get_kernel_matrix()

	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()
	return km_train,km_test,kernel
Beispiel #39
0
def kernel_gaussian(train_fname=traindat, test_fname=testdat, width=1.3):
    from shogun import RealFeatures, GaussianKernel, CSVFile

    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))

    kernel = GaussianKernel(feats_train, feats_train, width)
    km_train = kernel.get_kernel_matrix()

    kernel.init(feats_train, feats_test)
    km_test = kernel.get_kernel_matrix()
    return km_train, km_test, kernel
def features_dense_real (A=matrix):

# ... of type Real, LongInt and Byte
    a=RealFeatures(A)

# print(some statistics about a)
#print(a.get_num_vectors())
#print(a.get_num_features())

# get first feature vector and set it
#print(a.get_feature_vector(0))
    a.set_feature_vector(array([1,4,0,0,0,9], dtype=float64), 0)

# get matrix
    a_out = a.get_feature_matrix()

    assert(all(a_out==A))
    return a_out
def preprocessor_prunevarsubmean (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
	from shogun import Chi2Kernel
	from shogun import RealFeatures
	from shogun import PruneVarSubMean

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=PruneVarSubMean()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def preprocessor_randomfouriergausspreproc (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):
	from shogun import Chi2Kernel
	from shogun import RealFeatures
	from shogun import RandomFourierGaussPreproc

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preproc=RandomFourierGaussPreproc()
	preproc.init(feats_train)
	feats_train.add_preprocessor(preproc)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preproc)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def preprocessor_normone (fm_train_real=traindat,fm_test_real=testdat,width=1.4,size_cache=10):

	from shogun import Chi2Kernel
	from shogun import RealFeatures
	from shogun import NormOne

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	preprocessor=NormOne()
	preprocessor.init(feats_train)
	feats_train.add_preprocessor(preprocessor)
	feats_train.apply_preprocessor()
	feats_test.add_preprocessor(preprocessor)
	feats_test.apply_preprocessor()

	kernel=Chi2Kernel(feats_train, feats_train, width, size_cache)

	km_train=kernel.get_kernel_matrix()
	kernel.init(feats_train, feats_test)
	km_test=kernel.get_kernel_matrix()

	return km_train,km_test,kernel
def stochasticgbmachine(train=traindat,train_labels=label_traindat,ft=feat_types):
	try:
		from shogun import RealFeatures, RegressionLabels, CSVFile, CARTree, StochasticGBMachine, SquaredLoss
	except ImportError:
		print("Could not import Shogun modules")
		return

	# wrap features and labels into Shogun objects
	feats=RealFeatures(CSVFile(train))
	labels=RegressionLabels(CSVFile(train_labels))

	# divide into training (90%) and test dataset (10%)
	p=np.random.permutation(labels.get_num_labels())
	num=labels.get_num_labels()*0.9

	cart=CARTree()
	cart.set_feature_types(ft)
	cart.set_max_depth(1)
	loss=SquaredLoss()
	s=StochasticGBMachine(cart,loss,500,0.01,0.6)

	# train
	feats.add_subset(np.int32(p[0:int(num)]))
	labels.add_subset(np.int32(p[0:int(num)]))
	s.set_labels(labels)
	s.train(feats)
	feats.remove_subset()
	labels.remove_subset()

	# apply
	feats.add_subset(np.int32(p[int(num):len(p)]))
	labels.add_subset(np.int32(p[int(num):len(p)]))
	output=s.apply_regression(feats)

	feats.remove_subset()
	labels.remove_subset()

	return s,output
Beispiel #45
0
def hsic_graphical():
	# parameters, change to get different results
	m=250
	difference=3

	# setting the angle lower makes a harder test
	angle=pi/30

	# number of samples taken from null and alternative distribution
	num_null_samples=500

	# use data generator class to produce example data
	data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)

	# create shogun feature representation
	features_x=RealFeatures(array([data[0]]))
	features_y=RealFeatures(array([data[1]]))

	# compute median data distance in order to use for Gaussian kernel width
	# 0.5*median_distance normally (factor two in Gaussian kernel)
	# However, shoguns kernel width is different to usual parametrization
	# Therefore 0.5*2*median_distance^2
	# Use a subset of data for that, only 200 elements. Median is stable
	subset=int32(array([x for x in range(features_x.get_num_vectors())])) # numpy
	subset=random.permutation(subset) # numpy permutation
	subset=subset[0:200]
	features_x.add_subset(subset)
	dist=EuclideanDistance(features_x, features_x)
	distances=dist.get_distance_matrix()
	features_x.remove_subset()
	median_distance=np.median(distances)
	sigma_x=median_distance**2
	features_y.add_subset(subset)
	dist=EuclideanDistance(features_y, features_y)
	distances=dist.get_distance_matrix()
	features_y.remove_subset()
	median_distance=np.median(distances)
	sigma_y=median_distance**2
	print "median distance for Gaussian kernel on x:", sigma_x
	print "median distance for Gaussian kernel on y:", sigma_y
	kernel_x=GaussianKernel(10,sigma_x)
	kernel_y=GaussianKernel(10,sigma_y)

	# create hsic instance. Note that this is a convienience constructor which copies
	# feature data. features_x and features_y are not these used in hsic.
	# This is only for user-friendlyness. Usually, its ok to do this.
	# Below, the alternative distribution is sampled, which means
	# that new feature objects have to be created in each iteration (slow)
	# However, normally, the alternative distribution is not sampled
	hsic=HSIC(kernel_x,kernel_y,features_x,features_y)

	# sample alternative distribution
	alt_samples=zeros(num_null_samples)
	for i in range(len(alt_samples)):
		data=DataGenerator.generate_sym_mix_gauss(m,difference,angle)
		features_x.set_feature_matrix(array([data[0]]))
		features_y.set_feature_matrix(array([data[1]]))

		# re-create hsic instance everytime since feature objects are copied due to
		# useage of convienience constructor
		hsic=HSIC(kernel_x,kernel_y,features_x,features_y)
		alt_samples[i]=hsic.compute_statistic()

	# sample from null distribution
	# permutation, biased statistic
	hsic.set_null_approximation_method(PERMUTATION)
	hsic.set_num_null_samples(num_null_samples)
	null_samples_boot=hsic.sample_null()

	# fit gamma distribution, biased statistic
	hsic.set_null_approximation_method(HSIC_GAMMA)
	gamma_params=hsic.fit_null_gamma()
	# sample gamma with parameters
	null_samples_gamma=array([gamma(gamma_params[0], gamma_params[1]) for _ in range(num_null_samples)])

	# plot
	figure()

	# plot data x and y
	subplot(2,2,1)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 4) ) # reduce number of x-ticks
	grid(True)
	plot(data[0], data[1], 'o')
	title('Data, rotation=$\pi$/'+str(1/angle*pi)+'\nm='+str(m))
	xlabel('$x$')
	ylabel('$y$')

	# compute threshold for test level
	alpha=0.05
	null_samples_boot.sort()
	null_samples_gamma.sort()
	thresh_boot=null_samples_boot[floor(len(null_samples_boot)*(1-alpha))];
	thresh_gamma=null_samples_gamma[floor(len(null_samples_gamma)*(1-alpha))];

	type_one_error_boot=sum(null_samples_boot<thresh_boot)/float(num_null_samples)
	type_one_error_gamma=sum(null_samples_gamma<thresh_boot)/float(num_null_samples)

	# plot alternative distribution with threshold
	subplot(2,2,2)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(alt_samples, 20, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	type_two_error=sum(alt_samples<thresh_boot)/float(num_null_samples)
	title('Alternative Dist.\n' + 'Type II error is ' + str(type_two_error))

	# compute range for all null distribution histograms
	hist_range=[min([min(null_samples_boot), min(null_samples_gamma)]), max([max(null_samples_boot), max(null_samples_gamma)])]

	# plot null distribution with threshold
	subplot(2,2,3)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_boot, 20, range=hist_range, normed=True);
	axvline(thresh_boot, 0, 1, linewidth=2, color='red')
	title('Sampled Null Dist.\n' + 'Type I error is '  + str(type_one_error_boot))

	# plot null distribution gamma
	subplot(2,2,4)
	gca().xaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	gca().yaxis.set_major_locator( MaxNLocator(nbins = 3) ) # reduce number of x-ticks
	grid(True)
	hist(null_samples_gamma, 20, range=hist_range, normed=True);
	axvline(thresh_gamma, 0, 1, linewidth=2, color='red')
	title('Null Dist. Gamma\nType I error is '  + str(type_one_error_gamma))
	grid(True)

	# pull plots a bit apart
	subplots_adjust(hspace=0.5)
	subplots_adjust(wspace=0.5)
def serialization_complex_example (num=5, dist=1, dim=10, C=2.0, width=10):
	import os
	from numpy import concatenate, zeros, ones
	from numpy.random import randn, seed
	from shogun import RealFeatures, MulticlassLabels
	from shogun import GMNPSVM
	from shogun import GaussianKernel
	from shogun import SerializableHdf5File,SerializableAsciiFile, \
			SerializableJsonFile,SerializableXmlFile,MSG_DEBUG
	from shogun import NormOne, LogPlusOne
	from tempfile import NamedTemporaryFile

	seed(17)

	data=concatenate((randn(dim, num), randn(dim, num) + dist,
					  randn(dim, num) + 2*dist,
					  randn(dim, num) + 3*dist), axis=1)
	lab=concatenate((zeros(num), ones(num), 2*ones(num), 3*ones(num)))

	feats=RealFeatures(data)
	#feats.io.set_loglevel(MSG_DEBUG)
	#feats.io.enable_file_and_line()
	kernel=GaussianKernel(feats, feats, width)

	labels=MulticlassLabels(lab)

	svm = GMNPSVM(C, kernel, labels)

	feats.add_preprocessor(NormOne())
	feats.add_preprocessor(LogPlusOne())
	feats.set_preprocessed(1)
	svm.train(feats)
	bias_ref = svm.get_svm(0).get_bias()

	#svm.print_serializable()

	tmp_h5 = NamedTemporaryFile(suffix='h5')
	fstream = SerializableHdf5File(tmp_h5.name, "w")
	status = svm.save_serializable(fstream)
	check_status(status,'h5')

	tmp_asc = NamedTemporaryFile(suffix='asc')
	fstream = SerializableAsciiFile(tmp_asc.name, "w")
	status = svm.save_serializable(fstream)
	check_status(status,'asc')

	tmp_json = NamedTemporaryFile(suffix='json')
	fstream = SerializableJsonFile(tmp_json.name, "w")
	status = svm.save_serializable(fstream)
	check_status(status,'json')

	tmp_xml = NamedTemporaryFile(suffix='xml')
	fstream = SerializableXmlFile(tmp_xml.name, "w")
	status = svm.save_serializable(fstream)
	check_status(status,'xml')

	fstream = SerializableHdf5File(tmp_h5.name, "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'h5')
	new_svm.train()
	bias_h5 = new_svm.get_svm(0).get_bias()

	fstream = SerializableAsciiFile(tmp_asc.name, "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'asc')
	new_svm.train()
	bias_asc = new_svm.get_svm(0).get_bias()

	fstream = SerializableJsonFile(tmp_json.name, "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'json')
	new_svm.train()
	bias_json = new_svm.get_svm(0).get_bias()

	fstream = SerializableXmlFile(tmp_xml.name, "r")
	new_svm=GMNPSVM()
	status = new_svm.load_serializable(fstream)
	check_status(status,'xml')
	new_svm.train()
	bias_xml = new_svm.get_svm(0).get_bias()

	return svm,new_svm, bias_ref, bias_h5, bias_asc, bias_json, bias_xml
Beispiel #47
0
def create_features(kname, examples, kparam, train_mode, preproc, seq_source, nuc_con):
    """Converts numpy arrays or sequences into shogun features"""

    if kname == 'gauss' or kname == 'linear' or kname == 'poly':
        examples = numpy.array(examples)
        feats = RealFeatures(examples)

    elif kname == 'wd' or kname == 'localalign' or kname == 'localimprove':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

    elif kname == 'spec' or kname == 'cumspec':
        if seq_source == 'dna':
            examples = non_atcg_convert(examples, nuc_con)
            feats = StringCharFeatures(examples, DNA)
        elif seq_source == 'protein':
            examples = non_aminoacid_converter(examples, nuc_con)
            feats = StringCharFeatures(examples, PROTEIN)
        else:
            sys.stderr.write("Sequence source -"+seq_source+"- is invalid. select [dna|protein]\n")
            sys.exit(-1)

        wf = StringUlongFeatures( feats.get_alphabet() )
        wf.obtain_from_char(feats, kparam['degree']-1, kparam['degree'], 0, kname=='cumspec')
        del feats

        if train_mode:
            preproc = SortUlongString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        #assert(ret)

        feats = wf
    elif kname == 'spec2' or kname == 'cumspec2':
        # spectrum kernel on two sequences
        feats = {}
        feats['combined'] = CombinedFeatures()

        reversed = kname=='cumspec2'

        (ex0,ex1) = zip(*examples)

        f0 = StringCharFeatures(list(ex0), DNA)
        wf = StringWordFeatures(f0.get_alphabet())
        wf.obtain_from_char(f0, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f0

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f0'] = wf

        f1 = StringCharFeatures(list(ex1), DNA)
        wf = StringWordFeatures( f1.get_alphabet() )
        wf.obtain_from_char(f1, kparam['degree']-1, kparam['degree'], 0, reversed)
        del f1

        if train_mode:
            preproc = SortWordString()
            preproc.init(wf)
        wf.add_preprocessor(preproc)
        ret = wf.apply_preprocessor()
        assert(ret)
        feats['combined'].append_feature_obj(wf)
        feats['f1'] = wf

    else:
        print 'Unknown kernel %s' % kname

    return (feats,preproc)