def metric_lmnn_statistics(k=3, fname_features='../../data/fm_train_multiclass_digits.dat.gz', fname_labels='../../data/label_train_multiclass_digits.dat'):
	try:
		from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG
		import matplotlib.pyplot as pyplot
	except ImportError:
		print 'Error importing shogun or other required modules. Please, verify their installation.'
		return

	features = RealFeatures(load_compressed_features(fname_features).T)
	labels = MulticlassLabels(CSVFile(fname_labels))

#	print 'number of examples = %d' % features.get_num_vectors()
#	print 'number of features = %d' % features.get_num_features()

	assert(features.get_num_vectors() == labels.get_num_labels())

	# train LMNN
	lmnn = LMNN(features, labels, k)
	lmnn.set_correction(100)
#	lmnn.io.set_loglevel(MSG_DEBUG)
	print 'Training LMNN, this will take about two minutes...'
	lmnn.train()
	print 'Training done!'

	# plot objective obtained during training
	statistics = lmnn.get_statistics()

	pyplot.plot(statistics.obj.get())
	pyplot.grid(True)
	pyplot.xlabel('Iterations')
	pyplot.ylabel('LMNN objective')
	pyplot.title('LMNN objective during training for the multiclass digits data set')

	pyplot.show()
def evaluation_multiclassaccuracy (ground_truth, predicted):
	from shogun import MulticlassLabels
	from shogun import MulticlassAccuracy

	ground_truth_labels = MulticlassLabels(ground_truth)
	predicted_labels = MulticlassLabels(predicted)

	evaluator = MulticlassAccuracy()
	accuracy = evaluator.evaluate(predicted_labels,ground_truth_labels)

	return accuracy
Beispiel #3
0
 def BuildModel(self, data, responses):
     # Create and train the classifier.
     model = Perceptron(RealFeatures(data.T), MulticlassLabels(responses))
     if self.iterations:
         model.set_max_iter(self.iterations)
     model.train()
     return model
Beispiel #4
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_DTC (" + str(method_param) + ")"

        # Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_feat = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_feat = RealFeatures(self.data[1].T)

        # Flag for Cross Validation Pruning
        self.cv_prune = False
        if "pruning" in method_param:
            self.cv_prune = bool(method_param["pruning"])

        self.num_folds = 2
        if "k" in method_param:
            # Making sure that the value is of the right type
            self.num_folds = int(method_param["k"])
Beispiel #5
0
        def RunNBCShogun():
            totalTimer = Timer()
            self.predictions = None
            Log.Info("Loading dataset", self.verbose)
            try:
                # Load train and test dataset.
                trainData = np.genfromtxt(self.dataset[0], delimiter=',')
                testData = np.genfromtxt(self.dataset[1], delimiter=',')

                # Labels are the last row of the training set.
                labels = MulticlassLabels(trainData[:,
                                                    (trainData.shape[1] - 1)])

                with totalTimer:
                    # Transform into features.
                    trainFeat = RealFeatures(trainData[:, :-1].T)
                    testFeat = RealFeatures(testData.T)

                    # Create and train the classifier.
                    self.model = self.BuildModel(trainFeat, labels, options)

                    # Run Naive Bayes Classifier on the test dataset.
                    self.predictions = self.model.apply_multiclass(
                        testFeat).get_labels()

            except Exception as e:
                return [-1]

            time = totalTimer.ElapsedTime()
            if len(self.dataset) > 1:
                return [time, self.predictions]

            return [time]
Beispiel #6
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_RANDOMFOREST (" + str(method_param) + ")"

        # Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_feat = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_feat = RealFeatures(self.data[1].T)

        self.num_trees = 50
        if "num-trees" in method_param:
            self.num_trees = int(method_param["num-trees"])

        self.form = 1
        if "dimensions" in method_param:
            self.form = int(method_param["dimensions"])

        self.solver = "auto"
        if "solver" in method_param:
            self.solver = str(method_param["solver"])
def classifier_multiclassocas (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1):
	from shogun import RealFeatures, MulticlassLabels
	from shogun import Math_init_random
	try:
		from shogun import MulticlassOCAS
	except ImportError:
		print("MulticlassOCAS not available")
		return

	# reproducible results
	random.seed(seed)
	Math_init_random(seed)

	# generate some training data where each class pair is linearly separable
	label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	fm_train=array(random.randn(num_class,num_vec))
	fm_test=array(random.randn(num_class,num_vec))
	for i in range(len(label_train)):
		fm_train[int(label_train[i]),i]+=distance
		fm_test[int(label_test[i]),i]+=distance

	feats_train=RealFeatures(fm_train)
	feats_test=RealFeatures(fm_test)

	labels=MulticlassLabels(label_train)

	classifier = MulticlassOCAS(C,feats_train,labels)
	classifier.train()

	out = classifier.apply(feats_test).get_labels()
	#print label_test
	#print out
	return out,classifier
Beispiel #8
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_LDA (" + str(method_param) + ")"

        # Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_feat = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_feat = RealFeatures(self.data[1].T)

        self.tolerance = 1e-4
        if "tolerance" in method_param:
            self.tolerance = float(method_param["tolerance"])

        self.store_cov = False
        if "store-covar" in method_param:
            self.store_cov = bool(method_param["store-covar"])

        self.solver = "auto"
        if "solver" in method_param:
            self.solver = str(method_param["solver"])
Beispiel #9
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_KNN (" + str(method_param) + ")"

        # Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_feat = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_feat = RealFeatures(self.data[1].T)

        self.k = 3
        if "k" in method_param:
            self.k = int(method_param["k"])

        self.distance = "Euclidean"
        if "distance" in method_param:
            self.distance = str(method_param["distance"])

        self.solver = "Brute"
        if "solver" in method_param:
            self.solver = str(method_param["solver"])

        self.degree = 3
        if "degree" in method_param:
            self.degree = float(method_param["degree"])
def evaluation_clustering(features=fea, ground_truth=gnd_raw, ncenters=10):
    from shogun import ClusteringAccuracy, ClusteringMutualInformation
    from shogun import MulticlassLabels
    from shogun import Math

    # reproducable results
    Math.init_random(1)

    centroids = run_clustering(features, ncenters)
    gnd_hat = assign_labels(features, centroids, ncenters)
    gnd = MulticlassLabels(ground_truth)

    AccuracyEval = ClusteringAccuracy()
    AccuracyEval.best_map(gnd_hat, gnd)

    accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
    #print(('Clustering accuracy = %.4f' % accuracy))

    MIEval = ClusteringMutualInformation()
    mutual_info = MIEval.evaluate(gnd_hat, gnd)
    #print(('Clustering mutual information = %.4f' % mutual_info))

    # TODO mutual information does not work with serialization
    #return gnd, gnd_hat, accuracy, MIEval, mutual_info
    return gnd, gnd_hat, accuracy
Beispiel #11
0
def mkl_multiclass_1(fm_train_real, fm_test_real, label_train_multiclass, C):
    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    for i in range(-10, 11):
        subkfeats_train = RealFeatures(fm_train_real)
        subkfeats_test = RealFeatures(fm_test_real)
        subkernel = GaussianKernel(pow(2, i + 1))
        feats_train.append_feature_obj(subkfeats_train)
        feats_test.append_feature_obj(subkfeats_test)
        kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = MulticlassLabels(label_train_multiclass)

    mkl = MKLMulticlass(C, kernel, labels)

    mkl.set_epsilon(1e-2)
    mkl.parallel.set_num_threads(num_threads)
    mkl.set_mkl_epsilon(mkl_epsilon)
    mkl.set_mkl_norm(1)

    mkl.train()

    kernel.init(feats_train, feats_test)

    out = mkl.apply().get_labels()
    return out
Beispiel #12
0
def metric_lmnn(train_fname=traindat,
                test_fname=testdat,
                label_train_fname=label_traindat,
                k=3):
    try:
        from shogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile
    except ImportError:
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = MulticlassLabels(CSVFile(label_train_fname))

    # LMNN
    lmnn = LMNN(feats_train, labels, k)
    lmnn.train()
    lmnn_distance = lmnn.get_distance()

    # perform classification with KNN
    knn = KNN(k, lmnn_distance, labels)
    knn.train()
    output = knn.apply(feats_test).get_labels()

    return lmnn, output
Beispiel #13
0
    def RunRandomForestShogun():
      totalTimer = Timer()

      Log.Info("Loading dataset", self.verbose)
      trainData, labels = SplitTrainData(self.dataset)
      trainData = RealFeatures(trainData.T)
      labels = MulticlassLabels(labels)
      testData = RealFeatures(LoadDataset(self.dataset[1]).T)

      if "num_trees" in options:
        self.numTrees = int(options.pop("num_trees"))
      else:
        Log.Fatal("Required parameter 'num_trees' not specified!")
        raise Exception("missing parameter")

      self.form = 1
      if "dimensions" in options:
        self.form = int(options.pop("dimensions"))

      if len(options) > 0:
        Log.Fatal("Unknown parameters: " + str(options))
        raise Exception("unknown parameters")

      try:
        with totalTimer:
          self.model = self.BuildModel(trainData, labels, options)
          # Run the Random Forest Classifier on the test dataset.
          self.predictions = self.model.apply_multiclass(testData).get_labels()
      except Exception as e:
        return [-1]

      time = totalTimer.ElapsedTime()
      return [time, self.predictions]
Beispiel #14
0
        def RunDTCShogun():
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            trainData, labels = SplitTrainData(self.dataset)
            trainData = RealFeatures(trainData.T)
            labels = MulticlassLabels(labels)
            testData = RealFeatures(LoadDataset(self.dataset[1]).T)

            if len(options) > 0:
                Log.Fatal("Unknown parameters: " + str(options))
                raise Exception("unknown parameters")

            try:
                with totalTimer:
                    self.model = self.BuildModel(trainData, labels, options)
                    # Run the CARTree Classifier on the test dataset.
                    self.predictions = self.model.apply_multiclass(
                        testData).get_labels()
            except Exception as e:
                return [-1]

            time = totalTimer.ElapsedTime()
            if len(self.dataset) > 1:
                return [time, self.predictions]

            return [time]
Beispiel #15
0
def multiclass_c45classifiertree(train=traindat,
                                 test=testdat,
                                 labels=label_traindat,
                                 ft=feattypes):
    try:
        from shogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree
        from numpy import random, int32
    except ImportError:
        print("Could not import Shogun and/or numpy modules")
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train))
    feats_test = RealFeatures(CSVFile(test))
    train_labels = MulticlassLabels(CSVFile(labels))

    # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3
    subset = int32(random.permutation(feats_train.get_num_vectors()))
    vsubset = subset[1:int(subset.size / 3)]
    trsubset = subset[1 + int(subset.size / 3):subset.size]

    # C4.5 Tree formation using training subset
    train_labels.add_subset(trsubset)
    feats_train.add_subset(trsubset)

    c = C45ClassifierTree()
    c.set_labels(train_labels)
    c.set_feature_types(ft)
    c.train(feats_train)

    train_labels.remove_subset()
    feats_train.remove_subset()

    # prune tree using validation subset
    train_labels.add_subset(vsubset)
    feats_train.add_subset(vsubset)

    c.prune_tree(feats_train, train_labels)

    train_labels.remove_subset()
    feats_train.remove_subset()

    # Classify test data
    output = c.apply_multiclass(feats_test).get_labels()
    output_certainty = c.get_certainty_vector()

    return c, output, output_certainty
Beispiel #16
0
 def BuildModel(self, data, responses):
     # Create and train the classifier.
     model = MulticlassLogisticRegression(self.z, RealFeatures(data.T),
                                          MulticlassLabels(responses))
     if self.max_iter is not None:
         model.set_max_iter(self.max_iter)
     model.train()
     return model
Beispiel #17
0
        def RunLDAShogun():
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the test file.
            try:
                if len(self.dataset) > 1:
                    testSet = LoadDataset(self.dataset[1])

                # Use the last row of the training set as the responses.
                trainSet, trainLabels = SplitTrainData(self.dataset)
                # if the labels are not in {0,1,2,...,num_classes-1}, map them to this set and store the mapping
                # shogun's MCLDA class requires the labels to be in {0,1,2,...,num_classes-1}
                distinctLabels = list(set(trainLabels))
                mapping = {}
                reverseMapping = {}
                idx = 0
                for label in distinctLabels:
                    mapping[label] = idx
                    reverseMapping[idx] = label
                    idx += 1
                for i in range(len(trainLabels)):
                    trainLabels[i] = mapping[trainLabels[i]]

                trainFeat = RealFeatures(trainSet.T)
                trainLabels = MulticlassLabels(trainLabels)
                # Gather optional parameters.
                if "tolerance" in options:
                    self.tolerance = float(options.pop("tolerance"))

                if "store" in options:
                    self.store = bool(options.pop("store"))

                if (len(options) > 0):
                    Log.Fatal("Unknown parameters: " + str(options))
                    raise Exception("unknown parameters")

                with totalTimer:
                    self.model = MCLDA(trainFeat, trainLabels, self.tolerance,
                                       self.store)
                    self.model.train()

                if (len(self.dataset) > 0):
                    self.predictions = self.model.apply_multiclass(
                        RealFeatures(testSet.T))
                    self.predictions = self.predictions.get_labels()
                    # reverse map the predicted labels to actual labels
                    for i in range(len(self.predictions)):
                        self.predictions[i] = reverseMapping[
                            self.predictions[i]]

            except Exception as e:
                Log.Info("Exception: " + str(e))
                return -1

            time = totalTimer.ElapsedTime()
            return time
Beispiel #18
0
    def _svm_new(self, kernel_width, c, epsilon):
        if self.x == None or self.y == None:
            raise Exception("No training data loaded.")

        x = RealFeatures(self.x)
        y = MulticlassLabels(self.y)

        self.svm = GMNPSVM(c, GaussianKernel(x, x, kernel_width), y)
        self.svm.set_epsilon(epsilon)
Beispiel #19
0
        def RunAllKnnShogun():
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the query
            # file.
            try:
                Log.Info("Loading dataset", self.verbose)
                if len(self.dataset) == 2:
                    referenceData = np.genfromtxt(self.dataset[0],
                                                  delimiter=',')
                    queryData = np.genfromtxt(self.dataset[1], delimiter=',')
                    queryFeat = RealFeatures(queryFeat.T)
                else:
                    referenceData = np.genfromtxt(self.dataset, delimiter=',')

                # Labels are the last row of the dataset.
                labels = MulticlassLabels(
                    referenceData[:, (referenceData.shape[1] - 1)])
                referenceData = referenceData[:, :-1]

                with totalTimer:
                    # Get all the parameters.
                    if "k" in options:
                        k = int(options.pop("k"))
                        if (k < 1 or k > referenceData.shape[0]):
                            Log.Fatal("Invalid k: " + k.group(1) +
                                      "; must be greater than 0" +
                                      " and less or equal than " +
                                      str(referenceData.shape[0]))
                            return -1
                    else:
                        Log.Fatal(
                            "Required option: Number of furthest neighbors to find."
                        )
                        return -1

                    if len(options) > 0:
                        Log.Fatal("Unknown parameters: " + str(options))
                        raise Exception("unknown parameters")

                    referenceFeat = RealFeatures(referenceData.T)
                    distance = EuclideanDistance(referenceFeat, referenceFeat)

                    # Perform All K-Nearest-Neighbors.
                    model = SKNN(k, distance, labels)
                    model.train()

                    if len(self.dataset) == 2:
                        out = model.apply(queryFeat).get_labels()
                    else:
                        out = model.apply(referenceFeat).get_labels()
            except Exception as e:
                return -1

            return totalTimer.ElapsedTime()
def assign_labels(data, centroids, ncenters):
    from shogun import EuclideanDistance
    from shogun import RealFeatures, MulticlassLabels
    from shogun import KNN
    from numpy import arange

    labels = MulticlassLabels(arange(0., ncenters))
    fea = RealFeatures(data)
    fea_centroids = RealFeatures(centroids)
    distance = EuclideanDistance(fea_centroids, fea_centroids)
    knn = KNN(1, distance, labels)
    knn.train()
    return knn.apply(fea)
Beispiel #21
0
def metric_lmnn_statistics(
        k=3,
        fname_features='../../data/fm_train_multiclass_digits.dat.gz',
        fname_labels='../../data/label_train_multiclass_digits.dat'):
    try:
        from shogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG
        import matplotlib.pyplot as pyplot
    except ImportError:
        print 'Error importing shogun or other required modules. Please, verify their installation.'
        return

    features = RealFeatures(load_compressed_features(fname_features).T)
    labels = MulticlassLabels(CSVFile(fname_labels))

    #	print 'number of examples = %d' % features.get_num_vectors()
    #	print 'number of features = %d' % features.get_num_features()

    assert (features.get_num_vectors() == labels.get_num_labels())

    # train LMNN
    lmnn = LMNN(features, labels, k)
    lmnn.set_correction(100)
    #	lmnn.io.set_loglevel(MSG_DEBUG)
    print 'Training LMNN, this will take about two minutes...'
    lmnn.train()
    print 'Training done!'

    # plot objective obtained during training
    statistics = lmnn.get_statistics()

    pyplot.plot(statistics.obj.get())
    pyplot.grid(True)
    pyplot.xlabel('Iterations')
    pyplot.ylabel('LMNN objective')
    pyplot.title(
        'LMNN objective during training for the multiclass digits data set')

    pyplot.show()
Beispiel #22
0
def classifier_multiclassliblinear(fm_train_real, fm_test_real,
                                   label_train_multiclass, C):
    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = MulticlassLibLinear(C, feats_train, labels)
    classifier.parallel.set_num_threads(num_threads)
    classifier.train()

    label_pred = classifier.apply(feats_test)
    out = label_pred.get_labels()
    return out
def classifier_multiclassliblinear (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,width=2.1,C=1,epsilon=1e-5):
	from shogun import RealFeatures, MulticlassLabels
	from shogun import MulticlassLibLinear

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)

	labels=MulticlassLabels(label_train_multiclass)

	classifier = MulticlassLibLinear(C,feats_train,labels)
	classifier.train()

	label_pred = classifier.apply(feats_test)
	out = label_pred.get_labels()

	if label_test_multiclass is not None:
		from shogun import MulticlassAccuracy
		labels_test = MulticlassLabels(label_test_multiclass)
		evaluator = MulticlassAccuracy()
		acc = evaluator.evaluate(label_pred, labels_test)
		print('Accuracy = %.4f' % acc)

	return out
Beispiel #24
0
def mkl_multiclass(fm_train_real, fm_test_real, label_train_multiclass, width,
                   C, epsilon, num_threads, mkl_epsilon, mkl_norm):

    from shogun import CombinedFeatures, RealFeatures, MulticlassLabels
    from shogun import CombinedKernel, GaussianKernel, LinearKernel, PolyKernel
    from shogun import MKLMulticlass

    kernel = CombinedKernel()
    feats_train = CombinedFeatures()
    feats_test = CombinedFeatures()

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = GaussianKernel(10, width)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = LinearKernel()
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    subkfeats_train = RealFeatures(fm_train_real)
    subkfeats_test = RealFeatures(fm_test_real)
    subkernel = PolyKernel(10, 2)
    feats_train.append_feature_obj(subkfeats_train)
    feats_test.append_feature_obj(subkfeats_test)
    kernel.append_kernel(subkernel)

    kernel.init(feats_train, feats_train)

    labels = MulticlassLabels(label_train_multiclass)

    mkl = MKLMulticlass(C, kernel, labels)

    mkl.set_epsilon(epsilon)
    mkl.parallel.set_num_threads(num_threads)
    mkl.set_mkl_epsilon(mkl_epsilon)
    mkl.set_mkl_norm(mkl_norm)

    mkl.train()

    kernel.init(feats_train, feats_test)

    out = mkl.apply().get_labels()
    return out
Beispiel #25
0
def evaluation_clustering_simple(n_data=100, sqrt_num_blobs=4, distance=5):
    from shogun import ClusteringAccuracy, ClusteringMutualInformation
    from shogun import MulticlassLabels, GaussianBlobsDataGenerator
    from shogun import Math

    # reproducable results
    Math.init_random(1)

    # produce sone Gaussian blobs to cluster
    ncenters = sqrt_num_blobs**2
    stretch = 1
    angle = 1
    gen = GaussianBlobsDataGenerator(sqrt_num_blobs, distance, stretch, angle)
    features = gen.get_streamed_features(n_data)
    X = features.get_feature_matrix()

    # compute approximate "ground truth" labels via taking the closest blob mean
    coords = array(range(0, sqrt_num_blobs * distance, distance))
    idx_0 = [abs(coords - x).argmin() for x in X[0]]
    idx_1 = [abs(coords - x).argmin() for x in X[1]]
    ground_truth = array(
        [idx_0[i] * sqrt_num_blobs + idx_1[i] for i in range(n_data)],
        dtype="float64")

    #for label in unique(ground_truth):
    #	indices=ground_truth==label
    #	plot(X[0][indices], X[1][indices], 'o')
    #show()

    centroids = run_clustering(features, ncenters)
    gnd_hat = assign_labels(features, centroids, ncenters)
    gnd = MulticlassLabels(ground_truth)

    AccuracyEval = ClusteringAccuracy()
    AccuracyEval.best_map(gnd_hat, gnd)

    accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
    # in this case we know that the clustering has to be very good
    #print(('Clustering accuracy = %.4f' % accuracy))
    assert (accuracy > 0.8)

    MIEval = ClusteringMutualInformation()
    mutual_info = MIEval.evaluate(gnd_hat, gnd)
    #print(('Clustering mutual information = %.4f' % mutual_info))

    return gnd, accuracy, mutual_info
Beispiel #26
0
    def __init__(self, method_param, run_param):
        self.info = "SHOGUN_GPC (" + str(method_param) + ")"

        #Assemble run model parameter.
        self.data = load_dataset(method_param["datasets"], ["csv"])
        self.data_split = split_dataset(self.data[0])

        self.train_features = RealFeatures(self.data_split[0].T)

        # Encode the labels into {0,1,2,3,......,num_classes-1}
        self.train_labels, self.label_map = label_encoder(self.data_split[1])
        self.train_labels = MulticlassLabels(self.train_labels)

        if len(self.data) >= 2:
            self.test_features = RealFeatures(self.data[1].T)

        self.method_param = method_param
Beispiel #27
0
def classifier_larank(num_vec,
                      num_class,
                      distance,
                      C=0.9,
                      num_threads=1,
                      num_iter=5,
                      seed=1):
    from shogun import RealFeatures, MulticlassLabels
    from shogun import GaussianKernel
    from shogun import LaRank
    from shogun import Math_init_random

    # reproducible results
    Math_init_random(seed)
    random.seed(seed)

    # generate some training data where each class pair is linearly separable
    label_train = array([mod(x, num_class) for x in range(num_vec)],
                        dtype="float64")
    label_test = array([mod(x, num_class) for x in range(num_vec)],
                       dtype="float64")
    fm_train = array(random.randn(num_class, num_vec))
    fm_test = array(random.randn(num_class, num_vec))
    for i in range(len(label_train)):
        fm_train[int(label_train[i]), i] += distance
        fm_test[int(label_test[i]), i] += distance

    feats_train = RealFeatures(fm_train)
    feats_test = RealFeatures(fm_test)

    width = 2.1
    kernel = GaussianKernel(feats_train, feats_train, width)

    epsilon = 1e-5
    labels = MulticlassLabels(label_train)

    svm = LaRank(C, kernel, labels)
    #svm.set_tau(1e-3)
    svm.set_batch_mode(False)
    #svm.io.enable_progress()
    svm.set_epsilon(epsilon)
    svm.train()
    out = svm.apply(feats_test).get_labels()
    predictions = svm.apply()
    return predictions, svm, predictions.get_labels()
Beispiel #28
0
def classifier_gmnpsvm(fm_train_real, fm_test_real, label_train_multiclass, C):
    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)
    kernel = GaussianKernel(feats_train, feats_train, width)
    import time
    start = time.time()
    tmp = kernel.get_kernel_matrix()
    end = time.time()

    labels = MulticlassLabels(label_train_multiclass)

    svm = GMNPSVM(C, kernel, labels)
    svm.set_epsilon(epsilon)
    svm.parallel.set_num_threads(num_threads)
    svm.train(feats_train)

    out = svm.apply(feats_test).get_labels()
    return out
def main(actual, predicted):
    LOGGER.info("SVM Multiclass evaluator")

    # Load SVMLight dataset
    feats, labels = get_features_and_labels(LibSVMFile(actual))

    # Load predicted labels
    with open(predicted, 'r') as f:
        predicted_labels_arr = np.array([float(l) for l in f])
        predicted_labels = MulticlassLabels(predicted_labels_arr)

    # Evaluate accuracy
    multiclass_measures = MulticlassAccuracy()
    LOGGER.info("Accuracy = %s" %
                multiclass_measures.evaluate(labels, predicted_labels))
    LOGGER.info("Confusion matrix:")
    res = multiclass_measures.get_confusion_matrix(labels, predicted_labels)
    print res
Beispiel #30
0
        def RunSVMShogun():
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            trainData, labels = SplitTrainData(self.dataset)
            trainData = RealFeatures(trainData.T)
            labels = MulticlassLabels(labels)
            testData = RealFeatures(LoadDataset(self.dataset[1]).T)

            try:
                with totalTimer:
                    self.model = self.BuildModel(trainData, labels, options)
                    # Run Support vector machines on the test dataset.
                    self.model.apply(testData).get_labels()
            except Exception as e:
                return -1

            return totalTimer.ElapsedTime()
Beispiel #31
0
def classifier_multiclassmachine (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,width=2.1,C=1,epsilon=1e-5):
	from shogun import RealFeatures, MulticlassLabels
	from shogun import GaussianKernel
	from shogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy

	feats_train=RealFeatures(fm_train_real)
	feats_test=RealFeatures(fm_test_real)
	kernel=GaussianKernel(feats_train, feats_train, width)

	labels=MulticlassLabels(label_train_multiclass)

	classifier = LibSVM()
	classifier.set_epsilon(epsilon)
	#print labels.get_labels()
	mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),kernel,classifier,labels)
	mc_classifier.train()

	kernel.init(feats_train, feats_test)
	out = mc_classifier.apply().get_labels()
	return out
def evaluation_cross_validation_multiclass_storage (traindat=traindat, label_traindat=label_traindat):
    from shogun import CrossValidation, CrossValidationResult
    from shogun import ParameterObserverCV
    from shogun import MulticlassAccuracy, F1Measure
    from shogun import StratifiedCrossValidationSplitting
    from shogun import MulticlassLabels
    from shogun import RealFeatures, CombinedFeatures
    from shogun import GaussianKernel, CombinedKernel
    from shogun import MKLMulticlass
    from shogun import Statistics, MSG_DEBUG, Math
    from shogun import ROCEvaluation

    Math.init_random(1)

    # training data, combined features all on same data
    features=RealFeatures(traindat)
    comb_features=CombinedFeatures()
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    comb_features.append_feature_obj(features)
    labels=MulticlassLabels(label_traindat)

    # kernel, different Gaussians combined
    kernel=CombinedKernel()
    kernel.append_kernel(GaussianKernel(10, 0.1))
    kernel.append_kernel(GaussianKernel(10, 1))
    kernel.append_kernel(GaussianKernel(10, 2))

    # create mkl using libsvm, due to a mem-bug, interleaved is not possible
    svm=MKLMulticlass(1.0,kernel,labels);
    svm.set_kernel(kernel);

    # splitting strategy for 5 fold cross-validation (for classification its better
    # to use "StratifiedCrossValidation", but the standard
    # "StratifiedCrossValidationSplitting" is also available
    splitting_strategy=StratifiedCrossValidationSplitting(labels, 3)

    # evaluation method
    evaluation_criterium=MulticlassAccuracy()

    # cross-validation instance
    cross_validation=CrossValidation(svm, comb_features, labels,
        splitting_strategy, evaluation_criterium)
    cross_validation.set_autolock(False)

    # append cross validation parameter observer
    multiclass_storage=ParameterObserverCV()
    cross_validation.subscribe_to_parameters(multiclass_storage)
    cross_validation.set_num_runs(3)

    # perform cross-validation
    result=cross_validation.evaluate()

    # get first observation and first fold
    obs = multiclass_storage.get_observations()[0]
    fold = obs.get_folds_results()[0]

    # get fold ROC for first class
    eval_ROC = ROCEvaluation()
    pred_lab_binary = MulticlassLabels.obtain_from_generic(fold.get_test_result()).get_binary_for_class(0)
    true_lab_binary = MulticlassLabels.obtain_from_generic(fold.get_test_true_result()).get_binary_for_class(0)
    eval_ROC.evaluate(pred_lab_binary, true_lab_binary)
    print eval_ROC.get_ROC()

    # get fold evaluation result
    acc_measure = F1Measure()
    print acc_measure.evaluate(pred_lab_binary, true_lab_binary)
Beispiel #33
0
def features_io (fm_train_real, label_train_twoclass):
	import numpy
	from shogun import SparseRealFeatures, RealFeatures, MulticlassLabels
	from shogun import GaussianKernel
	from shogun import LibSVMFile, CSVFile, BinaryFile, HDF5File
	from tempfile import NamedTemporaryFile

	feats=SparseRealFeatures(fm_train_real)
	feats2=SparseRealFeatures()

	tmp_fm_train_sparsereal_bin = NamedTemporaryFile(suffix='sparsereal.bin')
	f=BinaryFile(tmp_fm_train_sparsereal_bin.name, "w")
	feats.save(f)

	tmp_fm_train_sparsereal_ascii = NamedTemporaryFile(suffix='sparsereal.ascii')
	f=LibSVMFile(tmp_fm_train_sparsereal_ascii.name, "w")
	feats.save(f)

	f=BinaryFile(tmp_fm_train_sparsereal_bin.name)
	feats2.load(f)

	f=LibSVMFile(tmp_fm_train_sparsereal_ascii.name)
	feats2.load(f)

	feats=RealFeatures(fm_train_real)
	feats2=RealFeatures()

	tmp_fm_train_real_bin = NamedTemporaryFile(suffix='real.bin')
	f=BinaryFile(tmp_fm_train_real_bin.name, "w")
	feats.save(f)

	tmp_fm_train_real_h5 = NamedTemporaryFile(suffix='real.h5')
	f=HDF5File(tmp_fm_train_real_h5.name, "w", "/data/doubles")
	feats.save(f)

	tmp_fm_train_real_ascii = NamedTemporaryFile(suffix='real.ascii')
	f=CSVFile(tmp_fm_train_real_ascii.name, "w")
	feats.save(f)

	f=BinaryFile(tmp_fm_train_real_bin.name)
	feats2.load(f)
	#print("diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())))

	f=CSVFile(tmp_fm_train_real_ascii.name)
	feats2.load(f)
	#print("diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())))

	lab=MulticlassLabels(numpy.array([0.0,1.0,2.0,3.0]))
	lab2=MulticlassLabels()
	tmp_label_train_twoclass_ascii = NamedTemporaryFile(suffix='twoclass.ascii')
	f=CSVFile(tmp_label_train_twoclass_ascii.name, "w")
	lab.save(f)

	tmp_label_train_twoclass_bin = NamedTemporaryFile(suffix='twoclass.bin')
	f=BinaryFile(tmp_label_train_twoclass_bin.name, "w")
	lab.save(f)

	tmp_label_train_real_h5 = NamedTemporaryFile(suffix='real.h5')
	f=HDF5File(tmp_label_train_real_h5.name, "w", "/data/labels")
	lab.save(f)

	f=CSVFile(tmp_label_train_twoclass_ascii.name)
	lab2.load(f)

	f=BinaryFile(tmp_label_train_twoclass_bin.name)
	lab2.load(f)

	f=HDF5File(tmp_fm_train_real_h5.name, "r", "/data/doubles")
	feats2.load(f)
	#print(feats2.get_feature_matrix())
	f=HDF5File(tmp_label_train_real_h5.name, "r", "/data/labels")
	lab2.load(f)
	#print(lab2.get_labels())

	return feats, feats2, lab, lab2