Example #1
0
def classifier_multiclass_shareboost (fm_train_real=traindat,fm_test_real=testdat,label_train_multiclass=label_traindat,label_test_multiclass=label_testdat,lawidth=2.1,C=1,epsilon=1e-5):
    from modshogun import RealFeatures, RealSubsetFeatures, MulticlassLabels
    from modshogun import ShareBoost

    #print('Working on a problem of %d features and %d samples' % fm_train_real.shape)

    feats_train = RealFeatures(fm_train_real)

    labels = MulticlassLabels(label_train_multiclass)

    shareboost = ShareBoost(feats_train, labels, min(fm_train_real.shape[0]-1, 30))
    shareboost.train();
    #print(shareboost.get_activeset())

    feats_test  = RealSubsetFeatures(RealFeatures(fm_test_real), shareboost.get_activeset())
    label_pred = shareboost.apply(feats_test)

    out = label_pred.get_labels()

    if label_test_multiclass is not None:
        from modshogun import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc = evaluator.evaluate(label_pred, labels_test)
        #print('Accuracy = %.4f' % acc)

    return out
def classifier_multiclassliblinear_modular(
        fm_train_real=traindat,
        fm_test_real=testdat,
        label_train_multiclass=label_traindat,
        label_test_multiclass=label_testdat,
        width=2.1,
        C=1,
        epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import MulticlassLibLinear

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = MulticlassLibLinear(C, feats_train, labels)
    classifier.train()

    label_pred = classifier.apply(feats_test)
    out = label_pred.get_labels()

    if label_test_multiclass is not None:
        from modshogun import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc = evaluator.evaluate(label_pred, labels_test)
        print('Accuracy = %.4f' % acc)

    return out
def classifier_multiclasslogisticregression_modular(
        fm_train_real=traindat,
        fm_test_real=testdat,
        label_train_multiclass=label_traindat,
        label_test_multiclass=label_testdat,
        z=1,
        epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    try:
        from modshogun import MulticlassLogisticRegression
    except ImportError:
        print("recompile shogun with Eigen3 support")
        return

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = MulticlassLogisticRegression(z, feats_train, labels)
    classifier.train()

    label_pred = classifier.apply(feats_test)
    out = label_pred.get_labels()

    if label_test_multiclass is not None:
        from modshogun import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc = evaluator.evaluate(label_pred, labels_test)
        print('Accuracy = %.4f' % acc)

    return out
def classifier_multiclass_relaxedtree(fm_train_real=traindat,
                                      fm_test_real=testdat,
                                      label_train_multiclass=label_traindat,
                                      label_test_multiclass=label_testdat,
                                      lawidth=2.1,
                                      C=1,
                                      epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import RelaxedTree, MulticlassLibLinear
    from modshogun import GaussianKernel

    #print('Working on a problem of %d features and %d samples' % fm_train_real.shape)

    feats_train = RealFeatures(fm_train_real)

    labels = MulticlassLabels(label_train_multiclass)

    machine = RelaxedTree()
    machine.set_machine_for_confusion_matrix(MulticlassLibLinear())
    machine.set_kernel(GaussianKernel())
    machine.set_labels(labels)
    machine.train(feats_train)

    label_pred = machine.apply_multiclass(RealFeatures(fm_test_real))
    out = label_pred.get_labels()

    if label_test_multiclass is not None:
        from modshogun import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc = evaluator.evaluate(label_pred, labels_test)
        print('Accuracy = %.4f' % acc)

    return out
def classifier_multiclasslinearmachine_modular(
        fm_train_real=traindat,
        fm_test_real=testdat,
        label_train_multiclass=label_traindat,
        label_test_multiclass=label_testdat,
        width=2.1,
        C=1,
        epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine, MulticlassOneVsOneStrategy, MulticlassOneVsRestStrategy

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = LibLinear(L2R_L2LOSS_SVC)
    classifier.set_epsilon(epsilon)
    classifier.set_bias_enabled(True)
    mc_classifier = LinearMulticlassMachine(MulticlassOneVsOneStrategy(),
                                            feats_train, classifier, labels)

    mc_classifier.train()
    label_pred = mc_classifier.apply()
    out = label_pred.get_labels()

    if label_test_multiclass is not None:
        from modshogun import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc = evaluator.evaluate(label_pred, labels_test)
        print('Accuracy = %.4f' % acc)

    return out
def evaluation_multiclassaccuracy_modular(ground_truth, predicted):
    from modshogun import MulticlassLabels
    from modshogun import MulticlassAccuracy

    ground_truth_labels = MulticlassLabels(ground_truth)
    predicted_labels = MulticlassLabels(predicted)

    evaluator = MulticlassAccuracy()
    accuracy = evaluator.evaluate(predicted_labels, ground_truth_labels)

    return accuracy
def classifier_multiclass_ecoc_random(fm_train_real=traindat,
                                      fm_test_real=testdat,
                                      label_train_multiclass=label_traindat,
                                      label_test_multiclass=label_testdat,
                                      lawidth=2.1,
                                      C=1,
                                      epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine
    from modshogun import ECOCStrategy, ECOCRandomSparseEncoder, ECOCRandomDenseEncoder, ECOCHDDecoder
    from modshogun import Math_init_random
    Math_init_random(12345)

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = LibLinear(L2R_L2LOSS_SVC)
    classifier.set_epsilon(epsilon)
    classifier.set_bias_enabled(True)

    rnd_dense_strategy = ECOCStrategy(ECOCRandomDenseEncoder(),
                                      ECOCHDDecoder())
    rnd_sparse_strategy = ECOCStrategy(ECOCRandomSparseEncoder(),
                                       ECOCHDDecoder())

    dense_classifier = LinearMulticlassMachine(rnd_dense_strategy, feats_train,
                                               classifier, labels)
    dense_classifier.train()
    label_dense = dense_classifier.apply(feats_test)
    out_dense = label_dense.get_labels()

    sparse_classifier = LinearMulticlassMachine(rnd_sparse_strategy,
                                                feats_train, classifier,
                                                labels)
    sparse_classifier.train()
    label_sparse = sparse_classifier.apply(feats_test)
    out_sparse = label_sparse.get_labels()

    if label_test_multiclass is not None:
        from modshogun import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc_dense = evaluator.evaluate(label_dense, labels_test)
        acc_sparse = evaluator.evaluate(label_sparse, labels_test)
        print('Random Dense Accuracy  = %.4f' % acc_dense)
        print('Random Sparse Accuracy = %.4f' % acc_sparse)

    return out_sparse, out_dense
def classifier_multiclass_ecoc_ovr(fm_train_real=traindat,
                                   fm_test_real=testdat,
                                   label_train_multiclass=label_traindat,
                                   label_test_multiclass=label_testdat,
                                   lawidth=2.1,
                                   C=1,
                                   epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine
    from modshogun import ECOCStrategy, ECOCOVREncoder, ECOCLLBDecoder, MulticlassOneVsRestStrategy

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = LibLinear(L2R_L2LOSS_SVC)
    classifier.set_epsilon(epsilon)
    classifier.set_bias_enabled(True)

    mc_classifier = LinearMulticlassMachine(MulticlassOneVsRestStrategy(),
                                            feats_train, classifier, labels)
    mc_classifier.train()
    label_mc = mc_classifier.apply(feats_test)
    out_mc = label_mc.get_labels()

    ecoc_strategy = ECOCStrategy(ECOCOVREncoder(), ECOCLLBDecoder())
    ecoc_classifier = LinearMulticlassMachine(ecoc_strategy, feats_train,
                                              classifier, labels)
    ecoc_classifier.train()
    label_ecoc = ecoc_classifier.apply(feats_test)
    out_ecoc = label_ecoc.get_labels()

    n_diff = (out_mc != out_ecoc).sum()
    #if n_diff == 0:
    #	print("Same results for OvR and ECOCOvR")
    #else:
    #	print("Different results for OvR and ECOCOvR (%d out of %d are different)" % (n_diff, len(out_mc)))

    if label_test_multiclass is not None:
        from modshogun import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc_mc = evaluator.evaluate(label_mc, labels_test)
        acc_ecoc = evaluator.evaluate(label_ecoc, labels_test)
        #print('Normal OVR Accuracy = %.4f' % acc_mc)
        #print('ECOC OVR Accuracy   = %.4f' % acc_ecoc)

    return out_ecoc, out_mc
Example #9
0
 def fit(self, X, labels):
   self.X = X
   self.L = np.eye(X.shape[1])
   labels = MulticlassLabels(labels.astype(np.float64))
   self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.params['k'])
   self._lmnn.set_maxiter(self.params['max_iter'])
   self._lmnn.set_obj_threshold(self.params['convergence_tol'])
   self._lmnn.set_regularization(self.params['regularization'])
   self._lmnn.set_stepsize(self.params['learn_rate'])
   if self.params['use_pca']:
     self._lmnn.train()
   else:
     self._lmnn.train(self.L)
   self.L = self._lmnn.get_linear_transform()
   return self
Example #10
0
 def fit(self, X, labels, verbose=False):
   self.X = X
   self.L = np.eye(X.shape[1])
   labels = MulticlassLabels(labels.astype(np.float64))
   self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, self.params['k'])
   self._lmnn.set_maxiter(self.params['max_iter'])
   self._lmnn.set_obj_threshold(self.params['convergence_tol'])
   self._lmnn.set_regularization(self.params['regularization'])
   self._lmnn.set_stepsize(self.params['learn_rate'])
   if self.params['use_pca']:
     self._lmnn.train()
   else:
     self._lmnn.train(self.L)
   self.L = self._lmnn.get_linear_transform()
   return self
def evaluation_clustering(features=fea, ground_truth=gnd_raw, ncenters=10):
    from modshogun import ClusteringAccuracy, ClusteringMutualInformation
    from modshogun import MulticlassLabels
    from modshogun import Math

    # reproducable results
    Math.init_random(1)

    centroids = run_clustering(features, ncenters)
    gnd_hat = assign_labels(features, centroids, ncenters)
    gnd = MulticlassLabels(ground_truth)

    AccuracyEval = ClusteringAccuracy()
    AccuracyEval.best_map(gnd_hat, gnd)

    accuracy = AccuracyEval.evaluate(gnd_hat, gnd)
    #print(('Clustering accuracy = %.4f' % accuracy))

    MIEval = ClusteringMutualInformation()
    mutual_info = MIEval.evaluate(gnd_hat, gnd)
    #print(('Clustering mutual information = %.4f' % mutual_info))

    # TODO mutual information does not work with serialization
    #return gnd, gnd_hat, accuracy, MIEval, mutual_info
    return gnd, gnd_hat, accuracy
Example #12
0
def metric_lmnn_modular(train_fname=traindat,
                        test_fname=testdat,
                        label_train_fname=label_traindat,
                        k=3):
    try:
        from modshogun import RealFeatures, MulticlassLabels, LMNN, KNN, CSVFile
    except ImportError:
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train_fname))
    feats_test = RealFeatures(CSVFile(test_fname))
    labels = MulticlassLabels(CSVFile(label_train_fname))

    # LMNN
    lmnn = LMNN(feats_train, labels, k)
    lmnn.train()
    lmnn_distance = lmnn.get_distance()

    # perform classification with KNN
    knn = KNN(k, lmnn_distance, labels)
    knn.train()
    output = knn.apply(feats_test).get_labels()

    return lmnn, output
Example #13
0
        def RunNBCShogun():
            totalTimer = Timer()
            self.predictions = None
            Log.Info("Loading dataset", self.verbose)
            try:
                # Load train and test dataset.
                trainData = np.genfromtxt(self.dataset[0], delimiter=',')
                testData = np.genfromtxt(self.dataset[1], delimiter=',')

                # Labels are the last row of the training set.
                labels = MulticlassLabels(trainData[:,
                                                    (trainData.shape[1] - 1)])

                with totalTimer:
                    # Transform into features.
                    trainFeat = RealFeatures(trainData[:, :-1].T)
                    testFeat = RealFeatures(testData.T)

                    # Create and train the classifier.
                    self.model = self.BuildModel(trainFeat, labels, options)

                    # Run Naive Bayes Classifier on the test dataset.
                    self.predictions = self.model.apply_multiclass(
                        testFeat).get_labels()

            except Exception as e:
                return [-1]

            time = totalTimer.ElapsedTime()
            if len(self.dataset) > 1:
                return [time, self.predictions]

            return [time]
Example #14
0
    def fit(self, feats, labels):
        self.eigenvecs, self.space = self.space_model.fit(feats, labels)
        feat = RealFeatures(self.space)
        self.metric_model = shogun_LMNN(
            feat, MulticlassLabels(labels.astype(np.float64)), self.k)
        self.metric_model.set_maxiter(1000)
        self.metric_model.set_regularization(0.50)
        self.metric_model.set_obj_threshold(0.001)
        self.metric_model.set_stepsize(1e-7)

        #pdb.set_trace()
        L = np.eye(self.space.shape[1])
        self.metric_model.train(L)

        stats = self.metric_model.get_statistics()
        #plt.plot(stats.obj.get())
        #plt.grid(True)
        #plt.show()
        self.linear_transform = self.metric_model.get_linear_transform()

        self.projected_data = np.dot(self.linear_transform, self.space)
        norms = np.linalg.norm(self.projected_data, axis=0)
        self.projected_data /= norms
        # Fit the data with PCA first.
        # pdb.set_trace()
        return self.eigenvecs, self.projected_data
def classifier_multiclassmachine_modular(fm_train_real=traindat,
                                         fm_test_real=testdat,
                                         label_train_multiclass=label_traindat,
                                         width=2.1,
                                         C=1,
                                         epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import GaussianKernel
    from modshogun import LibSVM, KernelMulticlassMachine, MulticlassOneVsRestStrategy

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)
    kernel = GaussianKernel(feats_train, feats_train, width)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = LibSVM()
    classifier.set_epsilon(epsilon)
    #print labels.get_labels()
    mc_classifier = KernelMulticlassMachine(MulticlassOneVsRestStrategy(),
                                            kernel, classifier, labels)
    mc_classifier.train()

    kernel.init(feats_train, feats_test)
    out = mc_classifier.apply().get_labels()
    return out
Example #16
0
        def RunDTCShogun(q):
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            trainData, labels = SplitTrainData(self.dataset)
            trainData = RealFeatures(trainData.T)
            labels = MulticlassLabels(labels)
            testData = RealFeatures(LoadDataset(self.dataset[1]).T)

            if len(options) > 0:
                Log.Fatal("Unknown parameters: " + str(options))
                raise Exception("unknown parameters")

            try:
                with totalTimer:
                    self.model = self.BuildModel(trainData, labels, options)
                    # Run the CARTree Classifier on the test dataset.
                    self.model.apply_multiclass(testData).get_labels()
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)

            return time
Example #17
0
 def BuildModel(self, data, responses):
     # Create and train the classifier.
     model = Perceptron(RealFeatures(data.T), MulticlassLabels(responses))
     if self.iterations:
         model.set_max_iter(self.iterations)
     model.train()
     return model
Example #18
0
    def RunRandomForestShogun(q):
      totalTimer = Timer()

      Log.Info("Loading dataset", self.verbose)
      trainData, labels = SplitTrainData(self.dataset)
      trainData = RealFeatures(trainData.T)
      labels = MulticlassLabels(labels)
      testData = RealFeatures(LoadDataset(self.dataset[1]).T)

      # Number of Trees.
      n = re.search("-n (\d+)", options)
      # Number of attributes to be chosen randomly to select from.
      f = re.search("-f (\d+)", options)

      self.form = 1 if not f else int(f.group(1))
      self.numTrees = 10 if not n else int(n.group(1))

      try:
        with totalTimer:
          self.model = self.BuildModel(trainData, labels, options)
          # Run the Random Forest Classifier on the test dataset.
          self.model.apply_multiclass(testData).get_labels()
      except Exception as e:
        q.put(-1)
        return -1

      time = totalTimer.ElapsedTime()
      q.put(time)
      return time
Example #19
0
        def RunNBCShogun(q):
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            try:
                # Load train and test dataset.
                trainData = np.genfromtxt(self.dataset[0], delimiter=',')
                testData = np.genfromtxt(self.dataset[1], delimiter=',')

                # Labels are the last row of the training set.
                labels = MulticlassLabels(trainData[:,
                                                    (trainData.shape[1] - 1)])

                with totalTimer:
                    # Transform into features.
                    trainFeat = RealFeatures(trainData[:, :-1].T)
                    testFeat = RealFeatures(testData.T)

                    # Create and train the classifier.
                    nbc = GaussianNaiveBayes(trainFeat, labels)
                    nbc.train()

                    # Run Naive Bayes Classifier on the test dataset.
                    nbc.apply(testFeat).get_labels()
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
def multiclass_randomforest_modular(train=traindat,
                                    test=testdat,
                                    labels=label_traindat,
                                    ft=feattypes):
    try:
        from modshogun import RealFeatures, MulticlassLabels, CSVFile, RandomForest, MajorityVote
    except ImportError:
        print("Could not import Shogun modules")
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train))
    feats_test = RealFeatures(CSVFile(test))
    train_labels = MulticlassLabels(CSVFile(labels))

    # Random Forest formation
    rand_forest = RandomForest(feats_train, train_labels, 20, 1)
    rand_forest.set_feature_types(ft)
    rand_forest.set_combination_rule(MajorityVote())
    rand_forest.train()

    # Classify test data
    output = rand_forest.apply_multiclass(feats_test).get_labels()

    return rand_forest, output
Example #21
0
        def RunRandomForestShogun():
            totalTimer = Timer()

            Log.Info("Loading dataset", self.verbose)
            trainData, labels = SplitTrainData(self.dataset)
            trainData = RealFeatures(trainData.T)
            labels = MulticlassLabels(labels)
            testData = RealFeatures(LoadDataset(self.dataset[1]).T)

            if "num_trees" in options:
                self.numTrees = int(options.pop("num_trees"))
            else:
                Log.Fatal("Required parameter 'num_trees' not specified!")
                raise Exception("missing parameter")

            self.form = 1
            if "dimensions" in options:
                self.form = int(options.pop("dimensions"))

            if len(options) > 0:
                Log.Fatal("Unknown parameters: " + str(options))
                raise Exception("unknown parameters")

            try:
                with totalTimer:
                    self.model = self.BuildModel(trainData, labels, options)
                    # Run the Random Forest Classifier on the test dataset.
                    self.predictions = self.model.apply_multiclass(
                        testData).get_labels()
            except Exception as e:
                return [-1]

            time = totalTimer.ElapsedTime()
            return [time, self.predictions]
def classifier_multiclassocas_modular (num_vec=10,num_class=3,distance=15,width=2.1,C=1,epsilon=1e-5,seed=1):
	from modshogun import RealFeatures, MulticlassLabels
	from modshogun import MulticlassOCAS
	from modshogun import Math_init_random

	# reproducible results
	random.seed(seed)
	Math_init_random(seed)

	# generate some training data where each class pair is linearly separable
	label_train=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	label_test=array([mod(x,num_class) for x in range(num_vec)],dtype="float64")
	fm_train=array(random.randn(num_class,num_vec))
	fm_test=array(random.randn(num_class,num_vec))
	for i in range(len(label_train)):
		fm_train[label_train[i],i]+=distance
		fm_test[label_test[i],i]+=distance

	feats_train=RealFeatures(fm_train)
	feats_test=RealFeatures(fm_test)

	labels=MulticlassLabels(label_train)

	classifier = MulticlassOCAS(C,feats_train,labels)
	classifier.train()

	out = classifier.apply(feats_test).get_labels()
	#print label_test
	#print out
	return out,classifier
Example #23
0
    def fit(self, feats, labels):
        self.X_tr = feats
        self.y_train = labels

        feats = feats.astype(np.float64)
        feat = RealFeatures(feats.T)
        self.metric_model = shogun_LMNN(
            feat, MulticlassLabels(labels.astype(np.float64)), self.k)
        self.metric_model.set_maxiter(self.max_iter)
        self.metric_model.set_regularization(self.regularization)
        self.metric_model.set_obj_threshold(self.convergence_tol)
        self.metric_model.set_stepsize(self.learn_rate)

        self.metric_model.train()

        stats = self.metric_model.get_statistics()
        #pdb.set_trace()
        #plt.plot(stats.obj.get())
        #plt.grid(True)
        #plt.show()
        self.linear_transform = self.metric_model.get_linear_transform()

        #self.projected_data = np.dot(self.linear_transform, feats.T)
        #norms = np.linalg.norm(self.projected_data, axis=0)
        #self.projected_data /= norms
        # Fit the data with PCA first.
        # pdb.set_trace()
        return self
def multiclass_c45classifiertree_modular(train=traindat,
                                         test=testdat,
                                         labels=label_traindat,
                                         ft=feattypes):
    try:
        from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree
        from numpy import random, int32
    except ImportError:
        print("Could not import Shogun and/or numpy modules")
        return

    # wrap features and labels into Shogun objects
    feats_train = RealFeatures(CSVFile(train))
    feats_test = RealFeatures(CSVFile(test))
    train_labels = MulticlassLabels(CSVFile(labels))

    # divide train dataset into training and validation subsets in the ratio 2/3 to 1/3
    subset = int32(random.permutation(feats_train.get_num_vectors()))
    vsubset = subset[1:subset.size / 3]
    trsubset = subset[1 + subset.size / 3:subset.size]

    # C4.5 Tree formation using training subset
    train_labels.add_subset(trsubset)
    feats_train.add_subset(trsubset)

    c = C45ClassifierTree()
    c.set_labels(train_labels)
    c.set_feature_types(ft)
    c.train(feats_train)

    train_labels.remove_subset()
    feats_train.remove_subset()

    # prune tree using validation subset
    train_labels.add_subset(vsubset)
    feats_train.add_subset(vsubset)

    c.prune_tree(feats_train, train_labels)

    train_labels.remove_subset()
    feats_train.remove_subset()

    # Classify test data
    output = c.apply_multiclass(feats_test).get_labels()
    output_certainty = c.get_certainty_vector()

    return c, output, output_certainty
Example #25
0
 def BuildModel(self, data, responses):
   # Create and train the classifier.
   model = MulticlassLogisticRegression(self.z, RealFeatures(data.T),
       MulticlassLabels(responses))
   if self.max_iter is not None:
     model.set_max_iter(self.max_iter);
   model.train()
   return model
Example #26
0
        def RunAllKnnShogun(q):
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the query
            # file.
            try:
                Log.Info("Loading dataset", self.verbose)
                if len(self.dataset) == 2:
                    referenceData = np.genfromtxt(self.dataset[0],
                                                  delimiter=',')
                    queryData = np.genfromtxt(self.dataset[1], delimiter=',')
                    queryFeat = RealFeatures(queryFeat.T)
                else:
                    referenceData = np.genfromtxt(self.dataset, delimiter=',')

                # Labels are the last row of the dataset.
                labels = MulticlassLabels(
                    referenceData[:, (referenceData.shape[1] - 1)])
                referenceData = referenceData[:, :-1]

                with totalTimer:
                    # Get all the parameters.
                    k = re.search("-k (\d+)", options)
                    if not k:
                        Log.Fatal(
                            "Required option: Number of furthest neighbors to find."
                        )
                        q.put(-1)
                        return -1
                    else:
                        k = int(k.group(1))
                        if (k < 1 or k > referenceData.shape[0]):
                            Log.Fatal("Invalid k: " + k.group(1) +
                                      "; must be greater than 0" +
                                      " and less or equal than " +
                                      str(referenceData.shape[0]))
                            q.put(-1)
                            return -1

                    referenceFeat = RealFeatures(referenceData.T)
                    distance = EuclideanDistance(referenceFeat, referenceFeat)

                    # Perform All K-Nearest-Neighbors.
                    model = SKNN(k, distance, labels)
                    model.train()

                    if len(self.dataset) == 2:
                        out = model.apply(queryFeat).get_labels()
                    else:
                        out = model.apply(referenceFeat).get_labels()
            except Exception as e:
                q.put(-1)
                return -1

            time = totalTimer.ElapsedTime()
            q.put(time)
            return time
Example #27
0
        def RunLDAShogun():
            totalTimer = Timer()

            # Load input dataset.
            # If the dataset contains two files then the second file is the test file.
            try:
                if len(self.dataset) > 1:
                    testSet = LoadDataset(self.dataset[1])

                # Use the last row of the training set as the responses.
                trainSet, trainLabels = SplitTrainData(self.dataset)
                # if the labels are not in {0,1,2,...,num_classes-1}, map them to this set and store the mapping
                # shogun's MCLDA class requires the labels to be in {0,1,2,...,num_classes-1}
                distinctLabels = list(set(trainLabels))
                mapping = {}
                reverseMapping = {}
                idx = 0
                for label in distinctLabels:
                    mapping[label] = idx
                    reverseMapping[idx] = label
                    idx += 1
                for i in range(len(trainLabels)):
                    trainLabels[i] = mapping[trainLabels[i]]

                trainFeat = RealFeatures(trainSet.T)
                trainLabels = MulticlassLabels(trainLabels)
                # Gather optional parameters.
                if "tolerance" in options:
                    self.tolerance = float(options.pop("tolerance"))

                if "store" in options:
                    self.store = bool(options.pop("store"))

                if (len(options) > 0):
                    Log.Fatal("Unknown parameters: " + str(options))
                    raise Exception("unknown parameters")

                with totalTimer:
                    self.model = MCLDA(trainFeat, trainLabels, self.tolerance,
                                       self.store)
                    self.model.train()

                if (len(self.dataset) > 0):
                    self.predictions = self.model.apply_multiclass(
                        RealFeatures(testSet.T))
                    self.predictions = self.predictions.get_labels()
                    # reverse map the predicted labels to actual labels
                    for i in range(len(self.predictions)):
                        self.predictions[i] = reverseMapping[
                            self.predictions[i]]

            except Exception as e:
                Log.Info("Exception: " + str(e))
                return -1

            time = totalTimer.ElapsedTime()
            return time
Example #28
0
    def _svm_new(self, kernel_width, c, epsilon):
        if self.x == None or self.y == None:
            raise Exception("No training data loaded.")

        x = RealFeatures(self.x)
        y = MulticlassLabels(self.y)

        self.svm = GMNPSVM(c, GaussianKernel(x, x, kernel_width), y)
        self.svm.set_epsilon(epsilon)
Example #29
0
def main():

    # Get training file name from the command line
    traindatafile = sys.argv[1]

    # The training file is in libSVM format

    with open(traindatafile, mode="r") as myFile:
        lines = myFile.readlines()

    random.shuffle(lines)
    open("tempdata.dat", 'w').writelines(lines)

    tr_data = load_svmlight_file("tempdata.dat")
    #To randomly select 5000 points

    Xtr = tr_data[0].toarray()
    # Converts sparse matrices to dense
    Ytr = tr_data[1]
    # The trainig labels

    Xtr = Xtr[:5000]
    Ytr = Ytr[:5000]
    # Cast data to Shogun format to work with LMNN
    features = RealFeatures(Xtr.T)
    labels = MulticlassLabels(Ytr.astype(np.float64))

    #print(Xtr.shape)
    ### Do magic stuff here to learn the best metric you can ###
    kmax = 25  #inductive bias
    values = list(range(1, kmax + 1))
    k = predict(Xtr, Ytr, values)
    # Number of target neighbours per example - tune this using validation
    #print(k)
    # Initialize the LMNN package
    print("K : "),
    print(k)

    k = 5
    lmnn = LMNN(features, labels, k)
    init_transform = np.eye(Xtr.shape[1])

    # Choose an appropriate timeout
    lmnn.set_maxiter(25000)
    lmnn.train(init_transform)

    # Let LMNN do its magic and return a linear transformation
    # corresponding to the Mahalanobis metric it has learnt
    L = lmnn.get_linear_transform()
    M = np.matrix(np.dot(L.T, L))

    print("LMNN done")
    #print(M)
    # Save the model for use in testing phase
    # Warning: do not change this file name
    np.save("model.npy", M)
    def load_train(self):
        ims, labels = self.load( self.test_images, self.test_labels)

        self.test_images = ims
        self.test_labels = labels
        labels_numbers = MulticlassLabels(self.test_labels)
        feats  = RealFeatures(self.test_images.T)
        dist = EuclideanDistance()
        self.knn = KNN(self.k, dist, labels_numbers)
        self.knn.train(feats)
Example #31
0
def classifier_multiclass_ecoc_discriminant(
        fm_train_real=traindat,
        fm_test_real=testdat,
        label_train_multiclass=label_traindat,
        label_test_multiclass=label_testdat,
        lawidth=2.1,
        C=1,
        epsilon=1e-5):
    from modshogun import RealFeatures, MulticlassLabels
    from modshogun import LibLinear, L2R_L2LOSS_SVC, LinearMulticlassMachine
    from modshogun import ECOCStrategy, ECOCDiscriminantEncoder, ECOCHDDecoder

    feats_train = RealFeatures(fm_train_real)
    feats_test = RealFeatures(fm_test_real)

    labels = MulticlassLabels(label_train_multiclass)

    classifier = LibLinear(L2R_L2LOSS_SVC)
    classifier.set_epsilon(epsilon)
    classifier.set_bias_enabled(True)

    encoder = ECOCDiscriminantEncoder()
    encoder.set_features(feats_train)
    encoder.set_labels(labels)
    encoder.set_sffs_iterations(50)

    strategy = ECOCStrategy(encoder, ECOCHDDecoder())

    classifier = LinearMulticlassMachine(strategy, feats_train, classifier,
                                         labels)
    classifier.train()
    label_pred = classifier.apply(feats_test)
    out = label_pred.get_labels()

    if label_test_multiclass is not None:
        from modshogun import MulticlassAccuracy
        labels_test = MulticlassLabels(label_test_multiclass)
        evaluator = MulticlassAccuracy()
        acc = evaluator.evaluate(label_pred, labels_test)
        print('Accuracy = %.4f' % acc)

    return out
Example #32
0
def load_data(num_train_samples=7291, m_data_dict=data_dict):
	from modshogun import RealFeatures, MulticlassLabels
	import numpy

	train_vec = m_data_dict['yTr'][0][:num_train_samples].astype(numpy.float64)
	train_labels = MulticlassLabels(train_vec)
	test_vec = m_data_dict['yTe'][0].astype(numpy.float64)
 	test_labels = MulticlassLabels(test_vec)
	print "#train_labels = " + str(train_labels.get_num_labels())
	print "#test_labels  = " + str(test_labels.get_num_labels())

	train_mat = m_data_dict['xTr'][:,:num_train_samples].astype(numpy.float64)
	train_features = RealFeatures(train_mat)
	test_mat = m_data_dict['xTe'].astype(numpy.float64)
	test_features = RealFeatures(test_mat)
	print "#train_vectors = " + str(train_features.get_num_vectors())
	print "#test_vectors  = " + str(test_features.get_num_vectors())
	print "data dimension = " + str(test_features.get_num_features())

	return train_features, train_labels, test_features, test_labels
def multiclass_c45classifiertree_modular(train=traindat,test=testdat,labels=label_traindat,ft=feattypes):
	try:
		from modshogun import RealFeatures, MulticlassLabels, CSVFile, C45ClassifierTree
		from numpy import random, int32
	except ImportError:
		print("Could not import Shogun and/or numpy modules")
		return

	# wrap features and labels into Shogun objects
	feats_train=RealFeatures(CSVFile(train))
	feats_test=RealFeatures(CSVFile(test))
	train_labels=MulticlassLabels(CSVFile(labels))

	# divide train dataset into training and validation subsets in the ratio 2/3 to 1/3
	subset=int32(random.permutation(feats_train.get_num_vectors()))
	vsubset=subset[1:subset.size/3]
	trsubset=subset[1+subset.size/3:subset.size]

	# C4.5 Tree formation using training subset
	train_labels.add_subset(trsubset)
	feats_train.add_subset(trsubset)

	c=C45ClassifierTree()
	c.set_labels(train_labels)
	c.set_feature_types(ft)
	c.train(feats_train)

	train_labels.remove_subset()
	feats_train.remove_subset()

	# prune tree using validation subset
	train_labels.add_subset(vsubset)
	feats_train.add_subset(vsubset)

	c.prune_tree(feats_train,train_labels)

	train_labels.remove_subset()
	feats_train.remove_subset()

	# Classify test data
	output=c.apply_multiclass(feats_test).get_labels()
	output_certainty=c.get_certainty_vector()

	return c,output,output_certainty
def metric_lmnn_statistics(
    k=3,
    fname_features="../../data/fm_train_multiclass_digits.dat.gz",
    fname_labels="../../data/label_train_multiclass_digits.dat",
):
    try:
        from modshogun import LMNN, CSVFile, RealFeatures, MulticlassLabels, MSG_DEBUG
        import matplotlib.pyplot as pyplot
    except ImportError:
        print "Error importing modshogun or other required modules. Please, verify their installation."
        return

    features = RealFeatures(load_compressed_features(fname_features).T)
    labels = MulticlassLabels(CSVFile(fname_labels))

    # 	print 'number of examples = %d' % features.get_num_vectors()
    # 	print 'number of features = %d' % features.get_num_features()

    assert features.get_num_vectors() == labels.get_num_labels()

    # train LMNN
    lmnn = LMNN(features, labels, k)
    lmnn.set_correction(100)
    # 	lmnn.io.set_loglevel(MSG_DEBUG)
    print "Training LMNN, this will take about two minutes..."
    lmnn.train()
    print "Training done!"

    # plot objective obtained during training
    statistics = lmnn.get_statistics()

    pyplot.plot(statistics.obj.get())
    pyplot.grid(True)
    pyplot.xlabel("Iterations")
    pyplot.ylabel("LMNN objective")
    pyplot.title("LMNN objective during training for the multiclass digits data set")

    pyplot.show()
def features_io_modular (fm_train_real, label_train_twoclass):
	import numpy
	from modshogun import SparseRealFeatures, RealFeatures, MulticlassLabels
	from modshogun import GaussianKernel
	from modshogun import LibSVMFile, CSVFile, BinaryFile, HDF5File

	feats=SparseRealFeatures(fm_train_real)
	feats2=SparseRealFeatures()

	f=BinaryFile("tmp/fm_train_sparsereal.bin","w")
	feats.save(f)

	f=LibSVMFile("tmp/fm_train_sparsereal.ascii","w")
	feats.save(f)

	f=BinaryFile("tmp/fm_train_sparsereal.bin")
	feats2.load(f)

	f=LibSVMFile("tmp/fm_train_sparsereal.ascii")
	feats2.load(f)

	feats=RealFeatures(fm_train_real)
	feats2=RealFeatures()

	f=BinaryFile("tmp/fm_train_real.bin","w")
	feats.save(f)

	f=HDF5File("tmp/fm_train_real.h5","w", "/data/doubles")
	feats.save(f)

	f=CSVFile("tmp/fm_train_real.ascii","w")
	feats.save(f)

	f=BinaryFile("tmp/fm_train_real.bin")
	feats2.load(f)
	#print("diff binary", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())))

	f=CSVFile("tmp/fm_train_real.ascii")
	feats2.load(f)
	#print("diff ascii", numpy.max(numpy.abs(feats2.get_feature_matrix().flatten()-fm_train_real.flatten())))

	lab=MulticlassLabels(numpy.array([0.0,1.0,2.0,3.0]))
	lab2=MulticlassLabels()
	f=CSVFile("tmp/label_train_twoclass.ascii","w")
	lab.save(f)

	f=BinaryFile("tmp/label_train_twoclass.bin","w")
	lab.save(f)

	f=HDF5File("tmp/label_train_real.h5","w", "/data/labels")
	lab.save(f)

	f=CSVFile("tmp/label_train_twoclass.ascii")
	lab2.load(f)

	f=BinaryFile("tmp/label_train_twoclass.bin")
	lab2.load(f)

	f=HDF5File("tmp/fm_train_real.h5","r", "/data/doubles")
	feats2.load(f)
	#print(feats2.get_feature_matrix())
	f=HDF5File("tmp/label_train_real.h5","r", "/data/labels")
	lab2.load(f)
	#print(lab2.get_labels())

	#clean up
	import os
	for f in ['tmp/fm_train_sparsereal.bin','tmp/fm_train_sparsereal.ascii',
			'tmp/fm_train_real.bin','tmp/fm_train_real.h5','tmp/fm_train_real.ascii',
			'tmp/label_train_real.h5', 'tmp/label_train_twoclass.ascii','tmp/label_train_twoclass.bin']:
		os.unlink(f)
	return feats, feats2, lab, lab2
Example #36
0
 def __init__(self, X, labels, k=3):
   self.X = X
   self.L = np.eye(X.shape[1])
   labels = MulticlassLabels(labels.astype(np.float64))
   self._lmnn = shogun_LMNN(RealFeatures(X.T), labels, k)
Example #37
0
def plot_data(x,y,axis):
	for idx,val in enumerate(numpy.unique(y)):
		xi = x[y==val]
		axis.scatter(xi[:,0], xi[:,1], s=50, facecolors='none', edgecolors=COLS[idx])

def plot_neighborhood_graph(x, nn, axis):
	for i in xrange(x.shape[0]):
		xs = [x[i,0], x[nn[1,i], 0]]
		ys = [x[i,1], x[nn[1,i], 1]]
		axis.plot(xs, ys, COLS[int(y[i])])

figure, axarr = pyplot.subplots(3, 1)
x, y = sandwich_data()

features = RealFeatures(x.T)
labels = MulticlassLabels(y)

print('%d vectors with %d features' % (features.get_num_vectors(), features.get_num_features()))
assert(features.get_num_vectors() == labels.get_num_labels())

distance = EuclideanDistance(features, features)
k = 2
knn = KNN(k, distance, labels)

plot_data(x, y, axarr[0])
plot_neighborhood_graph(x, knn.nearest_neighbors(), axarr[0])
axarr[0].set_aspect('equal')
axarr[0].set_xlim(-6, 4)
axarr[0].set_ylim(-3, 2)

lmnn = LMNN(features, labels, k)