def setUp(self): # Create dummy dataset (we want a test different from train) self.tr = CDLRandom(n_classes=4, n_clusters_per_class=1, random_state=50000).load() self.ts = CDLRandom(n_classes=4, n_clusters_per_class=1, random_state=10000).load()
def test_alignment(self): ds = CDLRandom(n_samples=100, n_features=500, n_redundant=0, n_informative=10, n_clusters_per_class=1, random_state=0).load() self.logger.info("Train Sec SVM") sec_svm = CClassifierSecSVM(C=1, eta=0.1, eps=1e-2, lb=-0.1, ub=0.5) sec_svm.verbose = 2 sec_svm.fit(ds.X, ds.Y) self.logger.info("Train SVM") svm = CClassifierSVM(C=1) svm.fit(ds.X, ds.Y) self._compute_alignment(ds, sec_svm, svm) svm_pred = sec_svm.predict(ds.X) secsvm_pred = sec_svm.predict(ds.X) self.logger.info("SVM pred:\n{:}".format(svm_pred)) self.logger.info("Sec-SVM pred:\n{:}".format(secsvm_pred)) self.assert_array_almost_equal(secsvm_pred, svm_pred)
def _dataset_creation_blobs(self): self.logger.info("\tTest dataset creation") # generate synthetic data dataset = CDLRandom(n_samples=self.n_samples_tr + self.n_samples_ts, n_classes=self.n_classes, n_features=self.n_features, n_redundant=0, n_clusters_per_class=1, class_sep=2, random_state=0).load() # Split in training and test splitter = CTrainTestSplit(train_size=self.n_samples_tr, test_size=self.n_samples_ts, random_state=0) self.tr, self.ts = splitter.split(dataset) # Normalize the data nmz = CNormalizerMinMax() self.tr.X = nmz.fit_transform(self.tr.X) self.ts.X = nmz.transform(self.ts.X) self._tr_loader = CDataLoaderPyTorch(self.tr.X, self.tr.Y, self.batch_size, shuffle=True, transform=None).get_loader() self._ts_loader = CDataLoaderPyTorch(self.ts.X, self.ts.Y, self.batch_size, shuffle=False, transform=None).get_loader()
def test_openworldkfold(self): ds = CDLRandom( n_classes=3, n_samples=14, n_informative=3, random_state=0).load() self.logger.info("Testing Open World K-Fold") kf = CDataSplitterOpenWorldKFold( num_folds=2, n_train_samples=4, random_state=5000).compute_indices(ds) tr_idx_expected = [CArray([0, 4, 8, 12]), CArray([1, 3, 9, 13])] ts_idx_expected = [CArray([1, 2, 3, 5, 6, 7, 9, 10, 11, 13]), CArray([0, 2, 4, 5, 6, 7, 8, 10, 11, 12])] self.assertEqual(len(kf.tr_idx), 2) self.assertEqual(len(kf.ts_idx), 2) self.logger.info("DS classes:\n{:}".format(ds.Y)) for fold_idx in range(kf.num_folds): self.logger.info( "{:} fold:\nTR CLASSES {:}\nTR {:} {:}\nTS {:} {:}".format( fold_idx, kf.tr_classes[fold_idx], kf.tr_idx[fold_idx], ds.Y[kf.tr_idx[fold_idx]], kf.ts_idx[fold_idx], ds.Y[kf.ts_idx[fold_idx]])) self.assert_array_equal( tr_idx_expected[fold_idx], kf.tr_idx[fold_idx]) self.assert_array_equal( ts_idx_expected[fold_idx], kf.ts_idx[fold_idx])
def setUp(self): self.ds_loader = CDLRandom(n_features=1000, n_redundant=200, n_informative=250, n_clusters_per_class=2) self.ds1 = self.ds_loader.load() self.ds2 = self.ds_loader.load() self.y1 = self.ds1.Y self.y2 = self.ds2.Y self.svm = CClassifierSVM(C=1e-7).fit(self.ds1.X, self.ds1.Y) _, self.s1 = self.svm.predict(self.ds1.X, return_decision_function=True) _, self.s2 = self.svm.predict(self.ds2.X, return_decision_function=True) self.s1 = self.s1[:, 1].ravel() self.s2 = self.s2[:, 1].ravel() # Roc with not computed average (2 repetitions) self.roc_nomean = CRoc() self.roc_nomean.compute([self.y1, self.y2], [self.s1, self.s2]) # Roc with average (2 repetitions) self.roc_wmean = CRoc() self.roc_wmean.compute([self.y1, self.y2], [self.s1, self.s2]) self.roc_wmean.average()
def test_params_multiclass(self): """Parameter estimation for multiclass classifiers.""" # Create dummy dataset (we want a test different from train) tr = CDLRandom(n_classes=4, n_clusters_per_class=1, random_state=50000).load() kernel = CKernel.create('rbf') multiclass = CClassifierMulticlassOVA(CClassifierSVM, C=1, kernel=kernel) multiclass.verbose = 1 xval_parameters = {'C': [1, 10, 100], 'kernel.gamma': [0.1, 1]} expected = {'C': 10.0, 'kernel.gamma': 0.1} self._run_multiclass(tr, multiclass, xval_parameters, expected) self.logger.info("Testing with preprocessor") kernel = CKernel.create('rbf') multiclass = CClassifierMulticlassOVA(CClassifierSVM, C=1, kernel=kernel, preprocess='min-max') multiclass.verbose = 1 xval_parameters = {'C': [1, 10, 100], 'kernel.gamma': [0.1, 1]} expected = {'C': 10.0, 'kernel.gamma': 0.1} self._run_multiclass(tr, multiclass, xval_parameters, expected)
def setUp(self): # generate synthetic data self.dataset = CDLRandom(n_features=2, n_redundant=0, n_informative=1, n_clusters_per_class=1, random_state=1).load() self.dataset_sparse = self.dataset.tosparse() kernel_types = (None, CKernelLinear, CKernelRBF, CKernelPoly) self.svms = [ CClassifierSVM(kernel=kernel() if kernel is not None else None) for kernel in kernel_types ] self.logger.info("Testing SVM with kernel functions: %s", str(kernel_types)) for svm in self.svms: # Enabling debug output for each classifier svm.verbose = 2 self.logger.info("." * 50) self.logger.info("Number of Patterns: %s", str(self.dataset.num_samples)) self.logger.info("Features: %s", str(self.dataset.num_features))
def setUp(self): """Test for init and fit methods.""" # TODO: remove this filter when `kernel` parameter is removed from Ridge Classifier self.logger.filterwarnings("ignore", message="`kernel` parameter.*", category=DeprecationWarning) # generate synthetic data self.dataset = CDLRandom(n_features=100, n_redundant=20, n_informative=25, n_clusters_per_class=2, random_state=0).load() self.dataset.X = CNormalizerMinMax().fit_transform(self.dataset.X) kernel_types = (None, CKernelLinear, CKernelRBF, CKernelPoly) self.ridges = [ CClassifierRidge(kernel=kernel() if kernel is not None else None) for kernel in kernel_types ] self.logger.info("Testing RIDGE with kernel unctions: %s", str(kernel_types)) for ridge in self.ridges: ridge.verbose = 2 # Enabling debug output for each classifier ridge.fit(self.dataset)
def setUp(self): import numpy as np np.random.seed(12345678) # generate synthetic data self.ds = CDLRandom(n_classes=3, n_features=2, n_redundant=0, n_clusters_per_class=1, class_sep=1, random_state=0).load() # Add a new class modifying one of the existing clusters self.ds.Y[(self.ds.X[:, 0] > 0).logical_and( self.ds.X[:, 1] > 1).ravel()] = self.ds.num_classes # self.kernel = None self.kernel = CKernelRBF(gamma=10) # Data normalization self.normalizer = CNormalizerMinMax() self.ds.X = self.normalizer.fit_transform(self.ds.X) self.multiclass = CClassifierMulticlassOVA(classifier=CClassifierSVM, class_weight='balanced', preprocess=None, kernel=self.kernel) self.multiclass.verbose = 0 # Training and classification self.multiclass.fit(self.ds.X, self.ds.Y) self.y_pred, self.score_pred = self.multiclass.predict( self.ds.X, return_decision_function=True)
def test_stratifiedkfold(self): ds = CDLRandom(n_samples=10, random_state=0).load() self.logger.info("Testing Stratified K-Fold") kf = CDataSplitterStratifiedKFold( num_folds=2, random_state=5000).compute_indices(ds) import sklearn if sklearn.__version__ < '0.22': # TODO: REMOVE AFTER BUMPING DEPS # v0.22 changed the model to fix an issue related test set size # https://github.com/scikit-learn/scikit-learn/pull/14704 tr_idx_expected = [CArray([4, 5, 6, 9]), CArray([0, 1, 2, 3, 7, 8])] ts_idx_expected = [CArray([0, 1, 2, 3, 7, 8]), CArray([4, 5, 6, 9])] else: tr_idx_expected = [CArray([1, 2, 7, 8, 9]), CArray([0, 3, 4, 5, 6])] ts_idx_expected = [CArray([0, 3, 4, 5, 6]), CArray([1, 2, 7, 8, 9])] self.assertEqual(len(kf.tr_idx), 2) self.assertEqual(len(kf.ts_idx), 2) self.logger.info("DS classes:\n{:}".format(ds.Y)) for fold_idx in range(kf.num_folds): self.logger.info("{:} fold: \nTR {:} \nTS {:}" "".format(fold_idx, kf.tr_idx[fold_idx], kf.ts_idx[fold_idx])) self.assert_array_equal( tr_idx_expected[fold_idx], kf.tr_idx[fold_idx]) self.assert_array_equal( ts_idx_expected[fold_idx], kf.ts_idx[fold_idx])
def test_shuffle(self): ds = CDLRandom(n_samples=10, random_state=0).load() self.logger.info("Testing Shuffle ") kf = CDataSplitterShuffle( num_folds=2, train_size=0.2, random_state=5000).compute_indices(ds) tr_idx_expected = [CArray([1, 2]), CArray([9, 3])] ts_idx_expected = [CArray([6, 4, 7, 0, 3, 9, 5, 8]), CArray([7, 5, 4, 0, 8, 2, 6, 1])] self.assertEqual(len(kf.tr_idx), 2) self.assertEqual(len(kf.ts_idx), 2) self.logger.info("DS classes:\n{:}".format(ds.Y)) for fold_idx in range(kf.num_folds): self.logger.info("{:} fold: \nTR {:} \nTS {:}" "".format(fold_idx, kf.tr_idx[fold_idx], kf.ts_idx[fold_idx])) self.assert_array_equal( tr_idx_expected[fold_idx], kf.tr_idx[fold_idx]) self.assert_array_equal( ts_idx_expected[fold_idx], kf.ts_idx[fold_idx])
def test_plot_decision_function(self): """Test plot of multiclass classifier decision function.""" # generate synthetic data ds = CDLRandom(n_classes=3, n_features=2, n_redundant=0, n_clusters_per_class=1, class_sep=1, random_state=0).load() multiclass = CClassifierMulticlassOVA( classifier=CClassifierSVM, class_weight='balanced', preprocess='min-max') # Training and classification multiclass.fit(ds.X, ds.Y) y_pred, score_pred = multiclass.predict( ds.X, return_decision_function=True) def plot_hyperplane(img, clf, min_v, max_v, linestyle, label): """Plot the hyperplane associated to the OVA clf.""" xx = CArray.linspace( min_v - 5, max_v + 5) # make sure the line is long enough # get the separating hyperplane yy = -(clf.w[0] * xx + clf.b) / clf.w[1] img.sp.plot(xx, yy, linestyle, label=label) fig = CFigure(height=7, width=8) fig.sp.title('{:} ({:})'.format(multiclass.__class__.__name__, multiclass.classifier.__name__)) x_bounds, y_bounds = ds.get_bounds() styles = ['go-', 'yp--', 'rs-.', 'bD--', 'c-.', 'm-', 'y-.'] for c_idx, c in enumerate(ds.classes): # Plot boundary and predicted label for each OVA classifier plot_hyperplane(fig, multiclass._binary_classifiers[c_idx], x_bounds[0], x_bounds[1], styles[c_idx], 'Boundary\nfor class {:}'.format(c)) fig.sp.scatter(ds.X[ds.Y == c, 0], ds.X[ds.Y == c, 1], s=40, c=styles[c_idx][0]) fig.sp.scatter(ds.X[y_pred == c, 0], ds.X[y_pred == c, 1], s=160, edgecolors=styles[c_idx][0], facecolors='none', linewidths=2) # Plotting multiclass decision function fig.sp.plot_decision_regions(multiclass, n_grid_points=100, grid_limits=ds.get_bounds(offset=5)) fig.sp.xlim(x_bounds[0] - .5 * x_bounds[1], x_bounds[1] + .5 * x_bounds[1]) fig.sp.ylim(y_bounds[0] - .5 * y_bounds[1], y_bounds[1] + .5 * y_bounds[1]) fig.sp.legend(loc=4) # lower, right fig.show()
def test_plot(self): """ Compare the classifiers graphically""" ds = CDLRandom(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=0).load() ds.X = CNormalizerMinMax().fit_transform(ds.X) fig = self._test_plot(self.ridges[0], ds) fig.savefig(fm.join(fm.abspath(__file__), 'figs', 'test_c_classifier_ridge.pdf'))
def test_train_test_split(self): ds = CDLRandom(n_samples=10, random_state=0).load() tts = CTrainTestSplit(train_size=0.5, random_state=0, shuffle=False) tr_idx, ts_idx = tts.compute_indices(ds) self.logger.info("TR IDX:\n{:}".format(tr_idx)) self.logger.info("TS IDX:\n{:}".format(ts_idx)) tr_idx_expected = CArray([0, 1, 2, 3, 4]) ts_idx_expected = CArray([5, 6, 7, 8, 9]) self.assertIsInstance(tr_idx, CArray) self.assertIsInstance(ts_idx, CArray) self.assertFalse((tr_idx != tr_idx_expected).any()) self.assertFalse((ts_idx != ts_idx_expected).any()) tr, ts = tts.split(ds) tr_expected = ds[tr_idx, :] ts_expected = ds[ts_idx, :] self.assertIsInstance(tr, CDataset) self.assertIsInstance(ts, CDataset) self.assertFalse((tr.X != tr_expected.X).any()) self.assertFalse((tr.Y != tr_expected.Y).any()) self.assertFalse((ts.X != ts_expected.X).any()) self.assertFalse((ts.Y != ts_expected.Y).any()) self.logger.info("Testing splitting of sparse dataset") ds = CDLRandom(n_samples=10, random_state=0).load() ds = ds.tosparse() tts = CTrainTestSplit(train_size=0.25, random_state=0, shuffle=False) tr, ts = tts.split(ds) self.assertEqual(2, tr.num_samples) self.assertEqual(8, ts.num_samples) self.assertTrue(tr.issparse) self.assertTrue(ts.issparse)
def setUp(self): self.clf = CClassifierSVM() self.dataset = CDLRandom(n_features=2, n_redundant=0, n_informative=1, n_clusters_per_class=1).load() self.dataset.X = CNormalizerMinMax().fit_transform(self.dataset.X) self.clf.fit(self.dataset.X, self.dataset.Y)
def setUp(self): self.ds = CDLRandom(n_classes=3, n_samples=50, random_state=0, n_informative=3).load() self.logger.info("Fit an SVM and classify dataset...") self.ova = CClassifierMulticlassOVA(CClassifierSVM) self.ova.fit(self.ds.X, self.ds.Y) self.labels, self.scores = self.ova.predict( self.ds.X, return_decision_function=True)
def setUp(self): """Test for init and fit methods.""" self.dataset = CDLRandom(n_features=2, n_redundant=0, n_informative=1, n_clusters_per_class=1).load() self.dataset.X = CNormalizerMinMax().fit_transform(self.dataset.X) self.nc = CClassifierNearestCentroid()
def setUp(self): self.ds = CDLRandom(n_samples=50, random_state=0).load() self.logger.info("Train an SVM and classify dataset...") self.svm = CClassifierSVM() self.svm.fit(self.ds.X, self.ds.Y) self.labels, self.scores = self.svm.predict( self.ds.X, return_decision_function=True)
def setUp(self): self.n_classes = 3 self.n_features = 5 self.ds = CDLRandom(n_classes=self.n_classes, n_features=self.n_features, n_informative=self.n_features, n_redundant=0).load() self.logger.info("num_samples: {}, num_classes: {:}".format( self.ds.num_samples, self.ds.num_classes))
def setUpClass(cls): CUnitTest.setUpClass() cls.seed = 2 cls.ds = CDLRandom(n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=cls.seed).load() cls.ds_sparse = cls.ds.tosparse()
def setUp(self): """Test for init and fit methods.""" # generate synthetic data self.dataset = CDLRandom(n_features=2, n_redundant=0, n_informative=1, n_clusters_per_class=1, random_state=99).load() self.dataset.X = CNormalizerMinMax().fit_transform(self.dataset.X) self.logger.info("Testing classifier creation ") self.log = CClassifierLogistic(random_state=99)
def test_preprocess(self): """Test classifier with preprocessors inside.""" ds = CDLRandom().load() # All linear transformations self._test_preprocess(ds, self.nc, ['min-max', 'mean-std'], [{'feature_range': (-1, 1)}, {}]) # Mixed linear/nonlinear transformations self._test_preprocess(ds, self.nc, ['pca', 'unit-norm'], [{}, {}])
def setUp(self): ds = CDLRandom(n_samples=100, n_classes=3, n_features=2, n_redundant=0, n_informative=2, n_clusters_per_class=1, random_state=10000).load() self.dataset = ds[:50, :] self.test = ds[50:, :] self.logger.info("Initializing KNeighbors Classifier... ") self.knn = CClassifierKNN(n_neighbors=3) self.knn.fit(self.dataset)
def _create_tr_ts(): """Create BLOBS training and test sets.""" ds = CDLRandom(n_samples=30, n_features=3 * 224 * 224).load() # Split in training and test splitter = CTrainTestSplit(train_size=10, test_size=20, random_state=0) tr, ts = splitter.split(ds) nmz = CNormalizerMinMax() tr.X = nmz.fit_transform(tr.X) ts.X = nmz.transform(ts.X) return tr, ts
def setUp(self): # Create dummy dataset (we want a test different from train) loader = CDLRandom(random_state=50000) self.training_dataset = loader.load() self.test_dataset = loader.load() # CREATE CLASSIFIERS kernel = CKernel.create('rbf') self.svm = CClassifierSVM(kernel=kernel) self.svm.verbose = 1 self.logger.info("Using kernel {:}".format(self.svm.kernel.class_type))
def setUp(self): self.dl1 = CDLRandom(n_features=1000, n_redundant=200, n_informative=250, n_clusters_per_class=2, random_state=0) self.dl2 = CDLRandom(n_features=1000, n_redundant=200, n_informative=250, n_clusters_per_class=2, random_state=1000) self.ds1 = self.dl1.load() self.ds2 = self.dl2.load() self.svm = CClassifierSVM(C=1e-7).fit(self.ds1.X, self.ds1.Y) self.y1, self.s1 = self.svm.predict(self.ds1.X, return_decision_function=True) self.y2, self.s2 = self.svm.predict(self.ds2.X, return_decision_function=True) self.roc = CRoc()
def setUp(self): self.ds = CDLRandom(n_samples=10, random_state=0).load() timestamps = CArray([ '2016-02-17T10:35:58', '2014-04-04T22:24:22', '2016-08-07T17:10:36', '2014-05-22T11:02:58', '2016-07-01T07:12:34', '2016-01-03T13:10:38', '2014-07-28T23:42:00', '2014-07-08T09:42:42', '2016-05-06T18:38:08', '2015-11-03T21:07:04' ]) self.ds.header = CDatasetHeader(timestamp=timestamps, timestamp_fmt='%Y-%m-%dT%H:%M:%S')
def test_preprocess(self): """Test classifier with preprocessors inside.""" ds = CDLRandom().load() # All linear transformations with gradient implemented self._test_preprocess(ds, self.log, ['min-max', 'mean-std'], [{'feature_range': (-1, 1)}, {}]) self._test_preprocess_grad(ds, self.log, ['min-max', 'mean-std'], [{'feature_range': (-1, 1)}, {}]) self.logger.info("The following case will skip the gradient test") # Mixed linear/nonlinear transformations without gradient self._test_preprocess(ds, self.log, ['pca', 'unit-norm'], [{}, {}])
def test_preprocess(self): """Test classifier with preprocessors inside.""" ds = CDLRandom().load() clf = CClassifierRidge() # All linear transformations with gradient implemented self._test_preprocess(ds, clf, ['min-max', 'mean-std'], [{'feature_range': (-1, 1)}, {}]) self._test_preprocess_grad(ds, clf, ['min-max', 'mean-std'], [{'feature_range': (-1, 1)}, {}]) # Mixed linear/nonlinear transformations without gradient self._test_preprocess(ds, clf, ['pca', 'unit-norm'], [{}, {}])
def setUpClass(cls): cls.ds = CDLRandom(n_samples=40, n_classes=3, n_features=20, n_informative=15, random_state=0).load() model = mlp(input_dims=20, hidden_dims=(40,), output_dims=3) loss = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=1e-1) cls.net = CClassifierPyTorch(model=model, loss=loss, optimizer=optimizer, random_state=0, epochs=10, pretrained=True) cls.net.fit(cls.ds.X, cls.ds.Y) cls.norm = CNormalizerDNN(net=cls.net) CPreProcessTestCases.setUpClass()