Ejemplo n.º 1
0
    def run(self, x, y, ds_init=None, *args, **kargs):
        x = CArray(x).atleast_2d()
        y = CArray(y).atleast_2d()
        x_init = None if ds_init is None else CArray(ds_init.X).atleast_2d()

        # only consider samples that can be manipulated
        v = self.is_attack_class(y)
        idx = CArray(v.find(v)).ravel()
        # print(v, idx)

        # number of modifiable samples
        n_mod_samples = idx.size

        adv_ds = CDataset(x.deepcopy(), y.deepcopy())

        # If dataset is sparse, set the proper attribute
        if x.issparse is True:
            self._issparse = True

        # array in which the value of the optimization function are stored
        fs_opt = CArray.zeros(n_mod_samples, )
        y_pred = CArray.zeros(n_mod_samples, )
        scores = CArray.zeros((n_mod_samples, 2))
        for i in range(n_mod_samples):
            k = idx[i].item()  # idx of sample that can be modified

            xi = x[k, :] if x_init is None else x_init[k, :]
            x_opt, f_opt = self._run(x[k, :], y[k], x_init=xi, *args, **kargs)

            self.logger.info(
                "Point: {:}/{:}, dmax:{:}, f(x):{:}, eval:{:}/{:}".format(
                    k, x.shape[0], self._dmax, f_opt, self.f_eval,
                    self.grad_eval))
            if x_opt.shape[-1] > adv_ds.X.shape[-1]:
                # Need to resize the whole adv dataset, since CDataset can't deal with varying vector sizes
                new_length = x_opt.shape[-1]
                adv_ds.X = adv_ds.X.resize((adv_ds.X.shape[0], new_length),
                                           256)
            adv_ds.X[k, :min(adv_ds.X.shape[-1], x_opt.shape[-1])] = x_opt
            fs_opt[i] = f_opt
            y_p, score = self.problem.model_wrapper.predict(
                x_opt, return_decision_function=True)
            scores[i, :] = score[0, :]
            y_pred[i] = y_p

        # Return the mean objective function value on the evasion points (
        # computed from the outputs of the surrogate classifier)
        f_obj = fs_opt.mean()

        return y_pred, scores, adv_ds, f_obj
    def fit(self, dataset, n_jobs=1):
        """Trains the classifier.

        If a preprocess has been specified,
        input is normalized before training.

        Parameters
        ----------
        dataset : CDataset
            Training set. Must be a :class:`.CDataset` instance with
            patterns data and corresponding labels.
        n_jobs : int, optional
            Number of parallel workers to use for training the classifier.
            Default 1. Cannot be higher than processor's number of cores.

        Returns
        -------
        trained_cls : CClassifier
            Instance of the classifier trained using input dataset.

        """
        self._n_features = dataset.num_features

        data_x = dataset.X
        # Transform data if a preprocess is defined
        if self.preprocess is not None:
            data_x = self.preprocess.fit_transform(dataset.X)

        return self._fit(CDataset(data_x, dataset.Y), n_jobs=n_jobs)
Ejemplo n.º 3
0
    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        with CDLRandomToy.__lock:
            if self.toy == 'iris':
                from sklearn.datasets import load_iris
                toy_data = load_iris()
            elif self.toy == 'digits':
                from sklearn.datasets import load_digits
                toy_data = load_digits()
            elif self.toy == 'boston':
                from sklearn.datasets import load_boston
                toy_data = load_boston()
            elif self.toy == 'diabetes':
                from sklearn.datasets import load_diabetes
                toy_data = load_diabetes()
            else:
                raise ValueError("toy dataset {:} if not available.".format(
                    self.toy))

        # Returning a CDataset
        if self.class_list is None:
            return CDataset(CArray(toy_data.data), CArray(toy_data.target))
        else:
            return self._select_classes(self.class_list, CArray(toy_data.data),
                                        CArray(toy_data.target))
Ejemplo n.º 4
0
    def _select_classes(self, class_list, patterns, labels):

        sel_patterns = None
        sel_labels = None

        for single_class in class_list:
            this_class_pat_idx = labels.find(labels == single_class)

            if sel_patterns is None:
                sel_patterns = patterns[this_class_pat_idx, :]
                sel_labels = labels[this_class_pat_idx]
            else:
                sel_patterns = sel_patterns.append(
                    patterns[this_class_pat_idx, :], axis=0)
                sel_labels = sel_labels.append(labels[this_class_pat_idx])

        if self.zero_one is True:
            if len(class_list) > 2:
                raise ValueError("you are try to convert to 0 1 label for a "
                                 "dataset with more than 2 classes")
            else:
                class_list.sort()
                sel_labels[sel_labels == class_list[0]] = 0
                sel_labels[sel_labels == class_list[1]] = 1

        return CDataset(sel_patterns, sel_labels)
Ejemplo n.º 5
0
    def _fit(self, x, y):
        """Trains the classifier.

        A One-Vs-All classifier is trained for each dataset class.

        Parameters
        ----------
        x : CArray
            Array to be used for training with shape (n_samples, n_features).
        y : CArray
            Array of shape (n_samples,) containing the class labels.

        Returns
        -------
        trained_cls : CClassifierMulticlassOVA
            Instance of the classifier trained using input dataset.

        """
        # Preparing the binary classifiers
        self.prepare(y.unique().size)

        # Fit a one-vs-all classifier for each class
        # Use the specified number of workers
        self._binary_classifiers = parfor2(_fit_one_ova,
                                           self.classes.size,
                                           self.n_jobs, self, CDataset(x, y),
                                           self.verbose)

        return self
Ejemplo n.º 6
0
    def _fit(self, x, y):
        """Trains the classifier.

        All the One-Vs-One classifier are trained for each dataset class.

        Parameters
        ----------
        x : CArray
            Array to be used for training with shape (n_samples, n_features).
        y : CArray
            Array of shape (n_samples,) containing the class labels.

        Returns
        -------
        trained_cls : CClassifierMulticlassOVO
            Instance of the classifier trained using input dataset.

        """
        # Number of unique classes
        n_classes = y.unique().size
        # Number of classifiers to be trained
        ovo_clf_number = int((n_classes * (n_classes - 1)) / 2)
        # Preparing the binary classifiers
        self.prepare(ovo_clf_number)
        # Preparing the list of binary classifiers indices
        self._clf_pair_idx = list(combinations(range(n_classes), 2))

        # Fit a one-vs-one classifier
        # Use the specified number of workers
        self._binary_classifiers = parfor2(_fit_one_ovo, self.num_classifiers,
                                           self.n_jobs, self, CDataset(x, y),
                                           self.verbose)

        return self
Ejemplo n.º 7
0
    def binarize_subset(tr_class_idx, vs_class_idx, dataset):
        """Returns the binary dataset tr_class_idx vs vs_class_idx.

        Parameters
        ----------
        tr_class_idx : int
            Index of the target class.
        vs_class_idx: int
            Index of the opposing class.
        dataset : CDataset
            Dataset from which the subset should be extracted.

        Returns
        -------
        bin_subset : CDataset
            Binarized subset.

        """
        tr_class = dataset.classes[tr_class_idx]
        vs_class = dataset.classes[vs_class_idx]

        tr_idx = dataset.Y.find(dataset.Y == tr_class)
        vs_idx = dataset.Y.find(dataset.Y == vs_class)

        subset = dataset[tr_idx + vs_idx, :]

        # Using get_labels_ovr to avoid redundant functions
        return CDataset(subset.X,
                        subset.get_labels_ovr(tr_class),
                        header=dataset.header)
    def _clf_poisoning(self):
        """
        Computes a poisoning point considering as source the sample {xc, yc}.
        """
        xc = self.poisoning._run(self.xc, self.yc)

        self.logger.info("Starting score: " + str(self.poisoning.f_seq[0]))
        self.logger.info("Final score: " + str(self.poisoning.f_seq[-1]))
        self.logger.info("x*: " + str(xc))
        self.logger.info("Point sequence: " + str(self.poisoning.x_seq))
        self.logger.info("Score sequence: : " + str(self.poisoning.f_seq))
        self.logger.info("Fun Eval: " + str(self.poisoning.f_eval))
        self.logger.info("Grad Eval: " + str(self.poisoning.grad_eval))

        metric = CMetric.create('accuracy')
        y_pred, scores = self.classifier.predict(self.ts.X,
                                                 return_decision_function=True)
        orig_acc = metric.performance_score(y_true=self.ts.Y, y_pred=y_pred)
        self.logger.info("Error on testing data: " + str(1 - orig_acc))

        tr = self.tr.append(CDataset(xc, self.yc))

        pois_clf = self.classifier.deepcopy()

        pois_clf.fit(tr.X, tr.Y)
        y_pred, scores = pois_clf.predict(self.ts.X,
                                          return_decision_function=True)
        pois_acc = metric.performance_score(y_true=self.ts.Y, y_pred=y_pred)
        self.logger.info("Error on testing data (poisoned): " +
                         str(1 - pois_acc))

        return pois_clf, xc
    def test_pretrained(self):
        """Test wrapping of pretrained models."""
        from sklearn import datasets, svm

        iris = datasets.load_iris()
        X = iris.data
        y = iris.target

        clf = svm.SVC(kernel='linear')

        from secml.core.exceptions import NotFittedError
        with self.assertRaises(NotFittedError):
            secmlclf = CClassifierSkLearn(clf)
            secmlclf.predict(CArray(X))

        clf.fit(X, y)

        y_pred = clf.predict(X)

        clf = svm.SVC(kernel='linear')
        secmlclf = CClassifierSkLearn(clf)
        secmlclf.fit(CDataset(X, y))

        y_pred_secml = secmlclf.predict(CArray(X))

        self.logger.info(
            "Predicted labels by pretrained model:\n{:}".format(y_pred))
        self.logger.info(
            "Predicted labels by our fit:\n{:}".format(y_pred_secml))

        self.assert_array_equal(y_pred, y_pred_secml)
Ejemplo n.º 10
0
    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_classification
        patterns, labels = make_classification(
            n_samples=self.n_samples,
            n_features=self.n_features,
            n_informative=self.n_informative,
            n_redundant=self.n_redundant,
            n_repeated=self.n_repeated,
            n_classes=self.n_classes,
            n_clusters_per_class=self.n_clusters_per_class,
            weights=self.weights,
            flip_y=self.flip_y,
            class_sep=self.class_sep,
            hypercube=self.hypercube,
            shift=self.shift,
            scale=self.scale,
            random_state=self.random_state)
        return CDataset(patterns, labels)
Ejemplo n.º 11
0
 def objective_function(self, xc, yc):
     # retrain clf on poisoned data
     clf = self.clf.deepcopy()
     tr = self.tr.append(CDataset(xc, yc))
     clf.fit(tr)
     y_pred = clf.predict(self.ts.X)
     unpriv = self.unprivileged()
     return y_pred[unpriv == 1].mean() / y_pred[unpriv == 0].mean()
Ejemplo n.º 12
0
def filter_transform(ds, labels, n_ds=None, transform=img_to_tensor, bin_label=False):
    valid = [i for i, y in enumerate(ds.Y) if y in labels]
    if n_ds is not None:
        valid = CArray(np.random.choice(a=valid, size=n_ds, replace=False))
    x = ds.X[valid, :]
    y = ds.Y[valid]
    if bin_label:
        y = y == labels[0]
    return CDataset(x=transform(x), y=y.astype(int))
Ejemplo n.º 13
0
    def fit(self, dataset, n_jobs=1):
        """Trains the classifier.

        If a preprocess has been specified,
        input is normalized before training.

        For multiclass case see `.CClassifierMulticlass`.

        Parameters
        ----------
        dataset : CDataset
            Training set. Must be a :class:`.CDataset` instance with
            patterns data and corresponding labels.
        n_jobs : int
            Number of parallel workers to use for training the classifier.
            Default 1. Cannot be higher than processor's number of cores.

        Returns
        -------
        trained_cls : CClassifier
            Instance of the classifier trained using input dataset.

        """
        if not isinstance(dataset, CDataset):
            raise TypeError(
                "training set should be provided as a CDataset object.")

        # Storing dataset classes
        self._classes = dataset.classes
        self._n_features = dataset.num_features

        data_x = dataset.X
        # Transform data if a preprocess is defined
        if self.preprocess is not None:
            data_x = self.preprocess.fit_transform(dataset.X)

        # Data is ready: fit the classifier
        try:  # Try to use parallelization
            self._fit(CDataset(data_x, dataset.Y), n_jobs=n_jobs)
        except TypeError:  # Parallelization is probably not supported
            self._fit(CDataset(data_x, dataset.Y))

        return self
Ejemplo n.º 14
0
def test_poison(clf, tr, val, ts, x_poison, y_poison):
    poison = CDataset(x_poison, y_poison)

    clf_p = clf.deepcopy()
    clf_p.init()
    tr_p = tr.append(poison)
    clf_p.fit(tr_p)

    test_acc = test_clf(clf_p, ts)
    val_acc = test_clf(clf_p, val)
    return clf_p, test_acc, val_acc
Ejemplo n.º 15
0
    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        patterns = CArray.randint(2, shape=(self.n_samples, self.n_features))
        labels = CArray.randint(2, shape=(1, self.n_samples))
        return CDataset(patterns, labels)
    def test_save_and_load_svmlight_file(self):
        """Testing libsvm dataset loading and saving."""
        self.logger.info("Testing libsvm dataset loading and saving...")

        test_file = fm.join(fm.abspath(__file__), "myfile.libsvm")

        # Cleaning test file
        try:
            fm.remove_file(test_file)
        except (OSError, IOError) as e:
            if e.errno != 2:
                raise e

        self.logger.info("Patterns saved:\n{:}".format(self.patterns))
        self.logger.info("Labels saved:\n{:}".format(self.labels))

        CDataLoaderSvmLight.dump(CDataset(self.patterns, self.labels),
                                 test_file)

        new_dataset = CDataLoaderSvmLight().load(test_file)

        self.assertFalse((new_dataset.X != self.patterns).any())
        self.assertFalse((new_dataset.Y != self.labels).any())

        # load data but now remove all zero features (colums)
        new_dataset = CDataLoaderSvmLight().load(test_file,
                                                 remove_all_zero=True)

        self.logger.info("Patterns loaded:\n{:}".format(new_dataset.X))
        self.logger.info("Labels loaded:\n{:}".format(new_dataset.Y))
        self.logger.info("Mapping back:\n{:}".format(
            new_dataset.header.idx_mapping))

        self.assertTrue(new_dataset.X.issparse)
        self.assertTrue(new_dataset.Y.isdense)
        self.assertTrue(new_dataset.header.idx_mapping.isdense)

        # non-zero elements should be unchanged
        self.assertEqual(self.patterns.nnz, new_dataset.X.nnz)
        new_nnz_data = new_dataset.X.nnz_data
        self.assertFalse((self.patterns.nnz_data != new_nnz_data.sort()).any())

        # With idx_mapping we should be able to reconstruct original data
        original = CArray.zeros(self.patterns.shape, sparse=True)
        original[:, new_dataset.header.idx_mapping] = new_dataset.X
        self.assertFalse((self.patterns != original).any())

        # Cleaning test file
        try:
            fm.remove_file(test_file)
        except (OSError, IOError) as e:
            if e.errno != 2:
                raise e
Ejemplo n.º 17
0
    def fit_forward(self, x, y=None, caching=False):
        """Fit estimator using data and then execute forward on the data.

        To avoid returning over-fitted scores on the training set, this method
        runs a 5-fold cross validation on training data and
        returns the validation scores.

        Parameters
        ----------
        x : CArray
            Array with shape (n_samples, n_features) to be transformed and
            to be used for training.
        y : CArray or None, optional
            Array of shape (n_samples,) containing the class labels.
            Can be None if not required by the algorithm.
        caching: bool
             True if preprocessed x should be cached for backward pass

        Returns
        -------
        CArray
            Transformed input data.

        See Also
        --------
        fit : fit the preprocessor.
        forward : run forward function on input data.

        """
        kfold = CDataSplitterKFold(num_folds=5,
                                   random_state=0).compute_indices(
                                       CDataset(x, y))

        scores = CArray.zeros(shape=(x.shape[0], self.classes.size))

        # TODO: samples can be first preprocessed and cached, if required.
        #  then we can use _fit and _forward to work on the preprocessed data
        for k in range(kfold.num_folds):
            tr_idx = kfold.tr_idx[k]
            ts_idx = kfold.ts_idx[k]
            self.fit(x[tr_idx, :], y[tr_idx])
            scores[ts_idx, :] = self.forward(x[ts_idx, :], caching=False)

        # train on the full training set after computing the xval scores
        self.fit(x, y)

        # cache x if required
        if caching is True:
            self._forward_preprocess(x, caching=True)

        return scores
Ejemplo n.º 18
0
    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_moons
        patterns, labels = make_moons(n_samples=self.n_samples,
                                      noise=self.noise,
                                      random_state=self.random_state)
        return CDataset(patterns, labels)
Ejemplo n.º 19
0
 def __init__(self, problem: CBlackBoxProblem, is_debug: bool = False):
     CAttackEvasion.__init__(
         self,
         problem.model_wrapper.classifier,
         problem.model_wrapper.classifier,
         surrogate_data=CDataset(CArray([[0], [1]]), CArray([0, 1])),
         y_target=None,
     )
     self.problem = problem
     self.confidences_ = []
     self.changes_per_iterations_ = []
     self.model_wrapper = problem.model_wrapper
     self.is_debug = is_debug
     self._original_x = None
     self.minimization_result_ = []
Ejemplo n.º 20
0
    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_circles
        patterns = make_circles(n_samples=self.n_samples,
                                noise=self.noise,
                                factor=self.factor,
                                random_state=self.random_state)[0]
        return CDataset(patterns, self._dts_function(patterns))
Ejemplo n.º 21
0
    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_blobs
        patterns = make_blobs(n_samples=self.n_samples,
                              n_features=2,
                              centers=self.centers,
                              cluster_std=self.cluster_std,
                              random_state=self.random_state)[0]
        return CDataset(patterns, self._dts_function(CArray(patterns)))
    def _update_poisoned_clf(self, clf=None, tr=None,
                             train_normalizer=False):
        """
        Trains classifier on D (original training data) plus {x,y} (new point).

        Parameters
        ----------
        x: feature vector of new training point
        y: true label of new training point

        Returns
        -------
        clf: trained classifier on D and {x,y}

        """

        #  xc hashing is only valid if clf and tr do not change
        #  (when calling update_poisoned_clf() without parameters)
        xc_hash_is_valid = False
        if clf is None and tr is None:
            xc_hash_is_valid = True

        if clf is None:
            clf = self._solver_clf

        if tr is None:
            tr = self.surrogate_data

        tr = tr.append(CDataset(self._xc, self._yc))

        xc_hash = self._xc.sha1()

        if self._xc_hash is None or self._xc_hash != xc_hash:
            # xc set has changed, retrain clf
            # hash is stored only if update_poisoned_clf() is called w/out pars
            self._xc_hash = xc_hash if xc_hash_is_valid else None
            self._poisoned_clf = clf.deepcopy()

            # we assume that normalizer is not changing w.r.t xc!
            # so we avoid re-training the normalizer on dataset including xc

            if self.classifier.preprocess is not None:
                self._poisoned_clf.retrain_normalizer = train_normalizer

            self._poisoned_clf.fit(tr)

        return self._poisoned_clf, tr
    def test_openworldkfold_tr_class_skip(self):

        ds = CDataset([[1, 2], [3, 4], [5, 6],
                       [10, 20], [30, 40], [50, 60],
                       [100, 200], [300, 400], [500, 600]],
                      [1, 2, 1, 2, 2, 0, 1, 0, 2])  # class 0 has 2 samples
        # create 25 folds to increase the chance of getting the warning message
        kf = CDataSplitterOpenWorldKFold(
            num_folds=25, n_train_samples=2,
            random_state=5000).compute_indices(ds)

        self.assertEqual(len(kf.tr_idx), 25)
        self.assertEqual(len(kf.ts_idx), 25)

        for fold_tr_idx, fold_ts_idx in kf:
            self.assertTrue((ds.Y[fold_tr_idx] != 0).all())
            self.assertTrue((ds.Y[fold_ts_idx] == 0).any())
Ejemplo n.º 24
0
    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_blobs
        patterns, labels = make_blobs(n_samples=self.n_samples,
                                      n_features=self.n_features,
                                      centers=self.centers,
                                      cluster_std=self.cluster_std,
                                      center_box=self.center_box,
                                      random_state=self.random_state)

        return CDataset(patterns, labels)
    def binarize_dataset(class_idx, dataset):
        """Returns the dataset needed by the class_idx binary classifier.

        Parameters
        ----------
        class_idx : int
            Index of the target class.
        dataset : CDataset
            Dataset to binarize.

        Returns
        -------
        bin_dataset : CDataset
            Binarized dataset.

        """
        return CDataset(
            dataset.X, dataset.get_labels_ovr(dataset.classes[class_idx]),
            header=dataset.header)
    def load(self, min_faces_per_person=None, funneled=True, color=False):
        """Load LFW dataset.

        Extra dataset attributes:
         - 'img_w', 'img_h': size of the images in pixels.
         - 'y_names': tuple with the name string for each class.

        Parameters
        ----------
        min_faces_per_person : int or None, optional
            The extracted dataset will only retain pictures of people
            that have at least min_faces_per_person different pictures.
            Default None, so all db images are returned.
        funneled : bool, optional
            Download and use the images aligned with deep funneling.
            Default True.
        color : bool, optional
            Keep the 3 RGB channels instead of averaging them to a
            single gray level channel. Default False.

        """
        with CDataLoaderLFW.__lock:
            lfw_people = fetch_lfw_people(
                data_home=SECML_DS_DIR,
                funneled=funneled,
                resize=1,
                min_faces_per_person=min_faces_per_person,
                color=color,
                slice_=None,
                download_if_missing=True)

        x = CArray(lfw_people.data)
        y = CArray(lfw_people.target)

        img_w = lfw_people.images.shape[2]
        img_h = lfw_people.images.shape[1]

        y_names = tuple(lfw_people.target_names.tolist())

        header = CDatasetHeader(img_w=img_w, img_h=img_h, y_names=y_names)

        return CDataset(x, y, header=header)
Ejemplo n.º 27
0
    def load(self):
        """Loads the dataset.

        Returns
        -------
        dataset : CDataset
            The randomly generated dataset.

        """
        from sklearn.datasets import make_regression
        patterns, labels = make_regression(n_samples=self.n_samples,
                                           n_features=self.n_features,
                                           n_informative=self.n_informative,
                                           n_targets=self.n_targets,
                                           bias=self.bias,
                                           effective_rank=self.effective_rank,
                                           tail_strength=self.tail_strength,
                                           noise=self.noise,
                                           random_state=self.random_state)
        return CDataset(patterns, labels)
Ejemplo n.º 28
0
    def _fit(self, x, y):
        """Trains the KNeighbors classifier.

        Training dataset is stored to use in kneighbors() method.

        Parameters
        ----------
        x : CArray
            Array to be used for training with shape (n_samples, n_features)
        y : CArray
            Array of shape (n_samples,) containing the class labels.

        Returns
        -------
        CClassifierKNN
            Trained classifier.

        """
        self._tr = CDataset(x, y)
        return CClassifierSkLearn._fit(self, x, y)
Ejemplo n.º 29
0
    def test_custom_attr(self):
        """Testing for custom attributes."""
        header = CDatasetHeader(id='mydataset',
                                age=34,
                                colors=CArray([1, 2, 3]))
        ds = CDataset(self.X, self.Y, header=header)

        ds_params = ds.header.get_params()
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assert_array_equal(ds_params['colors'], CArray([1, 2, 3]))

        # Testing getitem. Immutable objects should be copied as they are.
        # Arrays should be indexed.
        ds_get = ds[[0, 2], :]
        ds_params = ds_get.header.get_params()
        self.assert_array_equal(ds_get.X, CArray([[1, 2, 3], [7, 8, 9]]))
        self.assert_array_equal(ds_get.Y, CArray([1, 2]))
        self.assertEqual(ds_params['id'], 'mydataset')
        self.assertEqual(ds_params['age'], 34)
        self.assert_array_equal(ds_params['colors'], CArray([1, 3]))
Ejemplo n.º 30
0
    def test_margin(self):
        self.logger.info("Testing margin separation of SVM...")

        import numpy as np

        # we create 40 separable points
        rng = np.random.RandomState(0)
        n_samples_1 = 1000
        n_samples_2 = 100
        X = np.r_[1.5 * rng.randn(n_samples_1, 2),
                  0.5 * rng.randn(n_samples_2, 2) + [2, 2]]
        y = [0] * (n_samples_1) + [1] * (n_samples_2)

        dataset = CDataset(X, y)

        # fit the model
        clf = CClassifierSVM()
        clf.fit(dataset.X, dataset.Y)

        w = clf.w
        a = -w[0] / w[1]

        xx = CArray.linspace(-5, 5)
        yy = a * xx - clf.b / w[1]

        wclf = CClassifierSVM(class_weight={0: 1, 1: 10})
        wclf.fit(dataset.X, dataset.Y)

        ww = wclf.w
        wa = -ww[0] / ww[1]
        wyy = wa * xx - wclf.b / ww[1]

        fig = CFigure(linewidth=1)
        fig.sp.plot(xx, yy.ravel(), 'k-', label='no weights')
        fig.sp.plot(xx, wyy.ravel(), 'k--', label='with weights')
        fig.sp.scatter(X[:, 0].ravel(), X[:, 1].ravel(), c=y)
        fig.sp.legend()

        fig.savefig(
            fm.join(fm.abspath(__file__), 'figs', 'test_c_classifier_svm.pdf'))