def run(self, x, y, ds_init=None, *args, **kargs):
        x = CArray(x).atleast_2d()
        y = CArray(y).atleast_2d()
        x_init = None if ds_init is None else CArray(ds_init.X).atleast_2d()

        # only consider samples that can be manipulated
        v = self.is_attack_class(y)
        idx = CArray(v.find(v)).ravel()
        # print(v, idx)

        # number of modifiable samples
        n_mod_samples = idx.size

        adv_ds = CDataset(x.deepcopy(), y.deepcopy())

        # If dataset is sparse, set the proper attribute
        if x.issparse is True:
            self._issparse = True

        # array in which the value of the optimization function are stored
        fs_opt = CArray.zeros(n_mod_samples, )
        y_pred = CArray.zeros(n_mod_samples, )
        scores = CArray.zeros((n_mod_samples, 2))
        for i in range(n_mod_samples):
            k = idx[i].item()  # idx of sample that can be modified

            xi = x[k, :] if x_init is None else x_init[k, :]
            x_opt, f_opt = self._run(x[k, :], y[k], x_init=xi, *args, **kargs)

            self.logger.info(
                "Point: {:}/{:}, dmax:{:}, f(x):{:}, eval:{:}/{:}".format(
                    k, x.shape[0], self._dmax, f_opt, self.f_eval,
                    self.grad_eval))
            if x_opt.shape[-1] > adv_ds.X.shape[-1]:
                # Need to resize the whole adv dataset, since CDataset can't deal with varying vector sizes
                new_length = x_opt.shape[-1]
                adv_ds.X = adv_ds.X.resize((adv_ds.X.shape[0], new_length),
                                           256)
            adv_ds.X[k, :min(adv_ds.X.shape[-1], x_opt.shape[-1])] = x_opt
            fs_opt[i] = f_opt
            y_p, score = self.problem.model_wrapper.predict(
                x_opt, return_decision_function=True)
            scores[i, :] = score[0, :]
            y_pred[i] = y_p

        # Return the mean objective function value on the evasion points (
        # computed from the outputs of the surrogate classifier)
        f_obj = fs_opt.mean()

        return y_pred, scores, adv_ds, f_obj
    def _run(self, xc, yc, idx=0):
        """Single point poisoning.

        Here xc can be a *set* of points, in which case idx specifies which
        point should be manipulated by the poisoning attack.

        """
        xc = CArray(xc.deepcopy()).atleast_2d()

        self._yc = yc
        self._xc = xc
        self._idx = idx  # point to be optimized within xc

        self._x0 = self._xc[idx, :].ravel()

        self._init_solver()

        if self.y_target is None:  # indiscriminate attack
            x = self._solver.maximize(self._x0)
        else:  # targeted attack
            x = self._solver.minimize(self._x0)

        self._solution_from_solver()

        return x
Beispiel #3
0
    def _forward(self, x):
        """
        Apply the TF-IDF transform.

        Parameters
        ----------
        x : CArray
            Array with features to be transformed.

        Returns
        -------
        Array with normalized features.
        Shape of returned array is the same of the original array.

        """
        # transform data
        x = CArray(self._sklearn_tfidf.transform(x.get_data()))
        if self.norm is not None:  # apply unitnorm if set
            # store x after the tf-idf transformation (needed for grad. comp.)
            self._cached_x_tfidf = x.deepcopy()
            x = self._unitnorm.transform(x)
        return x
Beispiel #4
0
class CModuleTestCases(CUnitTest):
    """Unittests interface for CPreProcess."""
    def setUp(self):
        self.array_dense = CArray([[1, 0, 0, 5], [2, 4, 0, 0], [3, 6, 0, 0]])
        self.array_sparse = CArray(self.array_dense.deepcopy(), tosparse=True)

        self.labels = CArray([0, 1, 0])

        # found bug in sklearn normalizer, see:
        # https://github.com/scikit-learn/scikit-learn/issues/16632
        # self.row_dense = CArray([-4, 0, 6])
        self.row_dense = CArray([4, 0, 6])
        self.column_dense = self.row_dense.deepcopy().T

        self.row_sparse = CArray(self.row_dense.deepcopy(), tosparse=True)
        self.column_sparse = self.row_sparse.deepcopy().T

    @staticmethod
    def _create_chain(class_type_list, kwargs_list):
        """Creates a module with other modules chained
        and a list of the same modules (not chained)."""
        chain = None  # module with preprocessing chain
        modules = []  # list of modules (not connected via preprocessing)
        for i, pre_id in enumerate(class_type_list):
            chain = CModule.create(pre_id, preprocess=chain, **kwargs_list[i])
            modules.append(CModule.create(pre_id, **kwargs_list[i]))
        return chain, modules

    def _test_chain(self, x, class_type_list, kwargs_list, y=None):
        """Tests if preprocess chain and manual chaining yield same result."""
        chain, modules = self._create_chain(class_type_list, kwargs_list)

        chain = chain.fit(x, y=y)
        self.logger.info("Preprocessors chain:\n{:}".format(chain))

        x_chain = chain.forward(x)
        self.logger.info("Trasformed X (chain):\n{:}".format(x_chain))

        # Train the manual chain and transform
        x_manual = x
        for module in modules:
            module.fit(x_manual, y=y)
            x_manual = module.forward(x_manual)

        self.logger.info("Trasformed X (manual):\n{:}".format(x_manual))
        self.assert_allclose(x_chain, x_manual)

        return x_chain

    def _test_chain_gradient(self, x, class_type_list, kwargs_list, y=None):
        """Tests if gradient preprocess chain and
        gradient of manual chaining yield same result."""
        chain, modules = self._create_chain(class_type_list, kwargs_list)

        chain = chain.fit(x, y=y)
        self.logger.info("module chain:\n{:}".format(chain))

        v = x[1, :]
        fwd_chain = chain.forward(v)  # this has size equal to n_outputs

        # compute gradient of the last output
        n_outputs = fwd_chain.size
        w = CArray.zeros(shape=(n_outputs, ))
        w[-1] = 1
        grad_chain = chain.gradient(v, w=w)
        self.logger.info("chain.forward({:}):\n{:}".format(v, fwd_chain))
        self.logger.info("chain.gradient({:}):\n{:}".format(v, grad_chain))

        # Manually train the chain
        for module in modules:
            module.fit(x, y=y)
            x = module.forward(x)

        # test on a single point
        v_list = [v]
        for module in modules[:-1]:
            v = module.forward(v)
            v_list.append(v)

        v_list = list(reversed(v_list))
        modules = list(reversed(modules))

        grad = w
        for i, v in enumerate(v_list):
            grad = modules[i].gradient(v, w=grad)

        self.logger.info("chain.gradient({:}):\n{:}".format(v, grad))
        self.assert_allclose(grad_chain, grad)

        return grad
Beispiel #5
0
    def test_input_shape(self):
        """Test CArray.input_shape behavior."""
        array = CArray([[[2, 3], [22, 33]], [[4, 5], [44, 55]]])
        array_s = \
            CArray([[[2, 3], [22, 33]], [[4, 5], [44, 55]]], tosparse=True)
        ref_shape = (2, 2, 2)

        # not propagate on getitem (as it returns new objects)
        out = array[0:2, 0:2]
        self.assertEqual(out.input_shape, out.shape)
        out = array_s[0:2, 0:2]
        self.assertEqual(out.input_shape, out.shape)

        # not propagate on other generic methods (as they return new objects)
        out = array.astype(float)
        self.assertEqual(out.input_shape, out.shape)
        out = array.unique()
        self.assertEqual(out.input_shape, out.shape)
        out = array.all(axis=0)
        self.assertEqual(out.input_shape, out.shape)

        # not propagate on classmethods (es. concatenate/append)
        out = CArray.concatenate(array, array, axis=0)
        self.assertEqual(out.input_shape, out.shape)
        out = CArray.concatenate(array, array, axis=None)
        self.assertEqual(out.input_shape, out.shape)

        # should propagate on copy/deepcopy
        from copy import copy, deepcopy

        array_c = copy(array)
        self.assertEqual(array_c.input_shape, ref_shape)
        array_c = copy(array_s)
        self.assertEqual(array_c.input_shape, ref_shape)

        array_c = deepcopy(array)
        self.assertEqual(array_c.input_shape, ref_shape)
        array_c = deepcopy(array_s)
        self.assertEqual(array_c.input_shape, ref_shape)

        array_c = array.deepcopy()
        self.assertEqual(array_c.input_shape, ref_shape)
        array_c = array_s.deepcopy()
        self.assertEqual(array_c.input_shape, ref_shape)

        # should propagate on setitem
        array_c = array.deepcopy()
        array_c[0:2, 0:2] = 200
        self.assertEqual(array_c.input_shape, ref_shape)

        array_c = array.deepcopy()
        array_c[0:2, 0:2] = CArray([[100, 200], [300, 400]])
        self.assertEqual(array_c.input_shape, ref_shape)

        array_c = array_s.deepcopy()
        array_c[0:2, 0:2] = CArray([[100, 200], [300, 400]])
        self.assertEqual(array_c.input_shape, ref_shape)

        # should propagate on todense/tosparse
        self.assertEqual(array.tosparse().input_shape, ref_shape)
        self.assertEqual(array.todense().input_shape, ref_shape)
        self.assertEqual(array_s.tosparse().input_shape, ref_shape)
        self.assertEqual(array_s.todense().input_shape, ref_shape)
class CPreProcessTestCases(CUnitTest):
    """Unittests interface for CPreProcess."""
    def setUp(self):

        self.array_dense = CArray([[1, 0, 0, 5], [2, 4, 0, 0], [3, 6, 0, 0]])
        self.array_sparse = CArray(self.array_dense.deepcopy(), tosparse=True)

        # found bug in sklearn normalizer, see:
        # https://github.com/scikit-learn/scikit-learn/issues/16632
        # self.row_dense = CArray([-4, 0, 6])
        self.row_dense = CArray([4, 0, 6])
        self.column_dense = self.row_dense.deepcopy().T

        self.row_sparse = CArray(self.row_dense.deepcopy(), tosparse=True)
        self.column_sparse = self.row_sparse.deepcopy().T

    @staticmethod
    def _create_chain(pre_id_list, kwargs_list):
        """Creates a preprocessor with other preprocessors chained
        and a list of the same preprocessors (not chained)"""
        chain = None
        pre_list = []
        for i, pre_id in enumerate(pre_id_list):
            chain = CPreProcess.create(pre_id,
                                       preprocess=chain,
                                       **kwargs_list[i])
            pre_list.append(CPreProcess.create(pre_id, **kwargs_list[i]))

        return chain, pre_list

    def _test_chain(self, x, pre_id_list, kwargs_list, y=None):
        """Tests if preprocess chain and manual chaining yield same result."""
        chain, pre_list = self._create_chain(pre_id_list, kwargs_list)

        chain = chain.fit(x, y=y)
        self.logger.info("Preprocessors chain:\n{:}".format(chain))

        x_chain = chain.transform(x)
        self.logger.info("Trasformed X (chain):\n{:}".format(x_chain))

        # Train the manual chain and transform
        x_manual = x
        for pre in pre_list:
            x_manual = pre.fit_transform(x_manual, y=y)

        self.logger.info("Trasformed X (manual):\n{:}".format(x_manual))
        self.assert_allclose(x_chain, x_manual)

        # Reverting array (if available)
        try:
            x_chain_revert = chain.inverse_transform(x_chain)
            self.logger.info("Reverted X (chain):\n{:}".format(x_chain_revert))
            self.logger.info("Original X:\n{:}".format(x))
            self.assert_array_almost_equal(x_chain_revert, x)
        except NotImplementedError:
            self.logger.info("inverse_transform not available")

        return x_chain

    def _test_chain_gradient(self, x, pre_id_list, kwargs_list, y=None):
        """Tests if gradient preprocess chain and
        gradient of manual chaining yield same result."""
        chain, pre_list = self._create_chain(pre_id_list, kwargs_list)

        chain = chain.fit(x, y=y)
        self.logger.info("Preprocessors chain:\n{:}".format(chain))

        v = x[1, :]
        grad_chain = chain.gradient(v)
        self.logger.info("gradient({:}) (chain):\n{:}".format(v, grad_chain))

        # Manually compose the chain and transform
        for pre in pre_list:
            x = pre.fit_transform(x, y=y)

        v_list = [v]
        for pre in pre_list[:-1]:
            v = pre.transform(v)
            v_list.append(v)

        v_list = list(reversed(v_list))
        pre_list = list(reversed(pre_list))

        grad = None
        for i, v in enumerate(v_list):
            grad = pre_list[i].gradient(v, w=grad)

        self.logger.info("gradient({:}) (manual):\n{:}".format(v, grad))
        self.assert_allclose(grad_chain, grad)

        return grad_chain
    def run(self, x, y, ds_init=None, *args, **kargs):
        """Runs evasion on a dataset.

        Parameters
        ----------
        x : CArray
            Data points.
        y : CArray
            True labels.
        ds_init : CDataset
            Dataset for warm starts.

        Returns
        -------
        y_pred : CArray
            Predicted labels for all ds samples by target classifier.
        scores : CArray
            Scores for all ds samples by target classifier.
        adv_ds : CDataset
            Dataset of manipulated samples.
        f_obj : float
            Mean value of the objective function computed on each data point.

        """
        x = CArray(x).atleast_2d()
        y = CArray(y).atleast_2d()
        x_init = None if ds_init is None else CArray(ds_init.X).atleast_2d()

        # only consider samples that can be manipulated
        v = self.is_attack_class(y)
        idx = CArray(v.find(v)).ravel()
        # print(v, idx)

        # number of modifiable samples
        n_mod_samples = idx.size

        adv_ds = CDataset(x.deepcopy(), y.deepcopy())

        # If dataset is sparse, set the proper attribute
        if x.issparse is True:
            self._issparse = True

        # array in which the value of the optimization function are stored
        fs_opt = CArray.zeros(n_mod_samples, )

        for i in range(n_mod_samples):
            k = idx[i].item()  # idx of sample that can be modified

            xi = x[k, :] if x_init is None else x_init[k, :]
            x_opt, f_opt = self._run(x[k, :], y[k], x_init=xi, *args, **kargs)

            self.logger.info(
                "Point: {:}/{:}, dmax:{:}, f(x):{:}, eval:{:}/{:}".format(
                    k, x.shape[0], self._dmax, f_opt, self.f_eval,
                    self.grad_eval))
            adv_ds.X[k, :] = x_opt
            fs_opt[i] = f_opt

        y_pred, scores = self.classifier.predict(adv_ds.X,
                                                 return_decision_function=True)

        y_pred = CArray(y_pred)

        # Return the mean objective function value on the evasion points (
        # computed from the outputs of the surrogate classifier)
        f_obj = fs_opt.mean()

        return y_pred, scores, adv_ds, f_obj
Beispiel #8
0
    def run(self, x, y, ds_init=None) -> Tuple[CArray, CArray, CDataset, Any]:
        """
		Runs the genetic algorithms.

		Parameters
		----------
		x : CArray
			input sample to perturb
		y : CArray
			original class
		ds_init : CDataset, optional, default None
			the initialization point.
			Default is None
		Returns
		-------
		CArray
			y_pred : the predicted label after the attack
		CArray
			scores : the scores after the attack
		CDataset
			adv_ds : the CDataset containing the adversarial points
		CArray
			f_obj : the mean value for the objective function
		"""
        x = CArray(x).atleast_2d()
        y = CArray(y).atleast_2d()
        x_init = None if ds_init is None else CArray(ds_init.X).atleast_2d()

        # only consider samples that can be manipulated
        v = self.is_attack_class(y)
        idx = CArray(v.find(v)).ravel()
        # print(v, idx)

        # number of modifiable samples
        n_mod_samples = idx.size

        adv_ds = CDataset(x.deepcopy(), y.deepcopy())

        # If dataset is sparse, set the proper attribute
        if x.issparse is True:
            self._issparse = True

        # array in which the value of the optimization function are stored
        fs_opt = CArray.zeros(n_mod_samples, )
        y_pred = CArray.zeros(n_mod_samples, )
        scores = CArray.zeros((n_mod_samples, 2))
        for i in range(n_mod_samples):
            k = idx[i].item()  # idx of sample that can be modified

            xi = x[k, :] if x_init is None else x_init[k, :]
            x_opt, f_opt = self._run(x[k, :], y[k], x_init=xi)

            self.logger.info("Point: {:}/{:}, f(x):{:}, eval:{:}/{:}".format(
                k, x.shape[0], f_opt, self.f_eval, self.grad_eval))
            if x_opt.shape[-1] > adv_ds.X.shape[-1]:
                # Need to resize the whole adv dataset, since CDataset can't deal with varying vector sizes
                new_length = x_opt.shape[-1]
                adv_ds.X = adv_ds.X.resize((adv_ds.X.shape[0], new_length),
                                           256)
            adv_ds.X[k, :min(adv_ds.X.shape[-1], x_opt.shape[-1])] = x_opt
            fs_opt[i] = f_opt
            y_p, score = self.problem.model_wrapper.predict(
                x_opt, return_decision_function=True)
            scores[i, :] = score[0, :]
            y_pred[i] = y_p

        # Return the mean objective function value on the evasion points (
        # computed from the outputs of the surrogate classifier)
        f_obj = fs_opt.mean()

        return y_pred, scores, adv_ds, f_obj
Beispiel #9
0
    def _euclidean_proj_simplex(self, v, s=1):
        """Compute the Euclidean projection on a positive simplex.

        Solves the optimisation problem (using the algorithm from [1]):

            min_w 0.5 * || w - v ||_2^2 ,
            s.t. \\sum_i w_i = s, w_i >= 0

        Parameters
        ----------
        v : CArray
            1-Dimensional vector

        s : int, optional
            Radius of the simplex. Default 1.

        Returns
        -------
        w : CArray
           Euclidean projection of v on the simplex.

        Notes
        -----
        The complexity of this algorithm is in O(n log(n)) as it involves
        sorting v. Better alternatives exist for high-dimensional sparse
        vectors (cf. [1]). However, this implementation still easily
        scales to millions of dimensions.

        References
        ----------
        [1] Efficient Projections onto the l1-Ball for
            Learning in High Dimensions
            John Duchi, Shai Shalev-Shwartz, Yoram Singer,
            and Tushar Chandra.
            International Conference on Machine Learning (ICML 2008)
            http://www.cs.berkeley.edu/~jduchi/projects/DuchiSiShCh08.pdf

        """
        v = CArray(v).ravel()
        d = v.size
        # check if we are already on the simplex
        if v.sum() == s and (v >= 0).sum() == d:
            return v  # best projection: itself!
        # get the array of cumulative sums of a sorted (decreasing) copy of v
        u = v.deepcopy()
        u.sort(inplace=True)
        u = u[::-1]
        if u.issparse:
            u_nnz = CArray(u.nnz_data).todense()
            cssv = u_nnz.cumsum()
        else:
            cssv = u.cumsum()

        # get the number of > 0 components of the optimal solution
        # (only considering non-null elements in v
        j = CArray.arange(1, cssv.size+1)
        if u.issparse:
            rho = (j * u_nnz > (cssv - s)).sum() - 1
        else:
            rho = (j * u > (cssv - s)).sum() - 1

        # compute the Lagrange multiplier associated to the simplex constraint
        theta = (cssv[rho] - s) / (rho + 1.0)

        # compute the projection by thresholding v using theta
        w = v
        if w.issparse:
            p = CArray(w.nnz_data)
            p -= theta
            w[w.nnz_indices] = p
        else:
            w -= theta
        w[w < 0] = 0
        return w
class CArrayTestCases(CUnitTest):
    """Unittests interface for CArray."""
    def setUp(self):
        """Basic set up."""
        self.array_dense = CArray([[1, 0, 0, 5], [2, 4, 0, 0], [3, 6, 0, 0]])
        self.array_sparse = CArray(self.array_dense.deepcopy(), tosparse=True)

        self.array_dense_sym = CArray([[1, 2, 0], [2, 4, 6], [0, 6, 0]])
        self.array_sparse_sym = CArray(self.array_dense_sym.deepcopy(),
                                       tosparse=True)

        self.array_dense_nozero = CArray([[1, 2, 3, 4], [5, 6, 7, 8],
                                          [9, 10, 11, 12]])
        self.array_sparse_nozero = CArray(self.array_dense_nozero.deepcopy(),
                                          tosparse=True)

        self.array_dense_allzero = CArray([[0, 0, 0, 0], [0, 0, 0, 0],
                                           [0, 0, 0, 0]])
        self.array_sparse_allzero = CArray(self.array_dense_allzero.deepcopy(),
                                           tosparse=True)

        self.array_dense_bool = CArray([[True, False, True, True],
                                        [False, False, False, False],
                                        [True, True, True, True]])
        self.array_sparse_bool = CArray(self.array_dense_bool.deepcopy(),
                                        tosparse=True)

        self.array_dense_bool_true = CArray([[True, True, True, True],
                                             [True, True, True, True],
                                             [True, True, True, True]])
        self.array_sparse_bool_true = CArray(
            self.array_dense_bool_true.deepcopy(), tosparse=True)

        self.array_dense_bool_false = CArray([[False, False, False, False],
                                              [False, False, False, False],
                                              [False, False, False, False]])
        self.array_sparse_bool_false = CArray(
            self.array_dense_bool_false.deepcopy(), tosparse=True)

        self.row_flat_dense = CArray([4, 0, 6])
        self.row_dense = self.row_flat_dense.atleast_2d()
        self.column_dense = self.row_dense.deepcopy().T

        self.row_sparse = CArray(self.row_dense.deepcopy(), tosparse=True)
        self.column_sparse = self.row_sparse.deepcopy().T

        self.single_flat_dense = CArray([4])
        self.single_dense = self.single_flat_dense.atleast_2d()
        self.single_sparse = CArray(self.single_dense.deepcopy(),
                                    tosparse=True)

        self.single_flat_dense_zero = CArray([0])
        self.single_dense_zero = self.single_flat_dense_zero.atleast_2d()
        self.single_sparse_zero = CArray(self.single_dense_zero.deepcopy(),
                                         tosparse=True)

        self.single_bool_flat_dense = CArray([True])
        self.single_bool_dense = self.single_bool_flat_dense.atleast_2d()
        self.single_bool_sparse = CArray(self.single_bool_dense.deepcopy(),
                                         tosparse=True)

        self.single_bool_flat_dense_false = CArray([False])
        self.single_bool_dense_false = \
            self.single_bool_flat_dense_false.atleast_2d()
        self.single_bool_sparse_false = CArray(
            self.single_bool_dense_false.deepcopy(), tosparse=True)

        self.empty_flat_dense = CArray([], tosparse=False)
        self.empty_dense = CArray([[]], tosparse=False)
        self.empty_sparse = CArray([], tosparse=True)

    def _test_multiple_eq(self, items_list):
        """Return True if all items are equal."""

        # We are going to compare the first element
        # with the second, the second with the third, etc.
        for item_idx, item in enumerate(items_list):
            if item_idx == len(items_list) - 1:
                break  # We checked all the elements
            self.assert_array_equal(item, items_list[item_idx + 1])

        # Every item is equal to each other, return True
        return True

    def _test_operator_cycle(self, totest_op, totest_items, totest_result):
        """Check if operator return the expected result on given items.

        totest_op: list of operators
        totest_items: list of items PAIR to test
        totest_result: list of expected result (class name) for each PAIR

        """
        for operator in totest_op:
            to_check = []
            for pair_idx, pair in enumerate(totest_items):
                class0 = type(pair[0]._data) if \
                    hasattr(pair[0], 'isdense') else type(pair[0])
                class1 = type(pair[1]._data) if \
                    hasattr(pair[1], 'isdense') else type(pair[1])
                self.logger.info("Operator {:} between {:} and {:}"
                                 "".format(operator.__name__, class0, class1))
                result = operator(pair[0], pair[1])
                self.assertIsInstance(result._data, totest_result[pair_idx])
                self.logger.info("Result: {:}".format(
                    result._data.__class__.__name__))
                to_check.append(result)
            self.assertTrue(self._test_multiple_eq(to_check))

    def _test_operator_notimplemented(self, totest_op, totest_items):
        """Check if operator is not implemented for given items.

        totest_op: list of operators
        totest_items: list of items PAIR to test

        """
        for operator in totest_op:
            for pair in totest_items:
                with self.assertRaises(NotImplementedError):
                    operator(pair[0], pair[1])