Beispiel #1
0
    def _create_mnist_dataset(digits=[4, 9],
                              n_tr=100,
                              n_val=200,
                              n_ts=200,
                              seed=4):  # 10
        loader = CDataLoaderMNIST()

        tr = loader.load('training', digits=digits)
        ts = loader.load('testing', digits=digits, num_samples=n_ts)

        # start train and validation dataset split
        splitter = CDataSplitterKFold(num_folds=2, random_state=seed)
        splitter.compute_indices(tr)

        val_dts_idx = CArray.randsample(CArray.arange(0, tr.num_samples),
                                        n_val,
                                        random_state=seed)
        val = tr[val_dts_idx, :]

        tr_dts_idx = CArray.randsample(CArray.arange(0, tr.num_samples),
                                       n_tr,
                                       random_state=seed)
        tr = tr[tr_dts_idx, :]

        tr.X /= 255.0
        val.X /= 255.0
        ts.X /= 255.0

        return tr, val, ts
Beispiel #2
0
    def fit_forward(self, x, y=None, caching=False):
        """Fit estimator using data and then execute forward on the data.

        To avoid returning over-fitted scores on the training set, this method
        runs a 5-fold cross validation on training data and
        returns the validation scores.

        Parameters
        ----------
        x : CArray
            Array with shape (n_samples, n_features) to be transformed and
            to be used for training.
        y : CArray or None, optional
            Array of shape (n_samples,) containing the class labels.
            Can be None if not required by the algorithm.
        caching: bool
             True if preprocessed x should be cached for backward pass

        Returns
        -------
        CArray
            Transformed input data.

        See Also
        --------
        fit : fit the preprocessor.
        forward : run forward function on input data.

        """
        kfold = CDataSplitterKFold(num_folds=5,
                                   random_state=0).compute_indices(
                                       CDataset(x, y))

        scores = CArray.zeros(shape=(x.shape[0], self.classes.size))

        # TODO: samples can be first preprocessed and cached, if required.
        #  then we can use _fit and _forward to work on the preprocessed data
        for k in range(kfold.num_folds):
            tr_idx = kfold.tr_idx[k]
            ts_idx = kfold.ts_idx[k]
            self.fit(x[tr_idx, :], y[tr_idx])
            scores[ts_idx, :] = self.forward(x[ts_idx, :], caching=False)

        # train on the full training set after computing the xval scores
        self.fit(x, y)

        # cache x if required
        if caching is True:
            self._forward_preprocess(x, caching=True)

        return scores
# Metric to use for training and performance evaluation
from secml.ml.peval.metrics import CMetricAccuracy
metric = CMetricAccuracy()

# Creation of the multiclass classifier
from secml.ml.classifiers import CClassifierSVM
from secml.ml.classifiers.multiclass import CClassifierMulticlassOVA
from secml.ml.kernel import CKernelRBF
clf = CClassifierMulticlassOVA(CClassifierSVM, kernel=CKernelRBF())

# Parameters for the Cross-Validation procedure
xval_params = {'C': [1e-2, 0.1, 1], 'kernel.gamma': [10, 100, 1e3]}

# Let's create a 3-Fold data splitter
from secml.data.splitter import CDataSplitterKFold
xval_splitter = CDataSplitterKFold(num_folds=3, random_state=random_state)

# Select and set the best training parameters for the classifier
print("Estimating the best training parameters...")
best_params = clf.estimate_parameters(
    dataset=tr,
    parameters=xval_params,
    splitter=xval_splitter,
    metric='accuracy',
    perf_evaluator='xval'
)

print("The best training parameters are: ", best_params)

# We can now fit the classifier
clf.fit(tr)
def attack_keras_model(X, Y, S, nb_attack=25, dmax=0.1):
    """
    Generates an adversarial attack on a general model.

    :param X: Original inputs on which the model is trained
    :param Y: Original outputs on which the model is trained
    :param S: Original protected attributes on which the model is trained
    :return: Adversarial dataset (i.e. new data points + original input)
    """

    from secml.data import CDataset
    from secml.array import CArray

    # secML wants all dimensions to be homogeneous (we had previously float and int in X)
    data_set_encoded_secML = CArray(X, dtype=float, copy=True)
    data_set_encoded_secML = CDataset(data_set_encoded_secML, Y)

    n_tr = round(0.66 * X.shape[0])
    n_ts = X.shape[0] - n_tr

    logger.debug(X.shape)
    logger.debug(n_tr)
    logger.debug(n_ts)

    from secml.data.splitter import CTrainTestSplit
    splitter = CTrainTestSplit(train_size=n_tr, test_size=n_ts)

    # Use training set for the classifier and then pick points from an internal test set.
    tr_set_secML, ts_set_secML = splitter.split(data_set_encoded_secML)

    # tr_set_secML = CDataset(X_train,Y_train)
    # ts_set_secML = CDataset(X_test,Y_test)

    # Create a surrogate classifier

    # Creation of the multiclass classifier
    from secml.ml.classifiers import CClassifierSVM
    from secml.ml.classifiers.multiclass import CClassifierMulticlassOVA
    from secml.ml.kernel import CKernelRBF
    clf = CClassifierMulticlassOVA(CClassifierSVM, kernel=CKernelRBF())

    # Parameters for the Cross-Validation procedure
    xval_params = {'C': [1e-4, 1e-3, 1e-2, 0.1, 1], 'kernel.gamma': [0.01, 0.1, 1, 10, 100, 1e3]}

    # Let's create a 3-Fold data splitter
    random_state = 999

    from secml.data.splitter import CDataSplitterKFold
    xval_splitter = CDataSplitterKFold(num_folds=3, random_state=random_state)

    # Select and set the best training parameters for the classifier
    logger.debug("Estimating the best training parameters...")
    best_params = clf.estimate_parameters(
        dataset=tr_set_secML,
        parameters=xval_params,
        splitter=xval_splitter,
        metric='accuracy',
        perf_evaluator='xval'
    )
    logger.debug("The best training parameters are: ", best_params)

    logger.debug(clf.get_params())
    logger.debug(clf.num_classifiers)

    # Metric to use for training and performance evaluation
    from secml.ml.peval.metrics import CMetricAccuracy
    metric = CMetricAccuracy()

    # Train the classifier
    clf.fit(tr_set_secML)
    logger.debug(clf.num_classifiers)

    # Compute predictions on a test set
    y_pred = clf.predict(ts_set_secML.X)

    # Evaluate the accuracy of the classifier
    acc = metric.performance_score(y_true=ts_set_secML.Y, y_pred=y_pred)

    logger.debug("Accuracy on test set: {:.2%}".format(acc))

    # Prepare attack configuration

    noise_type = 'l2'   # Type of perturbation 'l1' or 'l2'
    lb, ub = 0, 1       # Bounds of the attack space. Can be set to `None` for unbounded
    y_target = None     # None if `error-generic` or a class label for `error-specific`

    # Should be chosen depending on the optimization problem
    solver_params = {
        'eta': 0.1,         # grid search resolution
        'eta_min': 0.1,
        'eta_max': None,    # None should be ok
        'max_iter': 1000,
        'eps': 1e-2         # Tolerance on the stopping crit.
    }

    # Run attack

    from secml.adv.attacks.evasion import CAttackEvasionPGDLS
    pgd_ls_attack = CAttackEvasionPGDLS(
        classifier=clf,
        surrogate_classifier=clf,
        surrogate_data=tr_set_secML,
        distance=noise_type,
        dmax=dmax,
        lb=lb, ub=ub,
        solver_params=solver_params,
        y_target=y_target)

    nb_feat = X.shape[1]

    result_pts = np.empty([nb_attack, nb_feat])
    result_class = np.empty([nb_attack, 1])

    # take a point at random being the starting point of the attack and run the attack
    import random
    for nb_iter in range(0, nb_attack):
        rn = random.randint(0, ts_set_secML.num_samples - 1)
        x0, y0 = ts_set_secML[rn, :].X, ts_set_secML[rn, :].Y,

        try:
            y_pred_pgdls, _, adv_ds_pgdls, _ = pgd_ls_attack.run(x0, y0)
            adv_pt = adv_ds_pgdls.X.get_data()
            # np.asarray([np.asarray(row, dtype=float) for row in y_tr], dtype=float)
            result_pts[nb_iter] = adv_pt
            result_class[nb_iter] = y_pred_pgdls.get_data()[0]
        except ValueError:
            logger.warning("value error on {}".format(nb_iter))

    return result_pts, result_class, ts_set_secML[:nb_attack, :].Y
#split between train and test -> train = 66% of data
n_tr = data_smp_encoded_secML.num_samples
n_ts = raw_data_encoded_secML.num_samples

#from secml.data.splitter import CTrainTestSplit
#splitter = CTrainTestSplit(
#    train_size=n_tr, test_size=n_ts)
#tr, ts = splitter.split(raw_data_encoded_secML)

print("Data splitted into train and test")

# create a splitter for training data -> training and validation sets
from secml.data.splitter import CDataSplitterKFold
#default splitter for cross-val -> num_folds=3 and random_state=None
xval_splitter = CDataSplitterKFold()

##train a classifier (SVM)
from secml.ml.classifiers import CClassifierSVM
clf_lin = CClassifierSVM()

#from secml.ml.kernel import CKernelRBF
#clf_rbf = CClassifierSVM(kernel=CKernelRBF())

#problem seems linearly decidable -> try a logistic regression classifier without any parameter estimations
from secml.ml.classifiers import CClassifierLogistic
#clf_l= CClassifierLogistic()

xval_lin_params = {'C': [0.01, 0.1, 1, 10, 100]}

# Select and set the best training parameters for the linear classifier