def _to_art_classifier( classifier: Union[tf.keras.Model, torch.nn.Module], nb_classes: int, input_shape: Tuple[int, ...], ) -> Union[TensorFlowV2Classifier, PyTorchClassifier]: """Converts a classifier to an ART classifier. :param classifier: Classifier to be converted. Either a Pytorch or Tensorflow classifier. :param nb_classes: Number of classes that were used to train the classifier. :param input_shape: Input shape of a data point of the classifier. :return: Given classifier converted to an ART classifier. :raises TypeError: If the given classifier is of an invalid type. """ if isinstance(classifier, torch.nn.Module): return PyTorchClassifier( model=classifier, loss=None, nb_classes=nb_classes, input_shape=input_shape, ) if isinstance(classifier, tf.keras.Model): return TensorFlowV2Classifier( model=classifier, nb_classes=nb_classes, input_shape=input_shape, ) else: raise TypeError( f"Expected classifier to be an instance of {str(torch.nn.Module)} or {str(tf.keras.Model)}, received {str(type(classifier))} instead." )
def adv_retrain(attack_name, dataset, model_name, nb_epochs=80, batch_size=512, overwrite=False): defended_model_path = "{}{}/".format(param.MODEL_DIR, dataset) filename = 'adv_' + model_name + '_' + attack_name + '.h5' fpath = os.path.join(defended_model_path, filename) if os.path.exists(fpath) and not overwrite: print("Adversarial defended model is already trained") print("please check at: ", fpath) else : '''adversrial retrain model''' x_train, y_train, x_test, y_test = load_data(dataset) ## Load keras pretrained model for the specific dataset model_path = "{}{}/{}.h5".format(param.MODEL_DIR, dataset, model_name) model = load_model(model_path) # model.summary() labels_true = np.argmax(y_test, axis=1) labels_test = np.argmax(model.predict(x_test), axis=1) print('Accuracy test set: %.2f%%' % (np.sum(labels_test == labels_true) / x_test.shape[0] * 100)) classifier_param = param.classifier_params[dataset_name] classifier = TensorFlowV2Classifier(model=model, **classifier_param) attack_param = param.attack_params[attack_name][dataset_name] attack_param["batch_size"] = batch_size if attack_name not in [param.FGSM, param.BIM] : ## some attacks don't have verbose parameter, e.g. bim attack_param["verbose"] = VERBOSE attack = call_function_by_attack_name(attack_name)(classifier, **attack_param) x_test_pgd = attack.generate(x_test, y_test) labels_pgd = np.argmax(classifier.predict(x_test_pgd), axis=1) print('Accuracy on original ' + attack_name + ' adversarial samples: %.2f%%' % (np.sum(labels_pgd == labels_true) / x_test.shape[0] * 100)) # Adversarial Training trainer = AdversarialTrainer(classifier, attack, ratio=1.0) trainer.fit(x_train, y_train, nb_epochs=nb_epochs, batch_size=batch_size) # Save model classifier.save(filename=filename, path=defended_model_path) # Evaluate the adversarially trained model on clean test set labels_true = np.argmax(y_test, axis=1) labels_test = np.argmax(classifier.predict(x_test), axis=1) print('Accuracy test set: %.2f%%' % (np.sum(labels_test == labels_true) / x_test.shape[0] * 100)) # Evaluate the adversarially trained model on original adversarial samples labels_pgd = np.argmax(classifier.predict(x_test_pgd), axis=1) print('Accuracy on original ' + attack_name + ' adversarial samples: %.2f%%' % (np.sum(labels_pgd == labels_true) / x_test.shape[0] * 100)) # Evaluate the adversarially trained model on fresh adversarial samples produced on the adversarially trained model x_test_pgd = attack.generate(x_test, y_test) labels_pgd = np.argmax(classifier.predict(x_test_pgd), axis=1) print('Accuracy on new ' + attack_name + ' adversarial samples: %.2f%%' % (np.sum(labels_pgd == labels_true) / x_test.shape[0] * 100))
def fix_make_dummy_model(): """ Create a random model for testing """ def get_prediction_model(param_dic): """ Model going from embeddings to predictions so we can easily optimise the embedding malware embedding. Needs to have the same structure as the target model. Populated here with "standard" parameters. """ inp = tf.keras.layers.Input(shape=( param_dic["maxlen"], param_dic["embedding_size"], )) filt = tf.keras.layers.Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation="relu", padding="valid", name="filt_layer", )(inp) attn = tf.keras.layers.Conv1D( filters=128, kernel_size=500, strides=500, use_bias=True, activation="sigmoid", padding="valid", name="attn_layer", )(inp) gated = tf.keras.layers.Multiply()([filt, attn]) feat = tf.keras.layers.GlobalMaxPooling1D()(gated) dense = tf.keras.layers.Dense(128, activation="relu", name="dense_layer")(feat) output = tf.keras.layers.Dense(1, name="output_layer")(dense) return tf.keras.Model(inputs=inp, outputs=output) param_dic = {"maxlen": 2**20, "input_dim": 257, "embedding_size": 8} prediction_model = get_prediction_model(param_dic) model_weights = np.random.normal(loc=0, scale=1.0, size=(257, 8)) classifier = TensorFlowV2Classifier( model=prediction_model, nb_classes=2, loss_object=tf.keras.losses.BinaryCrossentropy(from_logits=True), input_shape=(param_dic["maxlen"], param_dic["embedding_size"]), ) return classifier, model_weights
def boundary_attack_run(model_to_attack, target_image, iterations=100): """ This fonction runs the black box boundary attack inputs: -model_to_attack (tensorflow Model instance): model that will be attacked -target_image (numpy array (32*32)): image that will be attack -iterations (int): number of times to run the attack output: -degree_of_change (dict): keys: the number of the iteration, values: the degree of change between target and adversarial image """ classifier = TensorFlowV2Classifier( model=model_to_attack, input_shape=(32, 32, 3), clip_values=(0, 255), nb_classes=10, ) final_degree_of_change = {} attack = BoundaryAttack(estimator=classifier, targeted=False, max_iter=0, delta=0.001, epsilon=0.01) iter_step = 1 image_list = [] target = target_image x_adv = None for i in range(iterations): x_adv = attack.generate(x=np.array([target]), x_adv_init=x_adv) # clear_output() print( "Adversarial image at step %d." % (i * iter_step), "L2 error", np.linalg.norm(np.reshape(x_adv[0] - target, [-1])), "and class label %d." % np.argmax(classifier.predict(x_adv)[0]), ) plt.imshow(x_adv[0][..., ::-1].astype("int32")) image_list.append(x_adv[0][..., ::-1].astype(np.uint)) plt.show(block=False) final_degree_of_change[i * iter_step] = degree_of_change([x_adv[0]], [target]) if hasattr(attack, "curr_delta") and hasattr(attack, "curr_epsilon"): attack.max_iter = iter_step attack.delta = attack.curr_delta attack.epsilon = attack.curr_epsilon else: break return final_degree_of_change
def Deepfool(points=2, steps=0.05): from art.attacks.evasion import NewtonFool from art.estimators.classification import TensorFlowV2Classifier loss_object = tf.keras.losses.SparseCategoricalCrossentropy() classifier = TensorFlowV2Classifier(model=model, nb_classes=10, input_shape=(28, 28, 1), loss_object=loss_object, clip_values=(0, 1), channels_first=False) # Craft adversarial samples with FGSM epsilons = [0.2 * i + 0.1 for i in range(points)] # Maximum perturbation preds = np.argmax(classifier.predict(x_test[:1000]), axis=1) acc = np.sum( preds == np.argmax(y_test[:1000], axis=1)) / y_test[:1000].shape[0] print("\nTest accuracy on normal sample: %.2f%% eps: %.2f" % (acc * 100, 0)) accuracies = [acc] examples = [] for epsilon in epsilons[1:]: adv_crafter = NewtonFool(classifier) x_test_adv = adv_crafter.generate(x=x_test[:1000], y=y_test[:1000]) # Evaluate the classifier on the adversarial examples preds = np.argmax(classifier.predict(x_test_adv), axis=1) acc = np.sum( preds == np.argmax(y_test[:1000], axis=1)) / y_test[:1000].shape[0] print("\nTest accuracy on adversarial sample: %.2f%% eps: %.2f" % (acc * 100, epsilon)) accuracies.append(acc) example = [] preds = np.argmax(classifier.predict(x_test_adv), axis=1) labels = np.argmax(y_test[:1000], axis=1) for i in range(len(preds)): p, l = preds[i], labels[i] if p != l: orig = l adv = p ex = x_test_adv[i] example.append((orig, adv, ex)) if len(example) == 5: break examples.append(example) plot_accuracies(epsilons, accuracies) plot_examples(epsilons[1:], examples)
def gen_adv_data(model, x, y, attack_name, dataset_name, batch_size=1024): logging.getLogger().setLevel(logging.CRITICAL) classifier_param = classifier_params[dataset_name] classifier = TensorFlowV2Classifier(model=model, **classifier_param) attack_param = attack_params[attack_name][dataset_name] attack_param["batch_size"] = batch_size if attack_name not in [param.FGSM, param.BIM] : ## some attacks don't have verbose parameter, e.g. bim attack_param["verbose"] = VERBOSE attack = call_function_by_attack_name(attack_name)(classifier, **attack_param) data_num = x.shape[0] adv_x = attack.generate(x=x, y=y) logging.getLogger().setLevel(logging.INFO) return adv_x
with tf.GradientTape() as tape: predictions = model(images, training=True) loss = loss_object(labels, predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) model = TensorFlowModel() loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True) # Step 3: Create the ART classifier classifier = TensorFlowV2Classifier( model=model, loss_object=loss_object, train_step=train_step, nb_classes=10, input_shape=(28, 28, 1), clip_values=(0, 1), ) # Step 4: Train the ART classifier classifier.fit(x_train, y_train, batch_size=64, nb_epochs=3) # Step 5: Evaluate the ART classifier on benign test examples predictions = classifier.predict(x_test) accuracy = np.sum( np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len(y_test) print("Accuracy on benign test examples: {}%".format(accuracy * 100))
def __init__( self, estimator: "CLASSIFIER_LOSS_GRADIENTS_TYPE", norm: Union[int, float, str] = np.inf, eps: float = 0.3, eps_step: float = 0.1, max_iter: int = 100, targeted: bool = False, nb_random_init: int = 5, batch_size: int = 32, loss_type: Optional[str] = None, verbose: bool = True, ): """ Create a :class:`.AutoProjectedGradientDescent` instance. :param estimator: An trained estimator. :param norm: The norm of the adversarial perturbation. Possible values: "inf", np.inf, 1 or 2. :param eps: Maximum perturbation that the attacker can introduce. :param eps_step: Attack step size (input variation) at each iteration. :param max_iter: The maximum number of iterations. :param targeted: Indicates whether the attack is targeted (True) or untargeted (False). :param nb_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0 starting at the original input. :param batch_size: Size of the batch on which adversarial samples are generated. :param loss_type: Defines the loss to attack. Available options: None (Use loss defined by estimator), "cross_entropy", or "difference_logits_ratio" :param verbose: Show progress bars. """ from art.estimators.classification import TensorFlowClassifier, TensorFlowV2Classifier, PyTorchClassifier if loss_type not in self._predefined_losses: raise ValueError( "The argument loss_type has an invalid value. The following options for `loss_type` are currently " "supported: {}".format(self._predefined_losses) ) if loss_type is None: if hasattr(estimator, "predict") and is_probability( estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=np.float32)) ): raise ValueError( "AutoProjectedGradientDescent is expecting logits as estimator output, the provided " "estimator seems to predict probabilities." ) estimator_apgd = estimator else: if isinstance(estimator, TensorFlowClassifier): import tensorflow as tf if loss_type == "cross_entropy": if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))): raise NotImplementedError("Cross-entropy loss is not implemented for probability output.") self._loss_object = tf.reduce_mean( tf.keras.losses.categorical_crossentropy( y_pred=estimator._output, y_true=estimator._labels_ph, from_logits=True ) ) elif loss_type == "difference_logits_ratio": if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))): raise ValueError( "The provided estimator seems to predict probabilities. " "If loss_type='difference_logits_ratio' the estimator has to to predict logits." ) raise ValueError( "The loss `difference_logits_ratio` has not been validate completely. It seems that the " "commented implemented below is failing to selected the second largest logit for cases " "where the largest logit is the true logit. For future work `difference_logits_ratio` and " "loss_fn should return the same loss value." ) # def difference_logits_ratio(y_true, y_pred): # i_y_true = tf.cast(tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32) # i_y_pred_arg = tf.argsort(y_pred, axis=1) # # Not completely sure if the following line is correct. # # `i_y_pred_arg[:, -2], i_y_pred_arg[:, -1]` seems closer to the output of `loss_fn` than # # `i_y_pred_arg[:, -1], i_y_pred_arg[:, -2]` # i_z_i = tf.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -2], # i_y_pred_arg[:, -1]) # # z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0) # z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0) # z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0) # z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0) # # z_1 = tf.linalg.diag_part(z_1) # z_3 = tf.linalg.diag_part(z_3) # z_i = tf.linalg.diag_part(z_i) # z_y = tf.linalg.diag_part(z_y) # # dlr = -(z_y - z_i) / (z_1 - z_3) # # return tf.reduce_mean(dlr) # # def loss_fn(y_true, y_pred): # i_y_true = np.argmax(y_true, axis=1) # i_y_pred_arg = np.argsort(y_pred, axis=1) # i_z_i = np.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -1], # i_y_pred_arg[:, -2]) # # z_1 = y_pred[:, i_y_pred_arg[:, -1]] # z_3 = y_pred[:, i_y_pred_arg[:, -3]] # z_i = y_pred[:, i_z_i] # z_y = y_pred[:, i_y_true] # # z_1 = np.diag(z_1) # z_3 = np.diag(z_3) # z_i = np.diag(z_i) # z_y = np.diag(z_y) # # dlr = -(z_y - z_i) / (z_1 - z_3) # # return np.mean(dlr) # # self._loss_fn = loss_fn # self._loss_object = difference_logits_ratio(y_true=estimator._labels_ph, # y_pred=estimator._output) estimator_apgd = TensorFlowClassifier( input_ph=estimator._input_ph, output=estimator._output, labels_ph=estimator._labels_ph, train=estimator._train, loss=self._loss_object, learning=estimator._learning, sess=estimator._sess, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, feed_dict=estimator._feed_dict, ) elif isinstance(estimator, TensorFlowV2Classifier): import tensorflow as tf if loss_type == "cross_entropy": if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))): self._loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=False) else: self._loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True) elif loss_type == "difference_logits_ratio": if is_probability(estimator.predict(x=np.ones(shape=(1, *estimator.input_shape)))): raise ValueError( "The provided estimator seems to predict probabilities. " "If loss_type='difference_logits_ratio' the estimator has to to predict logits." ) class difference_logits_ratio: def __init__(self): self.reduction = "mean" def __call__(self, y_true, y_pred): i_y_true = tf.cast(tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32) i_y_pred_arg = tf.argsort(y_pred, axis=1) i_z_i_list = list() for i in range(y_true.shape[0]): if i_y_pred_arg[i, -1] != i_y_true[i]: i_z_i_list.append(i_y_pred_arg[i, -1]) else: i_z_i_list.append(i_y_pred_arg[i, -2]) i_z_i = tf.stack(i_z_i_list) z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0) z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0) z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0) z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0) z_1 = tf.linalg.diag_part(z_1) z_3 = tf.linalg.diag_part(z_3) z_i = tf.linalg.diag_part(z_i) z_y = tf.linalg.diag_part(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return tf.reduce_mean(dlr) self._loss_fn = difference_logits_ratio() self._loss_object = difference_logits_ratio() estimator_apgd = TensorFlowV2Classifier( model=estimator.model, nb_classes=estimator.nb_classes, input_shape=estimator.input_shape, loss_object=self._loss_object, train_step=estimator._train_step, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, ) elif isinstance(estimator, PyTorchClassifier): import torch if loss_type == "cross_entropy": if is_probability( estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=np.float32)) ): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='cross_entropy' " "the estimator has to to predict logits." ) self._loss_object = torch.nn.CrossEntropyLoss(reduction="mean") elif loss_type == "difference_logits_ratio": if is_probability( estimator.predict(x=np.ones(shape=(1, *estimator.input_shape), dtype=ART_NUMPY_DTYPE)) ): raise ValueError( "The provided estimator seems to predict probabilities. " "If loss_type='difference_logits_ratio' the estimator has to to predict logits." ) class difference_logits_ratio: def __init__(self): self.reduction = "mean" def __call__(self, y_pred, y_true): # type: ignore if isinstance(y_true, np.ndarray): y_true = torch.from_numpy(y_true) if isinstance(y_pred, np.ndarray): y_pred = torch.from_numpy(y_pred) y_true = y_true.float() i_y_true = torch.argmax(y_true, axis=1) i_y_pred_arg = torch.argsort(y_pred, axis=1) i_z_i_list = list() for i in range(y_true.shape[0]): if i_y_pred_arg[i, -1] != i_y_true[i]: i_z_i_list.append(i_y_pred_arg[i, -1]) else: i_z_i_list.append(i_y_pred_arg[i, -2]) i_z_i = torch.stack(i_z_i_list) z_1 = y_pred[:, i_y_pred_arg[:, -1]] z_3 = y_pred[:, i_y_pred_arg[:, -3]] z_i = y_pred[:, i_z_i] z_y = y_pred[:, i_y_true] z_1 = torch.diagonal(z_1) z_3 = torch.diagonal(z_3) z_i = torch.diagonal(z_i) z_y = torch.diagonal(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return torch.mean(dlr.float()) self._loss_object = difference_logits_ratio() estimator_apgd = PyTorchClassifier( model=estimator.model, loss=self._loss_object, input_shape=estimator.input_shape, nb_classes=estimator.nb_classes, optimizer=None, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, device_type=estimator._device, ) else: raise ValueError("The loss type {} is not supported for the provided estimator.".format(loss_type)) super().__init__(estimator=estimator_apgd) self.norm = norm self.eps = eps self.eps_step = eps_step self.max_iter = max_iter self.targeted = targeted self.nb_random_init = nb_random_init self.batch_size = batch_size self.loss_type = loss_type self.verbose = verbose self._check_params()
def natual(eps): # Step 1: Load the MNIST dataset (x_train, y_train), (x_test, y_test), min_pixel_value, max_pixel_value = load_mnist() # Step 2: Create the model import tensorflow as tf from tensorflow.keras import Model from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D class TensorFlowModel(Model): """ Standard TensorFlow model for unit testing. """ def __init__(self): super(TensorFlowModel, self).__init__() self.conv1 = Conv2D(filters=4, kernel_size=5, activation="relu") self.conv2 = Conv2D(filters=10, kernel_size=5, activation="relu") self.maxpool = MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="valid", data_format=None) self.flatten = Flatten() self.dense1 = Dense(100, activation="relu") self.logits = Dense(10, activation="linear") def call(self, x): """ Call function to evaluate the model. :param x: Input to the model :return: Prediction of the model """ x = self.conv1(x) x = self.maxpool(x) x = self.conv2(x) x = self.maxpool(x) x = self.flatten(x) x = self.dense1(x) x = self.logits(x) return x optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) def train_step(model, images, labels): with tf.GradientTape() as tape: predictions = model(images, training=True) loss = loss_object(labels, predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) model = TensorFlowModel() loss_object = tf.keras.losses.CategoricalCrossentropy(from_logits=True) # Step 3: Create the ART classifier classifier = TensorFlowV2Classifier( model=model, loss_object=loss_object, train_step=train_step, nb_classes=10, input_shape=(28, 28, 1), clip_values=(0, 1), ) # Step 4: Train the ART classifier classifier.fit(x_train, y_train, batch_size=64, nb_epochs=10) # Step 5: Evaluate the ART classifier on benign test examples predictions = classifier.predict(x_test) accuracy = np.sum( np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len( y_test) print("Accuracy on benign test examples: {}%".format(accuracy * 100)) # Step 6: Generate adversarial test examples attack = ProjectedGradientDescent(estimator=classifier, eps=eps, eps_step=eps / 3, max_iter=20) x_test_adv = attack.generate(x=x_test) # Step 7: Evaluate the ART classifier on adversarial test examples predictions = classifier.predict(x_test_adv) accuracy = np.sum( np.argmax(predictions, axis=1) == np.argmax(y_test, axis=1)) / len( y_test) print("Accuracy on adversarial test examples: {}%".format(accuracy * 100))
def main(): args = parse_option() print(args) # check args if args.loss not in LOSS_NAMES: raise ValueError('Unsupported loss function type {}'.format(args.loss)) if args.optimizer == 'adam': optimizer1 = tf.keras.optimizers.Adam(lr=args.lr_1) elif args.optimizer == 'lars': from lars_optimizer import LARSOptimizer # not compatible with tf2 optimizer1 = LARSOptimizer( args.lr_1, exclude_from_weight_decay=['batch_normalization', 'bias']) elif args.optimizer == 'sgd': optimizer1 = tfa.optimizers.SGDW(learning_rate=args.lr_1, momentum=0.9, weight_decay=1e-4) optimizer2 = tf.keras.optimizers.Adam(lr=args.lr_2) model_name = '{}_model-bs_{}-lr_{}'.format(args.loss, args.batch_size_1, args.lr_1) # 0. Load data if args.data == 'mnist': mnist = tf.keras.datasets.mnist elif args.data == 'fashion_mnist': mnist = tf.keras.datasets.fashion_mnist print('Loading {} data...'.format(args.data)) (_, y_train), (_, y_test) = mnist.load_data() # x_train, x_test = x_train / 255.0, x_test / 255.0 # x_train = x_train.reshape(-1, 28*28).astype(np.float32) # x_test = x_test.reshape(-1, 28*28).astype(np.float32) (x_train, _), (x_test, _), _, _ = load_mnist() # print(x_train[0][0]) print(x_train.shape, x_test.shape) # simulate low data regime for training # n_train = x_train.shape[0] # shuffle_idx = np.arange(n_train) # np.random.shuffle(shuffle_idx) # x_train = x_train[shuffle_idx][:args.n_data_train] # y_train = y_train[shuffle_idx][:args.n_data_train] # print('Training dataset shapes after slicing:') print(x_train.shape, y_train.shape) train_ds = tf.data.Dataset.from_tensor_slices( (x_train, y_train)).shuffle(5000).batch(args.batch_size_1) train_ds2 = tf.data.Dataset.from_tensor_slices( (x_train, y_train)).shuffle(5000).batch(args.batch_size_2) test_ds = tf.data.Dataset.from_tensor_slices( (x_test, y_test)).batch(args.batch_size_1) # 1. Stage 1: train encoder with multiclass N-pair loss encoder = Encoder(normalize=True, activation=args.activation) projector = Projector(args.projection_dim, normalize=True, activation=args.activation) if args.loss == 'max_margin': def loss_func(z, y): return losses.max_margin_contrastive_loss(z, y, margin=args.margin, metric=args.metric) elif args.loss == 'npairs': loss_func = losses.multiclass_npairs_loss elif args.loss == 'sup_nt_xent': def loss_func(z, y): return losses.supervised_nt_xent_loss( z, y, temperature=args.temperature, base_temperature=args.base_temperature) elif args.loss.startswith('triplet'): triplet_kind = args.loss.split('-')[1] def loss_func(z, y): return losses.triplet_loss(z, y, kind=triplet_kind, margin=args.margin) train_loss = tf.keras.metrics.Mean(name='train_loss') test_loss = tf.keras.metrics.Mean(name='test_loss') # tf.config.experimental_run_functions_eagerly(True) @tf.function # train step for the contrastive loss def train_step_stage1(x, y): ''' x: data tensor, shape: (batch_size, data_dim) y: data labels, shape: (batch_size, ) ''' with tf.GradientTape() as tape: r = encoder(x, training=True) z = projector(r, training=True) # print("z", z, "y", y) loss = loss_func(z, y) gradients = tape.gradient( loss, encoder.trainable_variables + projector.trainable_variables) optimizer1.apply_gradients( zip(gradients, encoder.trainable_variables + projector.trainable_variables)) train_loss(loss) @tf.function def test_step_stage1(x, y): r = encoder(x, training=False) z = projector(r, training=False) t_loss = loss_func(z, y) test_loss(t_loss) print('Stage 1 training ...') for epoch in range(args.epoch): # Reset the metrics at the start of the next epoch train_loss.reset_states() test_loss.reset_states() for x, y in train_ds: train_step_stage1(x, y) for x_te, y_te in test_ds: test_step_stage1(x_te, y_te) template = 'Epoch {}, Loss: {}, Test Loss: {}' # print(template.format(epoch + 1, # train_loss.result(), # test_loss.result())) if args.draw_figures: # projecting data with the trained encoder, projector x_tr_proj = projector(encoder(x_train)) x_te_proj = projector(encoder(x_test)) # convert tensor to np.array x_tr_proj = x_tr_proj.numpy() x_te_proj = x_te_proj.numpy() print(x_tr_proj.shape, x_te_proj.shape) # check learned embedding using PCA pca = PCA(n_components=2) pca.fit(x_tr_proj) x_te_proj_pca = pca.transform(x_te_proj) x_te_proj_pca_df = pd.DataFrame(x_te_proj_pca, columns=['PC1', 'PC2']) x_te_proj_pca_df['label'] = y_test # PCA scatter plot fig, ax = plt.subplots() ax = sns.scatterplot('PC1', 'PC2', data=x_te_proj_pca_df, palette='tab10', hue='label', linewidth=0, alpha=0.6, ax=ax) box = ax.get_position() ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) title = 'Data: {}\nEmbedding: {}\nbatch size: {}; LR: {}'.format( args.data, LOSS_NAMES[args.loss], args.batch_size_1, args.lr_1) ax.set_title(title) fig.savefig('figs/PCA_plot_{}_{}_embed.png'.format( args.data, model_name)) # density plot for PCA g = sns.jointplot('PC1', 'PC2', data=x_te_proj_pca_df, kind="hex") plt.subplots_adjust(top=0.95) g.fig.suptitle(title) g.savefig('figs/Joint_PCA_plot_{}_{}_embed.png'.format( args.data, model_name)) # Stage 2: freeze the learned representations and then learn a classifier # on a linear layer using a softmax loss softmax = SoftmaxPred() train_loss = tf.keras.metrics.Mean(name='train_loss') train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='train_ACC') test_loss = tf.keras.metrics.Mean(name='test_loss') test_acc = tf.keras.metrics.SparseCategoricalAccuracy(name='test_ACC') cce_loss_obj = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) @tf.function # train step for the 2nd stage def train_step(model, x, y): ''' x: data tensor, shape: (batch_size, data_dim) y: data labels, shape: (batch_size, ) ''' with tf.GradientTape() as tape: r = model.layers[0](x, training=False) y_preds = model.layers[1](r, training=True) loss = cce_loss_obj(y, y_preds) # freeze the encoder, only train the softmax layer gradients = tape.gradient(loss, model.layers[1].trainable_variables) optimizer2.apply_gradients( zip(gradients, model.layers[1].trainable_variables)) train_loss(loss) train_acc(y, y_preds) @tf.function def test_step(x, y): r = encoder(x, training=False) y_preds = softmax(r, training=False) t_loss = cce_loss_obj(y, y_preds) test_loss(t_loss) test_acc(y, y_preds) if args.write_summary: current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/{}/{}/{}/train'.format(model_name, args.data, current_time) test_log_dir = 'logs/{}/{}/{}/test'.format(model_name, args.data, current_time) train_summary_writer = tf.summary.create_file_writer(train_log_dir) test_summary_writer = tf.summary.create_file_writer(test_log_dir) print('Stage 2 training ...') model = tf.keras.Sequential([encoder, softmax]) loss_object = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) classifier = TensorFlowV2Classifier( model=model, loss_object=loss_object, train_step=train_step, nb_classes=10, input_shape=(28, 28, 1), clip_values=(0, 1), ) # classifier.fit(x_train, y_train, batch_size=256, nb_epochs=20) for epoch in range(args.epoch): # Reset the metrics at the start of the next epoch train_loss.reset_states() train_acc.reset_states() test_loss.reset_states() test_acc.reset_states() for x, y in train_ds2: train_step(model, x, y) if args.write_summary: with train_summary_writer.as_default(): tf.summary.scalar('loss', train_loss.result(), step=epoch) tf.summary.scalar('accuracy', train_acc.result(), step=epoch) for x_te, y_te in test_ds: test_step(x_te, y_te) if args.write_summary: with test_summary_writer.as_default(): tf.summary.scalar('loss', test_loss.result(), step=epoch) tf.summary.scalar('accuracy', test_acc.result(), step=epoch) template = 'Epoch {}, Loss: {}, Acc: {}, Test Loss: {}, Test Acc: {}' print( template.format(epoch + 1, train_loss.result(), train_acc.result() * 100, test_loss.result(), test_acc.result() * 100)) predictions = classifier.predict(x_test) print(predictions.shape, y_test.shape) accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test) print("Accuracy on benign test examples: {}%".format(accuracy * 100)) print('Stage 3 attacking ...') attack = ProjectedGradientDescent(estimator=classifier, eps=args.eps, eps_step=args.eps / 3, max_iter=20) x_test_adv = attack.generate(x=x_test) print('Stage 4 attacking ...') predictions = classifier.predict(x_test_adv) accuracy = np.sum(np.argmax(predictions, axis=1) == y_test) / len(y_test) print("Accuracy on adversarial test examples: {}%".format(accuracy * 100)) natual(args.eps)
train_data, train_labels = np.split(train.to_numpy(), [25], axis=1) test_data, test_labels = np.split(test.to_numpy(), [25], axis=1) non_encoded_test_labels = tf.argmax(test_labels, axis=1) non_encoded_train_labels = tf.argmax(train_labels,axis=1) print("Evaluating clean samples on clean model...\n") _, orig_acc = new_model.evaluate(test_data, test_labels, verbose=1) logger.info("Classifier with original training") logger.info("Accuracy on clean test samples: %.2f%%", (orig_acc * 100)) logger.info("="*50) print("Creating classifier...\n") adv_classifier = TensorFlowV2Classifier(model=new_model, loss_object=loss_object, train_step=train_step, nb_classes=5, input_shape=(1,25), clip_values=(0, 1)) print("Creating adversarial attack object...\n") fgsm = FastGradientMethod(adv_classifier, norm=np.inf, eps=eps, eps_step=0.001, targeted=False, batch_size=2048, num_random_init=27) print("Generating adversarial samples...\n")
def __init__( self, estimator: "CLASSIFIER_LOSS_GRADIENTS_TYPE", norm: Union[int, float, str] = np.inf, eps: float = 0.3, eps_step: float = 0.1, max_iter: int = 100, targeted: bool = False, nb_random_init: int = 5, batch_size: int = 32, loss_type: Optional[str] = None, ): """ Create a :class:`.AutoProjectedGradientDescent` instance. :param estimator: An trained estimator. :param norm: The norm of the adversarial perturbation. Possible values: "inf", np.inf, 1 or 2. :param eps: Maximum perturbation that the attacker can introduce. :param eps_step: Attack step size (input variation) at each iteration. :param max_iter: The maximum number of iterations. :param targeted: Indicates whether the attack is targeted (True) or untargeted (False). :param nb_random_init: Number of random initialisations within the epsilon ball. For num_random_init=0 starting at the original input. :param batch_size: Size of the batch on which adversarial samples are generated. """ from art.estimators.classification import TensorFlowClassifier, TensorFlowV2Classifier, PyTorchClassifier if isinstance(estimator, TensorFlowClassifier): import tensorflow as tf if loss_type == "cross_entropy": if is_probability( estimator.predict(x=np.ones( shape=(1, *estimator.input_shape)))): raise NotImplementedError( "Cross-entropy loss is not implemented for probability output." ) else: self._loss_object = tf.reduce_mean( tf.keras.losses.categorical_crossentropy( y_pred=estimator._output, y_true=estimator._labels_ph, from_logits=True)) def loss_fn(y_true, y_pred): y_pred_norm = y_pred - np.amax( y_pred, axis=1, keepdims=True) loss_value = -(y_true * y_pred_norm - np.log( np.sum(np.exp(y_pred_norm), axis=1, keepdims=True))) return np.mean(loss_value) self._loss_fn = loss_fn elif loss_type == "difference_logits_ratio": if is_probability( estimator.predict(x=np.ones( shape=(1, *estimator.input_shape)))): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' " "the estimator has to to predict logits.") else: def difference_logits_ratio(y_true, y_pred): i_y_true = tf.cast( tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32) i_y_pred_arg = tf.argsort(y_pred, axis=1) i_z_i = tf.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -2], i_y_pred_arg[:, -1]) z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0) z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0) z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0) z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0) z_1 = tf.linalg.diag_part(z_1) z_3 = tf.linalg.diag_part(z_3) z_i = tf.linalg.diag_part(z_i) z_y = tf.linalg.diag_part(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return tf.reduce_mean(dlr) def loss_fn(y_true, y_pred): i_y_true = np.argmax(y_true, axis=1) i_y_pred_arg = np.argsort(y_pred, axis=1) i_z_i = np.where(i_y_pred_arg[:, -1] != i_y_true[:], i_y_pred_arg[:, -1], i_y_pred_arg[:, -2]) z_1 = y_pred[:, i_y_pred_arg[:, -1]] z_3 = y_pred[:, i_y_pred_arg[:, -3]] z_i = y_pred[:, i_z_i] z_y = y_pred[:, i_y_true] z_1 = np.diag(z_1) z_3 = np.diag(z_3) z_i = np.diag(z_i) z_y = np.diag(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return np.mean(dlr) self._loss_fn = loss_fn self._loss_object = difference_logits_ratio( y_true=estimator._labels_ph, y_pred=estimator._output) elif loss_type is None: self._loss_object = estimator._loss_object else: raise ValueError( "The argument loss_type has an invalid value. The following options for loss_type are " "supported: {}".format( [None, "cross_entropy", "difference_logits_ratio"])) estimator_apgd = TensorFlowClassifier( input_ph=estimator._input_ph, output=estimator._output, labels_ph=estimator._labels_ph, train=estimator._train, loss=self._loss_object, learning=estimator._learning, sess=estimator._sess, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, feed_dict=estimator._feed_dict, ) elif isinstance(estimator, TensorFlowV2Classifier): import tensorflow as tf if loss_type == "cross_entropy": if is_probability( estimator.predict(x=np.ones( shape=(1, *estimator.input_shape)))): self._loss_object = tf.keras.losses.CategoricalCrossentropy( from_logits=False) self._loss_fn = self._loss_object else: self._loss_object = tf.keras.losses.CategoricalCrossentropy( from_logits=True) self._loss_fn = self._loss_object elif loss_type == "difference_logits_ratio": if is_probability( estimator.predict(x=np.ones( shape=(1, *estimator.input_shape)))): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' " "the estimator has to to predict logits.") else: def difference_logits_ratio(y_true, y_pred): i_y_true = tf.cast( tf.math.argmax(tf.cast(y_true, tf.int32), axis=1), tf.int32) i_y_pred_arg = tf.argsort(y_pred, axis=1) i_z_i_list = list() for i in range(y_true.shape[0]): if i_y_pred_arg[i, -1] != i_y_true[i]: i_z_i_list.append(i_y_pred_arg[i, -1]) else: i_z_i_list.append(i_y_pred_arg[i, -2]) i_z_i = tf.stack(i_z_i_list) z_1 = tf.gather(y_pred, i_y_pred_arg[:, -1], axis=1, batch_dims=0) z_3 = tf.gather(y_pred, i_y_pred_arg[:, -3], axis=1, batch_dims=0) z_i = tf.gather(y_pred, i_z_i, axis=1, batch_dims=0) z_y = tf.gather(y_pred, i_y_true, axis=1, batch_dims=0) z_1 = tf.linalg.diag_part(z_1) z_3 = tf.linalg.diag_part(z_3) z_i = tf.linalg.diag_part(z_i) z_y = tf.linalg.diag_part(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return tf.reduce_mean(dlr) self._loss_fn = difference_logits_ratio self._loss_object = difference_logits_ratio elif loss_type is None: self._loss_object = estimator._loss_object else: raise ValueError( "The argument loss_type has an invalid value. The following options for loss_type are " "supported: {}".format( [None, "cross_entropy", "difference_logits_ratio"])) estimator_apgd = TensorFlowV2Classifier( model=estimator.model, nb_classes=estimator.nb_classes, input_shape=estimator.input_shape, loss_object=self._loss_object, train_step=estimator._train_step, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, ) elif isinstance(estimator, PyTorchClassifier): import torch if loss_type == "cross_entropy": if is_probability( estimator.predict( x=np.ones(shape=(1, *estimator.input_shape), dtype=np.float32))): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='cross_entropy' " "the estimator has to to predict logits.") else: def loss_fn(y_true, y_pred): return torch.nn.CrossEntropyLoss()( torch.from_numpy(y_pred), torch.from_numpy(np.argmax(y_true, axis=1))) self._loss_fn = loss_fn self._loss_object = torch.nn.CrossEntropyLoss() elif loss_type == "difference_logits_ratio": if is_probability( estimator.predict( x=np.ones(shape=(1, *estimator.input_shape), dtype=ART_NUMPY_DTYPE))): raise ValueError( "The provided estimator seems to predict probabilities. If loss_type='difference_logits_ratio' " "the estimator has to to predict logits.") else: # def difference_logits_ratio(y_true, y_pred): def difference_logits_ratio(y_pred, y_true): # type: ignore if isinstance(y_true, np.ndarray): y_true = torch.from_numpy(y_true) if isinstance(y_pred, np.ndarray): y_pred = torch.from_numpy(y_pred) y_true = y_true.float() # dlr = torch.mean((y_pred - y_true) ** 2) # return loss i_y_true = torch.argmax(y_true, axis=1) i_y_pred_arg = torch.argsort(y_pred, axis=1) i_z_i_list = list() for i in range(y_true.shape[0]): if i_y_pred_arg[i, -1] != i_y_true[i]: i_z_i_list.append(i_y_pred_arg[i, -1]) else: i_z_i_list.append(i_y_pred_arg[i, -2]) i_z_i = torch.stack(i_z_i_list) z_1 = y_pred[:, i_y_pred_arg[:, -1]] z_3 = y_pred[:, i_y_pred_arg[:, -3]] z_i = y_pred[:, i_z_i] z_y = y_pred[:, i_y_true] z_1 = torch.diagonal(z_1) z_3 = torch.diagonal(z_3) z_i = torch.diagonal(z_i) z_y = torch.diagonal(z_y) dlr = -(z_y - z_i) / (z_1 - z_3) return torch.mean(dlr.float()) self._loss_fn = difference_logits_ratio self._loss_object = difference_logits_ratio elif loss_type is None: self._loss_object = estimator._loss_object else: raise ValueError( "The argument loss_type has an invalid value. The following options for loss_type are " "supported: {}".format( [None, "cross_entropy", "difference_logits_ratio"])) estimator_apgd = PyTorchClassifier( model=estimator.model, loss=self._loss_object, input_shape=estimator.input_shape, nb_classes=estimator.nb_classes, optimizer=None, channels_first=estimator.channels_first, clip_values=estimator.clip_values, preprocessing_defences=estimator.preprocessing_defences, postprocessing_defences=estimator.postprocessing_defences, preprocessing=estimator.preprocessing, device_type=estimator._device, ) else: estimator_apgd = None super().__init__(estimator=estimator_apgd) self.norm = norm self.eps = eps self.eps_step = eps_step self.max_iter = max_iter self.targeted = targeted self.nb_random_init = nb_random_init self.batch_size = batch_size self.loss_type = loss_type self._check_params()
def test_1_tf(self): """ Test with a TensorFlow Classifier. :return: """ tf_version = list( map(int, tf.__version__.lower().split("+")[0].split("."))) if tf_version[0] == 2: # Get MNIST (x_train, y_train), (x_test, y_test) = self.mnist # Create a model from scratch from tensorflow.keras import Model from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D class TensorFlowModel(Model): """ Standard TensorFlow model for unit testing. """ def __init__(self): super(TensorFlowModel, self).__init__() self.conv1 = Conv2D(filters=4, kernel_size=5, activation="relu") self.conv2 = Conv2D(filters=10, kernel_size=5, activation="relu") self.maxpool = MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="valid", data_format=None) self.flatten = Flatten() self.dense1 = Dense(100, activation="relu") self.logits = Dense(10, activation="linear") def call(self, x): """ Call function to evaluate the model. :param x: Input to the model :return: Prediction of the model """ x = self.conv1(x) x = self.maxpool(x) x = self.conv2(x) x = self.maxpool(x) x = self.flatten(x) x = self.dense1(x) x = self.logits(x) return x optimizer = Adam(learning_rate=0.01) def train_step(model, images, labels): with tf.GradientTape() as tape: predictions = model(images, training=True) loss = loss_object(labels, predictions) gradients = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients( zip(gradients, model.trainable_variables)) model = TensorFlowModel() loss_object = tf.keras.losses.CategoricalCrossentropy( from_logits=True) classifier = TensorFlowV2Classifier( model=model, loss_object=loss_object, train_step=train_step, nb_classes=10, input_shape=(28, 28, 1), clip_values=(0, 1), ) # Initialize DPA Classifier dpa = DeepPartitionEnsemble( classifiers=classifier, ensemble_size=ENSEMBLE_SIZE, channels_first=classifier.channels_first, clip_values=classifier.clip_values, preprocessing_defences=classifier.preprocessing_defences, postprocessing_defences=classifier.postprocessing_defences, preprocessing=classifier.preprocessing, ) # Check basic functionality of DPA Classifier # check predict y_test_dpa = dpa.predict(x=x_test) self.assertEqual(y_test_dpa.shape, y_test.shape) self.assertTrue( (np.sum(y_test_dpa, axis=1) <= ENSEMBLE_SIZE * np.ones( (NB_TEST, ))).all()) # loss gradient grad = dpa.loss_gradient(x=x_test, y=y_test, sampling=True) assert grad.shape == (10, 28, 28, 1) # fit dpa.fit(x=x_train, y=y_train)