Beispiel #1
0
    def get_deepfool(self,
                     sess,
                     x,
                     predictions,
                     logits,
                     sample,
                     nb_candidate=10,
                     overshoot=0.03,
                     max_iter=30,
                     feed=None):
        adv_x = copy.copy(sample)
        # Initialize the loop variables
        current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed)
        if current.shape == ():
            current = np.array([current])
        w = np.squeeze(np.zeros(
            sample.shape[1:4]))  # same shape as original image
        r_tot = np.zeros(sample.shape)
        original = current  # use original label as the reference

        iteration = 0
        # Repeat this main loop until we have achieved misclassification
        while (np.any(current == original) and iteration < max_iter):
            feed.update({x: adv_x})
            gradients, predictions_val = sess.run([self.grads, predictions],
                                                  feed_dict=feed)
            for idx in range(sample.shape[0]):
                pert = np.inf
                if current[idx] != original[idx]:
                    continue
                for k in range(1, nb_candidate):
                    w_k = gradients[k][idx, ...] - gradients[0][idx, ...]
                    f_k = predictions_val[idx, k] - predictions_val[idx, 0]
                    # adding value 0.00001 to prevent f_k = 0
                    pert_k = (abs(f_k) + 1e-30) / np.linalg.norm(w_k.flatten())
                    if pert_k < pert:
                        pert = pert_k
                        w = w_k
                r_i = pert * w / np.linalg.norm(w)
                r_tot[idx, ...] = r_tot[idx, ...] + r_i

            # adv_x = np.clip(r_tot + sample, clip_min, clip_max)
            adv_x = r_tot + sample
            feed.update({x: adv_x})

            current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed)
            if current.shape == ():
                current = np.array([current])
            # Update loop variables
            iteration = iteration + 1

        # need to clip this image into the given range
        # adv_x = np.clip((1+overshoot)*r_tot + sample, clip_min, clip_max)
        adv_x = (1 + overshoot) * r_tot + sample
        return adv_x
def independent_single(x_test_cc):

    (x_train, y_train), (x_test, y_test) = cifar10.load_data()
    x_test = x_test.astype('float32') / 255
    x_train = x_train.astype('float32') / 255
    input_shape = x_train.shape[1:]

    sess = tf.Session()
    keras.backend.set_session(sess)

    model_input = Input(shape=input_shape)  
    model_dic = {}
    model_out = []
    model_logits = []
    for i in range(3):
        model_dic[str(i)] = lenet_v1(X_input=model_input, num_classes=10)
        model_out.append(model_dic[str(i)][3])
        model_logits.append(model_dic[str(i)][2])

    model = Model(input=model_input, output=model_out)

    model.load_weights(filepath)
    pred = model(model_input)

    final_pred_list = []
    clean_pred_list = []
    confidence_list = []
    entropy_list = []
    for i in range(N_numbers):
        #sess.run(tf.global_variables_initializer())
        # f = sess.run(final_features, feed_dict={model_input: x_test_cc})   # features


        # confidence / cross_entropy
        # en = -np.sum(soft * np.log2(soft))
        # entropy_list.append(en)
        predictive_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=model_out[i], logits=model_logits[i])
        confidence = tf.math.reduce_max(model_out[i], axis=-1)
        pe_adv, conf_adv = sess.run([predictive_entropy, confidence], feed_dict={model_input: x_test_cc})
        entropy_list.append(pe_adv)
        confidence_list.append(conf_adv)

        # prediction
        final_pred = model_argmax(sess, model_input, pred[i], samples=x_test_cc)
        clean_pred = model_argmax(sess, model_input, pred[i], samples=x_test)
        final_pred_list.append(final_pred)
        clean_pred_list.append(clean_pred)

    return y_test, clean_pred_list, final_pred_list, entropy_list, confidence_list
Beispiel #3
0
def generate_attacks(save_path, file_path, x_set, y_set, attack, gamma,
                     first_index, last_index):
    """
    Applies the saliency map attack against the specified model.

    Parameters
    ----------
    save_path: str
        The path of the folder in which the crafted adversarial samples will be saved.
    file_path: str
        The path to the joblib file of the model to attack.
    x_set: numpy.ndarray
        The dataset input array.
    y_set: numpy.ndarray
        The dataset output array.
    attack: str
        The type of used attack (either "jsma", "wjsma" or "tjsma").
    gamma: float
            Maximum percentage of perturbed features.
    first_index:
        The index of the first image attacked.
    last_index: int
        The index of the last image attacked.
    """

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    sess = tf.Session()

    img_rows, img_cols, channels = x_set.shape[1:4]
    nb_classes = y_set.shape[1]

    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))

    with sess.as_default():
        model = load(file_path)

    assert len(model.get_params()) > 0

    # Attack parameters. See SaliencyMapMethod for more information
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        'theta': 1,
        'gamma': gamma,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None,
        'attack': attack
    }

    preds = model(x)

    for sample_ind in range(first_index, last_index):
        results = pd.DataFrame()

        print('Attacking input %i/%i' % (sample_ind + 1, last_index))

        sample = x_set[sample_ind:(sample_ind + 1)]
        current_class = int(np.argmax(y_set[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        for target in target_classes:
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x, predictions = jsma.generate_np(sample, **jsma_params)

            res = int(model_argmax(sess, x, preds, adv_x) == target)

            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_set[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            results['number_' + str(sample_ind) + '_' + str(current_class) + '_to_' + str(target)] = \
                np.concatenate((adv_x_reshape.reshape(-1), np.array([nb_changed, percent_perturb, res]))
                               )

        sample_vector = sample.reshape(-1)
        shape1 = sample_vector.shape[0]
        shape2 = results.shape[0]

        results['original_image_' + str(sample_ind)] = \
            np.concatenate((sample.reshape(-1), np.zeros((shape2 - shape1,))))

        results.to_csv(save_path + '/' + attack + '_image_' + str(sample_ind) +
                       '.csv',
                       index=False)
Beispiel #4
0
def jsma(sess,
         x,
         predictions,
         grads,
         sample,
         target,
         theta,
         gamma,
         increase,
         nb_classes,
         clip_min,
         clip_max,
         verbose=False):
    """
    TensorFlow implementation of the jacobian-based saliency map method (JSMA).
    :param sess: TF session
    :param x: the input placeholder
    :param predictions: the model's symbolic output (linear output,
        pre-softmax)
    :param sample: numpy array with sample input
    :param target: target class for sample input
    :param theta: delta for each feature adjustment
    :param gamma: a float between 0 - 1 indicating the maximum distortion
        percentage
    :param increase: boolean; true if we are increasing pixels, false otherwise
    :param nb_classes: integer indicating the number of classes in the model
    :param clip_min: optional parameter that can be used to set a minimum
                    value for components of the example returned
    :param clip_max: optional parameter that can be used to set a maximum
                    value for components of the example returned
    :param verbose: boolean; whether to print status updates or not
    :return: an adversarial sample
    """

    # Copy the source sample and define the maximum number of features
    # (i.e. the maximum number of iterations) that we may perturb
    adv_x = copy.copy(sample)
    # count the number of features. For MNIST, 1x28x28 = 784; for
    # CIFAR, 3x32x32 = 3072; etc.
    nb_features = np.product(adv_x.shape[1:])
    # reshape sample for sake of standardization
    original_shape = adv_x.shape
    adv_x = np.reshape(adv_x, (1, nb_features))
    # compute maximum number of iterations
    max_iters = np.floor(nb_features * gamma / 2)
    if verbose:
        print('Maximum number of iterations: {0}'.format(max_iters))

    # Compute our initial search domain. We optimize the initial search domain
    # by removing all features that are already at their maximum values (if
    # increasing input features---otherwise, at their minimum value).
    if increase:
        search_domain = set(
            [i for i in xrange(nb_features) if adv_x[0, i] < clip_max])
    else:
        search_domain = set(
            [i for i in xrange(nb_features) if adv_x[0, i] > clip_min])

    # Initialize the loop variables
    iteration = 0
    adv_x_original_shape = np.reshape(adv_x, original_shape)
    current = model_argmax(sess,
                           x,
                           predictions,
                           adv_x_original_shape,
                           feed={K.learning_phase(): 0})

    # Repeat this main loop until we have achieved misclassification
    while (current != target and iteration < max_iters
           and len(search_domain) > 1):
        # Reshape the adversarial example
        adv_x_original_shape = np.reshape(adv_x, original_shape)

        # Compute the Jacobian components
        grads_target, grads_others = jacobian(sess,
                                              x,
                                              grads,
                                              target,
                                              adv_x_original_shape,
                                              nb_features,
                                              nb_classes,
                                              feed={K.learning_phase(): 0})

        # Compute the saliency map for each of our target classes
        # and return the two best candidate features for perturbation
        i, j, search_domain = saliency_map(grads_target, grads_others,
                                           search_domain, increase)

        # Apply the perturbation to the two input features selected previously
        adv_x = apply_perturbations(i, j, adv_x, increase, theta, clip_min,
                                    clip_max)

        # Update our current prediction by querying the model
        current = model_argmax(sess,
                               x,
                               predictions,
                               adv_x_original_shape,
                               feed={K.learning_phase(): 0})

        # Update loop variables
        iteration += 1

        # This process may take a while, so outputting progress regularly
        if iteration % 5 == 0 and verbose:
            msg = 'Current iteration: {0} - Current Prediction: {1}'
            print(msg.format(iteration, current))

    # Compute the ratio of pixels perturbed by the algorithm
    percent_perturbed = float(iteration * 2) / nb_features

    # Report success when the adversarial example is misclassified in the
    # target class
    if current == target:
        if verbose:
            print('Successful')
        return np.reshape(adv_x, original_shape), 1, percent_perturbed
    else:
        if verbose:
            print('Unsuccesful')
        return np.reshape(adv_x, original_shape), 0, percent_perturbed
Beispiel #5
0
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   train_dir="/tmp",
                   filename="mnist.ckpt",
                   load_model=False,
                   testing=False):
    keras.layers.core.K.set_learning_phase(0)
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()
    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")
    # Create TF session and set as Keras backend session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)
    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))
    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         predictions,
                         X_test,
                         Y_test,
                         args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }

    # Train an MNIST model
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess,
                    x,
                    y,
                    predictions,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    save=True,
                    rng=rng)

    advGenTimeStart = time.time()
    wrap = KerasModelWrapper(model)
    advGenTimeStart = time.time()
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    adv_x = sess.run(adv_x, feed_dict={x: X_test[5500:]})
    advGenTimeEnd = time.time()
    advGenTime = advGenTimeEnd - advGenTimeStart

    for i in xrange(4500):
        normalization(adv_x[i:(i + 1)])

    original_classified_wrong_number = 0
    disturbed_failure_number = 0
    test_number = 0
    TTP = 0
    TP = 0
    FN = 0
    FP = 0

    for i in range(len(adv_x)):
        current_class = int(np.argmax(Y_test[i + 5500]))
        oriPreTimeStart = time.time()
        currentXLabel = model_argmax(sess, x, predictions,
                                     X_test[i + 5500:(i + 5501)])
        currentXProbList = my_model_argmax(sess, x, predictions,
                                           X_test[i + 5500:(i + 5501)])
        oriPreTimeEnd = time.time()
        oriPreTime = oriPreTimeEnd - oriPreTimeStart
        if currentXLabel != current_class:
            original_classified_wrong_number += 1
            continue

        advPreTimeStart = time.time()
        currentAdvXLabel = model_argmax(sess, x, predictions, adv_x[i:(i + 1)])
        currentAdvXProbList = my_model_argmax(sess, x, predictions,
                                              adv_x[i:(i + 1)])
        advPreTimeEnd = time.time()
        advPreTime = advPreTimeEnd - advPreTimeStart

        if currentAdvXLabel == currentXLabel:
            disturbed_failure_number += 1
            continue

#         fig = plt.figure('test')
#         picOne = fig.add_subplot(121)
#         picOne.imshow(X_test[i+5500:(i+5501)].reshape((28,28)), cmap='gray')
#         picTwo = fig.add_subplot(122)
#         picTwo.imshow(adv_x[i:(i+1)].reshape((28,28)), cmap='gray')
#         plt.show()

        test_number += 1

        print('probabilities = %.4f ; %.4f' %
              (currentXProbList[currentXLabel],
               currentAdvXProbList[currentAdvXLabel]))

        tempX = np.reshape(X_test[i + 5500:(i + 5501)], (28, 28))
        test_x = np.array(tempX)

        oriFilteredPreTimeStart = time.time()
        currentX = np.reshape(X_test[i + 5500:(i + 5501)], (28, 28))
        imageEntropy = oneDEntropy(test_x)
        if imageEntropy < 4:
            current_x_res = scalarQuantization(currentX, 128)
        elif imageEntropy < 5:
            current_x_res = scalarQuantization(currentX, 64)
        else:
            current_x_ASQ = scalarQuantization(currentX, 43)
            current_x_ASQ_AMF = crossMeanFilterOperations(
                current_x_ASQ, 3, 25, 13)
            current_x_res = chooseCloserFilter(currentX, current_x_ASQ,
                                               current_x_ASQ_AMF)
        current_x_res = np.reshape(current_x_res, X_test[0:1].shape)
        current_x_res_label = model_argmax(sess, x, predictions, current_x_res)
        current_x_res_prob = my_model_argmax(sess, x, predictions,
                                             current_x_res)

        tempX2 = np.reshape(adv_x[i:(i + 1)], (28, 28))
        test_adv_x = np.array(tempX2)
        currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28))
        imageEntropy2 = oneDEntropy(test_adv_x)
        print('%d: %.2f------%.2f' % (i, imageEntropy, imageEntropy2))
        if imageEntropy2 < 4:
            current_adv_x_res = scalarQuantization(currentAdvX, 128)
        elif imageEntropy2 < 5:
            current_adv_x_res = scalarQuantization(currentAdvX, 64)
        else:
            current_adv_x_ASQ = scalarQuantization(currentAdvX, 43)
            current_adv_x_ASQ_AMF = crossMeanFilterOperations(
                current_adv_x_ASQ, 3, 25, 13)
            current_adv_x_res = chooseCloserFilter(currentAdvX,
                                                   current_adv_x_ASQ,
                                                   current_adv_x_ASQ_AMF)
        current_adv_x_res = np.reshape(current_adv_x_res, X_test[0:1].shape)
        current_adv_x_res_label = model_argmax(sess, x, predictions,
                                               current_adv_x_res)
        current_adv_x_res_prob = my_model_argmax(sess, x, predictions,
                                                 current_adv_x_res)

        print('filtered Probs = %.4f ; %.4f' %
              (current_x_res_prob[current_x_res_label],
               current_adv_x_res_prob[current_adv_x_res_label]))

        if current_adv_x_res_label != currentAdvXLabel:
            TP += 1
            if current_adv_x_res_label == current_class:
                TTP += 1
        else:
            FN += 1
        if current_x_res_label != currentXLabel:
            FP += 1
        str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
            test_number, original_classified_wrong_number,
            disturbed_failure_number, TP, FN, FP, TTP)
        print(str1)

    Recall = TP / (TP + FN)
    Precision = TP / (TP + FP)
    tempStarStr = '********************************************************'
    recallStr = 'Recall = %.4f' % (Recall)
    precisionStr = 'Precision = %.4f' % (Precision)
    print(tempStarStr)
    print(recallStr)
    print(precisionStr)
    print(tempStarStr)
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   train_dir="/tmp",
                   filename="mnist.ckpt",
                   load_model=False,
                   testing=False):
    keras.layers.core.K.set_learning_phase(0)
    report = AccuracyReport()
    tf.set_random_seed(1234)
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         predictions,
                         X_test,
                         Y_test,
                         args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    # Train an MNIST model
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess,
                    x,
                    y,
                    predictions,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    save=True,
                    rng=rng)

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    advGenTimeStart = time.time()
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    adv_x = sess.run(adv_x, feed_dict={x: X_test[:4500]})
    advGenTimeEnd = time.time()
    advGenTime = advGenTimeEnd - advGenTimeStart

    for i in xrange(4500):
        normalization(adv_x[i:(i + 1)])

    print('adversarial examples generation time = ', advGenTime, 'seconds')

    intervals = [128, 85, 64, 51, 43, 37, 32, 28, 26]

    for intervalIndex in range(9):
        startTime = time.time()
        print('NBinterval =  ', intervalIndex + 2, '; interval size = ',
              intervals[intervalIndex])
        original_classified_wrong_number = 0
        disturbed_failure_number = 0
        test_number = 0
        TTP = 0
        TP = 0
        FN = 0
        FP = 0

        for i in range(1000):
            current_class = int(np.argmax(Y_test[i]))

            currentXLabel = model_argmax(sess, x, predictions,
                                         X_test[i:(i + 1)])
            if currentXLabel != current_class:
                original_classified_wrong_number += 1
                continue

            currentAdvXLabel = model_argmax(sess, x, predictions,
                                            adv_x[i:(i + 1)])
            if currentAdvXLabel == currentXLabel:
                disturbed_failure_number += 1
                continue

            test_number += 1

            currentX = np.reshape(X_test[i:(i + 1)], (28, 28))
            currentX = scalarQuantization(currentX, intervals[intervalIndex])
            currentX = np.reshape(currentX, X_test[i:(i + 1)].shape)
            currentXFilteredLabel = model_argmax(sess, x, predictions,
                                                 currentX)

            currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28))
            currentAdvX = scalarQuantization(currentAdvX,
                                             intervals[intervalIndex])
            currentAdvX = np.reshape(currentAdvX, X_test[i:(i + 1)].shape)
            currentAdvXFilteredLabel = model_argmax(sess, x, predictions,
                                                    currentAdvX)

            if currentAdvXFilteredLabel != currentAdvXLabel:
                TP += 1
                if currentAdvXFilteredLabel == current_class:
                    TTP += 1
            else:
                FN += 1
            if currentXFilteredLabel != currentXLabel:
                FP += 1

            if (i + 1) % 1000 == 0:
                str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
                    test_number, original_classified_wrong_number,
                    disturbed_failure_number, TP, FN, FP, TTP)
                print(str1)

        str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
            test_number, original_classified_wrong_number,
            disturbed_failure_number, TP, FN, FP, TTP)
        print(str1)

        endTime = time.time()
        print('lasting ', endTime - startTime, 'seconds')
        Recall = TP / (TP + FN)
        Precision = TP / (TP + FP)
        tempStarStr = '********************************************************'
        recallStr = 'Recall = %.4f' % (Recall)
        precisionStr = 'Precision = %.4f' % (Precision)
        print(tempStarStr)
        print(recallStr)
        print(precisionStr)
        print(tempStarStr)

    return report
def main(argv=None):
    """
    CIFAR10 CleverHans tutorial
    :return:
    """

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")

    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    assert Y_train.shape[1] == 10.
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 32, 32, 3))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model(img_rows=32, img_cols=32, channels=3)
    predictions = model(x)
    print "Defined TensorFlow model graph."

    def evaluate():
        # Evaluate the accuracy of the CIFAR10 model on legitimate test
        # examples
        eval_params = {'batch_size': FLAGS.batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              predictions,
                              X_test,
                              Y_test,
                              args=eval_params)
        assert X_test.shape[0] == 10000, X_test.shape
        print 'Test accuracy on legitimate test examples: ' + str(accuracy)

    # Train an CIFAR10 model
    train_params = {
        'nb_epochs': FLAGS.nb_epochs,
        'batch_size': FLAGS.batch_size,
        'learning_rate': FLAGS.learning_rate
    }
    model_train(sess,
                x,
                y,
                predictions,
                X_train,
                Y_train,
                evaluate=evaluate,
                args=train_params)

    # Craft adversarial examples using Fast Gradient Sign Method (FGSM)
    # adv_x = fgsm(x, predictions, eps=0.3)

    mim = MIM(model, back='tf', sess=sess)
    mim_params = {
        'eps_iter': 0.06,
        'eps': 0.3,
        'nb_iter': 10,
        'ord': 2,
        'decay_factor': 1.0
    }

    adv_x = mim.generate(x, **mim_params)

    eval_params = {'batch_size': FLAGS.batch_size}
    X_test_adv, = batch_eval(sess, [x], [adv_x], [X_test], args=eval_params)
    assert X_test_adv.shape[0] == 10000, X_test_adv.shape
    accuracy = model_eval(sess,
                          x,
                          y,
                          predictions,
                          X_test_adv,
                          Y_test,
                          args=eval_params)
    print 'Test accuracy on adversarial examples: ' + str(accuracy)

    from scipy.misc import imsave
    path = '/home/neale/repos/adversarial-toolbox/images/adversarials/mim/cifar/symmetric/'
    """
    for i, (real, adv) in enumerate(zip(X_test, X_test_adv)):
        imsave(path+'adv/adv_{}.png'.format(i), adv)
    """
    preds = model_argmax(sess, x, predictions, X_test_adv)
    print Y_test.shape
    print preds.shape
    count = 0
    for i in range(len(preds)):
        if np.argmax(Y_test[i]) == preds[i]:
            # imsave(path+'real/im_{}.png'.format(i), X_test[i])
            # imsave(path+'adv/adv_{}.png'.format(i), X_test_adv[i])
            count += 1
    print "saved ", count
Beispiel #8
0
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001, train_dir="/tmp",
                   filename="mnist.ckpt", load_model=False,
                   testing=False):
    keras.layers.core.K.set_learning_phase(0)
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()
    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")
    # Create TF session and set as Keras backend session
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)
    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)
    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))
    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, predictions, X_test, Y_test, args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)
    
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    
    # Train an MNIST model
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess, x, y, predictions, X_train, Y_train, evaluate=evaluate,
                    args=train_params, save=True, rng=rng)

    advGenTimeStart = time.time()
    wrap = KerasModelWrapper(model)
    advGenTimeStart = time.time()
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.2,
                   'clip_min': 0.,
                   'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    adv_x = sess.run(adv_x, feed_dict={x: X_test[4500:5500]})
    advGenTimeEnd = time.time()
    advGenTime = advGenTimeEnd-advGenTimeStart

    for i in xrange(1000):
        normalization(adv_x[i:(i+1)])
    
    original_classified_wrong_number = 0
    disturbed_failure_number = 0
    NbLowEntropy = 0
    NbMidEntropy = 0
    NbHighEntropy = 0
    lowTP = 0
    lowFN = 0
    lowFP = 0
    midTP = 0
    midFN = 0
    midFP = 0
    highTP = 0
    highFN = 0
    highFP = 0
    
    for i in range(len(adv_x)):
        current_class = int(np.argmax(Y_test[4500+i]))
        oriPreTimeStart = time.time()
        currentXLabel = model_argmax(sess,x,predictions,X_test[i+4500:(i+4501)])
        currentXProbList = my_model_argmax(sess,x,predictions,X_test[i+4500:(i+4501)])
        oriPreTimeEnd = time.time()
        oriPreTime = oriPreTimeEnd-oriPreTimeStart
        if currentXLabel != current_class:
            original_classified_wrong_number+=1
            continue
        
        advPreTimeStart = time.time()
        currentAdvXLabel = model_argmax(sess,x,predictions,adv_x[i:(i+1)])
        currentAdvXProbList = my_model_argmax(sess,x,predictions,adv_x[i:(i+1)])
        advPreTimeEnd = time.time()
        advPreTime = advPreTimeEnd-advPreTimeStart

        if currentAdvXLabel == currentXLabel:
            disturbed_failure_number+=1
            continue
                    
        tempX = np.reshape(X_test[i+4500:(i+4501)], (28,28))
        test_x = np.array(tempX)
        
        oriFilteredPreTimeStart = time.time()
        currentX = np.reshape(X_test[i+4500:(i+4501)], (28,28))
        imageEntropy = oneDEntropy(test_x)
        if imageEntropy < 4:
            NbLowEntropy+=1
            current_x_res = scalarQuantization(currentX,128)
            current_x_res = np.reshape(current_x_res, X_test[0:1].shape)
            current_x_res_label = model_argmax(sess,x,predictions,current_x_res)
            if current_x_res_label != current_class:
                lowFP+=1
        elif imageEntropy < 5:
            NbMidEntropy+=1
            current_x_res = scalarQuantization(currentX,64)
            current_x_res = np.reshape(current_x_res, X_test[0:1].shape)
            current_x_res_label = model_argmax(sess,x,predictions,current_x_res)
            if current_x_res_label != current_class:
                midFP+=1
        else:
            NbHighEntropy+=1
            current_x_res = scalarQuantization(currentX,43)
            current_x_res = np.reshape(current_x_res, X_test[0:1].shape)
            current_x_res_label = model_argmax(sess,x,predictions,current_x_res)
            if current_x_res_label != current_class:
                highFP+=1

        
        tempX2 = np.reshape(adv_x[i:(i+1)], (28,28))
        test_adv_x = np.array(tempX2)
        currentAdvX = np.reshape(adv_x[i:(i+1)], (28,28))
        imageEntropy2 = oneDEntropy(test_adv_x)
        print('%d: %.2f------%.2f' % (i, imageEntropy,imageEntropy2))
        if imageEntropy2 < 4:
            NbLowEntropy+=1
            current_adv_x_res = scalarQuantization(currentAdvX,128)
            current_adv_x_res = np.reshape(current_adv_x_res, X_test[0:1].shape)
            current_adv_x_res_label = model_argmax(sess,x,predictions,current_adv_x_res)
            if current_adv_x_res_label != currentAdvXLabel:
                lowTP+=1
            else:
                lowFN+=1
        elif imageEntropy2 < 5:
            NbMidEntropy+=1
            current_adv_x_res = scalarQuantization(currentAdvX,64)
            current_adv_x_res = np.reshape(current_adv_x_res, X_test[0:1].shape)
            current_adv_x_res_label = model_argmax(sess,x,predictions,current_adv_x_res)
            if current_adv_x_res_label != currentAdvXLabel:
                midTP+=1
            else:
                highFN+=1
        else:
            NbHighEntropy+=1
            current_adv_x_res = scalarQuantization(currentAdvX,43)
            current_adv_x_res = np.reshape(current_adv_x_res, X_test[0:1].shape)
            current_adv_x_res_label = model_argmax(sess,x,predictions,current_adv_x_res)
            if current_adv_x_res_label != currentAdvXLabel:
                highTP+=1
            else:
                highFN+=1

        str1 = '%d-%d' % (original_classified_wrong_number,disturbed_failure_number)
        lowstr = '%d : lowTP = %d; lowFN = %d; lowFP = %d' % (NbLowEntropy,lowTP,lowFN,lowFP)
        midstr = '%d : midTP = %d; midFN = %d; midFP = %d' % (NbMidEntropy,midTP,midFN,midFP)
        highstr = '%d : highTP = %d; highFN = %d; highFP = %d' % (NbHighEntropy,highTP,highFN,highFP)
        print(str1)
        print(lowstr)
        print(midstr)
        print(highstr)
    
    lowRecall=lowTP*1.0/(lowTP+lowFN)
    lowPrecision=lowTP*1.0/(lowTP+lowFP)
    midRecall=midTP*1.0/(midTP+midFN)
    midPrecision=midTP*1.0/(midTP+midFP)
    highRecall=highTP*1.0/(highTP+highFN)
    highPrecision=highTP*1.0/(highTP+highFP)

    print ("lowRecall: ",lowRecall)
    print ("lowPrecision: ",lowPrecision)
    print ("midRecall: ",midRecall)
    print ("midPrecision: ",midPrecision)   
    print ("highRecall: ",highRecall)
    print ("highPrecision: ",highPrecision)
def deepfool_attack_L2(sess,
                       x,
                       predictions,
                       logits,
                       grads,
                       sample,
                       nb_candidate,
                       overshoot,
                       max_iter,
                       clip_min,
                       clip_max,
                       feed=None):
    """
    TensorFlow implementation of DeepFool.
    Paper link: see https://arxiv.org/pdf/1511.04599.pdf  #it is said by <On detecting Adversarial Perturbations>, deepfool has L2 and L_infinity versions
    :param sess: TF session
    :param x: The input placeholder
    :param predictions: The model's sorted symbolic output of logits, only the
                       top nb_candidate classes are contained
    :param logits: The model's unnormalized output tensor (the input to
                   the softmax layer)
    :param grads: Symbolic gradients of the top nb_candidate classes, procuded
                 from gradient_graph
    :param sample: Numpy array with sample input
    :param nb_candidate: The number of classes to test against, i.e.,
                         deepfool only consider nb_candidate classes when
                         attacking(thus accelerate speed). The nb_candidate
                         classes are chosen according to the prediction
                         confidence during implementation.
    :param overshoot: A termination criterion to prevent vanishing updates
    :param max_iter: Maximum number of iteration for DeepFool
    :param clip_min: Minimum value for components of the example returned
    :param clip_max: Maximum value for components of the example returned
    :return: Adversarial examples
    """
    adv_x = copy.copy(sample)
    # Initialize the loop variables
    iteration = 0
    current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed)
    if current.shape == ():
        current = np.array([current])
    w = np.squeeze(np.zeros(sample.shape[1:]))  # same shape as original image
    r_tot = np.zeros(sample.shape)
    original = current  # use original label as the reference

    _logger.debug(
        "Starting DeepFool attack up to %s iterations", max_iter)
    # Repeat this main loop until we have achieved misclassification
    while (np.any(current == original) and iteration < max_iter):

        # if iteration % 5 == 0 and iteration > 0:
        #     _logger.info("Attack result at iteration %s is %s", iteration, current)
        gradients = sess.run(grads, feed_dict={x: adv_x})
        predictions_val = sess.run(predictions, feed_dict={x: adv_x})
        for idx in range(sample.shape[0]):
            pert = np.inf
            if current[idx] != original[idx]:
                continue
            for k in range(1, nb_candidate):
                w_k = gradients[idx, k, ...] - gradients[idx, 0, ...]
                f_k = predictions_val[idx, k] - predictions_val[idx, 0]
                # adding value 0.00001 to prevent f_k = 0
                pert_k = (abs(f_k) + 0.00001) / np.linalg.norm(w_k.flatten())
                if pert_k < pert:
                    pert = pert_k
                    w = w_k
            r_i = pert * w / np.linalg.norm(w.flatten())
            r_tot[idx, ...] = r_tot[idx, ...] + r_i

        adv_x = np.clip(r_tot + sample, clip_min, clip_max)
        current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed)
        if current.shape == ():
            current = np.array([current])
        # Update loop variables
        iteration = iteration + 1

    # need more revision, including info like how many succeed
    # _logger.info("Attack result at iteration %s is %s", iteration, current)
    _logger.info("%s out of %s become adversarial examples at iteration %s",
                 sum(current != original),
                 sample.shape[0],
                 iteration)
    # need to clip this image into the given range
    adv_x = np.clip((1 + overshoot) * r_tot + sample, clip_min, clip_max)
    return adv_x
        def do_jsma():
            print('Crafting ' + str(source_samples) + ' * ' +
                  str(nb_classes - 1) + ' adversarial examples')

            # Keep track of success (adversarial example classified in target)
            results = np.zeros((nb_classes, source_samples), dtype='i')

            # Rate of perturbed features for each test set example and target class
            perturbations = np.zeros((nb_classes, source_samples), dtype='f')

            # Initialize our array for grid visualization
            grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
            grid_viz_data = np.zeros(grid_shape, dtype='f')

            # Instantiate a SaliencyMapMethod attack object
            jsma = SaliencyMapMethod(model, back='tf', sess=sess)
            jsma_params = {
                'theta': 1.,
                'gamma': 0.1,
                'clip_min': 0.,
                'clip_max': 1.,
                'y_target': None
            }

            figure = None
            # Loop over the samples we want to perturb into adversarial examples
            for sample_ind in xrange(0, source_samples):
                print('--------------------------------------')
                print('Attacking input %i/%i' %
                      (sample_ind + 1, source_samples))
                sample = X_test[sample_ind:(sample_ind + 1)]

                # We want to find an adversarial example for each possible target class
                # (i.e. all classes that differ from the label given in the dataset)
                current_class = int(np.argmax(Y_test[sample_ind]))
                target_classes = other_classes(nb_classes, current_class)

                # For the grid visualization, keep original images along the diagonal
                grid_viz_data[current_class,
                              current_class, :, :, :] = np.reshape(
                                  sample, (img_rows, img_cols, channels))

                # Loop over all target classes
                for target in target_classes:
                    print('Generating adv. example for target class %i' %
                          target)

                    # This call runs the Jacobian-based saliency map approach
                    one_hot_target = np.zeros((1, nb_classes),
                                              dtype=np.float32)
                    one_hot_target[0, target] = 1
                    jsma_params['y_target'] = one_hot_target
                    adv_x = jsma.generate_np(sample, **jsma_params)

                    # Check if success was achieved
                    res = int(model_argmax(sess, x, preds, adv_x) == target)

                    # Computer number of modified features
                    adv_x_reshape = adv_x.reshape(-1)
                    test_in_reshape = X_test[sample_ind].reshape(-1)
                    nb_changed = np.where(
                        adv_x_reshape != test_in_reshape)[0].shape[0]
                    percent_perturb = float(nb_changed) / adv_x.reshape(
                        -1).shape[0]

                    # Display the original and adversarial images side-by-side
                    if FLAGS.viz_enabled:
                        figure = pair_visual(
                            np.reshape(sample, (img_rows, img_cols)),
                            np.reshape(adv_x, (img_rows, img_cols)), figure)

                    # Add our adversarial example to our grid data
                    grid_viz_data[target, current_class, :, :, :] = np.reshape(
                        adv_x, (img_rows, img_cols, channels))

                    # Update the arrays for later analysis
                    results[target, sample_ind] = res
                    perturbations[target, sample_ind] = percent_perturb

            print('--------------------------------------')

            # Compute the number of adversarial examples that were successfully found
            nb_targets_tried = ((nb_classes - 1) * source_samples)
            succ_rate = float(np.sum(results)) / nb_targets_tried
            print('Avg. rate of successful adv. examples {0:.4f}'.format(
                succ_rate))
            report.clean_train_adv_eval = 1. - succ_rate

            # Compute the average distortion introduced by the algorithm
            percent_perturbed = np.mean(perturbations)
            print('Avg. rate of perturbed features {0:.4f}'.format(
                percent_perturbed))

            # Compute the average distortion introduced for successful samples only
            percent_perturb_succ = np.mean(perturbations * (results == 1))
            print('Avg. rate of perturbed features for successful '
                  'adversarial examples {0:.4f}'.format(percent_perturb_succ))
            if FLAGS.viz_enabled:
                import matplotlib.pyplot as plt
                plt.close(figure)
                _ = grid_visual(grid_viz_data)

            return report
  def attack(self, x_val, targets):
    """
    Perform the attack on the given instance for the given targets.
    """

    def lbfgs_objective(adv_x, self, targets, oimgs, CONST):
      # returns the function value and the gradient for fmin_l_bfgs_b
      loss = self.sess.run(
          self.loss,
          feed_dict={
              self.x: adv_x.reshape(oimgs.shape),
              self.targeted_label: targets,
              self.ori_img: oimgs,
              self.const: CONST
          })
      grad = self.sess.run(
          self.grad,
          feed_dict={
              self.x: adv_x.reshape(oimgs.shape),
              self.targeted_label: targets,
              self.ori_img: oimgs,
              self.const: CONST
          })
      return loss, grad.flatten().astype(float)

    # begin the main part for the attack
    from scipy.optimize import fmin_l_bfgs_b
    oimgs = np.clip(x_val, self.clip_min, self.clip_max)
    CONST = np.ones(self.batch_size) * self.initial_const

    # set the lower and upper bounds accordingly
    lower_bound = np.zeros(self.batch_size)
    upper_bound = np.ones(self.batch_size) * 1e10

    # set the box constraints for the optimization function
    clip_min = self.clip_min * np.ones(oimgs.shape[:])
    clip_max = self.clip_max * np.ones(oimgs.shape[:])
    clip_bound = list(zip(clip_min.flatten(), clip_max.flatten()))

    # placeholders for the best l2 and instance attack found so far
    o_bestl2 = [1e10] * self.batch_size
    o_bestattack = np.copy(oimgs)

    for outer_step in range(self.binary_search_steps):
      _logger.debug("  Binary search step %s of %s",
                    outer_step, self.binary_search_steps)

      # The last iteration (if we run many steps) repeat the search once.
      if self.repeat and outer_step == self.binary_search_steps - 1:
        CONST = upper_bound

      # optimization function
      adv_x, _, __ = fmin_l_bfgs_b(
          lbfgs_objective,
          oimgs.flatten().astype(float),
          args=(self, targets, oimgs, CONST),
          bounds=clip_bound,
          maxiter=self.max_iterations,
          iprint=0)

      adv_x = adv_x.reshape(oimgs.shape)
      assert np.amax(adv_x) <= self.clip_max and \
          np.amin(adv_x) >= self.clip_min, \
          'fmin_l_bfgs_b returns are invalid'

      # adjust the best result (i.e., the adversarial example with the
      # smallest perturbation in terms of L_2 norm) found so far
      preds = np.atleast_1d(
          utils_tf.model_argmax(self.sess, self.x, self.logits,
                                adv_x))
      _logger.debug("predicted labels are %s", preds)

      l2s = np.zeros(self.batch_size)
      for i in range(self.batch_size):
        l2s[i] = np.sum(np.square(adv_x[i] - oimgs[i]))

      for e, (l2, pred, ii) in enumerate(zip(l2s, preds, adv_x)):
        if l2 < o_bestl2[e] and pred == np.argmax(targets[e]):
          o_bestl2[e] = l2
          o_bestattack[e] = ii

      # adjust the constant as needed
      for e in range(self.batch_size):
        if preds[e] == np.argmax(targets[e]):
          # success, divide const by two
          upper_bound[e] = min(upper_bound[e], CONST[e])
          if upper_bound[e] < 1e9:
            CONST[e] = (lower_bound[e] + upper_bound[e]) / 2
        else:
          # failure, either multiply by 10 if no solution found yet
          #          or do binary search with the known upper bound
          lower_bound[e] = max(lower_bound[e], CONST[e])
          if upper_bound[e] < 1e9:
            CONST[e] = (lower_bound[e] + upper_bound[e]) / 2
          else:
            CONST[e] *= 10

      _logger.debug("  Successfully generated adversarial examples "
                    "on %s of %s instances.",
                    sum(upper_bound < 1e9), self.batch_size)
      o_bestl2 = np.array(o_bestl2)
      mean = np.mean(np.sqrt(o_bestl2[o_bestl2 < 1e9]))
      _logger.debug("   Mean successful distortion: {:.4g}".format(mean))

    # return the best solution found
    o_bestl2 = np.array(o_bestl2)
    return o_bestattack
Beispiel #12
0
def mnist_tutorial(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_epochs=6, batch_size=128,
                   learning_rate=0.001,
                   clean_train=True,
                   testing=False,
                   backprop_through_attack=False,
                   nb_filters=64):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    sess = tf.Session()

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = "models/mnist"
    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    fgsm_params = {'eps': 0.3,
                   'clip_min': 0.,
                   'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    if clean_train:
        model = make_basic_cnn(nb_filters=nb_filters)
        preds = model.get_probs(x)

        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test
            # examples
            eval_params = {'batch_size': batch_size}
            acc = model_eval(
                sess, x, y, preds, X_test, Y_test, args=eval_params)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            print('Test accuracy on legitimate examples: %0.4f' % acc)
        model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,
                    args=train_params, rng=rng)

        # Calculate training error
        if testing:
            eval_params = {'batch_size': batch_size}
            acc = model_eval(
                sess, x, y, preds, X_train, Y_train, args=eval_params)
            report.train_clean_train_clean_eval = acc

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)
        adv_x = fgsm.generate(x, **fgsm_params)
        print(adv_x)
        preds_adv = model.get_probs(adv_x)

        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        # Define accuracy symbolically
        if LooseVersion(tf.__version__) >= LooseVersion('1.0.0'):
            correct_preds = tf.not_equal(tf.argmax(y, axis=-1),
                                     tf.argmax(preds_adv, axis=-1))
        else:
            correct_preds = tf.not_equal(tf.argmax(y, axis=tf.rank(y) - 1),
                                     tf.argmax(preds_adv,
                                               axis=tf.rank(preds_adv) - 1))
        # print("the shape of correct_preds is ", correct_preds.get_shape())
        # correct_preds is a boolean Tensor with shape (size,)
        success_adv_x = tf.boolean_mask(adv_x, correct_preds)
        success_clean_x = tf.boolean_mask(x, correct_preds)
        success_clean_y = tf.boolean_mask(y, correct_preds)
        fgsm_adv_x, fgsm_clean_x, fgsm_clean_y = sess.run([success_adv_x, success_clean_x, success_clean_y], feed_dict={x:X_test,y:Y_test})
        np.savez('adversarial_fgsm',adv_examples=fgsm_adv_x, adv_clean_labels=fgsm_clean_y, adv_clean_examples=fgsm_clean_x)
        print("the shape of adversarial examples we save is ", np.shape(fgsm_adv_x))
        print("the shape of clean targets we save is ", np.shape(fgsm_clean_y))

        acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
        print('Test accuracy on adversarial examples fgsm: %0.4f\n' % acc)
        report.clean_train_adv_eval = acc
        adv_x_test_for_save = sess.run(adv_x, {x: X_test})
        np.savez("adv_test_fgsm_data.npz", adv_examples=adv_x_test_for_save, adv_clean_labels=Y_test, adv_clean_examples=X_test)
        # Calculate training error
        if testing:
            eval_par = {'batch_size': batch_size}
            acc = model_eval(sess, x, y, preds_adv, X_train,
                             Y_train, args=eval_par)
            report.train_clean_train_adv_eval = acc

        print("Repeating the process, using adversarial training")
    # Redefine TF model graph
    model_2 = make_basic_cnn(nb_filters=nb_filters)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(model_2, sess=sess)
    adv_x_2 = fgsm2.generate(x, **fgsm_params)
    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x_2 = tf.stop_gradient(adv_x_2)
    preds_2_adv = model_2(adv_x_2)
    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_test, Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess, x, y, preds_2_adv, X_test,
                              Y_test, args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    # Perform and evaluate adversarial training
    print("pred_adv", preds_2_adv.get_shape())
    model_train(sess, x, y, preds_2, X_train, Y_train,
                predictions_adv=preds_2_adv, evaluate=evaluate_2,
                args=train_params, rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess, x, y, preds_2, X_train, Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess, x, y, preds_2_adv, X_train,
                              Y_train, args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and
    # graph


    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    source_samples = 10000
    nb_classes = 10
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model_2, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    
    # create an array for storing adv examples
    adv_examples = np.empty([1,28,28,1])
    # for target labels
    adv_targets = np.empty([1,10])
    # corresponding clean/correct label
    adv_clean_labels = np.empty([1,10])
    # correspongding clean data
    adv_clean_examples = np.empty([1,28,28,1])
        
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = X_test[sample_ind:(sample_ind+1)] # generate from testing data

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind])) # generate from testing data
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        # grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
        #     sample, (img_rows, img_cols, channels))
        
        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            #create fake target
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)
            # print('adv_x\'shape is ', np.shape(adv_x)) # (1,28,28,1)
            
            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)
            # if succeeds
            if res == 1:
                # append new adv_x to adv_examples array
                # append sample here, so that the number of times sample is appended mmatches number of adv_ex.
                adv_examples = np.append(adv_examples, adv_x, axis=0)
                adv_targets = np.append(adv_targets, one_hot_target, axis=0)
                adv_clean_labels = np.append(adv_clean_labels, np.expand_dims(Y_test[sample_ind],axis=0), axis=0) # generate from testing data
                adv_clean_examples = np.append(adv_clean_examples, sample, axis=0)

            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb
    print('--------------------------------------')
    adv_examples = adv_examples[1:,:,:,:]
    adv_targets = adv_targets[1:,:]
    adv_clean_labels = adv_clean_labels[1:,:]
    adv_clean_examples = adv_clean_examples[1:,:,:,:]
    np.savez('adversarial_jsma_actual_full',adv_examples=adv_examples, adv_targets=adv_targets, adv_clean_labels=adv_clean_labels,adv_clean_examples=adv_clean_examples)
    print(np.shape(adv_targets)[0], "adversarial examples have been saved.")
    
    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_test_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))


    return report
def deepfool_attack(sess, x, predictions, logits, sample, nb_candidate=10, overshoot=0.02, max_iter=50, clip_min=0.0, clip_max=1.0, feed=None):
    """
    TensorFlow implementation of DeepFool.
    Paper link: see https://arxiv.org/pdf/1511.04599.pdf
    :param sess: TF session
    :param x: The input placeholder
    :param predictions: The model's sorted symbolic output of logits, only the
                       top nb_candidate classes are contained
    :param logits: The model's unnormalized output tensor (the input to
                   the softmax layer)
    :param grads: Symbolic gradients of the top nb_candidate classes, procuded
                 from gradient_graph
    :param sample: Numpy array with sample input
    :param nb_candidate: The number of classes to test against, i.e.,
                        deepfool only consider nb_candidate classes when
                        attacking (thus accelerate speed)
    :param overshoot: A termination criterion to prevent vanishing updates
    :param max_iter: Maximum number of iteration for DeepFool
    :param clip_min: Minimum value for components of the example returned
    :param clip_max: Maximum value for components of the example returned
    :return: an adversarial sample
    """
    import copy

    adv_x = copy.copy(sample)
    # Initialize the loop variables
    iteration = 0
    current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed)
    if current.shape == ():
        current = np.array([current])
    w = np.squeeze(np.zeros(sample.shape[1:4]))  # same shape as original image
    r_tot = np.zeros(sample.shape)
    original = current  # use original label as the reference

    grads = jacobian_graph(predictions, x, nb_candidate)

    # Repeat this main loop until we have achieved misclassification
    while (np.any(current == original) and iteration < max_iter):
        feed.update({x: adv_x})
        gradients = sess.run(grads, feed_dict=feed)
        predictions_val = sess.run(predictions, feed_dict=feed)
        for idx in range(sample.shape[0]):
            pert = np.inf
            if current[idx] != original[idx]:
                continue
            for k in range(1, nb_candidate):
                w_k = gradients[k][idx, ...] - gradients[0][idx, ...]
                f_k = predictions_val[idx, k] - predictions_val[idx, 0]
                # adding value 0.00001 to prevent f_k = 0
                pert_k = (abs(f_k) + 1e-30) / np.linalg.norm(w_k.flatten())
                if pert_k < pert:
                    pert = pert_k
                    w = w_k
            r_i = pert*w/np.linalg.norm(w)
            r_tot[idx, ...] = r_tot[idx, ...] + r_i

        # adv_x = np.clip(r_tot + sample, clip_min, clip_max)
        adv_x = r_tot + sample
        feed.update({x: adv_x})

        current = utils_tf.model_argmax(sess, x, logits, adv_x, feed=feed)
        if current.shape == ():
            current = np.array([current])
        # Update loop variables
        iteration = iteration + 1

    # need to clip this image into the given range
    # adv_x = np.clip((1+overshoot)*r_tot + sample, clip_min, clip_max)
    adv_x = (1 + overshoot) * r_tot + sample
    return adv_x
Beispiel #14
0
    def jsma(self, sess, x, predictions, grads, sample, target, theta, gamma, clip_min,
            clip_max, feed=None):
        """
        TensorFlow implementation of the JSMA (see https://arxiv.org/abs/1511.07528
        for details about the algorithm design choices).
        :param sess: TF session
        :param x: the input placeholder
        :param predictions: the model's symbolic output (the attack expects the
                    probabilities, i.e., the output of the softmax, but will
                    also work with logits typically)
        :param grads: symbolic gradients
        :param sample: numpy array with sample input
        :param target: target class for sample input
        :param theta: delta for each feature adjustment
        :param gamma: a float between 0 - 1 indicating the maximum distortion
            percentage
        :param clip_min: minimum value for components of the example returned
        :param clip_max: maximum value for components of the example returned
        :return: an adversarial sample
        """

        # Copy the source sample and define the maximum number of features
        # (i.e. the maximum number of iterations) that we may perturb
        adv_x = copy.copy(sample)
        # count the number of features. For MNIST, 1x28x28 = 784; for
        # CIFAR, 3x32x32 = 3072; etc.
        nb_features = np.product(adv_x.shape[1:])
        # reshape sample for sake of standardization
        original_shape = adv_x.shape
        adv_x = np.reshape(adv_x, (1, nb_features))
        # compute maximum number of iterations
        max_iters = np.floor(nb_features * gamma / 2)

        # Find number of classes based on grads
        nb_classes = len(grads)

        increase = bool(theta > 0)

        # Compute our initial search domain. We optimize the initial search domain
        # by removing all features that are already at their maximum values (if
        # increasing input features---otherwise, at their minimum value).
        if increase:
            search_domain = set([i for i in range(nb_features)
                                if adv_x[0, i] < clip_max])
        else:
            search_domain = set([i for i in range(nb_features)
                                if adv_x[0, i] > clip_min])

        # Initialize the loop variables
        iteration = 0
        adv_x_original_shape = np.reshape(adv_x, original_shape)
        current = utils_tf.model_argmax(sess, x, predictions, adv_x_original_shape,
                                        feed=feed)

        # charlee: Used to log when the model gets confused
        orig_label = current
        confused_at = 0
        success_at = 0

        # must have at least 10 successful pred to be judged as successful
        confuse_count = 0
        success_count = 0

        logger.debug("Starting JSMA attack up to {} iterations".format(max_iters))
        # Repeat this main loop until we have achieved misclassification
        while (success_at == 0 and iteration < max_iters and
            len(search_domain) > 1):
            # Reshape the adversarial example
            adv_x_original_shape = np.reshape(adv_x, original_shape)

            # Compute the Jacobian components
            grads_target, grads_others = jacobian(sess, x, grads, target,
                                                adv_x_original_shape,
                                                nb_features, nb_classes,
                                                feed=feed)

            if iteration % ((max_iters + 1) // 5) == 0 and iteration > 0:
                logger.debug("Iteration {} of {}".format(iteration,
                                                        int(max_iters)))
            # Compute the saliency map for each of our target classes
            # and return the two best candidate features for perturbation
            i, j, search_domain = saliency_map(
                grads_target, grads_others, search_domain, increase)

            # Apply the perturbation to the two input features selected previously
            adv_x = apply_perturbations(
                i, j, adv_x, increase, theta, clip_min, clip_max)

            # Update our current prediction by querying the model
            current = utils_tf.model_argmax(sess, x, predictions,
                                            adv_x_original_shape, feed=feed)

            # Update loop variables
            iteration = iteration + 1

            # charlee: Record the iternation when model gets confused
            if current != orig_label and confused_at == 0:
                confuse_count += 1
                if confuse_count >= 1:
                    confused_at = iteration
            else:
                confuse_count = 0

            if current == target:
                success_count += 1
                if success_count >= 1:
                    logger.info("Attack succeeded using {} iterations".format(iteration))
                    success_at = iteration
            else:
                success_count = 0

        if success_at == 0:
            logger.info(("Failed to find adversarial example " +
                        "after {} iterations").format(iteration))

        # Compute the ratio of pixels perturbed by the algorithm
        percent_perturbed = float(iteration * 2) / nb_features
        confused_at = float(confused_at * 2) / nb_features
        success_at = float(success_at * 2) / nb_features

        # Report success when the adversarial example is misclassified in the
        # target class
        return np.reshape(adv_x, original_shape), percent_perturbed, confused_at, success_at, orig_label, current
def mnist_tutorial_jsma(train_start=0,
                        train_end=60000,
                        test_start=0,
                        test_end=10000,
                        viz_enabled=True,
                        nb_epochs=6,
                        batch_size=128,
                        nb_classes=10,
                        source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(4254264)

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    # X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
    #                                               train_end=train_end,
    #                                               test_start=test_start,
    #                                               test_end=test_end)

    # Get notMNIST data
    # with np.load("notmnist.npz") as data:
    #     X_train, Y_train, X_test, Y_test = data['examples_train'], data['labels_train'], data['examples_test'], data['labels_test']

    # Get MNISTnotMNIST data
    with np.load("mnist.npz") as data:
        X_train, Y_train, X_test, Y_test = data['X_train'], data[
            'Y_train'], data['X_test'], data['Y_test']
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    # Define TF model graph
    model_path = "./"
    model_name = "clean_trained_mnist_model"
    model = make_basic_cnn(nb_classes=nb_classes)
    if tf_model_load(sess, file_path=os.path.join(model_path, model_name)):
        print(model_name, " reloaded.")
    preds = model.get_probs(x)
    # print('shape is', preds.get_shape())

    # clean_train = True
    # if clean_train:
    #     train_params = {
    #         'nb_epochs': nb_epochs,
    #         'batch_size': batch_size,
    #         'learning_rate': learning_rate
    #     }
    #     model_path = "./"
    #     model_name = "clean_trained__model_notmnist"
    #     rng = np.random.RandomState([1989, 12, 13])
    #     model = make_basic_cnn()
    #     preds = model.get_probs(x)
    #
    #     def evaluate():
    #         # Evaluate the accuracy of the MNIST model on legitimate test
    #         # examples
    #         eval_params = {'batch_size': batch_size}
    #         acc = model_eval(
    #             sess, x, y, preds, X_test, Y_test, args=eval_params)
    #         report.clean_train_clean_eval = acc
    #         assert X_test.shape[0] == test_end - test_start, X_test.shape
    #         print('Test accuracy on legitimate examples: %0.4f' % acc)
    #     model_train(sess, x, y, preds, X_train, Y_train, evaluate=evaluate,args=train_params, rng=rng)
    #
    #     save_path = os.path.join(model_path, model_name)
    #     saver = tf.train.Saver()
    #     saver.save(sess, save_path)
    #     _logger.info("Completed model training and saved at: " + str(save_path))
    # print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    # train_params = {
    #     'nb_epochs': nb_epochs,
    #     'batch_size': batch_size,
    #     'learning_rate': learning_rate,
    #     'train_dir': model_path,
    #     'filename': model_name
    # }
    # sess.run(tf.global_variables_initializer())
    # rng = np.random.RandomState([2017, 8, 30])
    # model_train(sess, x, y, preds, X_train, Y_train, save=True, args=train_params,
    #             rng=rng)
    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    # report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')
    # misclassify
    results2 = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    # grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    # grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {
        'theta': 1,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }

    figure = None
    rng = np.random.RandomState([1358, 23, 234])
    index_shuf = list(range(len(X_test)))
    rng.shuffle(index_shuf)
    X_test = X_test[index_shuf]
    Y_test = Y_test[index_shuf]

    # create a dictionary to keep track of occurence of each letter
    # create a 2D array to kee track of successful attacks
    occurence = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0}
    # 10:0, 11:0, 12:0, 13:0, 14:0, 15:0, 16:0, 17:0, 18:0, 19:0}
    rate_table = np.zeros((nb_classes, nb_classes), dtype='f')

    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = X_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # add one to current class occurence
        occurence[current_class] += 1

        # For the grid visualization, keep original images along the diagonal
        # grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
        #     sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)
            # misclassify
            res2 = int(model_argmax(sess, x, preds, adv_x) != current_class)
            # if success, add one to successful rate table
            if res == 1:
                rate_table[current_class, target] += 1.

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            # if viz_enabled:
            #     figure = pair_visual(
            #         np.reshape(sample, (img_rows, img_cols)),
            #         np.reshape(adv_x, (img_rows, img_cols)), figure)

            # Add our adversarial example to our grid data
            # grid_viz_data[target, current_class, :, :, :] = np.reshape(
            #     adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            results2[target, sample_ind] = res2
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Close TF session
    sess.close()

    # Compute success rate of each letter attacking each target
    for cur in range(nb_classes):
        if occurence[cur] != 0:
            rate_table[cur, :] /= float(occurence[cur])
    print("The table of rate of successful attacking is shown below")
    print(rate_table)
    print("the number of occurrence of each class is ", occurence)

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    # misclassify
    succ_rate2 = float(np.sum(results2)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    print(
        'Avg. rate of misclassified adv. examples {0:.4f}'.format(succ_rate2))
    # report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Finally, block & display a grid of all the adversarial examples
    # if viz_enabled:
    #     import matplotlib.pyplot as plt
    #     plt.close(figure)
    #     _ = grid_visual(grid_viz_data)

    return report
Beispiel #16
0
def mnist_tutorial_jsma(
    train_start=0,
    train_end=60000,
    test_start=0,
    test_end=10000,
    viz_enabled=VIZ_ENABLED,
    nb_epochs=NB_EPOCHS,
    batch_size=BATCH_SIZE,
    source_samples=SOURCE_SAMPLES,
    learning_rate=LEARNING_RATE,
):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(
        train_start=train_start,
        train_end=train_end,
        test_start=test_start,
        test_end=test_end,
    )
    x_train, y_train = mnist.get_set("train")
    x_test, y_test = mnist.get_set("test")

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN("model1", nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        "nb_epochs": nb_epochs,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x_train, y_train, args=train_params, rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {"batch_size": batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print("Test accuracy on legitimate test examples: {0}".format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print("Crafting " + str(source_samples) + " * " + str(nb_classes - 1) +
          " adversarial examples")

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype="i")

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype="f")

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype="f")

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        "theta": 1.0,
        "gamma": 0.1,
        "clip_min": 0.0,
        "clip_max": 1.0,
        "y_target": None,
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print("--------------------------------------")
        print("Attacking input %i/%i" % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print("Generating adv. example for target class %i" % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params["y_target"] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Compute number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)),
                    figure,
                )

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print("--------------------------------------")

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = (nb_classes - 1) * source_samples
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print("Avg. rate of successful adv. examples {0:.4f}".format(succ_rate))
    report.clean_train_adv_eval = 1.0 - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations[np.where(perturbations != 0)])
    print("Avg. rate of perturbed features {0:.4f}".format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(
        perturbations[np.where(perturbations != 0)] *
        (results[np.where(perturbations != 0)] == 1))
    print("Avg. rate of perturbed features for successful "
          "adversarial examples {0:.4f}".format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt

        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Beispiel #17
0
def minist_fgsm_saliency(
    train_start=0,
    train_end=10,
    test_start=0,
    test_end=5,
    nb_epochs=2,
    batch_size=128,
    learning_rate=0.001,
    clean_train=True,
    testing=False,
    backprop_through_attack=False,
    nb_filters=64,
    nb_classes=10,
    source_samples=10,
):
    """
    MNIST cleverhans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param clean_train: perform normal training on clean examples only
                        before performing adversarial training.
    :param testing: if true, complete an AccuracyReport for unit tests
                    to verify that performance is adequate
    :param backprop_through_attack: If True, backprop through adversarial
                                    example construction process during
                                    adversarial training.
    :param clean_train: if true, train on clean examples
    :return: an AccuracyReport object
    """

    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Create TF session
    sess = tf.Session()

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    # this way, all the 9 zeroes -> 0.1/9 because
    # the one-bit becomes 0.9
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # placeholder for y_target --> for saliency tensor
    y_target = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = "models/mnist"
    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    fgsm_params = {'eps': 0.3, 'clip_min': 0., 'clip_max': 1.}
    rng = np.random.RandomState([2017, 8, 30])

    ###########################################################################
    # Training the CNN model using TensorFlow: model --> base model
    ###########################################################################
    model = make_basic_cnn(nb_filters=nb_filters)
    preds = model.get_probs(x)

    if clean_train:
        # omg -> creates a cnn model
        # model = make_basic_cnn(nb_filters=nb_filters)
        # preds = model.get_probs(x)
        def evaluate():
            # Evaluate the accuracy of the MNIST model on legitimate test
            # examples
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_test,
                             Y_test,
                             args=eval_params)
            report.clean_train_clean_eval = acc
            assert X_test.shape[0] == test_end - test_start, X_test.shape
            print('Test accuracy on legitimate examples: %0.4f' % acc)

        ###########################################################################
        # MODEL Train!!!!!!!!!!!!
        ###########################################################################
        # training the basic model, using train_params
        model_train(sess,
                    x,
                    y,
                    preds,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    rng=rng)

        # Calculate training error
        if testing:
            eval_params = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds,
                             X_train,
                             Y_train,
                             args=eval_params)
            report.train_clean_train_clean_eval = acc

        ###########################################################################
        # Generate FGSM Adversarial based on model, and
        # Compute Base Model Accuracy
        ###########################################################################

        # Initialize the Fast Gradient Sign Method (FGSM) attack object and
        # graph
        fgsm = FastGradientMethod(model, sess=sess)

        # todo: follow the paper and run Cleverhans Output?
        fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.}

        #adv_x = fgsm.generate(x, **fgsm_params)
        adv_x = fgsm.generate(x, **fgsm_params_y)
        preds_adv = model.get_probs(adv_x)
        # Evaluate the accuracy of the MNIST model on adversarial examples
        eval_par = {'batch_size': batch_size}
        acc = model_eval(sess, x, y, preds_adv, X_test, Y_test, args=eval_par)
        print('Test accuracy on FGSM adversarial examples: %0.4f\n' % acc)
        report.clean_train_adv_eval = acc

        # Calculate training error
        if testing:
            eval_par = {'batch_size': batch_size}
            acc = model_eval(sess,
                             x,
                             y,
                             preds_adv,
                             X_train,
                             Y_train,
                             args=eval_par)
            report.train_clean_train_adv_eval = acc

        ###########################################################################
        # Generate Saliency Map Adversarial Example and
        # Compute base model accuracy (only 10)
        ###########################################################################
        print("Saliency Map Attack On The Base Model")
        print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
              ' adversarial examples')

        # Instantiate a SaliencyMapMethod attack object --> modify y_target for each test_data again
        jsma = SaliencyMapMethod(model, back='tf', sess=sess)
        jsma_params = {
            'theta': 1.,
            'gamma': 0.1,
            'clip_min': 0.,
            'clip_max': 1.,
            'y_target': None
        }

        # Keep track of success (adversarial example classified in target)
        # Need this info to compute the success rate
        results = np.zeros((nb_classes, source_samples), dtype='i')

        # each sample will get 9 adversarial samples

        # adv_x_set: place_holder for all the x variations
        # correct_y_set: correct_y_output used for training

        adv_x_set = None
        adv_y_target = None

        # we need multi x_train_saliency / y_train_saliency
        #
        x_train_saliency = None
        y_train_saliency = None

        for sample_ind in xrange(0, source_samples):
            print('--------------------------------------')
            print('Saliency Attacking input %i/%i' %
                  (sample_ind + 1, source_samples))
            sample = X_train[sample_ind:(sample_ind + 1)]
            y_sample = Y_train[sample_ind:(sample_ind + 1)]

            current_class = int(np.argmax(Y_train[sample_ind]))
            target_classes = other_classes(nb_classes, current_class)

            # Loop over all target classes
            for target in target_classes:
                print('Generating adv. example for target class %i' % target)

                # Create x_train_saliency, corresponding to y_train_saliency
                if x_train_saliency is not None:
                    x_train_saliency = np.concatenate(
                        (x_train_saliency, sample), axis=0)
                    y_train_saliency = np.concatenate(
                        (y_train_saliency, y_sample), axis=0)
                else:
                    x_train_saliency = sample
                    y_train_saliency = y_sample
                    print("sample shape: ", x_train_saliency.shape)
                    print("y_sample shape: ", y_train_saliency.shape)

                # This call runs the Jacobian-based saliency map approach
                one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
                one_hot_target[0, target] = 1
                jsma_params['y_target'] = one_hot_target

                adv_x_np = jsma.generate_np(sample, **jsma_params)

                # Add to adv_x_set, correct_y_set
                if adv_x_set is not None:
                    adv_y_target = np.concatenate(
                        (adv_y_target, one_hot_target), axis=0)
                    adv_x_set = np.concatenate((adv_x_np, adv_x_set), axis=0)
                else:
                    adv_y_target = one_hot_target
                    adv_x_set = adv_x_np
                    print("adv_y_target shape(one-hot-encoding): ",
                          adv_y_target.shape)
                    print("adv_x_set(np) shape: ", adv_x_np.shape)

                # Check if success was achieved
                res = int(model_argmax(sess, x, preds, adv_x_np) == target)

                # Update the arrays for later analysis
                results[target, sample_ind] = res

        print('--------------------------------------')
        # Compute the number of adversarial examples that were successfully found
        nb_targets_tried = ((nb_classes - 1) * source_samples)
        succ_rate = float(np.sum(results)) / nb_targets_tried
        print('Avg. rate of successful Saliency adv. examples {0:.4f}'.format(
            succ_rate))
        report.clean_train_adv_eval = 1. - succ_rate

        # here we have successfully stacked up x_adversarial_set, y_correct_set
        # these can be used to provide training to our model now
        print("\n\n\n*****************************")
        print("Checking x_adv_set shape: ", adv_x_set.shape)
        print("Checking correct_y_set shape: ", adv_y_target.shape)

        print("x_training_saliency shape:", x_train_saliency.shape)
        print("y_training_saliency shape:", y_train_saliency.shape)

        # now construct model 3, define output -> input relationship tensor
        model_3 = make_basic_cnn(nb_filters=nb_filters)
        # define the x, the placeholder input - > preds_3 output
        preds_3 = model_3(x)

        # jsma3 = SaliencyMapMethod(model_3, sess=sess)
        #
        # jsma_params = {'theta': 1., 'gamma': 0.1,
        #                'clip_min': 0., 'clip_max': 1.,
        #                'y_target': y_target}
        #
        # # create adv_saliency set tensor, using x_train data and jsma_params containing adv_y_target
        # adv_jsma = jsma3.generate(x, jsma_params)
        # # create adv preds tensor
        # preds_jsma_adv = model_3(adv_jsma)

        # define saliency training model accuracy
        def evaluate_saliency():
            # Accuracy of adversarially trained model on legitimate test inputs
            eval_params = {'batch_size': batch_size}
            accuracy = model_eval(sess,
                                  x,
                                  y,
                                  preds_3,
                                  x_train_saliency,
                                  y_train_saliency,
                                  args=eval_params)
            print('Test accuracy on legitimate examples: %0.4f' % accuracy)
            report.adv_train_clean_eval = accuracy

        ###########################################################################
        # MODEL Train for Saliency Map
        ###########################################################################
        # Perform and evaluate adversarial training with FSGM MODEL!!!
        # Train the model with samples of normal and adversarial examples!
        model_train(sess,
                    x,
                    y,
                    model_3,
                    x_train_saliency,
                    y_train_saliency,
                    evaluate=evaluate_saliency(),
                    args=train_params,
                    rng=rng)

        #todo: use jsma to create adversarial testing??? or training???

    # Redefine TF model FGSM!!!
    model_2 = make_basic_cnn(nb_filters=nb_filters)
    preds_2 = model_2(x)
    fgsm2 = FastGradientMethod(model_2, sess=sess)

    # parameter for FGSM
    fgsm_params_y = {'eps': 0.3, 'y': y, 'clip_min': 0., 'clip_max': 1.}
    adv_x_2 = fgsm2.generate(x, **fgsm_params_y)
    if not backprop_through_attack:
        # For the fgsm attack used in this tutorial, the attack has zero
        # gradient so enabling this flag does not change the gradient.
        # For some other attacks, enabling this flag increases the cost of
        # training, but gives the defender the ability to anticipate how
        # the atacker will change their strategy in response to updates to
        # the defender's parameters.
        adv_x_2 = tf.stop_gradient(adv_x_2)
    preds_2_adv = model_2(adv_x_2)

    def evaluate_2():
        # Accuracy of adversarially trained model on legitimate test inputs
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on legitimate examples: %0.4f' % accuracy)
        report.adv_train_clean_eval = accuracy

        # Accuracy of the adversarially trained model on adversarial examples
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_test,
                              Y_test,
                              args=eval_params)
        print('Test accuracy on adversarial examples: %0.4f' % accuracy)
        report.adv_train_adv_eval = accuracy

    ###########################################################################
    # MODEL Train for FGSM
    ###########################################################################
    # Perform and evaluate adversarial training with FSGM MODEL!!!
    model_train(sess,
                x,
                y,
                preds_2,
                X_train,
                Y_train,
                predictions_adv=preds_2_adv,
                evaluate=evaluate_2,
                args=train_params,
                rng=rng)

    # Calculate training errors
    if testing:
        eval_params = {'batch_size': batch_size}
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_clean_eval = accuracy
        accuracy = model_eval(sess,
                              x,
                              y,
                              preds_2_adv,
                              X_train,
                              Y_train,
                              args=eval_params)
        report.train_adv_train_adv_eval = accuracy

    return report
def mnist_tutorial(train_start=0,
                   train_end=60000,
                   test_start=0,
                   test_end=10000,
                   nb_epochs=6,
                   batch_size=128,
                   learning_rate=0.001,
                   train_dir="/tmp",
                   filename="mnist.ckpt",
                   load_model=False,
                   testing=False):
    """
    MNIST CleverHans tutorial
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param learning_rate: learning rate for training
    :param train_dir: Directory storing the saved model
    :param filename: Filename to save model under
    :param load_model: True for load, False for not load
    :param testing: if true, test error is calculated
    :return: an AccuracyReport object
    """
    keras.layers.core.K.set_learning_phase(0)
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()
    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)
    if not hasattr(backend, "tf"):
        raise RuntimeError("This tutorial requires keras to be configured"
                           " to use the TensorFlow backend.")
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' to "
              "'th', temporarily setting to 'tf'")
    # Create TF session and set as Keras backend session


#     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)
#     config = tf.ConfigProto(gpu_options=gpu_options)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    keras.backend.set_session(sess)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Use label smoothing
    assert Y_train.shape[1] == 10
    label_smooth = .1
    Y_train = Y_train.clip(label_smooth / 9., 1. - label_smooth)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    predictions = model(x)
    print("Defined TensorFlow model graph.")

    def evaluate():
        # Evaluate the accuracy of the MNIST model on legitimate test examples
        eval_params = {'batch_size': batch_size}
        acc = model_eval(sess,
                         x,
                         y,
                         predictions,
                         X_test,
                         Y_test,
                         args=eval_params)
        report.clean_train_clean_eval = acc
        assert X_test.shape[0] == test_end - test_start, X_test.shape
        print('Test accuracy on legitimate examples: %0.4f' % acc)

    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'train_dir': train_dir,
        'filename': filename
    }
    # Train an MNIST model
    ckpt = tf.train.get_checkpoint_state(train_dir)
    ckpt_path = False if ckpt is None else ckpt.model_checkpoint_path

    rng = np.random.RandomState([2017, 8, 30])
    if load_model and ckpt_path:
        saver = tf.train.Saver()
        saver.restore(sess, ckpt_path)
        print("Model loaded from: {}".format(ckpt_path))
    else:
        print("Model was not loaded, training from scratch.")
        model_train(sess,
                    x,
                    y,
                    predictions,
                    X_train,
                    Y_train,
                    evaluate=evaluate,
                    args=train_params,
                    save=True,
                    rng=rng)

    # Initialize the Fast Gradient Sign Method (FGSM) attack object and graph
    wrap = KerasModelWrapper(model)
    advGenTimeStart = time.time()
    fgsm = FastGradientMethod(wrap, sess=sess)
    fgsm_params = {'eps': 0.2, 'clip_min': 0., 'clip_max': 1.}
    adv_x = fgsm.generate(x, **fgsm_params)
    adv_x = sess.run(adv_x, feed_dict={x: X_test[:4500]})
    advGenTimeEnd = time.time()
    advGenTime = advGenTimeEnd - advGenTimeStart

    for i in xrange(4500):
        normalization(adv_x[i:(i + 1)])

    print('adversarial examples generation time = ', advGenTime, 'seconds')
    crosses = [
        np.array([[0, 1, 0], [1, 1, 1], [0, 1, 0]]),
        np.array([[0, 0, 1, 0, 0], [0, 0, 1, 0, 0], [1, 1, 1, 1, 1],
                  [0, 0, 1, 0, 0], [0, 0, 1, 0, 0]]),
        np.array([[0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0],
                  [0, 0, 0, 1, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1],
                  [0, 0, 0, 1, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0],
                  [0, 0, 0, 1, 0, 0, 0]]),
        np.array([
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [1, 1, 1, 1, 1, 1, 1, 1, 1],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
            [0, 0, 0, 0, 1, 0, 0, 0, 0],
        ])
    ]
    coefficient = [5, 9, 13, 17]
    #diamond filter test, kernel size: 3, 5, 7, 9
    kernelIndex = -1
    for kernelSize in xrange(3, 10, 2):
        startTime = time.time()
        original_classified_wrong_number = 0
        disturbed_failure_number = 0
        test_number = 0
        TTP = 0
        TP = 0
        FN = 0
        FP = 0

        start = (kernelSize - 1) // 2
        end = 28 - start
        kernelIndex += 1
        print('cross filter')
        print(crosses[kernelIndex])
        for i in range(4500):
            current_class = int(np.argmax(Y_test[i]))

            currentXLabel = model_argmax(sess, x, predictions,
                                         X_test[i:(i + 1)])
            if currentXLabel != current_class:
                original_classified_wrong_number += 1
                continue

            currentAdvXLabel = model_argmax(sess, x, predictions,
                                            adv_x[i:(i + 1)])
            if currentAdvXLabel == currentXLabel:
                disturbed_failure_number += 1
                continue

            test_number += 1

            currentX = np.reshape(X_test[i:(i + 1)], (28, 28))
            currentX = diamondAndCrossFilterOperations(
                currentX, crosses[kernelIndex], start, end,
                coefficient[kernelIndex])
            currentX = np.reshape(currentX, X_test[i:(i + 1)].shape)
            currentXFilteredLabel = model_argmax(sess, x, predictions,
                                                 currentX)

            currentAdvX = np.reshape(adv_x[i:(i + 1)], (28, 28))
            currentAdvX = diamondAndCrossFilterOperations(
                currentAdvX, crosses[kernelIndex], start, end,
                coefficient[kernelIndex])
            currentAdvX = np.reshape(currentAdvX, X_test[i:(i + 1)].shape)
            currentAdvXFilteredLabel = model_argmax(sess, x, predictions,
                                                    currentAdvX)

            if currentAdvXFilteredLabel != currentAdvXLabel:
                TP += 1
                if currentAdvXFilteredLabel == current_class:
                    TTP += 1
            else:
                FN += 1
            if currentXFilteredLabel != currentXLabel:
                FP += 1

            if (i + 1) % 1000 == 0:
                str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
                    test_number, original_classified_wrong_number,
                    disturbed_failure_number, TP, FN, FP, TTP)
                print(str1)

        str1 = '%d-%d-%d: TP = %d; FN = %d; FP = %d; TTP = %d' % (
            test_number, original_classified_wrong_number,
            disturbed_failure_number, TP, FN, FP, TTP)
        print(str1)

        endTime = time.time()
        print('lasting ', endTime - startTime, 'seconds')
        Recall = TP / (TP + FN)
        Precision = TP / (TP + FP)
        tempStarStr = '********************************************************'
        recallStr = 'Recall = %.4f' % (Recall)
        precisionStr = 'Precision = %.4f' % (Precision)
        print(tempStarStr)
        print(recallStr)
        print(precisionStr)
        print(tempStarStr)

    return report
Beispiel #19
0
	# Only target the normal class
	for target in [0]:
		if current_class == 0:
			break

		print('Generating adv. example for target class {} for sample {}'.format(target, sample_ind), end='\r')

		# Run the Jacobian-based saliency map approach
		one_hot_target = np.zeros((1, FLAGS.nb_classes), dtype=np.float32)
		one_hot_target[0, target] = 1
		jsma_params['y_target'] = one_hot_target
		adv_x = jsma.generate_np(sample, **jsma_params)

		# Check if success was achieved
		res = int(model_argmax(sess, x, predictions, adv_x) == target)

		# Compute number of modified features
		adv_x_reshape = adv_x.reshape(-1)
		test_in_reshape = X_test_scaled[sample_ind].reshape(-1)
		nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
		percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

		X_adv[sample_ind] = adv_x
		results[target, sample_ind] = res
		perturbations[target, sample_ind] = percent_perturb

print()
print(X_adv.shape)

print("=========================== Evaluation of MLP Performance ==============================")
Beispiel #20
0
def mnist_tutorial_jsma(train_start=0,
                        train_end=60000,
                        test_start=0,
                        test_end=10000,
                        viz_enabled=VIZ_ENABLED,
                        nb_epochs=NB_EPOCHS,
                        batch_size=BATCH_SIZE,
                        source_samples=SOURCE_SAMPLES,
                        learning_rate=LEARNING_RATE):
    """
  MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param viz_enabled: (boolean) activate plots of adversarial examples
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param nb_classes: number of output classes
  :param source_samples: number of test inputs to attack
  :param learning_rate: learning rate for training
  :return: an AccuracyReport object
  """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    #replace
    num_threads = None
    if num_threads:
        config_args = dict(intra_op_parallelism_threads=1)
    else:
        config_args = {}
    sess = tf.Session(config=tf.ConfigProto(**config_args))
    #with sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    mnist = MNIST(train_start=train_start,
                  train_end=train_end,
                  test_start=test_start,
                  test_end=test_end)
    x_train, y_train = mnist.get_set('train')
    x_test, y_test = mnist.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]
    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64

    # Define TF model graph
    model = make_basic_picklable_cnn()

    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    dataset = tf.data.Dataset.from_tensor_slices(
        (tf.reshape(x_train, [60000, 28, 28]), y_train))
    dataset = dataset.batch(32)
    val_dataset = tf.data.Dataset.from_tensor_slices(
        (tf.reshape(x_test, [10000, 28, 28]), y_test))
    val_dataset = val_dataset.batch(32)

    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    if TRAIN_NEW == 1:
        with sess.as_default():
            train(sess, loss, x_train, y_train, args=train_params, rng=rng)
            save("test.joblib", model)
    else:
        with sess.as_default():
            model = load("test.joblib")  #changed
        assert len(model.get_params()) > 0
        preds = model.get_logits(x)
        loss = CrossEntropy(model, smoothing=0.1)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    seed(SEED)
    for sample_ind in xrange(0, source_samples):
        img = randint(0, 10000)
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[img:(img +
                             1)]  #sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(
            y_test[img]))  #current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))
        tn = 0
        totc = 0
        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Compute number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]
            diff = np.array(adv_x - sample)
            #print(np.sum(diff))
            diff = np.reshape(diff, (28, 28))
            diff = diff * 255
            cv2.imwrite("test.png", diff)
            diff = cv2.imread("test.png")
            diff = cv2.cvtColor(diff, cv2.COLOR_BGR2GRAY)
            nieghbors = 0
            tc = 0
            for i in range(0, 28, 1):
                for j in range(0, 28, 1):
                    if diff[i, j] > 0:
                        tc = tc + 1
                        totc = totc + 1
                        if i > 0 and i < 27 and j > 0 and j < 27:  #main grid not edges or corners
                            if diff[i - 1, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i - 1, j] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i - 1, j + 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i, j + 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j - 1] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j] > 0:
                                nieghbors = nieghbors + 1
                            if diff[i + 1, j + 1] > 0:
                                nieghbors = nieghbors + 1
                        else:
                            #corners
                            if i == 0 and j == 0:
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j == 0:
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 0 and j == 27:
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j == 27:
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                            #edges
                            if i == 0 and j > 0 and j < 27:  #left side
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if i == 27 and j > 0 and j < 27:  #right side
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if j == 0 and i > 0 and i < 27:  #top side
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j + 1] > 0:
                                    nieghbors = nieghbors + 1
                            if j == 27 and i > 0 and i < 27:  #bot side
                                if diff[i - 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i - 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i, j - 1] > 0:
                                    nieghbors = nieghbors + 1
                                if diff[i + 1, j - 1] > 0:
                                    nieghbors = nieghbors + 1

            # print(tc)
            # print(nieghbors)
            tn = tn + nieghbors
            # if tc > 0:
            # print(nieghbors/tc)
            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)
            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb
            #print(perturbations[target, sample_ind])

    print('--------------------------------------')

    print("average neighbors per modified pixel ", tn / totc)
    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.8f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)

    s = perturbations.shape
    myPert = np.empty(0)
    myResults = np.empty(0)
    for i in range(s[0]):
        for j in range(s[1]):
            if perturbations[i][j] > 0:
                myPert = np.append(myPert, perturbations[i][j])
                myResults = np.append(myResults, results[i][j])
    min_perturbed = np.min(myPert)
    max_perturbed = np.max(myPert)

    s2 = myResults.shape
    final = np.empty(0)
    for i in range(s2[0]):
        if myResults[i] > 0:
            final = np.append(final, myPert[i])

    print('Avg. rate of perturbed features {0:.8f}'.format(percent_perturbed))
    print('MIN of perturbed features {0:.8f}'.format(min_perturbed))
    print('MAX of perturbed features {0:.8f}'.format(max_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    min_perturb_succ = np.min(final)
    max_perturb_succ = np.max(final)
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(percent_perturb_succ))
    print('Min of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(min_perturb_succ))
    print('Max of perturbed features for successful '
          'adversarial examples {0:.8f}'.format(max_perturb_succ))

    #Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    x_train, y_train, x_test, y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols,
                                          nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelBasicCNN('model1', nb_classes, nb_filters)
    preds = model.get_logits(x)
    loss = LossCrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x, y, x_train, y_train, args=train_params,
          rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, nchannels)),
                    np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Beispiel #22
0
def main(argv=None):
    """
    CIFAR10 CleverHans tutorial
    :return:
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # CIFAR10-specific dimensions
    img_rows = 32
    img_cols = 32
    channels = 3
    nb_classes = 10

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    sess = tf.Session()

    set_log_level(logging.DEBUG)

    # Get CIFAR10 test data
    X_train, Y_train, X_test, Y_test = data_cifar10()

    # Label smoothing
    assert Y_train.shape[1] == 10.

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, channels))

    y = tf.placeholder(tf.float32, shape=(None, 10))

    model_path = FLAGS.model_path
    nb_samples = FLAGS.nb_samples

    from cnn_models import make_basic_cnn
    model = make_basic_cnn('fp_',
                           input_shape=(None, img_rows, img_cols, channels),
                           nb_filters=FLAGS.nb_filters)

    preds = model(x)
    print("Defined TensorFlow model graph with %d parameters" % model.n_params)

    rng = np.random.RandomState([2017, 8, 30])

    def evaluate(eval_params):
        # Evaluate the model on legitimate test examples
        acc = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
        return acc

    model_load(sess, model_path)
    print('Restored model from %s' % model_path)
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = evaluate(eval_params)
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(nb_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, nb_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, nb_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    from cleverhans.attacks import SaliencyMapMethod
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        'gamma': FLAGS.gamma,
        'theta': 1.,
        'symbolic_impl': True,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }
    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in range(0, nb_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, nb_samples))
        sample = X_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, channels)),
                    np.reshape(adv_x, (img_rows, img_cols, channels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * nb_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)
def gtsrb_blackbox(train_start=0, train_end=60000, test_start=0,
                   test_end=10000, nb_classes=NB_CLASSES,
                   batch_size=BATCH_SIZE, learning_rate=LEARNING_RATE,
                   nb_epochs=NB_EPOCHS, holdout=HOLDOUT, data_aug=DATA_AUG,
                   nb_epochs_s=NB_EPOCHS_S, lmbda=LMBDA,
                   aug_batch_size=AUG_BATCH_SIZE):
    """
    MNIST tutorial for the black-box attack from arxiv.org/abs/1602.02697
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :return: a dictionary with:
             * black-box model accuracy on test set
             * substitute model accuracy on test set
             * black-box model accuracy on adversarial examples transferred
               from the substitute model
    """

    # Set logging level to see debug information
    set_log_level(logging.DEBUG)

    # Dictionary used to keep track and return key accuracies
    accuracies = {}

    # Perform tutorial setup
    assert setup_tutorial()

    # Create TF session
    sess = tf.Session()

    t1 = time.time()
    x_train, y_train, x_VAL, y_VAL, x_test, y_test = read_gtsrb_dataset()
    print('Data reading time :', time.time()-t1, 'seconds')

    # Initialize substitute training set reserved for adversary
    x_sub = x_test[:holdout]

    savefigfromarray(x_sub[0],filename = 'my2.ppm')
    #y_sub = np.argmax(y_test[:holdout], axis=1)
    y_sub = y_test[:holdout]

    print(x_sub.shape)
    print(y_sub.shape)
    print(x_train.shape)
    print(y_train.shape)
    print(x_test.shape)
    print(y_test.shape)

    # Redefine test set as remaining samples unavailable to adversaries
    x_test = x_test[holdout:]
    y_test = y_test[holdout:]

    # Obtain Image parameters
    nchannels, img_rows, img_cols = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, nchannels, img_rows, img_cols))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    # Seed random number generator so tutorial is reproducible
    rng = np.random.RandomState([2017, 8, 30])

    # Simulate the black-box model locally
    print("Loading the black-box model.")
    t1 = time.time()
    prep_bbox_out = prep_bbox(sess, x, y, x_train, y_train, x_test, y_test,
                              nb_epochs, batch_size, learning_rate,
                              rng, nb_classes, img_rows, img_cols, nchannels)
    model, bbox_preds, accuracies['bbox'] = prep_bbox_out
    print(bbox_preds.shape)
    print('Oracle loading time :', time.time()-t1, 'seconds')

    # Evaluate oracle on random noised test samples
    rand_x_test, rand_y_test = [], y_test
    try:
        rand_x_test = np.load('rand_x_test.npy')
    except:
        for itest in range(len(x_test)):
            rand_x_test.append(add_gaussian_noise(x_test[itest], std=0.1))
        rand_x_test = np.array(rand_x_test)
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, bbox_preds, rand_x_test, rand_y_test, args=eval_params)
    accuracies['oracle on noise'] = acc

    # Train substitute using method from https://arxiv.org/abs/1602.02697
    print("Training the substitute model.")
    t1 = time.time()

    train_sub_out = train_sub(sess, x, y, bbox_preds, x_train, y_train,
                              nb_classes, nb_epochs_s, batch_size,
                              learning_rate, data_aug, lmbda, aug_batch_size,
                              rng, img_rows, img_cols, nchannels)

    print('Substitute training time :', time.time()-t1, 'seconds')

    model_sub, preds_sub = train_sub_out
    print(preds_sub.shape)
    # Evaluate the substitute model on clean test examples
    eval_params = {'batch_size': batch_size}
    acc = model_eval(sess, x, y, preds_sub, x_train, y_train, args=eval_params)
    accuracies['sub'] = acc
    print('sub on clean test {0}'.format(acc))

    # Initialize the Fast Gradient Sign Method (FGSM) attack object.
    fgsm_par = {'eps': 0.3, 'ord': np.inf, 'clip_min': 0., 'clip_max': 1.}
    fgsm = FastGradientMethod(model_sub, sess=sess)

    # Craft adversarial examples using the substitute
    t1 = time.time()
    eval_params = {'batch_size': batch_size}
    x_adv_sub = fgsm.generate(x, **fgsm_par)
    print('Adversarial example crafting time :', time.time()-t1, 'seconds')

    # Evaluate the accuracy of the "black-box" model on adversarial examples
    accuracy = model_eval(sess, x, y, model.get_logits(x_adv_sub),
                          x_test, y_test, args=eval_params)
    print('Test accuracy of oracle on adversarial examples generated '
          'using the substitute: ' + str(accuracy))
    accuracies['bbox_on_sub_adv_ex'] = accuracy

    # Visualize one example:
    x_adv_sub_0  = x_adv_sub.eval(session=sess, feed_dict = {x:x_test[0].reshape(1,3,48,48)})
    print('ONE EXMAPLE: shape = {0}'.format(x_adv_sub_0.shape))
    print('symbolic x_adv_sub: shape = {0}'.format(x_adv_sub.shape))
    np.save('x_adv_sub_0', x_adv_sub_0)

    ###########################################################################
    # Visualize adversarial examples as a grid of pictures.
    ###########################################################################
    source_samples = 10
    img_rows = 48
    img_cols = 48
    nchannels = 3
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    # jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    # jsma_params = {'theta': 1., 'gamma': 0.1,
    #                'clip_min': 0., 'clip_max': 1.,
    #                'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes[:3]:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            # jsma_params['y_target'] = one_hot_target
            #adv_x = jsma.generate_np(sample, **jsma_params)
            adv_x = fgsm.generate_np(sample, **fgsm_par)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds_sub, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            fig1 = pair_visual(
                np.reshape(sample, (img_rows, img_cols, nchannels)),
                np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # Add our adversarial example to our grid data
            fig2 = grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb
    fig1.savefig('fig1.png')
    np.save('fig2.png', fig2)

    print('--------------------------------------')

    return accuracies
def main(argv=None):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :return:
    """
    # Disable Keras learning phase since we will be serving through tensorflow
    keras.layers.core.K.set_learning_phase(0)

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Image dimensions ordering should follow the Theano convention
    if keras.backend.image_dim_ordering() != 'tf':
        keras.backend.set_image_dim_ordering('tf')
        print("INFO: '~/.keras/keras.json' sets 'image_dim_ordering' "
              "to 'th', temporarily setting to 'tf'")

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    keras.backend.set_session(sess)
    print("Created TensorFlow session and set Keras backend.")

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist()
    print("Loaded MNIST test data.")

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = cnn_model()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model if it does not exist in the train_dir folder
    saver = tf.train.Saver()
    save_path = os.path.join(FLAGS.train_dir, FLAGS.filename)
    if os.path.isfile(save_path):
        saver.restore(sess, os.path.join(FLAGS.train_dir, FLAGS.filename))
    else:
        train_params = {
            'nb_epochs': FLAGS.nb_epochs,
            'batch_size': FLAGS.batch_size,
            'learning_rate': FLAGS.learning_rate
        }
        model_train(sess, x, y, preds, X_train, Y_train,
                    args=train_params)
        saver.save(sess, save_path)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': FLAGS.batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test,
                          args=eval_params)
    assert X_test.shape[0] == 10000, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(FLAGS.source_samples) + ' * ' +
          str(FLAGS.nb_classes-1) + ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((FLAGS.nb_classes, FLAGS.source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((FLAGS.nb_classes, FLAGS.source_samples),
                             dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (FLAGS.nb_classes,
                  FLAGS.nb_classes,
                  FLAGS.img_rows,
                  FLAGS.img_cols,
                  FLAGS.nb_channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Define the SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)

    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, FLAGS.source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, FLAGS.source_samples))

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(FLAGS.nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            X_test[sample_ind:(sample_ind+1)],
            (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, FLAGS.nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params = {'theta': 1., 'gamma': 0.1,
                           'nb_classes': FLAGS.nb_classes, 'clip_min': 0.,
                           'clip_max': 1., 'targets': y,
                           'y_val': one_hot_target}
            adv_x = jsma.generate_np(X_test[sample_ind:(sample_ind+1)],
                                     **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if FLAGS.viz_enabled:
                if 'figure' not in vars():
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind+1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x,
                                   (FLAGS.img_rows, FLAGS.img_cols)))
                else:
                    figure = pair_visual(
                        np.reshape(X_test[sample_ind:(sample_ind+1)],
                                   (FLAGS.img_rows, FLAGS.img_cols)),
                        np.reshape(adv_x, (FLAGS.img_rows,
                                   FLAGS.img_cols)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (FLAGS.img_rows, FLAGS.img_cols, FLAGS.nb_channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((FLAGS.nb_classes - 1) * FLAGS.source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if FLAGS.viz_enabled:
        _ = grid_visual(grid_viz_data)
def mnist_tutorial_jsma(train_start=0, train_end=60000, test_start=0,
                        test_end=10000, viz_enabled=True, nb_epochs=6,
                        batch_size=128, nb_classes=10, source_samples=10,
                        learning_rate=0.001):
    """
    MNIST tutorial for the Jacobian-based saliency map approach (JSMA)
    :param train_start: index of first training set example
    :param train_end: index of last training set example
    :param test_start: index of first test set example
    :param test_end: index of last test set example
    :param viz_enabled: (boolean) activate plots of adversarial examples
    :param nb_epochs: number of epochs to train model
    :param batch_size: size of training batches
    :param nb_classes: number of output classes
    :param source_samples: number of test inputs to attack
    :param learning_rate: learning rate for training
    :return: an AccuracyReport object
    """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # MNIST-specific dimensions
    img_rows = 28
    img_cols = 28
    channels = 1

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get MNIST test data
    X_train, Y_train, X_test, Y_test = data_mnist(train_start=train_start,
                                                  train_end=train_end,
                                                  test_start=test_start,
                                                  test_end=test_end)

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, 28, 28, 1))
    y = tf.placeholder(tf.float32, shape=(None, 10))

    # Define TF model graph
    model = make_basic_cnn()
    preds = model(x)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an MNIST model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    model_train(sess, x, y, preds, X_train, Y_train, args=train_params,
                rng=rng)

    # Evaluate the accuracy of the MNIST model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, X_test, Y_test, args=eval_params)
    assert X_test.shape[0] == test_end - test_start, X_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes-1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, nb_classes, img_rows, img_cols, channels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, back='tf', sess=sess)
    jsma_params = {'theta': 1., 'gamma': 0.1,
                   'clip_min': 0., 'clip_max': 1.,
                   'y_target': None}

    figure = None
    # Loop over the samples we want to perturb into adversarial examples
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = X_test[sample_ind:(sample_ind+1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(Y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
            sample, (img_rows, img_cols, channels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)

            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = X_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]

            # Display the original and adversarial images side-by-side
            if viz_enabled:
                figure = pair_visual(
                    np.reshape(sample, (img_rows, img_cols, channels)),
                    np.reshape(adv_x, (img_rows, img_cols, channels)), figure)

            # Add our adversarial example to our grid data
            grid_viz_data[target, current_class, :, :, :] = np.reshape(
                adv_x, (img_rows, img_cols, channels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Close TF session
    sess.close()

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        import matplotlib.pyplot as plt
        plt.close(figure)
        _ = grid_visual(grid_viz_data)

    return report
Beispiel #26
0
def cifar10_tutorial_jsma(train_start=0,
                          train_end=60000,
                          test_start=0,
                          test_end=10000,
                          viz_enabled=VIZ_ENABLED,
                          nb_epochs=NB_EPOCHS,
                          batch_size=BATCH_SIZE,
                          source_samples=SOURCE_SAMPLES,
                          learning_rate=LEARNING_RATE,
                          model_path=MODEL_PATH,
                          noise_output=NOISE_OUTPUT):
    """
  CIFAR10 tutorial for the Jacobian-based saliency map approach (JSMA)
  :param train_start: index of first training set example
  :param train_end: index of last training set example
  :param test_start: index of first test set example
  :param test_end: index of last test set example
  :param viz_enabled: (boolean) activate plots of adversarial examples
  :param nb_epochs: number of epochs to train model
  :param batch_size: size of training batches
  :param nb_classes: number of output classes
  :param source_samples: number of test inputs to attack
  :param learning_rate: learning rate for training
  :return: an AccuracyReport object
  """
    # Object used to keep track of (and return) key accuracies
    report = AccuracyReport()

    # Set TF random seed to improve reproducibility
    tf.set_random_seed(1234)

    # Create TF session and set as Keras backend session
    sess = tf.Session()
    print("Created TensorFlow session.")

    set_log_level(logging.DEBUG)

    # Get CIFAR10 test data
    cifar10 = CIFAR10(train_start=train_start,
                      train_end=train_end,
                      test_start=test_start,
                      test_end=test_end)
    x_train, y_train = cifar10.get_set('train')
    x_test, y_test = cifar10.get_set('test')

    # Obtain Image Parameters
    img_rows, img_cols, nchannels = x_train.shape[1:4]
    nb_classes = y_train.shape[1]

    # Define input TF placeholder
    x = tf.placeholder(tf.float32, shape=(None, img_rows, img_cols, nchannels))
    y = tf.placeholder(tf.float32, shape=(None, nb_classes))

    nb_filters = 64
    # Define TF model graph
    model = ModelAllConvolutional('model1',
                                  nb_classes,
                                  nb_filters,
                                  input_shape=[32, 32, 3])
    preds = model.get_logits(x)
    loss = CrossEntropy(model, smoothing=0.1)
    print("Defined TensorFlow model graph.")

    ###########################################################################
    # Training the model using TensorFlow
    ###########################################################################

    # Train an CIFAR10 model
    train_params = {
        'nb_epochs': nb_epochs,
        'batch_size': batch_size,
        'learning_rate': learning_rate,
        'filename': os.path.split(model_path)[-1]
    }
    sess.run(tf.global_variables_initializer())
    rng = np.random.RandomState([2017, 8, 30])
    train(sess, loss, x_train, y_train, args=train_params, rng=rng)

    # Evaluate the accuracy of the CIFAR10 model on legitimate test examples
    eval_params = {'batch_size': batch_size}
    accuracy = model_eval(sess, x, y, preds, x_test, y_test, args=eval_params)
    assert x_test.shape[0] == test_end - test_start, x_test.shape
    print('Test accuracy on legitimate test examples: {0}'.format(accuracy))
    report.clean_train_clean_eval = accuracy

    ###########################################################################
    # Craft adversarial examples using the Jacobian-based saliency map approach
    ###########################################################################
    print('Crafting ' + str(source_samples) + ' * ' + str(nb_classes - 1) +
          ' adversarial examples')

    # Keep track of success (adversarial example classified in target)
    results = np.zeros((nb_classes, source_samples), dtype='i')

    # Rate of perturbed features for each test set example and target class
    perturbations = np.zeros((nb_classes, source_samples), dtype='f')

    # Initialize our array for grid visualization
    grid_shape = (nb_classes, 1, img_rows, img_cols, nchannels)
    grid_viz_data = np.zeros(grid_shape, dtype='f')

    # Instantiate a SaliencyMapMethod attack object
    jsma = SaliencyMapMethod(model, sess=sess)
    jsma_params = {
        'theta': 1.,
        'gamma': 0.1,
        'clip_min': 0.,
        'clip_max': 1.,
        'y_target': None
    }
    # Loop over the samples we want to perturb into adversarial examples
    adv_all = np.zeros((nb_classes, img_rows, img_cols, nchannels), dtype='f')
    sample_all = np.zeros((nb_classes, img_rows, img_cols, nchannels),
                          dtype='f')
    for sample_ind in xrange(0, source_samples):
        print('--------------------------------------')
        print('Attacking input %i/%i' % (sample_ind + 1, source_samples))
        sample = x_test[sample_ind:(sample_ind + 1)]

        # We want to find an adversarial example for each possible target class
        # (i.e. all classes that differ from the label given in the dataset)
        current_class = int(np.argmax(y_test[sample_ind]))
        target_classes = other_classes(nb_classes, current_class)

        # For the grid visualization, keep original images along the diagonal
        # grid_viz_data[current_class, current_class, :, :, :] = np.reshape(
        #     sample, (img_rows, img_cols, nchannels))

        # Loop over all target classes
        for target in target_classes:
            print('Generating adv. example for target class %i' % target)
            # This call runs the Jacobian-based saliency map approach
            one_hot_target = np.zeros((1, nb_classes), dtype=np.float32)
            one_hot_target[0, target] = 1
            jsma_params['y_target'] = one_hot_target
            adv_x = jsma.generate_np(sample, **jsma_params)
            adv_all[current_class] = adv_x
            sample_all[current_class] = sample

            # Check if success was achieved
            res = int(model_argmax(sess, x, preds, adv_x) == target)

            # Computer number of modified features
            adv_x_reshape = adv_x.reshape(-1)
            test_in_reshape = x_test[sample_ind].reshape(-1)
            nb_changed = np.where(adv_x_reshape != test_in_reshape)[0].shape[0]
            percent_perturb = float(nb_changed) / adv_x.reshape(-1).shape[0]
            # Display the original and adversarial images side-by-side
            # if viz_enabled:
            #   figure = pair_visual(
            #       np.reshape(sample, (img_rows, img_cols, nchannels)),
            #       np.reshape(adv_x, (img_rows, img_cols, nchannels)), figure)

            # # Add our adversarial example to our grid data
            # grid_viz_data[target, current_class, :, :, :] = np.reshape(
            #     adv_x, (img_rows, img_cols, nchannels))

            # Update the arrays for later analysis
            results[target, sample_ind] = res
            perturbations[target, sample_ind] = percent_perturb

    print('--------------------------------------')

    # Compute the number of adversarial examples that were successfully found
    nb_targets_tried = ((nb_classes - 1) * source_samples)
    succ_rate = float(np.sum(results)) / nb_targets_tried
    print('Avg. rate of successful adv. examples {0:.4f}'.format(succ_rate))
    report.clean_train_adv_eval = 1. - succ_rate

    # Compute the average distortion introduced by the algorithm
    percent_perturbed = np.mean(perturbations)
    print('Avg. rate of perturbed features {0:.4f}'.format(percent_perturbed))

    # Compute the average distortion introduced for successful samples only
    percent_perturb_succ = np.mean(perturbations * (results == 1))
    print('Avg. rate of perturbed features for successful '
          'adversarial examples {0:.4f}'.format(percent_perturb_succ))

    # Compute the average distortion introduced by the algorithm
    l2_norm = np.mean(np.sum((adv_all - sample_all)**2, axis=(1, 2, 3))**.5)
    print('Avg. L_2 norm of perturbations {0:.4f}'.format(l2_norm))

    for i in range(nb_classes):
        if noise_output:
            image = adv_all[i] - sample_all[i]
        else:
            image = adv_all[i]
        grid_viz_data[i, 0] = image

    # Close TF session
    sess.close()

    def save_visual(data, path):
        """
    Modified version of cleverhans.plot.pyplot
    """
        import matplotlib.pyplot as plt

        figure = plt.figure()
        # figure.canvas.set_window_title('Cleverhans: Grid Visualization')

        # Add the images to the plot
        num_cols = data.shape[0]
        num_rows = data.shape[1]
        num_channels = data.shape[4]
        for y in range(num_rows):
            for x in range(num_cols):
                figure.add_subplot(num_rows, num_cols,
                                   (x + 1) + (y * num_cols))
                plt.axis('off')

                if num_channels == 1:
                    plt.imshow(data[x, y, :, :, 0], cmap='gray')
                else:
                    plt.imshow(data[x, y, :, :, :])

        # Draw the plot and return
        plt.savefig(path)

    # Finally, block & display a grid of all the adversarial examples
    if viz_enabled:
        if noise_output:
            image_name = "output/jsma_cifar10_noise.png"
        else:
            image_name = "output/jsma_cifar10.png"
        _ = save_visual(grid_viz_data, image_name)

    return report