Exemple #1
0
    def __init__(
            self,
            predict_fn: Union[Callable, tf.keras.Model, 'keras.Model'],
            shape: Tuple[int, ...],
            distance_fn: str = 'l1',
            target_proba: float = 1.0,
            target_class: Union[str, int] = 'other',
            max_iter: int = 1000,
            early_stop: int = 50,
            lam_init: float = 1e-1,
            max_lam_steps: int = 10,
            tol: float = 0.05,
            learning_rate_init=0.1,
            feature_range: Union[Tuple, str] = (-1e10, 1e10),
            eps: Union[float, np.ndarray] = 0.01,  # feature-wise epsilons
            init: str = 'identity',
            decay: bool = True,
            write_dir: str = None,
            debug: bool = False,
            sess: tf.Session = None) -> None:
        """
        Initialize counterfactual explanation method based on Wachter et al. (2017)

        Parameters
        ----------
        predict_fn
            Keras or TensorFlow model or any other model's prediction function returning class probabilities
        shape
            Shape of input data starting with batch size
        distance_fn
            Distance function to use in the loss term
        target_proba
            Target probability for the counterfactual to reach
        target_class
            Target class for the counterfactual to reach, one of 'other', 'same' or an integer denoting
            desired class membership for the counterfactual instance
        max_iter
            Maximum number of interations to run the gradient descent for (inner loop)
        early_stop
            Number of steps after which to terminate gradient descent if all or none of found instances are solutions
        lam_init
            Initial regularization constant for the prediction part of the Wachter loss
        max_lam_steps
            Maximum number of times to adjust the regularization constant (outer loop) before terminating the search
        tol
            Tolerance for the counterfactual target probability
        learning_rate_init
            Initial learning rate for each outer loop of lambda
        feature_range
            Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or
            numpy arrays with dimension (1 x nb of features) for feature-wise ranges
        eps
            Gradient step sizes used in calculating numerical gradients, defaults to a single value for all
            features, but can be passed an array for feature-wise step sizes
        init
            Initialization method for the search of counterfactuals, currently must be 'identity'
        decay
            Flag to decay learning rate to zero for each outer loop over lambda
        write_dir
            Directory to write Tensorboard files to
        debug
            Flag to write Tensorboard summaries for debugging
        sess
            Optional Tensorflow session that will be used if passed instead of creating or inferring one internally
        """
        super().__init__(meta=copy.deepcopy(DEFAULT_META_CF))
        # get params for storage in meta
        params = locals()
        remove = ['self', 'predict_fn', 'sess', '__class__']
        for key in remove:
            params.pop(key)
        self.meta['params'].update(params)

        self.data_shape = shape
        self.batch_size = shape[0]
        self.target_class = target_class

        # options for the optimizer
        self.max_iter = max_iter
        self.lam_init = lam_init
        self.tol = tol
        self.max_lam_steps = max_lam_steps
        self.early_stop = early_stop

        self.eps = eps
        self.init = init
        self.feature_range = feature_range
        self.target_proba_arr = target_proba * np.ones(self.batch_size)

        self.debug = debug

        # check if the passed object is a model and get session
        is_model, is_keras, model_sess = _check_keras_or_tf(predict_fn)
        self.meta['params'].update(is_model=is_model, is_keras=is_keras)

        # if session provided, use it
        if isinstance(sess, tf.Session):
            self.sess = sess
        else:
            self.sess = model_sess

        if is_model:  # Keras or TF model
            self.model = True
            self.predict_fn = predict_fn.predict  # type: ignore # array function
            self.predict_tn = predict_fn  # tensor function

        else:  # black-box model
            self.predict_fn = predict_fn
            self.predict_tn = None
            self.model = False

        self.n_classes = self.predict_fn(np.zeros(shape)).shape[1]

        # flag to keep track if explainer is fit or not
        self.fitted = False

        # set up graph session for optimization (counterfactual search)
        with tf.variable_scope('cf_search', reuse=tf.AUTO_REUSE):

            # define variables for original and candidate counterfactual instances, target labels and lambda
            self.orig = tf.get_variable('original',
                                        shape=shape,
                                        dtype=tf.float32)
            self.cf = tf.get_variable(
                'counterfactual',
                shape=shape,
                dtype=tf.float32,
                constraint=lambda x: tf.clip_by_value(x, feature_range[0],
                                                      feature_range[1]))
            # the following will be a 1-hot encoding of the target class (as predicted by the model)
            self.target = tf.get_variable('target',
                                          shape=(self.batch_size,
                                                 self.n_classes),
                                          dtype=tf.float32)

            # constant target probability and global step variable
            self.target_proba = tf.constant(target_proba *
                                            np.ones(self.batch_size),
                                            dtype=tf.float32,
                                            name='target_proba')
            self.global_step = tf.Variable(0.0,
                                           trainable=False,
                                           name='global_step')

            # lambda hyperparameter - placeholder instead of variable as annealed in first epoch
            self.lam = tf.placeholder(tf.float32,
                                      shape=(self.batch_size),
                                      name='lam')

            # define placeholders that will be assigned to relevant variables
            self.assign_orig = tf.placeholder(tf.float32,
                                              shape,
                                              name='assing_orig')
            self.assign_cf = tf.placeholder(tf.float32,
                                            shape,
                                            name='assign_cf')
            self.assign_target = tf.placeholder(tf.float32,
                                                shape=(self.batch_size,
                                                       self.n_classes),
                                                name='assign_target')

            # L1 distance and MAD constants
            # TODO: MADs?
            ax_sum = list(np.arange(1, len(self.data_shape)))
            if distance_fn == 'l1':
                self.dist = tf.reduce_sum(tf.abs(self.cf - self.orig),
                                          axis=ax_sum,
                                          name='l1')
            else:
                logger.exception('Distance metric %s not supported',
                                 distance_fn)
                raise ValueError

            # distance loss
            self.loss_dist = self.lam * self.dist

            # prediction loss
            if not self.model:
                # will need to calculate gradients numerically
                self.loss_opt = self.loss_dist
            else:
                # autograd gradients throughout
                self.pred_proba = self.predict_tn(self.cf)

                # 3 cases for target_class
                if target_class == 'same':
                    self.pred_proba_class = tf.reduce_max(
                        self.target * self.pred_proba, 1)
                elif target_class == 'other':
                    self.pred_proba_class = tf.reduce_max(
                        (1 - self.target) * self.pred_proba, 1)
                elif target_class in range(self.n_classes):
                    # if class is specified, this is known in advance
                    self.pred_proba_class = tf.reduce_max(
                        tf.one_hot(
                            target_class, self.n_classes, dtype=tf.float32) *
                        self.pred_proba, 1)
                else:
                    logger.exception('Target class %s unknown', target_class)
                    raise ValueError

                self.loss_pred = tf.square(self.pred_proba_class -
                                           self.target_proba)

                self.loss_opt = self.loss_pred + self.loss_dist

            # optimizer
            if decay:
                self.learning_rate = tf.train.polynomial_decay(
                    learning_rate_init,
                    self.global_step,
                    self.max_iter,
                    0.0,
                    power=1.0)
            else:
                self.learning_rate = tf.convert_to_tensor(learning_rate_init)

            # TODO optional argument to change type, learning rate scheduler
            opt = tf.train.AdamOptimizer(self.learning_rate)

            # first compute gradients, then apply them
            self.compute_grads = opt.compute_gradients(self.loss_opt,
                                                       var_list=[self.cf])
            self.grad_ph = tf.placeholder(shape=shape,
                                          dtype=tf.float32,
                                          name='grad_cf')
            grad_and_var = [(self.grad_ph, self.cf)]
            self.apply_grads = opt.apply_gradients(
                grad_and_var, global_step=self.global_step)

        # variables to initialize
        self.setup = []  # type: list
        self.setup.append(self.orig.assign(self.assign_orig))
        self.setup.append(self.cf.assign(self.assign_cf))
        self.setup.append(self.target.assign(self.assign_target))

        self.tf_init = tf.variables_initializer(var_list=tf.global_variables(
            scope='cf_search'))

        # tensorboard
        if write_dir is not None:
            self.writer = tf.summary.FileWriter(write_dir,
                                                tf.get_default_graph())
            self.writer.add_graph(tf.get_default_graph())

        # return templates
        self.instance_dict = dict.fromkeys(
            ['X', 'distance', 'lambda', 'index', 'class', 'proba', 'loss'])
        self.return_dict = copy.deepcopy(DEFAULT_DATA_CF)
        self.return_dict['all'] = {i: [] for i in range(self.max_lam_steps)}
Exemple #2
0
    def __init__(self,
                 predict: Union[Callable, tf.keras.Model, 'keras.Model'],
                 shape: tuple,
                 kappa: float = 0.,
                 beta: float = .1,
                 feature_range: tuple = (-1e10, 1e10),
                 gamma: float = 0.,
                 ae_model: Union[tf.keras.Model, 'keras.Model'] = None,
                 enc_model: Union[tf.keras.Model, 'keras.Model'] = None,
                 theta: float = 0.,
                 use_kdtree: bool = False,
                 learning_rate_init: float = 1e-2,
                 max_iterations: int = 1000,
                 c_init: float = 10.,
                 c_steps: int = 10,
                 eps: tuple = (1e-3, 1e-3),
                 clip: tuple = (-1000., 1000.),
                 update_num_grad: int = 1,
                 write_dir: str = None,
                 sess: tf.compat.v1.Session = None) -> None:
        """
        Initialize prototypical counterfactual method.

        Parameters
        ----------
        predict
            Keras or TensorFlow model or any other model's prediction function returning class probabilities
        shape
            Shape of input data starting with batch size
        kappa
            Confidence parameter for the attack loss term
        beta
            Regularization constant for L1 loss term
        feature_range
            Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or
            numpy arrays with dimension (1x nb of features) for feature-wise ranges
        gamma
            Regularization constant for optional auto-encoder loss term
        ae_model
            Optional auto-encoder model used for loss regularization
        enc_model
            Optional encoder model used to guide instance perturbations towards a class prototype
        theta
            Constant for the prototype search loss term
        use_kdtree
            Whether to use k-d trees for the prototype loss term if no encoder is available
        learning_rate_init
            Initial learning rate of optimizer
        max_iterations
            Maximum number of iterations for finding a counterfactual
        c_init
            Initial value to scale the attack loss term
        c_steps
            Number of iterations to adjust the constant scaling the attack loss term
        eps
            If numerical gradients are used to compute dL/dx = (dL/dp) * (dp/dx), then eps[0] is used to
            calculate dL/dp and eps[1] is used for dp/dx. eps[0] and eps[1] can be a combination of float values and
            numpy arrays. For eps[0], the array dimension should be (1x nb of prediction categories) and for
            eps[1] it should be (1x nb of features)
        clip
            Tuple with min and max clip ranges for both the numerical gradients and the gradients
            obtained from the TensorFlow graph
        update_num_grad
            If numerical gradients are used, they will be updated every update_num_grad iterations
        write_dir
            Directory to write tensorboard files to
        sess
            Optional Tensorflow session that will be used if passed instead of creating or inferring one internally
        """
        self.predict = predict

        # check whether the model, encoder and auto-encoder are Keras or TF models and get session
        is_model, is_model_keras, model_sess = _check_keras_or_tf(predict)
        is_ae, is_ae_keras, ae_sess = _check_keras_or_tf(ae_model)
        is_enc, is_enc_keras, enc_sess = _check_keras_or_tf(enc_model)
        # TODO: check ae, enc and model are all compatible

        # if session provided, use it
        if isinstance(sess, tf.compat.v1.Session):
            self.sess = sess
        else:
            self.sess = model_sess

        if is_model:  # Keras or TF model
            self.model = True
            self.classes = self.sess.run(
                self.predict(
                    tf.convert_to_tensor(np.zeros(shape),
                                         dtype=tf.float32))).shape[1]
        else:  # black-box model
            self.model = False
            self.classes = self.predict(np.zeros(shape)).shape[1]

        if is_enc:
            self.enc_model = True
        else:
            self.enc_model = False

        if is_ae:
            self.ae_model = True
        else:
            self.ae_model = False

        if use_kdtree and self.enc_model:
            logger.warning(
                'Both an encoder and k-d trees enabled. Using the encoder for the prototype loss term.'
            )

        if use_kdtree or self.enc_model:
            self.enc_or_kdtree = True
        else:
            self.enc_or_kdtree = False

        self.shape = shape
        self.kappa = kappa
        self.beta = beta
        self.gamma = gamma
        self.theta = theta
        self.ae = ae_model
        self.enc = enc_model
        self.use_kdtree = use_kdtree
        self.batch_size = shape[0]
        self.max_iterations = max_iterations
        self.c_init = c_init
        self.c_steps = c_steps
        self.update_num_grad = update_num_grad
        self.eps = eps
        self.clip = clip
        self.write_dir = write_dir

        # define tf variables for original and perturbed instances, and target labels
        self.orig = tf.Variable(np.zeros(shape), dtype=tf.float32, name='orig')
        self.adv = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv')
        self.adv_s = tf.Variable(np.zeros(shape),
                                 dtype=tf.float32,
                                 name='adv_s')
        self.target = tf.Variable(np.zeros((self.batch_size, self.classes)),
                                  dtype=tf.float32,
                                  name='target')

        # variable for target class proto
        if self.enc_model:
            self.shape_enc = self.enc.predict(np.zeros(shape)).shape
        else:
            self.shape_enc = shape

        self.target_proto = tf.Variable(np.zeros(self.shape_enc),
                                        dtype=tf.float32,
                                        name='target_proto')

        # define tf variable for constant used in FISTA optimization
        self.const = tf.Variable(np.zeros(self.batch_size),
                                 dtype=tf.float32,
                                 name='const')
        self.global_step = tf.Variable(0.0,
                                       trainable=False,
                                       name='global_step')

        # define placeholders that will be assigned to relevant variables
        self.assign_orig = tf.placeholder(tf.float32,
                                          shape,
                                          name='assign_orig')
        self.assign_adv = tf.placeholder(tf.float32, shape, name='assign_adv')
        self.assign_adv_s = tf.placeholder(tf.float32,
                                           shape,
                                           name='assign_adv_s')
        self.assign_target = tf.placeholder(tf.float32,
                                            (self.batch_size, self.classes),
                                            name='assign_target')
        self.assign_const = tf.placeholder(tf.float32, [self.batch_size],
                                           name='assign_const')
        self.assign_target_proto = tf.placeholder(tf.float32,
                                                  self.shape_enc,
                                                  name='assign_target_proto')

        # define conditions and values for element-wise shrinkage thresholding
        with tf.name_scope('shrinkage_thresholding') as scope:
            cond = [
                tf.cast(
                    tf.greater(tf.subtract(self.adv_s, self.orig), self.beta),
                    tf.float32),
                tf.cast(
                    tf.less_equal(tf.abs(tf.subtract(self.adv_s, self.orig)),
                                  self.beta), tf.float32),
                tf.cast(
                    tf.less(tf.subtract(self.adv_s, self.orig),
                            tf.negative(self.beta)), tf.float32)
            ]
            upper = tf.minimum(tf.subtract(self.adv_s, self.beta),
                               tf.cast(feature_range[1], tf.float32))
            lower = tf.maximum(tf.add(self.adv_s, self.beta),
                               tf.cast(feature_range[0], tf.float32))
            self.assign_adv = tf.multiply(cond[0], upper) + tf.multiply(
                cond[1], self.orig) + tf.multiply(cond[2], lower)

        # perturbation update and vector projection on correct feature range set
        with tf.name_scope('perturbation_y') as scope:
            self.zt = tf.divide(self.global_step,
                                self.global_step + tf.cast(3, tf.float32))
            self.assign_adv_s = self.assign_adv + tf.multiply(
                self.zt, self.assign_adv - self.adv)
            # map to feature space
            self.assign_adv_s = tf.minimum(
                self.assign_adv_s, tf.cast(feature_range[1], tf.float32))
            self.assign_adv_s = tf.maximum(
                self.assign_adv_s, tf.cast(feature_range[0], tf.float32))

        # assign counterfactual of step k+1 to k
        with tf.name_scope('update_adv') as scope:
            self.adv_updater = tf.assign(self.adv, self.assign_adv)
            self.adv_updater_s = tf.assign(self.adv_s, self.assign_adv_s)

        # from perturbed instance, derive deviation delta
        with tf.name_scope('update_delta') as scope:
            self.delta = self.orig - self.adv
            self.delta_s = self.orig - self.adv_s

        # define L1 and L2 loss terms; L1+L2 is later used as an optimization constraint for FISTA
        ax_sum = list(np.arange(1, len(shape)))
        with tf.name_scope('loss_l1_l2') as scope:
            self.l2 = tf.reduce_sum(tf.square(self.delta), axis=ax_sum)
            self.l2_s = tf.reduce_sum(tf.square(self.delta_s), axis=ax_sum)
            self.l1 = tf.reduce_sum(tf.abs(self.delta), axis=ax_sum)
            self.l1_s = tf.reduce_sum(tf.abs(self.delta_s), axis=ax_sum)
            self.l1_l2 = self.l2 + tf.multiply(self.l1, self.beta)
            self.l1_l2_s = self.l2_s + tf.multiply(self.l1_s, self.beta)

            # sum losses
            self.loss_l1 = tf.reduce_sum(self.l1)
            self.loss_l1_s = tf.reduce_sum(self.l1_s)
            self.loss_l2 = tf.reduce_sum(self.l2)
            self.loss_l2_s = tf.reduce_sum(self.l2_s)

        with tf.name_scope('loss_ae') as scope:
            # gamma * AE loss
            if self.ae_model:
                self.loss_ae = self.gamma * tf.square(
                    tf.norm(self.ae(self.adv) - self.adv))
                self.loss_ae_s = self.gamma * tf.square(
                    tf.norm(self.ae(self.adv_s) - self.adv_s))
            else:  # no auto-encoder available
                self.loss_ae = tf.constant(0.)
                self.loss_ae_s = tf.constant(0.)

        with tf.name_scope('loss_attack') as scope:
            if not self.model:
                self.loss_attack = tf.placeholder(tf.float32)
            elif self.c_init == 0. and self.c_steps == 1:  # prediction loss term not used
                # make predictions on perturbed instance
                self.pred_proba = self.predict(self.adv)
                self.pred_proba_s = self.predict(self.adv_s)

                self.loss_attack = tf.constant(0.)
                self.loss_attack_s = tf.constant(0.)
            else:
                # make predictions on perturbed instance
                self.pred_proba = self.predict(self.adv)
                self.pred_proba_s = self.predict(self.adv_s)

                # probability of target label prediction
                self.target_proba = tf.reduce_sum(
                    self.target * self.pred_proba, 1)
                target_proba_s = tf.reduce_sum(self.target * self.pred_proba_s,
                                               1)

                # max probability of non target label prediction
                self.nontarget_proba_max = tf.reduce_max(
                    (1 - self.target) * self.pred_proba -
                    (self.target * 10000), 1)
                nontarget_proba_max_s = tf.reduce_max(
                    (1 - self.target) * self.pred_proba_s -
                    (self.target * 10000), 1)

                # loss term f(x,d)
                loss_attack = tf.maximum(
                    0.0,
                    -self.nontarget_proba_max + self.target_proba + self.kappa)
                loss_attack_s = tf.maximum(
                    0.0, -nontarget_proba_max_s + target_proba_s + self.kappa)

                # c * f(x,d)
                self.loss_attack = tf.reduce_sum(self.const * loss_attack)
                self.loss_attack_s = tf.reduce_sum(self.const * loss_attack_s)

        with tf.name_scope('loss_prototype') as scope:
            if self.enc_model:
                self.loss_proto = self.theta * tf.square(
                    tf.norm(self.enc(self.adv) - self.target_proto))
                self.loss_proto_s = self.theta * tf.square(
                    tf.norm(self.enc(self.adv_s) - self.target_proto))
            elif self.use_kdtree:
                self.loss_proto = self.theta * tf.square(
                    tf.norm(self.adv - self.target_proto))
                self.loss_proto_s = self.theta * tf.square(
                    tf.norm(self.adv_s - self.target_proto))
            else:  # no encoder available and no k-d trees used
                self.loss_proto = tf.constant(0.)
                self.loss_proto_s = tf.constant(0.)

        with tf.name_scope('loss_combined') as scope:
            # no need for L1 term in loss to optimize when using FISTA
            if self.model:
                self.loss_opt = self.loss_attack_s + self.loss_l2_s + self.loss_ae_s + self.loss_proto_s
            else:  # separate numerical computation of loss attack gradient
                self.loss_opt = self.loss_l2_s + self.loss_ae_s + self.loss_proto_s

            # add L1 term to overall loss; this is not the loss that will be directly optimized
            self.loss_total = (self.loss_attack + self.loss_l2 + self.loss_ae +
                               tf.multiply(self.beta, self.loss_l1) +
                               self.loss_proto)

        with tf.name_scope('training') as scope:
            self.learning_rate = tf.train.polynomial_decay(learning_rate_init,
                                                           self.global_step,
                                                           self.max_iterations,
                                                           0,
                                                           power=0.5)
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            start_vars = set(x.name for x in tf.global_variables())

            # first compute, then apply grads
            self.compute_grads = optimizer.compute_gradients(
                self.loss_opt, var_list=[self.adv_s])
            self.grad_ph = tf.placeholder(tf.float32, name='grad_adv_s')
            var = [
                tvar for tvar in tf.trainable_variables()
                if tvar.name.startswith('adv_s')
            ][-1]  # get the last in
            # case explainer is re-initialized and a new graph is created
            grad_and_var = [(self.grad_ph, var)]
            self.apply_grads = optimizer.apply_gradients(
                grad_and_var, global_step=self.global_step)
            end_vars = tf.global_variables()
            new_vars = [x for x in end_vars if x.name not in start_vars]

        # variables to initialize
        self.setup = []  # type: list
        self.setup.append(self.orig.assign(self.assign_orig))
        self.setup.append(self.target.assign(self.assign_target))
        self.setup.append(self.const.assign(self.assign_const))
        self.setup.append(self.adv.assign(self.assign_adv))
        self.setup.append(self.adv_s.assign(self.assign_adv_s))
        self.setup.append(self.target_proto.assign(self.assign_target_proto))

        self.init = tf.variables_initializer(
            var_list=[self.global_step] + [self.adv_s] + [self.adv] + new_vars)

        if self.write_dir is not None:
            self.writer = tf.summary.FileWriter(write_dir,
                                                tf.get_default_graph())
            self.writer.add_graph(tf.get_default_graph())
        else:
            self.writer = None
Exemple #3
0
    def __init__(self,
                 predict: Union[Callable, tf.keras.Model, 'keras.Model'],
                 mode: str,
                 shape: tuple,
                 kappa: float = 0.,
                 beta: float = .1,
                 feature_range: tuple = (-1e10, 1e10),
                 gamma: float = 0.,
                 ae_model: Union[tf.keras.Model, 'keras.Model'] = None,
                 learning_rate_init: float = 1e-2,
                 max_iterations: int = 1000,
                 c_init: float = 10.,
                 c_steps: int = 10,
                 eps: tuple = (1e-3, 1e-3),
                 clip: tuple = (-100., 100.),
                 update_num_grad: int = 1,
                 no_info_val: Union[float, np.ndarray] = None,
                 write_dir: str = None,
                 sess: tf.Session = None) -> None:
        """
        Initialize contrastive explanation method.
        Paper: https://arxiv.org/abs/1802.07623

        Parameters
        ----------
        predict
            Keras or TensorFlow model or any other model's prediction function returning class probabilities
        mode
            Find pertinant negatives ('PN') or pertinant positives ('PP')
        shape
            Shape of input data starting with batch size
        kappa
            Confidence parameter for the attack loss term
        beta
            Regularization constant for L1 loss term
        feature_range
            Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or
            numpy arrays with dimension (1x nb of features) for feature-wise ranges
        gamma
            Regularization constant for optional auto-encoder loss term
        ae_model
            Optional auto-encoder model used for loss regularization
        learning_rate_init
            Initial learning rate of optimizer
        max_iterations
            Maximum number of iterations for finding a PN or PP
        c_init
            Initial value to scale the attack loss term
        c_steps
            Number of iterations to adjust the constant scaling the attack loss term
        eps
            If numerical gradients are used to compute dL/dx = (dL/dp) * (dp/dx), then eps[0] is used to
            calculate dL/dp and eps[1] is used for dp/dx. eps[0] and eps[1] can be a combination of float values and
            numpy arrays. For eps[0], the array dimension should be (1x nb of prediction categories) and for
            eps[1] it should be (1x nb of features)
        clip
            Tuple with min and max clip ranges for both the numerical gradients and the gradients
            obtained from the TensorFlow graph
        update_num_grad
            If numerical gradients are used, they will be updated every update_num_grad iterations
        no_info_val
            Global or feature-wise value considered as containing no information
        write_dir
            Directory to write tensorboard files to
        sess
            Optional Tensorflow session that will be used if passed instead of creating or inferring one internally
        """
        super().__init__(meta=copy.deepcopy(DEFAULT_META_CEM))
        # get params for storage in meta
        params = locals()
        remove = ['self', 'predict', 'ae_model', 'sess', '__class__']
        for key in remove:
            params.pop(key)
        self.meta['params'].update(params)
        self.predict = predict

        # check whether the model and the auto-encoder are Keras or TF models and get session
        is_model, is_model_keras, model_sess = _check_keras_or_tf(predict)
        is_ae, is_ae_keras, ae_sess = _check_keras_or_tf(ae_model)
        # TODO: check ae and model are compatible
        self.meta['params'].update(is_model=is_model, is_model_keras=is_model_keras, is_ae=is_ae,
                                   is_ae_keras=is_ae_keras)

        # if session provided, use it
        if isinstance(sess, tf.Session):
            self.sess = sess
        else:
            self.sess = model_sess

        if is_model:  # Keras or TF model
            self.model = True
            classes = self.sess.run(self.predict(tf.convert_to_tensor(np.zeros(shape), dtype=tf.float32))).shape[1]
        else:
            self.model = False
            classes = self.predict(np.zeros(shape)).shape[1]

        self.mode = mode
        self.shape = shape
        self.kappa = kappa
        self.beta = beta
        self.gamma = gamma
        self.ae = ae_model
        self.batch_size = shape[0]
        self.max_iterations = max_iterations
        self.c_init = c_init
        self.c_steps = c_steps
        self.update_num_grad = update_num_grad
        self.eps = eps
        self.clip = clip
        self.write_dir = write_dir
        if type(no_info_val) == float:
            self.no_info_val = np.ones(shape) * no_info_val
        else:
            self.no_info_val = no_info_val

        # values regarded as containing no information
        # PNs will deviate away from these values while PPs will gravitate towards them
        self.no_info = tf.Variable(np.zeros(shape), dtype=tf.float32, name='no_info')

        # define tf variables for original and perturbed instances, and target labels
        self.orig = tf.Variable(np.zeros(shape), dtype=tf.float32, name='orig')
        self.adv = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv')  # delta(k)
        self.adv_s = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv_s')  # y(k)
        self.target = tf.Variable(np.zeros((self.batch_size, classes)), dtype=tf.float32, name='target')

        # define tf variable for constant used in FISTA optimization
        self.const = tf.Variable(np.zeros(self.batch_size), dtype=tf.float32, name='const')
        self.global_step = tf.Variable(0.0, trainable=False, name='global_step')

        # define placeholders that will be assigned to relevant variables
        self.assign_orig = tf.placeholder(tf.float32, shape, name='assign_orig')
        self.assign_adv = tf.placeholder(tf.float32, shape, name='assign_adv')
        self.assign_adv_s = tf.placeholder(tf.float32, shape, name='assign_adv_s')
        self.assign_target = tf.placeholder(tf.float32, (self.batch_size, classes), name='assign_target')
        self.assign_const = tf.placeholder(tf.float32, [self.batch_size], name='assign_const')
        self.assign_no_info = tf.placeholder(tf.float32, shape, name='assign_no_info')

        # define conditions and values for element-wise shrinkage thresholding (eq.7)
        with tf.name_scope('shrinkage_thresholding') as scope:
            cond = [tf.cast(tf.greater(tf.subtract(self.adv_s, self.orig), self.beta), tf.float32),
                    tf.cast(tf.less_equal(tf.abs(tf.subtract(self.adv_s, self.orig)), self.beta), tf.float32),
                    tf.cast(tf.less(tf.subtract(self.adv_s, self.orig), tf.negative(self.beta)), tf.float32)]
            upper = tf.minimum(tf.subtract(self.adv_s, self.beta), tf.cast(feature_range[1], tf.float32))
            lower = tf.maximum(tf.add(self.adv_s, self.beta), tf.cast(feature_range[0], tf.float32))
            self.assign_adv = tf.multiply(cond[0], upper) + tf.multiply(cond[1], self.orig) + tf.multiply(cond[2],
                                                                                                          lower)

        # perturbation update for delta and vector projection on correct set depending on PP or PN (eq.5)
        # delta(k) = adv; delta(k+1) = assign_adv
        with tf.name_scope('perturbation_delta') as scope:
            proj_d = [tf.cast(tf.greater(tf.abs(tf.subtract(self.assign_adv, self.no_info)),
                                         tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32),
                      tf.cast(tf.less_equal(tf.abs(tf.subtract(self.assign_adv, self.no_info)),
                                            tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32)]
            if self.mode == "PP":
                self.assign_adv = tf.multiply(proj_d[1], self.assign_adv) + tf.multiply(proj_d[0], self.orig)
            elif self.mode == "PN":
                self.assign_adv = tf.multiply(proj_d[0], self.assign_adv) + tf.multiply(proj_d[1], self.orig)

        # perturbation update and vector projection on correct set for y: y(k+1) = assign_adv_s (eq.6)
        with tf.name_scope('perturbation_y') as scope:
            self.zt = tf.divide(self.global_step, self.global_step + tf.cast(3, tf.float32))  # k/(k+3) in (eq.6)
            self.assign_adv_s = self.assign_adv + tf.multiply(self.zt, self.assign_adv - self.adv)
            proj_d_s = [tf.cast(tf.greater(tf.abs(tf.subtract(self.assign_adv_s, self.no_info)),
                                           tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32),
                        tf.cast(tf.less_equal(tf.abs(tf.subtract(self.assign_adv_s, self.no_info)),
                                              tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32)]
            if self.mode == "PP":
                self.assign_adv_s = tf.multiply(proj_d_s[1], self.assign_adv_s) + tf.multiply(proj_d_s[0], self.orig)
            elif self.mode == "PN":
                self.assign_adv_s = tf.multiply(proj_d_s[0], self.assign_adv_s) + tf.multiply(proj_d_s[1], self.orig)

        # delta(k) <- delta(k+1);  y(k) <- y(k+1)
        with tf.name_scope('update_adv') as scope:
            self.adv_updater = tf.assign(self.adv, self.assign_adv)
            self.adv_updater_s = tf.assign(self.adv_s, self.assign_adv_s)

        # from perturbed instance, derive deviation delta
        with tf.name_scope('update_delta') as scope:
            self.delta = self.orig - self.adv
            self.delta_s = self.orig - self.adv_s

        # define L1 and L2 loss terms; L1+L2 is later used as an optimization constraint for FISTA
        ax_sum = list(np.arange(1, len(shape)))
        with tf.name_scope('loss_l1_l2') as scope:
            self.l2 = tf.reduce_sum(tf.square(self.delta), axis=ax_sum)
            self.l2_s = tf.reduce_sum(tf.square(self.delta_s), axis=ax_sum)
            self.l1 = tf.reduce_sum(tf.abs(self.delta), axis=ax_sum)
            self.l1_s = tf.reduce_sum(tf.abs(self.delta_s), axis=ax_sum)
            self.l1_l2 = self.l2 + tf.multiply(self.l1, self.beta)
            self.l1_l2_s = self.l2_s + tf.multiply(self.l1_s, self.beta)

            # sum losses
            self.loss_l1 = tf.reduce_sum(self.l1)
            self.loss_l1_s = tf.reduce_sum(self.l1_s)
            self.loss_l2 = tf.reduce_sum(self.l2)
            self.loss_l2_s = tf.reduce_sum(self.l2_s)

        with tf.name_scope('loss_ae') as scope:
            # gamma * AE loss
            if self.mode == "PP" and callable(self.ae):
                self.loss_ae = self.gamma * tf.square(tf.norm(self.ae(self.delta) - self.delta))
                self.loss_ae_s = self.gamma * tf.square(tf.norm(self.ae(self.delta_s) - self.delta_s))
            elif self.mode == "PN" and callable(self.ae):
                self.loss_ae = self.gamma * tf.square(tf.norm(self.ae(self.adv) - self.adv))
                self.loss_ae_s = self.gamma * tf.square(tf.norm(self.ae(self.adv_s) - self.adv_s))
            else:  # no auto-encoder available
                self.loss_ae = tf.constant(0.)
                self.loss_ae_s = tf.constant(0.)

        with tf.name_scope('loss_attack') as scope:
            if not self.model:
                self.loss_attack = tf.placeholder(tf.float32)
            else:
                # make predictions on perturbed instance (PN) or delta (PP)
                if self.mode == "PP":
                    self.pred_proba = self.predict(self.delta)
                    self.pred_proba_s = self.predict(self.delta_s)
                elif self.mode == "PN":
                    self.pred_proba = self.predict(self.adv)
                    self.pred_proba_s = self.predict(self.adv_s)

                # probability of target label prediction
                self.target_proba = tf.reduce_sum(self.target * self.pred_proba, 1)
                target_proba_s = tf.reduce_sum(self.target * self.pred_proba_s, 1)

                # max probability of non target label prediction
                self.nontarget_proba_max = tf.reduce_max((1 - self.target) * self.pred_proba - (self.target * 10000), 1)
                nontarget_proba_max_s = tf.reduce_max((1 - self.target) * self.pred_proba_s - (self.target * 10000), 1)

                # loss term f(x,d) for PP (eq.4) and PN (eq.2)
                if self.mode == "PP":
                    loss_attack = tf.maximum(0.0, self.nontarget_proba_max - self.target_proba + self.kappa)
                    loss_attack_s = tf.maximum(0.0, nontarget_proba_max_s - target_proba_s + self.kappa)
                elif self.mode == "PN":
                    loss_attack = tf.maximum(0.0, -self.nontarget_proba_max + self.target_proba + self.kappa)
                    loss_attack_s = tf.maximum(0.0, -nontarget_proba_max_s + target_proba_s + self.kappa)

                # c * f(x,d)
                self.loss_attack = tf.reduce_sum(self.const * loss_attack)
                self.loss_attack_s = tf.reduce_sum(self.const * loss_attack_s)

        with tf.name_scope('loss_combined') as scope:
            # no need for L1 term in loss to optimize when using FISTA
            if self.model:
                self.loss_opt = self.loss_attack_s + self.loss_l2_s + self.loss_ae_s
            else:  # separate numerical computation of loss attack gradient
                self.loss_opt = self.loss_l2_s + self.loss_ae_s

            # add L1 term to overall loss; this is not the loss that will be directly optimized
            self.loss_total = self.loss_attack + self.loss_l2 + self.loss_ae + tf.multiply(self.beta, self.loss_l1)

        with tf.name_scope('training') as scope:
            self.learning_rate = tf.train.polynomial_decay(learning_rate_init, self.global_step,
                                                           self.max_iterations, 0, power=0.5)
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            start_vars = set(x.name for x in tf.global_variables())

            # first compute, then apply grads
            self.compute_grads = optimizer.compute_gradients(self.loss_opt, var_list=[self.adv_s])
            self.grad_ph = tf.placeholder(tf.float32, name='grad_adv_s')
            var = [tvar for tvar in tf.trainable_variables() if tvar.name.startswith('adv_s')][-1]  # get the last in
            # case explainer is re-initialized and a new graph is created
            grad_and_var = [(self.grad_ph, var)]
            self.apply_grads = optimizer.apply_gradients(grad_and_var, global_step=self.global_step)
            end_vars = tf.global_variables()
            new_vars = [x for x in end_vars if x.name not in start_vars]

        # variables to initialize
        self.setup = []  # type: list
        self.setup.append(self.orig.assign(self.assign_orig))
        self.setup.append(self.target.assign(self.assign_target))
        self.setup.append(self.const.assign(self.assign_const))
        self.setup.append(self.adv.assign(self.assign_adv))
        self.setup.append(self.adv_s.assign(self.assign_adv_s))
        self.setup.append(self.no_info.assign(self.assign_no_info))

        self.init = tf.variables_initializer(var_list=[self.global_step] + [self.adv_s] + [self.adv] + new_vars)

        if self.write_dir is not None:
            writer = tf.summary.FileWriter(write_dir, tf.get_default_graph())
            writer.add_graph(tf.get_default_graph())
Exemple #4
0
def test_blackbox_check_keras_or_tf_no_keras_import():
    with mock.patch.dict('sys.modules', {'keras': None}):
        is_model, is_keras, sess = _check_keras_or_tf(blackbox_model)
        assert not is_model
        assert not is_keras
Exemple #5
0
def test_keras_bb_check_keras_or_tf():
    is_model, is_keras, sess = _check_keras_or_tf(blackbox_keras)
    assert not is_model
    assert not is_keras
Exemple #6
0
def test_tf_check_keras_or_tf():
    is_model, is_keras, sess = _check_keras_or_tf(tf_model)
    assert is_model
    assert not is_keras