Esempio n. 1
0
    def get_min_eigvec(self, loss, vars):
        iterations = 10
        eps = 3
        v = [self._get_initial_vector(vars)]
        eigvals = []
        grad = self._list_to_tensor(tf.gradients(loss, vars))
        for i in range(iterations):
            # Power iteration with the shifted Hessian
            v_new = self._list_to_tensor(
                _hessian_vector_product(loss, vars,
                                        self._tensor_to_list(v[i], vars)))
            v.append(eps * v[i] - v_new)
            v[i + 1] = self._normalize(v[i + 1])

            # Get corresponding eigenvalue
            eigval = tf.reduce_sum(
                tf.multiply(
                    v[i],
                    self._list_to_tensor(
                        _hessian_vector_product(
                            loss, vars, self._tensor_to_list(v[i], vars)))))
            eigvals.append(eigval)

        idx = iterations - 1  #tf.cast(tf.argmin(eigvals[3:iterations-1]), tf.int32)
        e = tf.gather(eigvals, idx)
        v = tf.gather(v, idx)

        _sign = -tf.sign(tf.reduce_sum(tf.multiply(grad, v)))
        v *= _sign

        return v, e
Esempio n. 2
0
    def _second_order(self) -> None:
        with tf.name_scope(self._name + '_second_order'):
            self._vjp = tf.gradients(self._predictions_fn_tensor,
                                     self._input_var,
                                     self._dummy_var,
                                     name='vjp')[0]
            self._jvpz = tf.gradients(self._vjp,
                                      self._dummy_var,
                                      tf.stop_gradient(self._z),
                                      name='jvpz')[0]

            if self._diag_hessian_fn is not None:
                self._hjvpz = self._diag_hessian_fn_tensor * self._jvpz
            else:
                self._hjvpz = _hessian_vector_product(
                    ys=[self._loss_fn_tensor],
                    xs=[self._predictions_fn_tensor],
                    v=[self._jvpz])[0]

            # J^T H J z
            self._jhjvpz = tf.gradients(self._predictions_fn_tensor,
                                        self._input_var,
                                        self._hjvpz + self._jloss,
                                        name='jhjvpz')[0]

            self._precond_this_iter = 1.
            if self._diag_precond_t is not None:
                self._precond_this_iter = 1 / (self._diag_precond_t +
                                               self._damping_factor)
            self._deltaz = self._precond_this_iter * (
                self._jhjvpz + self._damping_factor * self._z)
            self._grads_tensor = tf.gradients(self._loss_fn_tensor,
                                              self._input_var)[0]
 def testHessianVectorProduct(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that HessianVectorProduct matches multiplication by the
     # explicit Hessian.
     # Specifically, the Hessian of f(x) = x^T A x is
     # H = A + A^T.
     # We expect HessianVectorProduct(f(x), x, v) to be H v.
     m = 4
     rng = np.random.RandomState([1, 2, 3])
     mat_value = rng.randn(m, m).astype("float32")
     v_value = rng.randn(m, 1).astype("float32")
     x_value = rng.randn(m, 1).astype("float32")
     hess_value = mat_value + mat_value.T
     hess_v_value = np.dot(hess_value, v_value)
     for use_gpu in [False, True]:
         with self.test_session(use_gpu=use_gpu):
             mat = constant_op.constant(mat_value)
             v = constant_op.constant(v_value)
             x = constant_op.constant(x_value)
             mat_x = math_ops.matmul(mat, x, name="Ax")
             x_mat_x = math_ops.matmul(array_ops.transpose(x),
                                       mat_x,
                                       name="xAx")
             hess_v = gradients_impl._hessian_vector_product(
                 x_mat_x, [x], [v])[0]
             hess_v_actual = hess_v.eval()
         self.assertAllClose(hess_v_value, hess_v_actual)
Esempio n. 4
0
    def init_gradients(self):
        self.loss_gradient = tf.gradients(self.loss, self.policy.params_list)
        self.hvp = _hessian_vector_product(self.kl_div,
                                           self.policy.params_list, self.v_plh)

        self.grad_plh = [
            tf.placeholder(tf.float32, shape=s, name="grad_plh_%d" % i)
            for i, s in zip(range(len(self.policy.params_shapes)),
                            self.policy.params_shapes)
        ]
Esempio n. 5
0
    def __init__(self, h, xs):
        self.h = h
        self.xs = xs
        self.v = tf.placeholder(tf.float32, lst_to_vec(xs).shape[0])

        # The `v` arg to _hessian_vector_product is a list of tensors with same structure as `xs`,
        # but the cg alg will give us a vector. Thus need to wrangle `v` into correct form.
        vec_as_list = vec_to_lst(self.v, xs)
        self.fv_product = lst_to_vec(
            _hessian_vector_product(h, xs, vec_as_list))
        self.feed_dict = None
Esempio n. 6
0
    def __init__(self, workspace, feeder, loss_op_train, loss_op_test, x_placeholder, y_placeholder,
                 test_feed_options=None, train_feed_options=None, trainable_variables=None):
        self.workspace = workspace
        self.feeder = feeder
        self.x_placeholder = x_placeholder
        self.y_placeholder = y_placeholder
        self.test_feed_options = test_feed_options if test_feed_options else dict()
        self.train_feed_options = train_feed_options if train_feed_options else dict()

        if trainable_variables is None:
            trainable_variables = (
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) +
                tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))

        self.loss_op_train = loss_op_train
        self.grad_op_train = tf.gradients(loss_op_train, trainable_variables)
        self.grad_op_test = tf.gradients(loss_op_test, trainable_variables)

        self.v_cur_estimated = [tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables]
        self.v_test_grad = [tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables]
        self.v_ihvp = tf.placeholder(tf.float64, shape=[None])
        self.v_param_damping = tf.placeholder(tf.float32)
        self.v_param_scale = tf.placeholder(tf.float32)
        self.v_param_total_trainset = tf.placeholder(tf.float64)

        self.inverse_hvp = None
        self.trainable_variables = trainable_variables

        with tf.name_scope('darkon_ihvp'):
            self.hessian_vector_op = _hessian_vector_product(loss_op_train, trainable_variables, self.v_cur_estimated)
            self.estimation_op = [
                a + (b * self.v_param_damping) - (c / self.v_param_scale)
                for a, b, c in zip(self.v_test_grad, self.v_cur_estimated, self.hessian_vector_op)
            ]

        with tf.name_scope('darkon_grad_diff'):
            flatten_inverse_hvp = tf.reshape(self.v_ihvp, shape=(-1, 1))
            flatten_grads = tf.concat([tf.reshape(a, (-1,)) for a in self.grad_op_train], 0)
            flatten_grads = tf.reshape(flatten_grads, shape=(1, -1,))
            flatten_grads = tf.cast(flatten_grads, tf.float64)
            flatten_grads /= self.v_param_total_trainset
            self.grad_diff_op = tf.matmul(flatten_grads, flatten_inverse_hvp)

        self.ihvp_config = {
            'scale': 1e4,
            'damping': 0.01,
            'num_repeats': 1,
            'recursion_batch_size': 10,
            'recursion_depth': 10000
        }

        if not os.path.exists(self.workspace):
            os.makedirs(self.workspace)
Esempio n. 7
0
    def __init__(self, workspace, feeder, loss_op_train, loss_op_test, x_placeholder, y_placeholder,
                 test_feed_options=None, train_feed_options=None, trainable_variables=None):
        self.workspace = workspace
        self.feeder = feeder
        self.x_placeholder = x_placeholder
        self.y_placeholder = y_placeholder
        self.test_feed_options = test_feed_options if test_feed_options else dict()
        self.train_feed_options = train_feed_options if train_feed_options else dict()

        if trainable_variables is None:
            trainable_variables = (
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) +
                tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))

        self.loss_op_train = loss_op_train
        self.grad_op_train = tf.gradients(loss_op_train, trainable_variables)
        self.grad_op_test = tf.gradients(loss_op_test, trainable_variables)

        self.v_cur_estimated = [tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables]
        self.v_test_grad = [tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables]
        self.v_ihvp = tf.placeholder(tf.float64, shape=[None])
        self.v_param_damping = tf.placeholder(tf.float32)
        self.v_param_scale = tf.placeholder(tf.float32)
        self.v_param_total_trainset = tf.placeholder(tf.float64)

        self.inverse_hvp = None
        self.trainable_variables = trainable_variables

        with tf.name_scope('darkon_ihvp'):
            self.hessian_vector_op = _hessian_vector_product(loss_op_train, trainable_variables, self.v_cur_estimated)
            self.estimation_op = [
                a + (b * self.v_param_damping) - (c / self.v_param_scale)
                for a, b, c in zip(self.v_test_grad, self.v_cur_estimated, self.hessian_vector_op)
            ]

        with tf.name_scope('darkon_grad_diff'):
            flatten_inverse_hvp = tf.reshape(self.v_ihvp, shape=(-1, 1))
            flatten_grads = tf.concat([tf.reshape(a, (-1,)) for a in self.grad_op_train], 0)
            flatten_grads = tf.reshape(flatten_grads, shape=(1, -1,))
            flatten_grads = tf.cast(flatten_grads, tf.float64)
            flatten_grads /= self.v_param_total_trainset
            self.grad_diff_op = tf.matmul(flatten_grads, flatten_inverse_hvp)

        self.ihvp_config = {
            'scale': 1e4,
            'damping': 0.01,
            'num_repeats': 1,
            'recursion_batch_size': 10,
            'recursion_depth': 10000
        }

        if not os.path.exists(self.workspace):
            os.makedirs(self.workspace)
Esempio n. 8
0
  def testInvalidSecondGradient(self):
    inputs = np.random.randn(2, 2, 3).astype(np.float32)
    inputs_t = constant_op.constant(inputs)
    labels = SimpleSparseTensorFrom([[0, 1], [1, 0]])
    seq_lens = np.array([2, 2], dtype=np.int32)
    v = [1.0]

    with self.session(use_gpu=False):
      loss = _ctc_loss_v2(
          inputs=inputs_t, labels=labels, sequence_length=seq_lens)
      # Taking this second gradient should fail, since it is not
      # yet supported.
      with self.assertRaisesRegex(LookupError, "explicitly disabled"):
        _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
Esempio n. 9
0
  def testInvalidSecondGradient(self):
    inputs = np.random.randn(2, 2, 3).astype(np.float32)
    inputs_t = constant_op.constant(inputs)
    labels = SimpleSparseTensorFrom([[0, 1], [1, 0]])
    seq_lens = np.array([2, 2], dtype=np.int32)
    v = [1.0]

    with self.session(use_gpu=False):
      loss = _ctc_loss_v2(
          inputs=inputs_t, labels=labels, sequence_length=seq_lens)
      # Taking ths second gradient should fail, since it is not
      # yet supported.
      with self.assertRaisesRegexp(LookupError,
                                   "explicitly disabled"):
        _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
Esempio n. 10
0
    def _setupHessianVectorProduct(self, jvp_fn: Callable[[tf.Tensor],
                                                          tf.Tensor],
                                   x: tf.Tensor,
                                   v_constant: tf.Tensor) -> tf.Tensor:
        predictions_this = self._predictions_fn(v_constant)
        if self._diag_hessian_fn is None:
            loss_this = self._loss_fn(predictions_this)
            hjvp = _hessian_vector_product(ys=[loss_this],
                                           xs=[predictions_this],
                                           v=[jvp_fn(x)])
        else:
            hjvp = self._diag_hessian_fn(predictions_this) * jvp_fn(x)

        jhjvp = tf.gradients(predictions_this, v_constant, hjvp)[0]
        return jhjvp
Esempio n. 11
0
 def body(it, randv, eig_est, eig_est_prev, tfconst):
     #hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [randv])[0]-10*randv
     hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [randv])[0]-tf.multiply(tfconst,randv)
     hv_op_rs = tf.reshape(hv_op_tmp, (tf.shape(hv_op_tmp)[0],-1))
     hv_norm_op = tf.norm(hv_op_rs, axis = 1, keepdims=True)
     hv_op_rs_normalize = hv_op_rs/hv_norm_op
     hv_op = tf.reshape(hv_op_rs_normalize, tf.shape(hv_op_tmp))
     
     randv_rs = tf.reshape(randv, (tf.shape(randv)[0],-1))
     randv_norm_op = tf.norm(randv_rs, axis = 1)
     vhv_op = tf.reduce_sum(tf.multiply(randv_rs,hv_op_rs),axis=1)
     eig_est_prev = eig_est
     eig_est = vhv_op/tf.square(randv_norm_op) 
     
     return (it+1, hv_op, eig_est, eig_est_prev, tfconst)
Esempio n. 12
0
    def __init__(self,
                 workspace,
                 feeder,
                 loss_op_train,
                 loss_op_test,
                 x_placeholder,
                 y_placeholder,
                 test_feed_options=None,
                 train_feed_options=None,
                 trainable_variables=None):
        self.workspace = workspace
        self.feeder = feeder
        self.x_placeholder = x_placeholder
        self.y_placeholder = y_placeholder
        self.test_feed_options = test_feed_options if test_feed_options else dict(
        )
        self.train_feed_options = train_feed_options if train_feed_options else dict(
        )

        if trainable_variables is None:
            trainable_variables = (
                tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) +
                tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))

        self.loss_op_train = loss_op_train
        self.grad_op_train = tf.gradients(loss_op_train, trainable_variables)
        self.grad_op_test = tf.gradients(loss_op_test, trainable_variables)

        self.v_placeholder = [
            tf.placeholder(tf.float32, shape=a.get_shape())
            for a in trainable_variables
        ]
        self.hessian_vector_op = _hessian_vector_product(
            loss_op_train, trainable_variables, self.v_placeholder)
        self.inverse_hvp = None
        self.trainable_variables = trainable_variables

        self.ihvp_config = {
            'scale': 1e4,
            'damping': 0.01,
            'num_repeats': 1,
            'recursion_batch_size': 10,
            'recursion_depth': 10000
        }

        if not os.path.exists(self.workspace):
            os.makedirs(self.workspace)
Esempio n. 13
0
 def testHessianVectorProduct(self):
   # Manually compute the Hessian explicitly for a low-dimensional problem
   # and check that HessianVectorProduct matches multiplication by the
   # explicit Hessian.
   # Specifically, the Hessian of f(x) = x^T A x is
   # H = A + A^T.
   # We expect HessianVectorProduct(f(x), x, v) to be H v.
   m = 4
   rng = np.random.RandomState([1, 2, 3])
   mat_value = rng.randn(m, m).astype("float32")
   v_value = rng.randn(m, 1).astype("float32")
   x_value = rng.randn(m, 1).astype("float32")
   hess_value = mat_value + mat_value.T
   hess_v_value = np.dot(hess_value, v_value)
   for use_gpu in [False, True]:
     with self.test_session(use_gpu=use_gpu):
       mat = constant_op.constant(mat_value)
       v = constant_op.constant(v_value)
       x = constant_op.constant(x_value)
       mat_x = math_ops.matmul(mat, x, name="Ax")
       x_mat_x = math_ops.matmul(array_ops.transpose(x), mat_x, name="xAx")
       hess_v = gradients_impl._hessian_vector_product(x_mat_x, [x], [v])[0]
       hess_v_actual = hess_v.eval()
     self.assertAllClose(hess_v_value, hess_v_actual)
Esempio n. 14
0
    def _second_order(self) -> None:
        with tf.name_scope(self._name + '_second_order'):
            self._vjp = tf.gradients(self._predictions_fn_tensor,
                                     self._input_var,
                                     self._dummy_var,
                                     name='vjp')[0]
            self._jvpz = tf.gradients(self._vjp,
                                      self._dummy_var,
                                      tf.stop_gradient(self._z),
                                      name='jvpz')[0]

            if self._diag_hessian_fn is not None:
                self._hjvpz = self._diag_hessian_fn_tensor * self._jvpz
            else:
                # I have commented out my implementation of the hessian-vector product.
                # Using the tensorflow implementation instead.
                #self._hjvpz = tf.gradients(tf.gradients(self._loss_fn_tensor,
                #                                       self._predictions_fn_tensor)[0][None, :]
                #                          @ self._jvpz[:,None], self._predictions_fn_tensor,
                #                          stop_gradients=self._jvpz)[0]
                self._hjvpz = _hessian_vector_product(
                    ys=[self._loss_fn_tensor],
                    xs=[self._predictions_fn_tensor],
                    v=[self._jvpz])[0]

            # J^T H J z
            self._jhjvpz = tf.gradients(self._predictions_fn_tensor,
                                        self._input_var,
                                        self._hjvpz + self._jloss,
                                        name='jhjvpz')[0]

            self._deltaz = self._jhjvpz + self._damping_factor * self._z

            self._grad_t = tf.gradients(self._loss_fn_tensor,
                                        self._input_var,
                                        name='grad')[0]
Esempio n. 15
0
    def __init__(self,
                 feeder,
                 model,
                 workspace='./influence-workspace',
                 trainable_variables=None):
        self.workspace = workspace
        self.feeder = feeder
        self.x_placeholder = model.input
        self.y_placeholder = K.placeholder(shape=model.output.shape)
        self.test_feed_options = dict()
        self.train_feed_options = dict()

        # If the model is a string, make sure it referrs to a Keras loss function.
        if model.loss in kerasLossDict.keys():
            loss_op_train = kerasLossDict[model.loss](self.y_placeholder,
                                                      model.output)
            loss_op_test = kerasLossDict[model.loss](self.y_placeholder,
                                                     model.output)
        else:
            loss_op_train = model.loss(self.y_placeholder, model.output)
            loss_op_test = model.loss(self.y_placeholder, model.output)

        if trainable_variables:
            trainable_variables = trainable_variables
        else:
            trainable_variables = model.trainable_weights

        self.loss_op_train = loss_op_train
        self.grad_op_train = K.gradients(loss_op_train, trainable_variables)
        self.grad_op_test = K.gradients(loss_op_test, trainable_variables)

        self.v_cur_estimated = [
            tf.placeholder(tf.float32, shape=a.get_shape())
            for a in trainable_variables
        ]
        self.v_test_grad = [
            tf.placeholder(tf.float32, shape=a.get_shape())
            for a in trainable_variables
        ]
        self.v_ihvp = tf.placeholder(tf.float64, shape=[None])
        self.v_param_damping = tf.placeholder(tf.float32)
        self.v_param_scale = tf.placeholder(tf.float32)
        self.v_param_total_trainset = tf.placeholder(tf.float64)

        self.inverse_hvp = None
        self.trainable_variables = trainable_variables

        with tf.name_scope('model_ihvp'):
            self.hessian_vector_op = _hessian_vector_product(
                loss_op_train, trainable_variables, self.v_cur_estimated)
            self.estimation_op = [
                a + (b * self.v_param_damping) - (c / self.v_param_scale)
                for a, b, c in zip(self.v_test_grad, self.v_cur_estimated,
                                   self.hessian_vector_op)
            ]

        with tf.name_scope('model_grad_diff'):
            flatten_inverse_hvp = tf.reshape(self.v_ihvp, shape=(-1, 1))
            flatten_grads = tf.concat(
                [tf.reshape(a, (-1, )) for a in self.grad_op_train], 0)
            flatten_grads = tf.reshape(flatten_grads, shape=(
                1,
                -1,
            ))
            flatten_grads = tf.cast(flatten_grads, tf.float64)
            flatten_grads /= self.v_param_total_trainset
            self.grad_diff_op = tf.matmul(flatten_grads, flatten_inverse_hvp)

        self.ihvp_config = {
            'scale': 1e4,
            'damping': 0.01,
            'num_repeats': 1,
            'recursion_batch_size': 10,
            'recursion_depth': 10000
        }

        if not os.path.exists(self.workspace):
            os.makedirs(self.workspace)
Esempio n. 16
0
    def compute_gradients(self,
                          loss,
                          var_list=None,
                          aggregation_method=None,
                          colocate_gradients_with_ops=False,
                          device='/cpu:0'):
        """Compute gradients of `loss` for the variables in `var_list`.
    This is the first part of `minimize()`.  It returns a list
    of (gradient, variable) pairs where "gradient" is the gradient
    for "variable".  Note that "gradient" can be a `Tensor`, an
    `IndexedSlices`, or `None` if there is no gradient for the
    given variable.
    Args:
      loss: A Tensor containing the value to minimize or a callable taking
        no arguments which returns the value to minimize. When eager execution
        is enabled it must be a callable.
      var_list: Optional list or tuple of `tf.Variable` to update to minimize
        `loss`.  Defaults to the list of variables collected in the graph
        under the key `GraphKeys.TRAINABLE_VARIABLES`.
      aggregation_method: Specifies the method used to combine gradient terms.
        Valid values are defined in the class `AggregationMethod`.
      colocate_gradients_with_ops: If True, try colocating gradients with
        the corresponding op.
      device: which device to compute the variables dot product on.
    Returns:
      A list of (gradient, variable) pairs. Variable is always present, but
      gradient can be `None`.
    Raises:
      TypeError: If `var_list` contains anything else than `Variable` objects.
      ValueError: If some arguments are invalid.
      NotImplementedError: If called with eager execution enabled. or with
        unknown loss name
    @compatibility(eager)
    Not compatible.
    @end_compatibility
    """
        if callable(loss):
            raise NotImplementedError('Eager execution is not available yet')

        if self._autolambda:
            self._lambda = tf.reshape(tf.cond(tf.equal(tf.mod(self._step, self._auto_step),0),\
                self._autolam, lambda: self._lambda),[])
            self._loss = loss

        # Get trainable variables
        if var_list is None:
            var_list = (
                variables.trainable_variables() +
                ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
        else:
            var_list = nest.flatten(var_list)

        # pylint: disable=protected-access
        var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS)

        # Check if we have anything to optimize
        if not var_list:
            raise ValueError("No variables to optimize.")

        # TODO enable more variables mode maybe fix z device placement
        var_refs = var_list

        # Init momentum vector
        mu = 1.
        self._z = []
        self._zdic = {}
        for i in range(len(var_refs)):
            self._z.append(
                tf.get_variable("z%03d" % (i),
                                shape=var_refs[i].get_shape(),
                                caching_device=var_refs[i].device,
                                initializer=tf.zeros_initializer()))
            self._zdic[var_refs[i].name] = self._z[i]
        # do two GD steps. first update z (linear system state), then use the
        # result (whitened gradient estimate) to update the parameters w.
        #
        # zdelta = grad{||(mu * J' * Hl * J + lambda * I) * z - J' * Jl'||^2}
        #        = (mu * J' * Hl * J + lambda * I) * z - J' * Jl'
        #        =  mu * J' * Hl * J * z + lambda * z - J' * Jl'
        #
        # znew = momentum * z - beta * zdelta
        #
        # wnew = w - lr * znew
        #

        # Assert that pre_loss is a single tensorflow tensor for simplicity

        if not isinstance(self._pre_loss, Tensor):
            raise NotImplementedError(
                'Optimizer not yet working with vector of logits')

        delta_z = []
        if not self._hessian:
            Jz = fmad_prod(self._pre_loss, var_refs, self._z)

            # Evaluate Hessian loss and gradient
            Jz_ = self._hessian_grad_loss(self._loss_name, self._pre_loss,
                                          loss, Jz)
            Jl = tf.gradients(loss, self._pre_loss)[0]
            Jz_ = mu * Jz_

            # Backpropagate Jz_ - Jl
            h_term = tf.gradients(self._pre_loss, var_refs, Jz_ + Jl)
            for i in range(len(var_refs)):
                delta_z.append(h_term[i] + self._lambda * self._z[i])
        else:
            # Compute gradient w.r.t the loss
            grad = tf.gradients(loss, var_refs)

            # Tensorflow build-in function, compute hessian vector products
            h_term = _hessian_vector_product(loss, var_refs, self._z)
            for i in range(len(var_refs)):
                delta_z.append(h_term[i] + self._lambda * self._z[i] + grad[i])

        # Autoparam
        if self._autoparam:
            if not self._hessian:
                Jdz = fmad_prod(self._pre_loss, var_refs, delta_z)

                Jdz_ = self._hessian_grad_loss(self._loss_name, self._pre_loss,
                                               loss, Jdz)

                with tf.device(device):
                    A11 = mu * tf.matmul(tf.reshape(Jdz, [1, -1]),
                                         tf.reshape(Jdz_, [-1, 1]))
                    A12 = mu * tf.matmul(tf.reshape(Jz, [1, -1]),
                                         tf.reshape(Jdz_, [-1, 1]))
                    A22 = mu * tf.matmul(tf.reshape(Jz, [1, -1]),
                                         tf.reshape(Jz_, [-1, 1]))

                    b1 = tf.matmul(tf.reshape(Jl, [1, -1]),
                                   tf.reshape(Jdz, [-1, 1]))
                    b2 = tf.matmul(tf.reshape(Jl, [1, -1]),
                                   tf.reshape(Jz, [-1, 1]))

                    for i in range(len(var_refs)):
                        # compute the system we want to invert
                        z_vec = tf.reshape(self._z[i], [1, -1])
                        dz_vec = tf.reshape(delta_z[i], [1, -1])

                        A11 = A11 + tf.matmul(dz_vec, dz_vec,
                                              transpose_b=True) * self._lambda
                        A12 = A12 + tf.matmul(dz_vec, z_vec,
                                              transpose_b=True) * self._lambda
                        A22 = A22 + tf.matmul(z_vec, z_vec,
                                              transpose_b=True) * self._lambda
            else:
                # Tensorflow build-in function, compute hessian vector products
                h_term_dz = _hessian_vector_product(loss, var_refs, delta_z)

                with tf.device(device):
                    A11, A12, A22 = 0, 0, 0
                    b1, b2 = 0, 0

                    for i in range(len(var_refs)):
                        # compute the system we want to invert
                        z_vec = tf.reshape(self._z[i], [1, -1])
                        dz_vec = tf.reshape(delta_z[i], [1, -1])

                        hz_vec = tf.reshape(h_term[i], [1, -1])
                        hdz_vec = tf.reshape(h_term_dz[i], [1, -1])

                        A11 = A11 + tf.matmul(
                            hdz_vec, dz_vec, transpose_b=True) + tf.matmul(
                                dz_vec, dz_vec,
                                transpose_b=True) * self._lambda
                        A12 = A12 + tf.matmul(
                            hdz_vec, z_vec, transpose_b=True) + tf.matmul(
                                dz_vec, z_vec, transpose_b=True) * self._lambda
                        A22 = A22 + tf.matmul(
                            hz_vec, z_vec, transpose_b=True) + tf.matmul(
                                z_vec, z_vec, transpose_b=True) * self._lambda

                        b1 = b1 + tf.matmul(tf.reshape(grad[i], [1, -1]),
                                            tf.reshape(dz_vec, [-1, 1]))
                        b2 = b2 + tf.matmul(tf.reshape(grad[i], [1, -1]),
                                            tf.reshape(z_vec, [-1, 1]))

            # compute beta and momentum coefficient
            A = tf.concat([tf.concat([A11, A12], 0),
                           tf.concat([A12, A22], 0)], 1)
            b = tf.concat([b1, b2], 0)

            # Solve linear system
            m_b = tf.matrix_solve_ls(A,
                                     b,
                                     l2_regularizer=self._autoparam_reg,
                                     fast=False)
            self._M = -0.5 * tf.reduce_sum(m_b * b)

            m_b = tf.unstack(m_b, axis=0)

            beta = -tf.to_float(m_b[0])
            self._momentum = -tf.to_float(m_b[1])
        else:
            beta = -self._beta

        # Update gradient
        for i in range(len(var_refs)):
            # delta_z handle the momentum update
            delta_z[i] = beta * delta_z[i]

        grads_and_vars = list(zip(delta_z, var_list))
        self._assert_valid_dtypes([
            v for g, v in grads_and_vars
            if g is not None and v.dtype != dtypes.resource
        ])
        return grads_and_vars
Esempio n. 17
0
def get_acc_for_nonzero_gaussian_perturbed_two_layer_model_MNIST(mu, sigma=.1, const_multiplier=1., n_tot_iters=5000, n_fisher_iters=2000, record_tensorboard=False, regularizer_mode='hvp'):
    import tensorflow as tf
    from tensorflow.examples.tutorials.mnist import input_data
    from tensorflow.python.ops import gradients_impl
    import numpy as np
    tf.reset_default_graph()
    mnist = input_data.read_data_sets('/tmp/data', one_hot=True)

    x = tf.placeholder(tf.float32, shape = (None, 784), name='Inputs')
    y = tf.placeholder(tf.float32, shape = (None, 10), name='Labels')
    gamma = tf.placeholder(tf.float32, shape = (), name='reg_constant')
    nwts = 7840
    # wts = tf.get_variable('Weights',shape= (784,10), initializer = tf.random_normal_initializer(stddev=.001))
    w = tf.get_variable(name='w', shape=[784, 512], initializer=tf.contrib.layers.xavier_initializer())
    w2 = tf.get_variable(name='w2', shape = [512, 10], initializer = tf.contrib.layers.xavier_initializer())
    bias1 = tf.get_variable('bias1',shape= (512), initializer = tf.random_normal_initializer(stddev=.1))
    bias2 = tf.get_variable('bias2',shape= (10), initializer = tf.random_normal_initializer(stddev=.1))

    w_pert = tf.placeholder(tf.float32, shape=(784,512))
    w_pert2 = tf.placeholder(tf.float32, shape=(512,10))
    # 0.1000    0.1292    0.1668    0.2154    0.2783    0.3594    0.4642    0.5995    0.7743    1.0000

    # w_pert = tf.stop_gradient(w + shift_pctage*w)
    perturbation = tf.stop_gradient(w - w_pert)
    perturbation2 = tf.stop_gradient(w2 - w_pert2)


    layer_1_out = tf.nn.relu(tf.matmul(x, w) + bias1)

    logits = tf.matmul(layer_1_out, w2) + bias2
    y_ = tf.nn.softmax(logits)
    correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
    accuracy = tf.stop_gradient(tf.reduce_mean(tf.cast(correct_prediction, tf.float32)))

    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y, logits = logits))


    optimizer = tf.train.AdamOptimizer()
    ce_grads = tf.gradients(loss, [w, w2, bias1,bias2])
    ce_grads_w1 = ce_grads[0]
    ce_grads_w2 = ce_grads[1]



    # print(vars)
    tf.summary.histogram('weights1', w)
    tf.summary.histogram('weights2', w2)
    tf.summary.histogram('pertweights1', w_pert)
    tf.summary.histogram('pertweights2', w_pert2)

    if regularizer_mode == 'hvp_adam':
        train_op = optimizer.apply_gradients(zip(ce_grads, [w, w2, bias1, bias2]))
        hvp1 = gradients_impl._hessian_vector_product(loss, [w], [perturbation])
        hvp2 = gradients_impl._hessian_vector_product(loss, [w2], [perturbation2])

        diag_load_amt1 = gamma * .005 * perturbation
        diag_load_amt2 = gamma * .005 * perturbation2

        reg_grad1 = gamma * 2.0 * hvp1 + diag_load_amt1
        reg_grad1 = tf.reshape(reg_grad1, tf.shape(w))
        reg_grad2 = gamma * 2.0 * hvp2 + diag_load_amt2
        reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2))
        train_op_reg = optimizer.apply_gradients(zip([reg_grad1, reg_grad2], [w, w2]))

    elif regularizer_mode == 'diag_adam':
        train_op = optimizer.apply_gradients(zip(ce_grads, [w, w2, bias1, bias2]))
        vars = optimizer.variables()
        v_2 = vars[-1]
        v_1 = vars[-3]

        hvp1 = tf.multiply(v_1 ,perturbation)
        hvp2 = tf.multiply(v_2 ,perturbation2)

        diag_load_amt1 = gamma * .005 * perturbation
        diag_load_amt2 = gamma * .005 * perturbation2

        reg_grad1 = gamma * 2.0 * hvp1 + diag_load_amt1
        reg_grad1 = tf.reshape(reg_grad1, tf.shape(w))
        reg_grad2 = gamma * 2.0 * hvp2 + diag_load_amt2
        reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2))
        train_op_reg = optimizer.apply_gradients(zip([reg_grad1, reg_grad2], [w, w2]))

    elif regularizer_mode == 'l2_adam':
        train_op = optimizer.apply_gradients(zip(ce_grads, [w, w2, bias1, bias2]))
        diag_load_amt1 = gamma * .005 * perturbation
        diag_load_amt2 = gamma * .005 * perturbation2

        reg_grad1 = diag_load_amt1
        reg_grad1 = tf.reshape(reg_grad1, tf.shape(w))
        reg_grad2 = diag_load_amt2
        reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2))
        train_op_reg = optimizer.apply_gradients(zip([reg_grad1, reg_grad2], [w, w2]))

    elif regularizer_mode == 'hvp':
        diag_load_amt1 = gamma * .005 * perturbation
        diag_load_amt2 = gamma * .005 * perturbation2
        hvp1 = gradients_impl._hessian_vector_product(loss, [w], [perturbation])
        hvp2 = gradients_impl._hessian_vector_product(loss, [w2], [perturbation2])

        reg_grad1 = gamma * 2.0 * hvp1 + diag_load_amt1
        reg_grad1 = tf.reshape(reg_grad1, tf.shape(w))
        reg_grad2 = gamma * 2.0 * hvp2 + diag_load_amt2
        reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2))
        tot_grads1 = ce_grads_w1 + reg_grad1
        tot_grads2 = ce_grads_w2 + reg_grad2
        train_op = optimizer.apply_gradients(zip([tot_grads1, tot_grads2, ce_grads[2], ce_grads[3]], [w, w2, bias1, bias2]))
        train_op_reg = tf.no_op()

    elif regularizer_mode == 'l2':
        diag_load_amt1 = gamma * .005 * perturbation
        diag_load_amt2 = gamma * .005 * perturbation2

        reg_grad1 = diag_load_amt1
        reg_grad1 = tf.reshape(reg_grad1, tf.shape(w))
        reg_grad2 = diag_load_amt2
        reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2))
        tot_grads1 = ce_grads_w1 + reg_grad1
        tot_grads2 = ce_grads_w2 + reg_grad2
        train_op = optimizer.apply_gradients(zip([tot_grads1, tot_grads2, ce_grads[2], ce_grads[3]], [w, w2, bias1, bias2]))
        train_op_reg = tf.no_op()
    else:
        train_op = optimizer.apply_gradients(zip(ce_grads, [w, w2, bias1, bias2]))
        train_op_reg = tf.no_op()

    tf.summary.histogram('ce_gradient1', ce_grads_w1)
    tf.summary.histogram('ce_gradient2', ce_grads_w2)

    if const_multiplier>0.:
        print('USING REGULARIZATION')

        tf.summary.histogram('regularizer_gradient1', reg_grad1)
        tf.summary.histogram('regularizer_gradient2', reg_grad2)
        tf.summary.histogram('diagonal_load1', diag_load_amt1)
        tf.summary.histogram('diagonal_load2', diag_load_amt2)

        tf.summary.scalar('loss_gamma', gamma)
    else:
        print('NO REGULARIZATION')
        train_op_reg = tf.no_op()


    n_iters = n_tot_iters
    batch_size = 1024
    n_fisher_iters= n_fisher_iters
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)

    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    if record_tensorboard:
        summary_writer = tf.summary.FileWriter('./logs/two_layer_zero_mean', sess.graph)
        summary_op = tf.summary.merge_all()

    lossval=[]
    accval=[]
    sess.run(tf.global_variables_initializer())

    regularizer_const=0.
    w_pert_ = np.zeros([784, 512])
    w_pert2_ = np.zeros([512, 10])

    for i in range(0, n_iters):
        x_batch, y_batch = mnist.train.next_batch(batch_size)
        if i<=(n_iters-n_fisher_iters):
            regularizer_const=0.
        else:
            regularizer_const=.1*const_multiplier

        _, __, l, acc, w_ = sess.run([train_op, train_op_reg, loss, accuracy, w,], feed_dict={x: x_batch, y: y_batch, gamma:regularizer_const, w_pert:w_pert_, w_pert2:w_pert2_})

        if record_tensorboard:
            summ, _, __, l, acc, w_ = sess.run([summary_op, train_op, train_op_reg, loss, accuracy, w], feed_dict={x: x_batch, y: y_batch, gamma:regularizer_const, w_pert:w_pert_, w_pert2:w_pert2_})

        if record_tensorboard:
            summary_writer.add_summary(summ, i)
        lossval.append(l)
        accval.append(acc)

        if i == n_iters-n_fisher_iters:
            print('SAVING OPTIMAL ML WEIGHTS FROM END OF TRAINING')
            w_, w2_ = sess.run([w, w2])

        if i >= n_iters-n_fisher_iters and regularizer_const>0.:
            w_pert_ = w_ + np.random.normal(mu, sigma, size = [784, 512])
            w_pert2_ = w2_ + np.random.normal(mu, sigma, size = [512, 10])

        if i == n_iters - 1:
            print('USING PERTURBATIONS ON WEIGHTS AT END OF ALL ITERATIONS')
            w_, w2_ = sess.run([w, w2])
            # w_pert_ = w_
            # w_pert2_ = w2_
            # w_pert_ = w_ + np.random.normal(mu, sigma, size = [784, 512])

            # w_pert2_ = w2_ + np.random.normal(mu, sigma, size = [512, 10])


        if i%200==0:
            print('\nIteration: '+str(i)+'\nAccuracy: '+str(acc)+'\nLoss: '+str(l)+'\n')

    regularizer_const = 0.

    # perturbed_test_set = mnist.test.images+np.random.normal(0.,stddev, np.shape(mnist.test.images))
    w_pert_ = w_ + np.random.normal(mu, sigma, size = [784, 512])
    w_pert2_ = w2_ + np.random.normal(mu, sigma, size = [512, 10])

    x_testcv = mnist.test.images
    y_testcv = mnist.test.labels
    x_cv = x_testcv[0:5000,:]
    x_test = x_testcv[5000:,:]

    y_cv = y_testcv[0:5000,:]
    y_test = y_testcv[5000:,:]
    up_acc = sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels})
    print('UNPERTURBED Test accuracy %g' % up_acc)
    sess.run(tf.assign(w, w_pert), feed_dict={gamma:regularizer_const, w_pert:w_pert_, w_pert2: w_pert2_})
    sess.run(tf.assign(w2, w_pert2_), feed_dict={gamma:regularizer_const, w_pert:w_pert_, w_pert2: w_pert2_})

    pert_acc = sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, gamma:regularizer_const, w_pert:w_pert_, w_pert2:w_pert2_})
    # pert_acc = sess.run(accuracy, feed_dict={x: perturbed_test_set, y: mnist.test.labels})
    print('PRETURBED test accuracy %g' % pert_acc)
    # summary_writer.close()
    sess.close()

    return up_acc, pert_acc
Esempio n. 18
0
    def _param_updates(self) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:

        with tf.name_scope(self._name + '_param_updates'):

            # This is for the beta and rho updates
            self._jvpdz = tf.gradients(self._vjp,
                                       self._dummy_var,
                                       tf.stop_gradient(self._deltaz),
                                       name='jvpdz')[0]

            if self._diag_hessian_fn is not None:
                #self._hjvpdz = self._diag_hessian_fn(self._predictions_fn_tensor) * self._jvpdz
                self._hjvpdz = self._diag_hessian_fn_tensor * self._jvpdz
            else:
                #self._hjvpdz = tf.gradients(tf.gradients(self._loss_fn_tensor,
                #                                       self._predictions_fn_tensor)[0][None, :]
                #                          @ self._jvpdz[:,None], self._predictions_fn_tensor,
                #                          stop_gradients=self._jvpdz)[0]
                self._hjvpdz = _hessian_vector_product(
                    ys=[self._loss_fn_tensor],
                    xs=[self._predictions_fn_tensor],
                    v=[self._jvpdz])[0]

            a11 = tf.reduce_sum(self._hjvpdz * self._jvpdz)
            a12 = tf.reduce_sum(self._jvpz * self._hjvpdz)
            a22 = tf.reduce_sum(self._jvpz * self._hjvpz)

            b1 = tf.reduce_sum(self._jloss * self._jvpdz)
            b2 = tf.reduce_sum(self._jloss * self._jvpz)

            a11 = a11 + tf.reduce_sum(
                self._deltaz * self._deltaz * self._damping_factor)
            a12 = a12 + tf.reduce_sum(
                self._deltaz * self._z * self._damping_factor)
            a22 = a22 + tf.reduce_sum(self._z * self._z * self._damping_factor)

            A = tf.stack([[a11, a12], [a12, a22]])
            b = tf.stack([b1, b2])

            # Cannot use vanilla matrix inverse because the matrix is sometimes singular
            #m_b = tf.reshape(tf.matrix_inverse(A)  @ b[:, None], [-1])

            # I am using 1e-15 for rcond instead of the default value.
            # While this is a less robust choice, using a higher value of rcond seems to output approximate
            # inverse values which slow down the optimization significantly.
            # Instead, choosing a low value sometimes produces very bad outputs, but we can take care of that
            # using an additional update condition based on the change of the loss function,
            # by requiring that the loss function always decrease.

            def _two_by_two_pinv_sol():
                A_inv = tf.linalg.pinv(A, rcond=1e-15)
                m_b = tf.reshape(A_inv @ b[:, None], [-1])
                #with tf.control_dependencies([tf.print(m_b)]):
                #    m_b_0 = tf.clip_by_value(m_b[0], clip_value_min=1e-5, clip_value_max=1.0)
                #    m_b_1 = tf.clip_by_value(m_b[1], clip_value_min=-np.inf, clip_value_max=-1e-5)
                #    m_b = tf.stack([m_b_0, m_b_1])
                #m_b = tf.reshape(m_b, [-1])
                #m_b = tf.reshape(tf.linalg.lstsq(A, b[:,None], fast=False), [-1])
                #for i in range(10):
                #    db = A @ m_b[:,None] - b[:,None]
                #    m_db = tf.reshape(tf.linalg.lstsq(A, db, fast=False), [-1])
                #    m_b = m_b - m_db
                return m_b

            def _zero_z_sol():
                return tf.stack([b[0] / A[[0, 0]], 0.])

            m_b = tf.cond(tf.equal(b2, 0.), _zero_z_sol, _two_by_two_pinv_sol)
            beta = m_b[0]
            rho = -m_b[1]
            M = -0.5 * tf.reduce_sum(m_b * b)
            #with tf.control_dependencies([tf.print(M)]):
            #    M = M + 0.
        return beta, rho, M
for indicator, layer_wt, qlayer_wt in zip(regularize_list, trainable_weights,
                                          qweight_list):
    perturbation_list.append(tf.stop_gradient(layer_wt - qlayer_wt))

for indicator, perturbation_vec in zip(regularize_list, perturbation_list):
    if indicator:
        perturbations_for_hvp.append(perturbation_vec)

print('list of perturbation tensors')
print(perturbation_list)
# exit()

#Compute hessian-vector product, and diag regularizer here
hvp_list = []

hvp_list = gradients_impl._hessian_vector_product(loss, regularized_weights,
                                                  perturbations_for_hvp)
hessian_vector_product = []

layer_diag_load_amt = []
for layer_hvp, layer_pertubation in zip(hvp_list, perturbations_for_hvp):
    layer_diag_load_amt.append(gamma * .1 * layer_pertubation)

for layer_hvp, diag_load_amt in zip(hvp_list, layer_diag_load_amt):
    hessian_vector_product.append(gamma * 2.0 * layer_hvp + diag_load_amt)

total_grads = []
i = 0
for indicator, layer_grad in zip(regularize_list, ce_grads):
    if indicator:
        total_grads.append(layer_grad + hessian_vector_product[i])
        # total_grads.append(layer_grad)# + hessian_vector_product[i])
def get_acc_for_gaussian_perturbed_logistic_model_MNIST(
        mu, sigma=.1, const_multiplier=1., record_tensorboard=False):
    import tensorflow as tf
    from tensorflow.examples.tutorials.mnist import input_data
    from tensorflow.python.ops import gradients_impl
    import numpy as np
    tf.reset_default_graph()
    mnist = input_data.read_data_sets('/tmp/data', one_hot=True)

    x = tf.placeholder(tf.float32, shape=(None, 784), name='Inputs')
    y = tf.placeholder(tf.float32, shape=(None, 10), name='Labels')
    gamma = tf.placeholder(tf.float32, shape=(), name='reg_constant')
    nwts = 7840
    w = tf.get_variable(name='w',
                        shape=[784, 10],
                        initializer=tf.contrib.layers.xavier_initializer())
    bias1 = tf.get_variable(
        'bias1',
        shape=(10),
        initializer=tf.random_normal_initializer(stddev=.1))

    w_pert = tf.placeholder(tf.float32, shape=(784, 10))
    # w_pert2 = tf.placeholder(tf.float32, shape=(512,10))
    # 0.1000    0.1292    0.1668    0.2154    0.2783    0.3594    0.4642    0.5995    0.7743    1.0000

    perturbation = tf.stop_gradient(w - w_pert)

    logits = tf.matmul(x, w) + bias1
    y_ = tf.nn.softmax(logits)
    correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
    accuracy = tf.stop_gradient(
        tf.reduce_mean(tf.cast(correct_prediction, tf.float32)))

    loss = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits))

    optimizer = tf.train.AdamOptimizer()
    ce_grads = tf.gradients(loss, [w, bias1])

    if const_multiplier > 0.:
        print('USING REGULARIZATION')

        ce_grads_w1 = ce_grads[0]
        hvp1 = gradients_impl._hessian_vector_product(loss, [w],
                                                      [perturbation])

        diag_load_amt1 = gamma * .01 * perturbation

        # reg_grad1 = gamma * 2.0 * hvp1 + diag_load_amt1
        reg_grad1 = diag_load_amt1
        reg_grad1 = tf.reshape(reg_grad1, tf.shape(w))

        tot_grads1 = ce_grads_w1 + reg_grad1
        tf.summary.histogram('regularizer_gradient1', reg_grad1)
        tf.summary.histogram('diagonal_load1', diag_load_amt1)
        tf.summary.histogram('ce_gradient1', ce_grads_w1)
        tf.summary.histogram('ce_gradient1_sq', tf.square(ce_grads_w1))
        tf.summary.scalar('loss_gamma', gamma)
        train_op = optimizer.apply_gradients(
            zip([tot_grads1, ce_grads[1]], [w, bias1]))
    else:
        print('NO REGULARIZATION')
        train_op = optimizer.apply_gradients(zip(ce_grads, [w, bias1]))

    n_iters = 5000
    batch_size = 512
    n_fisher_iters = 1000
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    tf.summary.histogram('weights1', w)
    tf.summary.histogram('pertweights1', w_pert)

    lossval = []
    accval = []

    if record_tensorboard:
        summary_writer = tf.summary.FileWriter('./logs/logistic_adam_v',
                                               sess.graph)
        summary_op = tf.summary.merge_all()
    sess.run(tf.global_variables_initializer())

    w_pert_ = np.zeros([784, 10])

    for i in range(0, n_iters):
        x_batch, y_batch = mnist.train.next_batch(batch_size)

        if i <= (n_iters - n_fisher_iters):
            regularizer_const = 0.
        else:
            regularizer_const = .1 * const_multiplier

        _, l, acc, w_ = sess.run([
            train_op,
            loss,
            accuracy,
            w,
        ],
                                 feed_dict={
                                     x: x_batch,
                                     y: y_batch,
                                     gamma: regularizer_const,
                                     w_pert: w_pert_
                                 })

        if record_tensorboard:
            summ, _, l, acc, w_ = sess.run(
                [summary_op, train_op, loss, accuracy, w],
                feed_dict={
                    x: x_batch,
                    y: y_batch,
                    gamma: regularizer_const,
                    w_pert: w_pert_
                })

        if record_tensorboard:
            summary_writer.add_summary(summ, i)
        lossval.append(l)
        accval.append(acc)

        if i == n_iters - n_fisher_iters:
            print('SAVING OPTIMAL ML WEIGHTS FROM END OF TRAINING')
            w_ = sess.run([w])

        if i >= n_iters - n_fisher_iters and regularizer_const > 0.:
            w_pert_ = w_ + np.random.normal(mu, sigma, size=[784, 10])

        if i == n_iters - 1:
            print('USING PERTURBATIONS ON WEIGHTS AT END OF ALL ITERATIONS')
            w_ = sess.run([w])

        if i % 200 == 0:
            print('\nIteration: ' + str(i) + '\nAccuracy: ' + str(acc) +
                  '\nLoss: ' + str(l) + '\n')

    regularizer_const = 0.

    w_pert_ = np.array(w_).reshape(784, 10) + np.random.normal(
        mu, sigma, size=[784, 10])

    x_testcv = mnist.test.images
    y_testcv = mnist.test.labels
    x_cv = x_testcv[0:5000, :]
    x_test = x_testcv[5000:, :]

    y_cv = y_testcv[0:5000, :]
    y_test = y_testcv[5000:, :]
    up_acc = sess.run(accuracy,
                      feed_dict={
                          x: mnist.test.images,
                          y: mnist.test.labels
                      })
    print('UNPERTURBED Test accuracy %g' % up_acc)
    sess.run(tf.assign(w, w_pert),
             feed_dict={
                 gamma: regularizer_const,
                 w_pert: w_pert_
             })

    pert_acc = sess.run(accuracy,
                        feed_dict={
                            x: mnist.test.images,
                            y: mnist.test.labels,
                            gamma: regularizer_const,
                            w_pert: w_pert_
                        })
    # pert_acc = sess.run(accuracy, feed_dict={x: perturbed_test_set, y: mnist.test.labels})
    print('PRETURBED test accuracy %g' % pert_acc)
    # summary_writer.close()
    sess.close()

    return up_acc, pert_acc
Esempio n. 21
0
    def _param_updates(self) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]:

        with tf.name_scope(self._name + '_param_updates'):

            # This is for the beta and rho updates
            ## I think the preconditioning cancels out during the matrix multiplication.
            ## But I am keeping this here in case it helps stabilize the matrix inverse.
            self._jvpdz = tf.gradients(self._vjp,
                                       self._dummy_var,
                                       tf.stop_gradient(self._deltaz),
                                       name='jvpdz')[0]

            if self._diag_hessian_fn is not None:
                self._hjvpdz = self._diag_hessian_fn_tensor * self._jvpdz
            else:
                self._hjvpdz = _hessian_vector_product(
                    ys=[self._loss_fn_tensor],
                    xs=[self._predictions_fn_tensor],
                    v=[self._jvpdz])[0]

            v1 = tf.reduce_sum(self._diag_hessian_fn_tensor) / tf.reduce_sum(
                tf.abs(self._diag_hessian_fn_tensor))

            a110 = tf.reduce_sum(self._hjvpdz * self._jvpdz)
            a12 = tf.reduce_sum(self._jvpz * self._hjvpdz)
            a22 = tf.reduce_sum(self._jvpz * self._hjvpz)

            b1 = tf.reduce_sum(self._jloss * self._jvpdz)
            b2 = tf.reduce_sum(self._jloss * self._jvpz)

            a11 = a110 + tf.reduce_sum(
                self._deltaz * self._deltaz * self._damping_factor)
            a12 = a12 + tf.reduce_sum(
                self._deltaz * self._z * self._damping_factor)
            a22 = a22 + tf.reduce_sum(self._z * self._z * self._damping_factor)

            A = tf.stack([[a11, a12], [a12, a22]])
            b = tf.stack([b1, b2])

            # Cannot use vanilla matrix inverse because the matrix is sometimes singular
            # m_b = tf.reshape(tf.matrix_inverse(A)  @ b[:, None], [-1])

            # I am using 1e-15 for rcond instead of the default value.
            # While this is a less robust choice, using a higher value of rcond seems to output approximate
            # inverse values which slow down the optimization significantly.
            # Instead, choosing a low value sometimes produces very bad outputs, but we can take care of that
            # using an additional update condition based on the change of the loss function,
            # by requiring that the loss function always decrease.

            def _two_by_two_pinv_sol():
                A_inv = tf.linalg.pinv(A, rcond=1e-15)
                m_b = tf.reshape(A_inv @ b[:, None], [-1])
                # m_b = tf.reshape(tf.linalg.lstsq(A, b[:,None], fast=False), [-1])
                # for i in range(2):
                #    db = A @ m_b[:,None] - b[:,None]
                #    m_db = tf.reshape(tf.linalg.lstsq(A, db, fast=False), [-1])
                #    m_b = m_b - m_db
                return m_b

            def _zero_z_sol():
                return tf.stack([b[0] / A[[0, 0]], 0.])

            m_b = tf.cond(tf.equal(b2, 0.), _zero_z_sol, _two_by_two_pinv_sol)
            beta = m_b[0]
            rho = -m_b[1]
            M = -0.5 * tf.reduce_sum(m_b * b)
            #dot_prod = tf.reduce_sum(self._grad * self._deltaz / tf.linalg.norm(self._grad) / tf.linalg.norm(self._deltaz))
            #with tf.control_dependencies(
            #    [tf.print('beta', beta, 'rho', rho, 'M', M, 'b2', b2, "b1", b1, "a110", a110, "a11", a11, "v1", v1,
            #              "dot_prod", dot_prod)]):
            #    M = M + 0.

        return beta, rho, M
Esempio n. 22
0
    def load_model(self,
                   dataset="mnist",
                   model_name="2-layer",
                   activation="relu",
                   model=None,
                   batch_size=0,
                   compute_slope=False,
                   order=1):
        """
        model: if set to None, then load dataset with model_name. Otherwise use the model directly.
        dataset: mnist, cifar and imagenet. recommend to use mnist and cifar as a starting point.
        model_name: possible options are 2-layer, distilled, and normal
        """
        from setup_cifar import CIFAR, CIFARModel, TwoLayerCIFARModel
        from setup_mnist import MNIST, MNISTModel, TwoLayerMNISTModel
        from nlayer_model import NLayerModel
        from setup_imagenet import ImageNet, ImageNetModel

        # if set this to true, we will use the logit layer output instead of probability
        # the logit layer's gradients are usually larger and more stable
        output_logits = True
        self.dataset = dataset
        self.model_name = model_name

        if model is None:
            print('Loading model...')
            if dataset == "mnist":
                self.batch_size = 1024
                if model_name == "2-layer":
                    model = TwoLayerMNISTModel("models/mnist_2layer",
                                               self.sess, not output_logits)
                elif model_name == "normal":
                    if activation == "relu":
                        model = MNISTModel("models/mnist", self.sess,
                                           not output_logits)
                    else:
                        print("actviation = {}".format(activation))
                        model = MNISTModel("models/mnist_cnn_7layer_" +
                                           activation,
                                           self.sess,
                                           not output_logits,
                                           activation=activation)
                        time.sleep(5)

                elif model_name == "brelu":
                    model = MNISTModel("models/mnist_brelu",
                                       self.sess,
                                       not output_logits,
                                       use_brelu=True)
                elif model_name == "distilled":
                    model = MNISTModel("models/mnist-distilled-100", self.sess,
                                       not output_logits)
                else:
                    # specify model parameters as N,M,opts
                    model_params = model_name.split(",")
                    if len(model_params) < 3:
                        raise (RuntimeError("incorrect model option" +
                                            model_name))
                    numlayer = int(model_params[0])
                    nhidden = int(model_params[1])
                    modelfile = "models/mnist_{}layer_relu_{}_{}".format(
                        numlayer, nhidden, model_params[2])
                    print("loading", modelfile)
                    model = NLayerModel([nhidden] * (numlayer - 1), modelfile)
            elif dataset == "cifar":
                self.batch_size = 1024
                if model_name == "2-layer":
                    model = TwoLayerCIFARModel("models/cifar_2layer",
                                               self.sess, not output_logits)
                elif model_name == "normal":
                    if activation == "relu":
                        model = CIFARModel("models/cifar", self.sess,
                                           not output_logits)
                    else:
                        model = CIFARModel("models/cifar_cnn_7layer_" +
                                           activation,
                                           self.sess,
                                           not output_logits,
                                           activation=activation)
                elif model_name == "brelu":
                    model = CIFARModel("models/cifar_brelu",
                                       self.sess,
                                       not output_logits,
                                       use_brelu=True)
                elif model_name == "distilled":
                    model = CIFARModel("models/cifar-distilled-100", self.sess,
                                       not output_logits)
                else:
                    # specify model parameters as N,M,opts
                    model_params = model_name.split(",")
                    if len(model_params) < 3:
                        raise (RuntimeError("incorrect model option" +
                                            model_name))
                    numlayer = int(model_params[0])
                    nhidden = int(model_params[1])
                    modelfile = "models/cifar_{}layer_relu_{}_{}".format(
                        numlayer, nhidden, model_params[2])
                    print("loading", modelfile)
                    model = NLayerModel([nhidden] * (numlayer - 1),
                                        modelfile,
                                        image_size=32,
                                        image_channel=3)
            elif dataset == "imagenet":
                self.batch_size = 32
                model = ImageNetModel(self.sess,
                                      use_softmax=not output_logits,
                                      model_name=model_name,
                                      create_prediction=False)
            else:
                raise (RuntimeError("dataset unknown"))

        #print("*** Loaded model successfully")

        self.model = model
        self.compute_slope = compute_slope
        if batch_size != 0:
            self.batch_size = batch_size

        ## placeholders: self.img, self.true_label, self.target_label
        # img is the placeholder for image input
        self.img = tf.placeholder(shape=[
            None, model.image_size, model.image_size, model.num_channels
        ],
                                  dtype=tf.float32)
        # output is the output tensor of the entire network
        self.output = model.predict(self.img)
        # create the graph to compute gradient
        # get the desired true label and target label
        self.true_label = tf.placeholder(dtype=tf.int32, shape=[])
        self.target_label = tf.placeholder(dtype=tf.int32, shape=[])
        true_output = self.output[:, self.true_label]
        target_output = self.output[:, self.target_label]
        # get the difference
        self.objective = true_output - target_output
        # get the gradient(deprecated arguments)
        self.grad_op = tf.gradients(self.objective, self.img)[0]
        # compute gradient norm: (in computation graph, so is faster)
        grad_op_rs = tf.reshape(self.grad_op, (tf.shape(self.grad_op)[0], -1))
        self.grad_2_norm_op = tf.norm(grad_op_rs, axis=1)
        self.grad_1_norm_op = tf.norm(grad_op_rs, ord=1, axis=1)
        self.grad_inf_norm_op = tf.norm(grad_op_rs, ord=np.inf, axis=1)

        ### Lily: added Hessian-vector product calculation here for 2nd order bound:
        if order == 2:
            ## _hessian_vector_product(ys, xs, v): return a list of tensors containing the product between the Hessian and v
            ## ys: a scalar valur or a tensor or a list of tensors to be summed to yield of scalar
            ## xs: a list of tensors that we should construct the Hessian over
            ## v: a list of tensors with the same shape as xs that we want to multiply by the Hessian
            # self.randv: shape = (Nimg,28,28,1) (the v in _hessian_vector_product)
            self.randv = tf.placeholder(shape=[
                None, model.image_size, model.image_size, model.num_channels
            ],
                                        dtype=tf.float32)
            # hv_op_tmp: shape = (Nimg,28,28,1) for mnist, same as self.img (the xs in _hessian_vector_product)
            hv_op_tmp = gradients_impl._hessian_vector_product(
                self.objective, [self.img], [self.randv])[0]
            # hv_op_rs: reshape hv_op_tmp to hv_op_rs whose shape = (Nimg, 784) for mnist
            hv_op_rs = tf.reshape(hv_op_tmp, (tf.shape(hv_op_tmp)[0], -1))
            # self.hv_norm_op: norm of hessian vector product, keep shape = (Nimg,1) using keepdims
            self.hv_norm_op = tf.norm(hv_op_rs, axis=1, keepdims=True)
            # hv_op_rs_normalize: normalize Hv to Hv/||Hv||, shape = (Nimg, 784)
            hv_op_rs_normalize = hv_op_rs / self.hv_norm_op
            # self.hv_op: reshape hv_op_rs_normalize to shape = (Nimg,28,28,1)
            self.hv_op = tf.reshape(hv_op_rs_normalize, tf.shape(hv_op_tmp))

            ## reshape randv and compute its norm
            # shape: (Nimg, 784)
            randv_rs = tf.reshape(self.randv, (tf.shape(self.randv)[0], -1))
            # shape: (Nimg,)
            self.randv_norm_op = tf.norm(randv_rs, axis=1)
            ## compute v'Hv: use un-normalized Hv (hv_op_tmp, hv_op_rs)
            # element-wise multiplication and then sum over axis = 1 (now shape: (Nimg,))
            self.vhv_op = tf.reduce_sum(tf.multiply(randv_rs, hv_op_rs),
                                        axis=1)
            ## compute Rayleigh quotient: v'Hv/v'v (estimated largest eigenvalue), shape: (Nimg,)
            # note: self.vhv_op and self.randv_norm_op has to be in the same dimension (either (Nimg,) or (Nimg,1))
            self.eig_est = self.vhv_op / tf.square(self.randv_norm_op)

            ## Lily added the tf.while to compute the eigenvalue in computational graph later
            # cond for computing largest abs/neg eigen-value
            def cond(it, randv, eig_est, eig_est_prev, tfconst):
                norm_diff = tf.norm(eig_est - eig_est_prev, axis=0)
                return tf.logical_and(it < 500, norm_diff > 0.001)

            # compute largest abs eigenvalue: tfconst = 0
            # compute largest neg eigenvalue: tfconst = 10
            def body(it, randv, eig_est, eig_est_prev, tfconst):
                #hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [randv])[0]-10*randv
                hv_op_tmp = gradients_impl._hessian_vector_product(
                    self.objective, [self.img], [randv])[0] - tf.multiply(
                        tfconst, randv)
                hv_op_rs = tf.reshape(hv_op_tmp, (tf.shape(hv_op_tmp)[0], -1))
                hv_norm_op = tf.norm(hv_op_rs, axis=1, keepdims=True)
                hv_op_rs_normalize = hv_op_rs / hv_norm_op
                hv_op = tf.reshape(hv_op_rs_normalize, tf.shape(hv_op_tmp))

                randv_rs = tf.reshape(randv, (tf.shape(randv)[0], -1))
                randv_norm_op = tf.norm(randv_rs, axis=1)
                vhv_op = tf.reduce_sum(tf.multiply(randv_rs, hv_op_rs), axis=1)
                eig_est_prev = eig_est
                eig_est = vhv_op / tf.square(randv_norm_op)

                return (it + 1, hv_op, eig_est, eig_est_prev, tfconst)

            it = tf.constant(0)
            # compute largest abs eigenvalue
            result = tf.while_loop(
                cond, body,
                [it, self.randv, self.vhv_op, self.eig_est,
                 tf.constant(0.0)])
            # compute largest neg eigenvalue
            self.shiftconst = tf.placeholder(shape=(), dtype=tf.float32)
            result_1 = tf.while_loop(
                cond, body,
                [it, self.randv, self.vhv_op, self.eig_est, self.shiftconst])

            # computing largest abs eig value and save result
            self.it = result[0]
            self.while_hv_op = result[1]
            self.while_eig = result[2]

            # computing largest neg eig value and save result
            self.it_1 = result_1[0]
            #self.while_eig_1 = tf.add(result_1[2], tfconst)
            self.while_eig_1 = tf.add(result_1[2], result_1[4])

            show_tensor_op = False
            if show_tensor_op:
                print("====================")
                print("Define hessian_vector_product operator: ")
                print("hv_op_tmp = {}".format(hv_op_tmp))
                print("hv_op_rs = {}".format(hv_op_rs))
                print("self.hv_norm_op = {}".format(self.hv_norm_op))
                print("hv_op_rs_normalize = {}".format(hv_op_rs_normalize))
                print("self.hv_op = {}".format(self.hv_op))
                print("self.grad_op = {}".format(self.grad_op))
                print("randv_rs = {}".format(randv_rs))
                print("self.randv_norm_op = {}".format(self.randv_norm_op))
                print("self.vhv_op = {}".format(self.vhv_op))
                print("self.eig_est = {}".format(self.eig_est))
                print("====================")

        return self.img, self.output
        def adam_fn(loss=loss,
                    timestep=t,
                    conv_layer_list=conv_layer_list,
                    fc_layer_list=fc_layer_list):

            new_t = timestep.assign(timestep + 1)

            conv_wgrad_fisher_gradient = []
            fc_wgrad_fisher_gradient = []

            # get list of conv layer binarized wts, biases, here "wgrad" refers to either the weights or the binarized weights
            conv_layer_wgrad_list = []
            conv_layer_bias_list = []

            for convlayer in conv_layer_list:
                conv_layer_bias_list.append(convlayer.bias)
                if convlayer.binary:
                    conv_layer_wgrad_list.append(convlayer.wb)
                else:
                    conv_layer_wgrad_list.append(convlayer.weight)

                if convlayer.fisher and convlayer.is_binary:
                    conv_wgrad_fisher_gradient.append(
                        gamma * convlayer.fisherconst * 2.0 *
                        gradients_impl._hessian_vector_product(
                            loss, [convlayer.wb], [convlayer.perturbation]))
                elif convlayer.fisher:
                    conv_wgrad_fisher_gradient.append(
                        gamma * convlayer.fisherconst * 2.0 *
                        gradients_impl._hessian_vector_product(
                            loss, [convlayer.weight],
                            [convlayer.perturbation]))
                else:
                    conv_wgrad_fisher_gradient.append(0.)
            # get list of fc layer binarized wts, biases
            fc_layer_wgrad_list = []
            fc_layer_bias_list = []
            # for fclayer in fc_layer_list:
            #     fc_layer_wgrad_list.append(fclayer.wb)
            #     fc_layer_bias_list.append(fclayer.bias)

            for fclayer in fc_layer_list:
                fc_layer_bias_list.append(fclayer.bias)
                if fclayer.binary:
                    fc_layer_wgrad_list.append(fclayer.wb)
                else:
                    fc_layer_wgrad_list.append(fclayer.weight)

                if fclayer.fisher and fclayer.binary:
                    fc_wgrad_fisher_gradient.append(
                        gamma * fclayer.fisherconst * 2.0 *
                        gradients_impl._hessian_vector_product(
                            loss, [fclayer.wb], [fclayer.perturbation]) +
                        gamma * fclayer.fisherconst * 2.0 *
                        fclayer.perturbation)
                elif fclayer.fisher:
                    fc_wgrad_fisher_gradient.append(
                        gamma * fclayer.fisherconst * 0.05 *
                        gradients_impl._hessian_vector_product(
                            loss, [fclayer.weight], [fclayer.perturbation]) +
                        gamma * fclayer.fisherconst * 0.05 *
                        fclayer.perturbation)
                else:
                    fc_wgrad_fisher_gradient.append(0.)

            print(fc_layer_wgrad_list)
            print(len(fc_layer_wgrad_list))
            # exit()
            # Calculate gradients wrt conv layer wb
            conv_layer_wgrad_grads = tf.gradients(loss, conv_layer_wgrad_list)
            # Calculate gradients wrt fc layer wb
            fc_layer_wgrad_grads = tf.gradients(loss, fc_layer_wgrad_list)
            # Calculate gradients wrt conv layer wb
            conv_layer_bias_grads = tf.gradients(loss, conv_layer_bias_list)
            # Calculate gradients wrt fc layer wb
            fc_layer_bias_grads = tf.gradients(loss, fc_layer_bias_list)

            conv_layer_w_gradient_tot = []
            fc_layer_w_gradient_tot = []

            i = 0
            for conv_w_grad, conv_w_fisher_grad in zip(
                    conv_layer_wgrad_grads, conv_wgrad_fisher_gradient):
                conv_layer_w_gradient_tot.append(conv_w_grad +
                                                 conv_w_fisher_grad)
                if record_tensorboard:
                    tf.summary.histogram('conv_grads_layer' + str(i),
                                         conv_w_grad)
                    tf.summary.histogram('conv_fishergrad_layer' + str(i),
                                         conv_w_fisher_grad)
                i += 1

            for fc_w_grad, fc_w_fisher_grad in zip(fc_layer_wgrad_grads,
                                                   fc_wgrad_fisher_gradient):
                fc_layer_w_gradient_tot.append(fc_w_grad + fc_w_fisher_grad)
                if record_tensorboard:
                    tf.summary.histogram('fc_grads_layer' + str(i), fc_w_grad)
                    tf.summary.histogram('fc_fishergrad_layer' + str(i),
                                         fc_w_fisher_grad)
                i += 1

            # FOR CONV LAYERS:
            new_conv_m_wgrad = []
            new_conv_v_wgrad = []

            new_conv_m_b = []
            new_conv_v_b = []

            new_fc_m_wgrad = []
            new_fc_v_wgrad = []

            new_fc_m_b = []
            new_fc_v_b = []

            #Calculate m, and v from adam for the wts
            for grad, layer in zip(conv_layer_w_gradient_tot, conv_layer_list):
                # new_conv_m_wgrad.append( layer.m_w.assign(tf.squeeze(beta1 * layer.m_w + (1 - beta1) * grad)))
                new_conv_m_wgrad.append(
                    layer.m_w.assign(beta1 * layer.m_w + (1 - beta1) * grad))
                # new_conv_v_wgrad.append( layer.v_w.assign(tf.squeeze(beta2 * layer.v_w + (1 - beta2) * grad ** 2)))
                new_conv_v_wgrad.append(
                    layer.v_w.assign(beta2 * layer.v_w +
                                     (1 - beta2) * grad**2))

            #Calculate m, and v from adam for the biases
            for grad, layer in zip(conv_layer_bias_grads, conv_layer_list):
                new_conv_m_b.append(
                    layer.m_b.assign(beta1 * layer.m_b + (1 - beta1) * grad))
                new_conv_v_b.append(
                    layer.v_b.assign(beta2 * layer.v_b +
                                     (1 - beta2) * grad**2))

            #FOR FC LAYERS:
            for grad, layer in zip(fc_layer_w_gradient_tot, fc_layer_list):
                new_fc_m_wgrad.append(
                    layer.m_w.assign(
                        tf.squeeze(beta1 * layer.m_w + (1 - beta1) * grad)))
                new_fc_v_wgrad.append(
                    layer.v_w.assign(
                        tf.squeeze(beta2 * layer.v_w + (1 - beta2) * grad**2)))

            #Calculate m, and v from adam for the biases
            for grad, layer in zip(fc_layer_bias_grads, fc_layer_list):
                new_fc_m_b.append(
                    layer.m_b.assign(beta1 * layer.m_b + (1 - beta1) * grad))
                new_fc_v_b.append(
                    layer.v_b.assign(beta2 * layer.v_b +
                                     (1 - beta2) * grad**2))

            #CALCULATE UPDATES:
            conv_updates_wgrad = []
            conv_updates_bias = []
            fc_updates_wgrad = []
            fc_updates_bias = []

            #For Conv layers wts
            for m, v in zip(new_conv_m_wgrad, new_conv_v_wgrad):
                conv_updates_wgrad.append(m / (tf.sqrt(v) + epsilon))

            #For conv layer bias
            for m, v in zip(new_conv_m_b, new_conv_v_b):
                conv_updates_bias.append(m / (tf.sqrt(v) + epsilon))

            #For FC layer wts
            for m, v in zip(new_fc_m_wgrad, new_fc_v_wgrad):
                fc_updates_wgrad.append(m / (tf.sqrt(v) + epsilon))

            #For FC layer bias
            for m, v in zip(new_fc_m_b, new_fc_v_b):
                fc_updates_bias.append(m / (tf.sqrt(v) + epsilon))

            return conv_updates_wgrad, conv_updates_bias, fc_updates_wgrad, fc_updates_bias, new_t