def get_min_eigvec(self, loss, vars): iterations = 10 eps = 3 v = [self._get_initial_vector(vars)] eigvals = [] grad = self._list_to_tensor(tf.gradients(loss, vars)) for i in range(iterations): # Power iteration with the shifted Hessian v_new = self._list_to_tensor( _hessian_vector_product(loss, vars, self._tensor_to_list(v[i], vars))) v.append(eps * v[i] - v_new) v[i + 1] = self._normalize(v[i + 1]) # Get corresponding eigenvalue eigval = tf.reduce_sum( tf.multiply( v[i], self._list_to_tensor( _hessian_vector_product( loss, vars, self._tensor_to_list(v[i], vars))))) eigvals.append(eigval) idx = iterations - 1 #tf.cast(tf.argmin(eigvals[3:iterations-1]), tf.int32) e = tf.gather(eigvals, idx) v = tf.gather(v, idx) _sign = -tf.sign(tf.reduce_sum(tf.multiply(grad, v))) v *= _sign return v, e
def _second_order(self) -> None: with tf.name_scope(self._name + '_second_order'): self._vjp = tf.gradients(self._predictions_fn_tensor, self._input_var, self._dummy_var, name='vjp')[0] self._jvpz = tf.gradients(self._vjp, self._dummy_var, tf.stop_gradient(self._z), name='jvpz')[0] if self._diag_hessian_fn is not None: self._hjvpz = self._diag_hessian_fn_tensor * self._jvpz else: self._hjvpz = _hessian_vector_product( ys=[self._loss_fn_tensor], xs=[self._predictions_fn_tensor], v=[self._jvpz])[0] # J^T H J z self._jhjvpz = tf.gradients(self._predictions_fn_tensor, self._input_var, self._hjvpz + self._jloss, name='jhjvpz')[0] self._precond_this_iter = 1. if self._diag_precond_t is not None: self._precond_this_iter = 1 / (self._diag_precond_t + self._damping_factor) self._deltaz = self._precond_this_iter * ( self._jhjvpz + self._damping_factor * self._z) self._grads_tensor = tf.gradients(self._loss_fn_tensor, self._input_var)[0]
def testHessianVectorProduct(self): # Manually compute the Hessian explicitly for a low-dimensional problem # and check that HessianVectorProduct matches multiplication by the # explicit Hessian. # Specifically, the Hessian of f(x) = x^T A x is # H = A + A^T. # We expect HessianVectorProduct(f(x), x, v) to be H v. m = 4 rng = np.random.RandomState([1, 2, 3]) mat_value = rng.randn(m, m).astype("float32") v_value = rng.randn(m, 1).astype("float32") x_value = rng.randn(m, 1).astype("float32") hess_value = mat_value + mat_value.T hess_v_value = np.dot(hess_value, v_value) for use_gpu in [False, True]: with self.test_session(use_gpu=use_gpu): mat = constant_op.constant(mat_value) v = constant_op.constant(v_value) x = constant_op.constant(x_value) mat_x = math_ops.matmul(mat, x, name="Ax") x_mat_x = math_ops.matmul(array_ops.transpose(x), mat_x, name="xAx") hess_v = gradients_impl._hessian_vector_product( x_mat_x, [x], [v])[0] hess_v_actual = hess_v.eval() self.assertAllClose(hess_v_value, hess_v_actual)
def init_gradients(self): self.loss_gradient = tf.gradients(self.loss, self.policy.params_list) self.hvp = _hessian_vector_product(self.kl_div, self.policy.params_list, self.v_plh) self.grad_plh = [ tf.placeholder(tf.float32, shape=s, name="grad_plh_%d" % i) for i, s in zip(range(len(self.policy.params_shapes)), self.policy.params_shapes) ]
def __init__(self, h, xs): self.h = h self.xs = xs self.v = tf.placeholder(tf.float32, lst_to_vec(xs).shape[0]) # The `v` arg to _hessian_vector_product is a list of tensors with same structure as `xs`, # but the cg alg will give us a vector. Thus need to wrangle `v` into correct form. vec_as_list = vec_to_lst(self.v, xs) self.fv_product = lst_to_vec( _hessian_vector_product(h, xs, vec_as_list)) self.feed_dict = None
def __init__(self, workspace, feeder, loss_op_train, loss_op_test, x_placeholder, y_placeholder, test_feed_options=None, train_feed_options=None, trainable_variables=None): self.workspace = workspace self.feeder = feeder self.x_placeholder = x_placeholder self.y_placeholder = y_placeholder self.test_feed_options = test_feed_options if test_feed_options else dict() self.train_feed_options = train_feed_options if train_feed_options else dict() if trainable_variables is None: trainable_variables = ( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) + tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) self.loss_op_train = loss_op_train self.grad_op_train = tf.gradients(loss_op_train, trainable_variables) self.grad_op_test = tf.gradients(loss_op_test, trainable_variables) self.v_cur_estimated = [tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables] self.v_test_grad = [tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables] self.v_ihvp = tf.placeholder(tf.float64, shape=[None]) self.v_param_damping = tf.placeholder(tf.float32) self.v_param_scale = tf.placeholder(tf.float32) self.v_param_total_trainset = tf.placeholder(tf.float64) self.inverse_hvp = None self.trainable_variables = trainable_variables with tf.name_scope('darkon_ihvp'): self.hessian_vector_op = _hessian_vector_product(loss_op_train, trainable_variables, self.v_cur_estimated) self.estimation_op = [ a + (b * self.v_param_damping) - (c / self.v_param_scale) for a, b, c in zip(self.v_test_grad, self.v_cur_estimated, self.hessian_vector_op) ] with tf.name_scope('darkon_grad_diff'): flatten_inverse_hvp = tf.reshape(self.v_ihvp, shape=(-1, 1)) flatten_grads = tf.concat([tf.reshape(a, (-1,)) for a in self.grad_op_train], 0) flatten_grads = tf.reshape(flatten_grads, shape=(1, -1,)) flatten_grads = tf.cast(flatten_grads, tf.float64) flatten_grads /= self.v_param_total_trainset self.grad_diff_op = tf.matmul(flatten_grads, flatten_inverse_hvp) self.ihvp_config = { 'scale': 1e4, 'damping': 0.01, 'num_repeats': 1, 'recursion_batch_size': 10, 'recursion_depth': 10000 } if not os.path.exists(self.workspace): os.makedirs(self.workspace)
def __init__(self, workspace, feeder, loss_op_train, loss_op_test, x_placeholder, y_placeholder, test_feed_options=None, train_feed_options=None, trainable_variables=None): self.workspace = workspace self.feeder = feeder self.x_placeholder = x_placeholder self.y_placeholder = y_placeholder self.test_feed_options = test_feed_options if test_feed_options else dict() self.train_feed_options = train_feed_options if train_feed_options else dict() if trainable_variables is None: trainable_variables = ( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) + tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) self.loss_op_train = loss_op_train self.grad_op_train = tf.gradients(loss_op_train, trainable_variables) self.grad_op_test = tf.gradients(loss_op_test, trainable_variables) self.v_cur_estimated = [tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables] self.v_test_grad = [tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables] self.v_ihvp = tf.placeholder(tf.float64, shape=[None]) self.v_param_damping = tf.placeholder(tf.float32) self.v_param_scale = tf.placeholder(tf.float32) self.v_param_total_trainset = tf.placeholder(tf.float64) self.inverse_hvp = None self.trainable_variables = trainable_variables with tf.name_scope('darkon_ihvp'): self.hessian_vector_op = _hessian_vector_product(loss_op_train, trainable_variables, self.v_cur_estimated) self.estimation_op = [ a + (b * self.v_param_damping) - (c / self.v_param_scale) for a, b, c in zip(self.v_test_grad, self.v_cur_estimated, self.hessian_vector_op) ] with tf.name_scope('darkon_grad_diff'): flatten_inverse_hvp = tf.reshape(self.v_ihvp, shape=(-1, 1)) flatten_grads = tf.concat([tf.reshape(a, (-1,)) for a in self.grad_op_train], 0) flatten_grads = tf.reshape(flatten_grads, shape=(1, -1,)) flatten_grads = tf.cast(flatten_grads, tf.float64) flatten_grads /= self.v_param_total_trainset self.grad_diff_op = tf.matmul(flatten_grads, flatten_inverse_hvp) self.ihvp_config = { 'scale': 1e4, 'damping': 0.01, 'num_repeats': 1, 'recursion_batch_size': 10, 'recursion_depth': 10000 } if not os.path.exists(self.workspace): os.makedirs(self.workspace)
def testInvalidSecondGradient(self): inputs = np.random.randn(2, 2, 3).astype(np.float32) inputs_t = constant_op.constant(inputs) labels = SimpleSparseTensorFrom([[0, 1], [1, 0]]) seq_lens = np.array([2, 2], dtype=np.int32) v = [1.0] with self.session(use_gpu=False): loss = _ctc_loss_v2( inputs=inputs_t, labels=labels, sequence_length=seq_lens) # Taking this second gradient should fail, since it is not # yet supported. with self.assertRaisesRegex(LookupError, "explicitly disabled"): _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
def testInvalidSecondGradient(self): inputs = np.random.randn(2, 2, 3).astype(np.float32) inputs_t = constant_op.constant(inputs) labels = SimpleSparseTensorFrom([[0, 1], [1, 0]]) seq_lens = np.array([2, 2], dtype=np.int32) v = [1.0] with self.session(use_gpu=False): loss = _ctc_loss_v2( inputs=inputs_t, labels=labels, sequence_length=seq_lens) # Taking ths second gradient should fail, since it is not # yet supported. with self.assertRaisesRegexp(LookupError, "explicitly disabled"): _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
def _setupHessianVectorProduct(self, jvp_fn: Callable[[tf.Tensor], tf.Tensor], x: tf.Tensor, v_constant: tf.Tensor) -> tf.Tensor: predictions_this = self._predictions_fn(v_constant) if self._diag_hessian_fn is None: loss_this = self._loss_fn(predictions_this) hjvp = _hessian_vector_product(ys=[loss_this], xs=[predictions_this], v=[jvp_fn(x)]) else: hjvp = self._diag_hessian_fn(predictions_this) * jvp_fn(x) jhjvp = tf.gradients(predictions_this, v_constant, hjvp)[0] return jhjvp
def body(it, randv, eig_est, eig_est_prev, tfconst): #hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [randv])[0]-10*randv hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [randv])[0]-tf.multiply(tfconst,randv) hv_op_rs = tf.reshape(hv_op_tmp, (tf.shape(hv_op_tmp)[0],-1)) hv_norm_op = tf.norm(hv_op_rs, axis = 1, keepdims=True) hv_op_rs_normalize = hv_op_rs/hv_norm_op hv_op = tf.reshape(hv_op_rs_normalize, tf.shape(hv_op_tmp)) randv_rs = tf.reshape(randv, (tf.shape(randv)[0],-1)) randv_norm_op = tf.norm(randv_rs, axis = 1) vhv_op = tf.reduce_sum(tf.multiply(randv_rs,hv_op_rs),axis=1) eig_est_prev = eig_est eig_est = vhv_op/tf.square(randv_norm_op) return (it+1, hv_op, eig_est, eig_est_prev, tfconst)
def __init__(self, workspace, feeder, loss_op_train, loss_op_test, x_placeholder, y_placeholder, test_feed_options=None, train_feed_options=None, trainable_variables=None): self.workspace = workspace self.feeder = feeder self.x_placeholder = x_placeholder self.y_placeholder = y_placeholder self.test_feed_options = test_feed_options if test_feed_options else dict( ) self.train_feed_options = train_feed_options if train_feed_options else dict( ) if trainable_variables is None: trainable_variables = ( tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) + tf.get_collection(tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) self.loss_op_train = loss_op_train self.grad_op_train = tf.gradients(loss_op_train, trainable_variables) self.grad_op_test = tf.gradients(loss_op_test, trainable_variables) self.v_placeholder = [ tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables ] self.hessian_vector_op = _hessian_vector_product( loss_op_train, trainable_variables, self.v_placeholder) self.inverse_hvp = None self.trainable_variables = trainable_variables self.ihvp_config = { 'scale': 1e4, 'damping': 0.01, 'num_repeats': 1, 'recursion_batch_size': 10, 'recursion_depth': 10000 } if not os.path.exists(self.workspace): os.makedirs(self.workspace)
def testHessianVectorProduct(self): # Manually compute the Hessian explicitly for a low-dimensional problem # and check that HessianVectorProduct matches multiplication by the # explicit Hessian. # Specifically, the Hessian of f(x) = x^T A x is # H = A + A^T. # We expect HessianVectorProduct(f(x), x, v) to be H v. m = 4 rng = np.random.RandomState([1, 2, 3]) mat_value = rng.randn(m, m).astype("float32") v_value = rng.randn(m, 1).astype("float32") x_value = rng.randn(m, 1).astype("float32") hess_value = mat_value + mat_value.T hess_v_value = np.dot(hess_value, v_value) for use_gpu in [False, True]: with self.test_session(use_gpu=use_gpu): mat = constant_op.constant(mat_value) v = constant_op.constant(v_value) x = constant_op.constant(x_value) mat_x = math_ops.matmul(mat, x, name="Ax") x_mat_x = math_ops.matmul(array_ops.transpose(x), mat_x, name="xAx") hess_v = gradients_impl._hessian_vector_product(x_mat_x, [x], [v])[0] hess_v_actual = hess_v.eval() self.assertAllClose(hess_v_value, hess_v_actual)
def _second_order(self) -> None: with tf.name_scope(self._name + '_second_order'): self._vjp = tf.gradients(self._predictions_fn_tensor, self._input_var, self._dummy_var, name='vjp')[0] self._jvpz = tf.gradients(self._vjp, self._dummy_var, tf.stop_gradient(self._z), name='jvpz')[0] if self._diag_hessian_fn is not None: self._hjvpz = self._diag_hessian_fn_tensor * self._jvpz else: # I have commented out my implementation of the hessian-vector product. # Using the tensorflow implementation instead. #self._hjvpz = tf.gradients(tf.gradients(self._loss_fn_tensor, # self._predictions_fn_tensor)[0][None, :] # @ self._jvpz[:,None], self._predictions_fn_tensor, # stop_gradients=self._jvpz)[0] self._hjvpz = _hessian_vector_product( ys=[self._loss_fn_tensor], xs=[self._predictions_fn_tensor], v=[self._jvpz])[0] # J^T H J z self._jhjvpz = tf.gradients(self._predictions_fn_tensor, self._input_var, self._hjvpz + self._jloss, name='jhjvpz')[0] self._deltaz = self._jhjvpz + self._damping_factor * self._z self._grad_t = tf.gradients(self._loss_fn_tensor, self._input_var, name='grad')[0]
def __init__(self, feeder, model, workspace='./influence-workspace', trainable_variables=None): self.workspace = workspace self.feeder = feeder self.x_placeholder = model.input self.y_placeholder = K.placeholder(shape=model.output.shape) self.test_feed_options = dict() self.train_feed_options = dict() # If the model is a string, make sure it referrs to a Keras loss function. if model.loss in kerasLossDict.keys(): loss_op_train = kerasLossDict[model.loss](self.y_placeholder, model.output) loss_op_test = kerasLossDict[model.loss](self.y_placeholder, model.output) else: loss_op_train = model.loss(self.y_placeholder, model.output) loss_op_test = model.loss(self.y_placeholder, model.output) if trainable_variables: trainable_variables = trainable_variables else: trainable_variables = model.trainable_weights self.loss_op_train = loss_op_train self.grad_op_train = K.gradients(loss_op_train, trainable_variables) self.grad_op_test = K.gradients(loss_op_test, trainable_variables) self.v_cur_estimated = [ tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables ] self.v_test_grad = [ tf.placeholder(tf.float32, shape=a.get_shape()) for a in trainable_variables ] self.v_ihvp = tf.placeholder(tf.float64, shape=[None]) self.v_param_damping = tf.placeholder(tf.float32) self.v_param_scale = tf.placeholder(tf.float32) self.v_param_total_trainset = tf.placeholder(tf.float64) self.inverse_hvp = None self.trainable_variables = trainable_variables with tf.name_scope('model_ihvp'): self.hessian_vector_op = _hessian_vector_product( loss_op_train, trainable_variables, self.v_cur_estimated) self.estimation_op = [ a + (b * self.v_param_damping) - (c / self.v_param_scale) for a, b, c in zip(self.v_test_grad, self.v_cur_estimated, self.hessian_vector_op) ] with tf.name_scope('model_grad_diff'): flatten_inverse_hvp = tf.reshape(self.v_ihvp, shape=(-1, 1)) flatten_grads = tf.concat( [tf.reshape(a, (-1, )) for a in self.grad_op_train], 0) flatten_grads = tf.reshape(flatten_grads, shape=( 1, -1, )) flatten_grads = tf.cast(flatten_grads, tf.float64) flatten_grads /= self.v_param_total_trainset self.grad_diff_op = tf.matmul(flatten_grads, flatten_inverse_hvp) self.ihvp_config = { 'scale': 1e4, 'damping': 0.01, 'num_repeats': 1, 'recursion_batch_size': 10, 'recursion_depth': 10000 } if not os.path.exists(self.workspace): os.makedirs(self.workspace)
def compute_gradients(self, loss, var_list=None, aggregation_method=None, colocate_gradients_with_ops=False, device='/cpu:0'): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize or a callable taking no arguments which returns the value to minimize. When eager execution is enabled it must be a callable. var_list: Optional list or tuple of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. device: which device to compute the variables dot product on. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. NotImplementedError: If called with eager execution enabled. or with unknown loss name @compatibility(eager) Not compatible. @end_compatibility """ if callable(loss): raise NotImplementedError('Eager execution is not available yet') if self._autolambda: self._lambda = tf.reshape(tf.cond(tf.equal(tf.mod(self._step, self._auto_step),0),\ self._autolam, lambda: self._lambda),[]) self._loss = loss # Get trainable variables if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) else: var_list = nest.flatten(var_list) # pylint: disable=protected-access var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS) # Check if we have anything to optimize if not var_list: raise ValueError("No variables to optimize.") # TODO enable more variables mode maybe fix z device placement var_refs = var_list # Init momentum vector mu = 1. self._z = [] self._zdic = {} for i in range(len(var_refs)): self._z.append( tf.get_variable("z%03d" % (i), shape=var_refs[i].get_shape(), caching_device=var_refs[i].device, initializer=tf.zeros_initializer())) self._zdic[var_refs[i].name] = self._z[i] # do two GD steps. first update z (linear system state), then use the # result (whitened gradient estimate) to update the parameters w. # # zdelta = grad{||(mu * J' * Hl * J + lambda * I) * z - J' * Jl'||^2} # = (mu * J' * Hl * J + lambda * I) * z - J' * Jl' # = mu * J' * Hl * J * z + lambda * z - J' * Jl' # # znew = momentum * z - beta * zdelta # # wnew = w - lr * znew # # Assert that pre_loss is a single tensorflow tensor for simplicity if not isinstance(self._pre_loss, Tensor): raise NotImplementedError( 'Optimizer not yet working with vector of logits') delta_z = [] if not self._hessian: Jz = fmad_prod(self._pre_loss, var_refs, self._z) # Evaluate Hessian loss and gradient Jz_ = self._hessian_grad_loss(self._loss_name, self._pre_loss, loss, Jz) Jl = tf.gradients(loss, self._pre_loss)[0] Jz_ = mu * Jz_ # Backpropagate Jz_ - Jl h_term = tf.gradients(self._pre_loss, var_refs, Jz_ + Jl) for i in range(len(var_refs)): delta_z.append(h_term[i] + self._lambda * self._z[i]) else: # Compute gradient w.r.t the loss grad = tf.gradients(loss, var_refs) # Tensorflow build-in function, compute hessian vector products h_term = _hessian_vector_product(loss, var_refs, self._z) for i in range(len(var_refs)): delta_z.append(h_term[i] + self._lambda * self._z[i] + grad[i]) # Autoparam if self._autoparam: if not self._hessian: Jdz = fmad_prod(self._pre_loss, var_refs, delta_z) Jdz_ = self._hessian_grad_loss(self._loss_name, self._pre_loss, loss, Jdz) with tf.device(device): A11 = mu * tf.matmul(tf.reshape(Jdz, [1, -1]), tf.reshape(Jdz_, [-1, 1])) A12 = mu * tf.matmul(tf.reshape(Jz, [1, -1]), tf.reshape(Jdz_, [-1, 1])) A22 = mu * tf.matmul(tf.reshape(Jz, [1, -1]), tf.reshape(Jz_, [-1, 1])) b1 = tf.matmul(tf.reshape(Jl, [1, -1]), tf.reshape(Jdz, [-1, 1])) b2 = tf.matmul(tf.reshape(Jl, [1, -1]), tf.reshape(Jz, [-1, 1])) for i in range(len(var_refs)): # compute the system we want to invert z_vec = tf.reshape(self._z[i], [1, -1]) dz_vec = tf.reshape(delta_z[i], [1, -1]) A11 = A11 + tf.matmul(dz_vec, dz_vec, transpose_b=True) * self._lambda A12 = A12 + tf.matmul(dz_vec, z_vec, transpose_b=True) * self._lambda A22 = A22 + tf.matmul(z_vec, z_vec, transpose_b=True) * self._lambda else: # Tensorflow build-in function, compute hessian vector products h_term_dz = _hessian_vector_product(loss, var_refs, delta_z) with tf.device(device): A11, A12, A22 = 0, 0, 0 b1, b2 = 0, 0 for i in range(len(var_refs)): # compute the system we want to invert z_vec = tf.reshape(self._z[i], [1, -1]) dz_vec = tf.reshape(delta_z[i], [1, -1]) hz_vec = tf.reshape(h_term[i], [1, -1]) hdz_vec = tf.reshape(h_term_dz[i], [1, -1]) A11 = A11 + tf.matmul( hdz_vec, dz_vec, transpose_b=True) + tf.matmul( dz_vec, dz_vec, transpose_b=True) * self._lambda A12 = A12 + tf.matmul( hdz_vec, z_vec, transpose_b=True) + tf.matmul( dz_vec, z_vec, transpose_b=True) * self._lambda A22 = A22 + tf.matmul( hz_vec, z_vec, transpose_b=True) + tf.matmul( z_vec, z_vec, transpose_b=True) * self._lambda b1 = b1 + tf.matmul(tf.reshape(grad[i], [1, -1]), tf.reshape(dz_vec, [-1, 1])) b2 = b2 + tf.matmul(tf.reshape(grad[i], [1, -1]), tf.reshape(z_vec, [-1, 1])) # compute beta and momentum coefficient A = tf.concat([tf.concat([A11, A12], 0), tf.concat([A12, A22], 0)], 1) b = tf.concat([b1, b2], 0) # Solve linear system m_b = tf.matrix_solve_ls(A, b, l2_regularizer=self._autoparam_reg, fast=False) self._M = -0.5 * tf.reduce_sum(m_b * b) m_b = tf.unstack(m_b, axis=0) beta = -tf.to_float(m_b[0]) self._momentum = -tf.to_float(m_b[1]) else: beta = -self._beta # Update gradient for i in range(len(var_refs)): # delta_z handle the momentum update delta_z[i] = beta * delta_z[i] grads_and_vars = list(zip(delta_z, var_list)) self._assert_valid_dtypes([ v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource ]) return grads_and_vars
def get_acc_for_nonzero_gaussian_perturbed_two_layer_model_MNIST(mu, sigma=.1, const_multiplier=1., n_tot_iters=5000, n_fisher_iters=2000, record_tensorboard=False, regularizer_mode='hvp'): import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data from tensorflow.python.ops import gradients_impl import numpy as np tf.reset_default_graph() mnist = input_data.read_data_sets('/tmp/data', one_hot=True) x = tf.placeholder(tf.float32, shape = (None, 784), name='Inputs') y = tf.placeholder(tf.float32, shape = (None, 10), name='Labels') gamma = tf.placeholder(tf.float32, shape = (), name='reg_constant') nwts = 7840 # wts = tf.get_variable('Weights',shape= (784,10), initializer = tf.random_normal_initializer(stddev=.001)) w = tf.get_variable(name='w', shape=[784, 512], initializer=tf.contrib.layers.xavier_initializer()) w2 = tf.get_variable(name='w2', shape = [512, 10], initializer = tf.contrib.layers.xavier_initializer()) bias1 = tf.get_variable('bias1',shape= (512), initializer = tf.random_normal_initializer(stddev=.1)) bias2 = tf.get_variable('bias2',shape= (10), initializer = tf.random_normal_initializer(stddev=.1)) w_pert = tf.placeholder(tf.float32, shape=(784,512)) w_pert2 = tf.placeholder(tf.float32, shape=(512,10)) # 0.1000 0.1292 0.1668 0.2154 0.2783 0.3594 0.4642 0.5995 0.7743 1.0000 # w_pert = tf.stop_gradient(w + shift_pctage*w) perturbation = tf.stop_gradient(w - w_pert) perturbation2 = tf.stop_gradient(w2 - w_pert2) layer_1_out = tf.nn.relu(tf.matmul(x, w) + bias1) logits = tf.matmul(layer_1_out, w2) + bias2 y_ = tf.nn.softmax(logits) correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1)) accuracy = tf.stop_gradient(tf.reduce_mean(tf.cast(correct_prediction, tf.float32))) loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels = y, logits = logits)) optimizer = tf.train.AdamOptimizer() ce_grads = tf.gradients(loss, [w, w2, bias1,bias2]) ce_grads_w1 = ce_grads[0] ce_grads_w2 = ce_grads[1] # print(vars) tf.summary.histogram('weights1', w) tf.summary.histogram('weights2', w2) tf.summary.histogram('pertweights1', w_pert) tf.summary.histogram('pertweights2', w_pert2) if regularizer_mode == 'hvp_adam': train_op = optimizer.apply_gradients(zip(ce_grads, [w, w2, bias1, bias2])) hvp1 = gradients_impl._hessian_vector_product(loss, [w], [perturbation]) hvp2 = gradients_impl._hessian_vector_product(loss, [w2], [perturbation2]) diag_load_amt1 = gamma * .005 * perturbation diag_load_amt2 = gamma * .005 * perturbation2 reg_grad1 = gamma * 2.0 * hvp1 + diag_load_amt1 reg_grad1 = tf.reshape(reg_grad1, tf.shape(w)) reg_grad2 = gamma * 2.0 * hvp2 + diag_load_amt2 reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2)) train_op_reg = optimizer.apply_gradients(zip([reg_grad1, reg_grad2], [w, w2])) elif regularizer_mode == 'diag_adam': train_op = optimizer.apply_gradients(zip(ce_grads, [w, w2, bias1, bias2])) vars = optimizer.variables() v_2 = vars[-1] v_1 = vars[-3] hvp1 = tf.multiply(v_1 ,perturbation) hvp2 = tf.multiply(v_2 ,perturbation2) diag_load_amt1 = gamma * .005 * perturbation diag_load_amt2 = gamma * .005 * perturbation2 reg_grad1 = gamma * 2.0 * hvp1 + diag_load_amt1 reg_grad1 = tf.reshape(reg_grad1, tf.shape(w)) reg_grad2 = gamma * 2.0 * hvp2 + diag_load_amt2 reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2)) train_op_reg = optimizer.apply_gradients(zip([reg_grad1, reg_grad2], [w, w2])) elif regularizer_mode == 'l2_adam': train_op = optimizer.apply_gradients(zip(ce_grads, [w, w2, bias1, bias2])) diag_load_amt1 = gamma * .005 * perturbation diag_load_amt2 = gamma * .005 * perturbation2 reg_grad1 = diag_load_amt1 reg_grad1 = tf.reshape(reg_grad1, tf.shape(w)) reg_grad2 = diag_load_amt2 reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2)) train_op_reg = optimizer.apply_gradients(zip([reg_grad1, reg_grad2], [w, w2])) elif regularizer_mode == 'hvp': diag_load_amt1 = gamma * .005 * perturbation diag_load_amt2 = gamma * .005 * perturbation2 hvp1 = gradients_impl._hessian_vector_product(loss, [w], [perturbation]) hvp2 = gradients_impl._hessian_vector_product(loss, [w2], [perturbation2]) reg_grad1 = gamma * 2.0 * hvp1 + diag_load_amt1 reg_grad1 = tf.reshape(reg_grad1, tf.shape(w)) reg_grad2 = gamma * 2.0 * hvp2 + diag_load_amt2 reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2)) tot_grads1 = ce_grads_w1 + reg_grad1 tot_grads2 = ce_grads_w2 + reg_grad2 train_op = optimizer.apply_gradients(zip([tot_grads1, tot_grads2, ce_grads[2], ce_grads[3]], [w, w2, bias1, bias2])) train_op_reg = tf.no_op() elif regularizer_mode == 'l2': diag_load_amt1 = gamma * .005 * perturbation diag_load_amt2 = gamma * .005 * perturbation2 reg_grad1 = diag_load_amt1 reg_grad1 = tf.reshape(reg_grad1, tf.shape(w)) reg_grad2 = diag_load_amt2 reg_grad2 = tf.reshape(reg_grad2, tf.shape(w2)) tot_grads1 = ce_grads_w1 + reg_grad1 tot_grads2 = ce_grads_w2 + reg_grad2 train_op = optimizer.apply_gradients(zip([tot_grads1, tot_grads2, ce_grads[2], ce_grads[3]], [w, w2, bias1, bias2])) train_op_reg = tf.no_op() else: train_op = optimizer.apply_gradients(zip(ce_grads, [w, w2, bias1, bias2])) train_op_reg = tf.no_op() tf.summary.histogram('ce_gradient1', ce_grads_w1) tf.summary.histogram('ce_gradient2', ce_grads_w2) if const_multiplier>0.: print('USING REGULARIZATION') tf.summary.histogram('regularizer_gradient1', reg_grad1) tf.summary.histogram('regularizer_gradient2', reg_grad2) tf.summary.histogram('diagonal_load1', diag_load_amt1) tf.summary.histogram('diagonal_load2', diag_load_amt2) tf.summary.scalar('loss_gamma', gamma) else: print('NO REGULARIZATION') train_op_reg = tf.no_op() n_iters = n_tot_iters batch_size = 1024 n_fisher_iters= n_fisher_iters gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) if record_tensorboard: summary_writer = tf.summary.FileWriter('./logs/two_layer_zero_mean', sess.graph) summary_op = tf.summary.merge_all() lossval=[] accval=[] sess.run(tf.global_variables_initializer()) regularizer_const=0. w_pert_ = np.zeros([784, 512]) w_pert2_ = np.zeros([512, 10]) for i in range(0, n_iters): x_batch, y_batch = mnist.train.next_batch(batch_size) if i<=(n_iters-n_fisher_iters): regularizer_const=0. else: regularizer_const=.1*const_multiplier _, __, l, acc, w_ = sess.run([train_op, train_op_reg, loss, accuracy, w,], feed_dict={x: x_batch, y: y_batch, gamma:regularizer_const, w_pert:w_pert_, w_pert2:w_pert2_}) if record_tensorboard: summ, _, __, l, acc, w_ = sess.run([summary_op, train_op, train_op_reg, loss, accuracy, w], feed_dict={x: x_batch, y: y_batch, gamma:regularizer_const, w_pert:w_pert_, w_pert2:w_pert2_}) if record_tensorboard: summary_writer.add_summary(summ, i) lossval.append(l) accval.append(acc) if i == n_iters-n_fisher_iters: print('SAVING OPTIMAL ML WEIGHTS FROM END OF TRAINING') w_, w2_ = sess.run([w, w2]) if i >= n_iters-n_fisher_iters and regularizer_const>0.: w_pert_ = w_ + np.random.normal(mu, sigma, size = [784, 512]) w_pert2_ = w2_ + np.random.normal(mu, sigma, size = [512, 10]) if i == n_iters - 1: print('USING PERTURBATIONS ON WEIGHTS AT END OF ALL ITERATIONS') w_, w2_ = sess.run([w, w2]) # w_pert_ = w_ # w_pert2_ = w2_ # w_pert_ = w_ + np.random.normal(mu, sigma, size = [784, 512]) # w_pert2_ = w2_ + np.random.normal(mu, sigma, size = [512, 10]) if i%200==0: print('\nIteration: '+str(i)+'\nAccuracy: '+str(acc)+'\nLoss: '+str(l)+'\n') regularizer_const = 0. # perturbed_test_set = mnist.test.images+np.random.normal(0.,stddev, np.shape(mnist.test.images)) w_pert_ = w_ + np.random.normal(mu, sigma, size = [784, 512]) w_pert2_ = w2_ + np.random.normal(mu, sigma, size = [512, 10]) x_testcv = mnist.test.images y_testcv = mnist.test.labels x_cv = x_testcv[0:5000,:] x_test = x_testcv[5000:,:] y_cv = y_testcv[0:5000,:] y_test = y_testcv[5000:,:] up_acc = sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels}) print('UNPERTURBED Test accuracy %g' % up_acc) sess.run(tf.assign(w, w_pert), feed_dict={gamma:regularizer_const, w_pert:w_pert_, w_pert2: w_pert2_}) sess.run(tf.assign(w2, w_pert2_), feed_dict={gamma:regularizer_const, w_pert:w_pert_, w_pert2: w_pert2_}) pert_acc = sess.run(accuracy, feed_dict={x: mnist.test.images, y: mnist.test.labels, gamma:regularizer_const, w_pert:w_pert_, w_pert2:w_pert2_}) # pert_acc = sess.run(accuracy, feed_dict={x: perturbed_test_set, y: mnist.test.labels}) print('PRETURBED test accuracy %g' % pert_acc) # summary_writer.close() sess.close() return up_acc, pert_acc
def _param_updates(self) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: with tf.name_scope(self._name + '_param_updates'): # This is for the beta and rho updates self._jvpdz = tf.gradients(self._vjp, self._dummy_var, tf.stop_gradient(self._deltaz), name='jvpdz')[0] if self._diag_hessian_fn is not None: #self._hjvpdz = self._diag_hessian_fn(self._predictions_fn_tensor) * self._jvpdz self._hjvpdz = self._diag_hessian_fn_tensor * self._jvpdz else: #self._hjvpdz = tf.gradients(tf.gradients(self._loss_fn_tensor, # self._predictions_fn_tensor)[0][None, :] # @ self._jvpdz[:,None], self._predictions_fn_tensor, # stop_gradients=self._jvpdz)[0] self._hjvpdz = _hessian_vector_product( ys=[self._loss_fn_tensor], xs=[self._predictions_fn_tensor], v=[self._jvpdz])[0] a11 = tf.reduce_sum(self._hjvpdz * self._jvpdz) a12 = tf.reduce_sum(self._jvpz * self._hjvpdz) a22 = tf.reduce_sum(self._jvpz * self._hjvpz) b1 = tf.reduce_sum(self._jloss * self._jvpdz) b2 = tf.reduce_sum(self._jloss * self._jvpz) a11 = a11 + tf.reduce_sum( self._deltaz * self._deltaz * self._damping_factor) a12 = a12 + tf.reduce_sum( self._deltaz * self._z * self._damping_factor) a22 = a22 + tf.reduce_sum(self._z * self._z * self._damping_factor) A = tf.stack([[a11, a12], [a12, a22]]) b = tf.stack([b1, b2]) # Cannot use vanilla matrix inverse because the matrix is sometimes singular #m_b = tf.reshape(tf.matrix_inverse(A) @ b[:, None], [-1]) # I am using 1e-15 for rcond instead of the default value. # While this is a less robust choice, using a higher value of rcond seems to output approximate # inverse values which slow down the optimization significantly. # Instead, choosing a low value sometimes produces very bad outputs, but we can take care of that # using an additional update condition based on the change of the loss function, # by requiring that the loss function always decrease. def _two_by_two_pinv_sol(): A_inv = tf.linalg.pinv(A, rcond=1e-15) m_b = tf.reshape(A_inv @ b[:, None], [-1]) #with tf.control_dependencies([tf.print(m_b)]): # m_b_0 = tf.clip_by_value(m_b[0], clip_value_min=1e-5, clip_value_max=1.0) # m_b_1 = tf.clip_by_value(m_b[1], clip_value_min=-np.inf, clip_value_max=-1e-5) # m_b = tf.stack([m_b_0, m_b_1]) #m_b = tf.reshape(m_b, [-1]) #m_b = tf.reshape(tf.linalg.lstsq(A, b[:,None], fast=False), [-1]) #for i in range(10): # db = A @ m_b[:,None] - b[:,None] # m_db = tf.reshape(tf.linalg.lstsq(A, db, fast=False), [-1]) # m_b = m_b - m_db return m_b def _zero_z_sol(): return tf.stack([b[0] / A[[0, 0]], 0.]) m_b = tf.cond(tf.equal(b2, 0.), _zero_z_sol, _two_by_two_pinv_sol) beta = m_b[0] rho = -m_b[1] M = -0.5 * tf.reduce_sum(m_b * b) #with tf.control_dependencies([tf.print(M)]): # M = M + 0. return beta, rho, M
for indicator, layer_wt, qlayer_wt in zip(regularize_list, trainable_weights, qweight_list): perturbation_list.append(tf.stop_gradient(layer_wt - qlayer_wt)) for indicator, perturbation_vec in zip(regularize_list, perturbation_list): if indicator: perturbations_for_hvp.append(perturbation_vec) print('list of perturbation tensors') print(perturbation_list) # exit() #Compute hessian-vector product, and diag regularizer here hvp_list = [] hvp_list = gradients_impl._hessian_vector_product(loss, regularized_weights, perturbations_for_hvp) hessian_vector_product = [] layer_diag_load_amt = [] for layer_hvp, layer_pertubation in zip(hvp_list, perturbations_for_hvp): layer_diag_load_amt.append(gamma * .1 * layer_pertubation) for layer_hvp, diag_load_amt in zip(hvp_list, layer_diag_load_amt): hessian_vector_product.append(gamma * 2.0 * layer_hvp + diag_load_amt) total_grads = [] i = 0 for indicator, layer_grad in zip(regularize_list, ce_grads): if indicator: total_grads.append(layer_grad + hessian_vector_product[i]) # total_grads.append(layer_grad)# + hessian_vector_product[i])
def get_acc_for_gaussian_perturbed_logistic_model_MNIST( mu, sigma=.1, const_multiplier=1., record_tensorboard=False): import tensorflow as tf from tensorflow.examples.tutorials.mnist import input_data from tensorflow.python.ops import gradients_impl import numpy as np tf.reset_default_graph() mnist = input_data.read_data_sets('/tmp/data', one_hot=True) x = tf.placeholder(tf.float32, shape=(None, 784), name='Inputs') y = tf.placeholder(tf.float32, shape=(None, 10), name='Labels') gamma = tf.placeholder(tf.float32, shape=(), name='reg_constant') nwts = 7840 w = tf.get_variable(name='w', shape=[784, 10], initializer=tf.contrib.layers.xavier_initializer()) bias1 = tf.get_variable( 'bias1', shape=(10), initializer=tf.random_normal_initializer(stddev=.1)) w_pert = tf.placeholder(tf.float32, shape=(784, 10)) # w_pert2 = tf.placeholder(tf.float32, shape=(512,10)) # 0.1000 0.1292 0.1668 0.2154 0.2783 0.3594 0.4642 0.5995 0.7743 1.0000 perturbation = tf.stop_gradient(w - w_pert) logits = tf.matmul(x, w) + bias1 y_ = tf.nn.softmax(logits) correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.stop_gradient( tf.reduce_mean(tf.cast(correct_prediction, tf.float32))) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits)) optimizer = tf.train.AdamOptimizer() ce_grads = tf.gradients(loss, [w, bias1]) if const_multiplier > 0.: print('USING REGULARIZATION') ce_grads_w1 = ce_grads[0] hvp1 = gradients_impl._hessian_vector_product(loss, [w], [perturbation]) diag_load_amt1 = gamma * .01 * perturbation # reg_grad1 = gamma * 2.0 * hvp1 + diag_load_amt1 reg_grad1 = diag_load_amt1 reg_grad1 = tf.reshape(reg_grad1, tf.shape(w)) tot_grads1 = ce_grads_w1 + reg_grad1 tf.summary.histogram('regularizer_gradient1', reg_grad1) tf.summary.histogram('diagonal_load1', diag_load_amt1) tf.summary.histogram('ce_gradient1', ce_grads_w1) tf.summary.histogram('ce_gradient1_sq', tf.square(ce_grads_w1)) tf.summary.scalar('loss_gamma', gamma) train_op = optimizer.apply_gradients( zip([tot_grads1, ce_grads[1]], [w, bias1])) else: print('NO REGULARIZATION') train_op = optimizer.apply_gradients(zip(ce_grads, [w, bias1])) n_iters = 5000 batch_size = 512 n_fisher_iters = 1000 gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) tf.summary.histogram('weights1', w) tf.summary.histogram('pertweights1', w_pert) lossval = [] accval = [] if record_tensorboard: summary_writer = tf.summary.FileWriter('./logs/logistic_adam_v', sess.graph) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) w_pert_ = np.zeros([784, 10]) for i in range(0, n_iters): x_batch, y_batch = mnist.train.next_batch(batch_size) if i <= (n_iters - n_fisher_iters): regularizer_const = 0. else: regularizer_const = .1 * const_multiplier _, l, acc, w_ = sess.run([ train_op, loss, accuracy, w, ], feed_dict={ x: x_batch, y: y_batch, gamma: regularizer_const, w_pert: w_pert_ }) if record_tensorboard: summ, _, l, acc, w_ = sess.run( [summary_op, train_op, loss, accuracy, w], feed_dict={ x: x_batch, y: y_batch, gamma: regularizer_const, w_pert: w_pert_ }) if record_tensorboard: summary_writer.add_summary(summ, i) lossval.append(l) accval.append(acc) if i == n_iters - n_fisher_iters: print('SAVING OPTIMAL ML WEIGHTS FROM END OF TRAINING') w_ = sess.run([w]) if i >= n_iters - n_fisher_iters and regularizer_const > 0.: w_pert_ = w_ + np.random.normal(mu, sigma, size=[784, 10]) if i == n_iters - 1: print('USING PERTURBATIONS ON WEIGHTS AT END OF ALL ITERATIONS') w_ = sess.run([w]) if i % 200 == 0: print('\nIteration: ' + str(i) + '\nAccuracy: ' + str(acc) + '\nLoss: ' + str(l) + '\n') regularizer_const = 0. w_pert_ = np.array(w_).reshape(784, 10) + np.random.normal( mu, sigma, size=[784, 10]) x_testcv = mnist.test.images y_testcv = mnist.test.labels x_cv = x_testcv[0:5000, :] x_test = x_testcv[5000:, :] y_cv = y_testcv[0:5000, :] y_test = y_testcv[5000:, :] up_acc = sess.run(accuracy, feed_dict={ x: mnist.test.images, y: mnist.test.labels }) print('UNPERTURBED Test accuracy %g' % up_acc) sess.run(tf.assign(w, w_pert), feed_dict={ gamma: regularizer_const, w_pert: w_pert_ }) pert_acc = sess.run(accuracy, feed_dict={ x: mnist.test.images, y: mnist.test.labels, gamma: regularizer_const, w_pert: w_pert_ }) # pert_acc = sess.run(accuracy, feed_dict={x: perturbed_test_set, y: mnist.test.labels}) print('PRETURBED test accuracy %g' % pert_acc) # summary_writer.close() sess.close() return up_acc, pert_acc
def _param_updates(self) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor]: with tf.name_scope(self._name + '_param_updates'): # This is for the beta and rho updates ## I think the preconditioning cancels out during the matrix multiplication. ## But I am keeping this here in case it helps stabilize the matrix inverse. self._jvpdz = tf.gradients(self._vjp, self._dummy_var, tf.stop_gradient(self._deltaz), name='jvpdz')[0] if self._diag_hessian_fn is not None: self._hjvpdz = self._diag_hessian_fn_tensor * self._jvpdz else: self._hjvpdz = _hessian_vector_product( ys=[self._loss_fn_tensor], xs=[self._predictions_fn_tensor], v=[self._jvpdz])[0] v1 = tf.reduce_sum(self._diag_hessian_fn_tensor) / tf.reduce_sum( tf.abs(self._diag_hessian_fn_tensor)) a110 = tf.reduce_sum(self._hjvpdz * self._jvpdz) a12 = tf.reduce_sum(self._jvpz * self._hjvpdz) a22 = tf.reduce_sum(self._jvpz * self._hjvpz) b1 = tf.reduce_sum(self._jloss * self._jvpdz) b2 = tf.reduce_sum(self._jloss * self._jvpz) a11 = a110 + tf.reduce_sum( self._deltaz * self._deltaz * self._damping_factor) a12 = a12 + tf.reduce_sum( self._deltaz * self._z * self._damping_factor) a22 = a22 + tf.reduce_sum(self._z * self._z * self._damping_factor) A = tf.stack([[a11, a12], [a12, a22]]) b = tf.stack([b1, b2]) # Cannot use vanilla matrix inverse because the matrix is sometimes singular # m_b = tf.reshape(tf.matrix_inverse(A) @ b[:, None], [-1]) # I am using 1e-15 for rcond instead of the default value. # While this is a less robust choice, using a higher value of rcond seems to output approximate # inverse values which slow down the optimization significantly. # Instead, choosing a low value sometimes produces very bad outputs, but we can take care of that # using an additional update condition based on the change of the loss function, # by requiring that the loss function always decrease. def _two_by_two_pinv_sol(): A_inv = tf.linalg.pinv(A, rcond=1e-15) m_b = tf.reshape(A_inv @ b[:, None], [-1]) # m_b = tf.reshape(tf.linalg.lstsq(A, b[:,None], fast=False), [-1]) # for i in range(2): # db = A @ m_b[:,None] - b[:,None] # m_db = tf.reshape(tf.linalg.lstsq(A, db, fast=False), [-1]) # m_b = m_b - m_db return m_b def _zero_z_sol(): return tf.stack([b[0] / A[[0, 0]], 0.]) m_b = tf.cond(tf.equal(b2, 0.), _zero_z_sol, _two_by_two_pinv_sol) beta = m_b[0] rho = -m_b[1] M = -0.5 * tf.reduce_sum(m_b * b) #dot_prod = tf.reduce_sum(self._grad * self._deltaz / tf.linalg.norm(self._grad) / tf.linalg.norm(self._deltaz)) #with tf.control_dependencies( # [tf.print('beta', beta, 'rho', rho, 'M', M, 'b2', b2, "b1", b1, "a110", a110, "a11", a11, "v1", v1, # "dot_prod", dot_prod)]): # M = M + 0. return beta, rho, M
def load_model(self, dataset="mnist", model_name="2-layer", activation="relu", model=None, batch_size=0, compute_slope=False, order=1): """ model: if set to None, then load dataset with model_name. Otherwise use the model directly. dataset: mnist, cifar and imagenet. recommend to use mnist and cifar as a starting point. model_name: possible options are 2-layer, distilled, and normal """ from setup_cifar import CIFAR, CIFARModel, TwoLayerCIFARModel from setup_mnist import MNIST, MNISTModel, TwoLayerMNISTModel from nlayer_model import NLayerModel from setup_imagenet import ImageNet, ImageNetModel # if set this to true, we will use the logit layer output instead of probability # the logit layer's gradients are usually larger and more stable output_logits = True self.dataset = dataset self.model_name = model_name if model is None: print('Loading model...') if dataset == "mnist": self.batch_size = 1024 if model_name == "2-layer": model = TwoLayerMNISTModel("models/mnist_2layer", self.sess, not output_logits) elif model_name == "normal": if activation == "relu": model = MNISTModel("models/mnist", self.sess, not output_logits) else: print("actviation = {}".format(activation)) model = MNISTModel("models/mnist_cnn_7layer_" + activation, self.sess, not output_logits, activation=activation) time.sleep(5) elif model_name == "brelu": model = MNISTModel("models/mnist_brelu", self.sess, not output_logits, use_brelu=True) elif model_name == "distilled": model = MNISTModel("models/mnist-distilled-100", self.sess, not output_logits) else: # specify model parameters as N,M,opts model_params = model_name.split(",") if len(model_params) < 3: raise (RuntimeError("incorrect model option" + model_name)) numlayer = int(model_params[0]) nhidden = int(model_params[1]) modelfile = "models/mnist_{}layer_relu_{}_{}".format( numlayer, nhidden, model_params[2]) print("loading", modelfile) model = NLayerModel([nhidden] * (numlayer - 1), modelfile) elif dataset == "cifar": self.batch_size = 1024 if model_name == "2-layer": model = TwoLayerCIFARModel("models/cifar_2layer", self.sess, not output_logits) elif model_name == "normal": if activation == "relu": model = CIFARModel("models/cifar", self.sess, not output_logits) else: model = CIFARModel("models/cifar_cnn_7layer_" + activation, self.sess, not output_logits, activation=activation) elif model_name == "brelu": model = CIFARModel("models/cifar_brelu", self.sess, not output_logits, use_brelu=True) elif model_name == "distilled": model = CIFARModel("models/cifar-distilled-100", self.sess, not output_logits) else: # specify model parameters as N,M,opts model_params = model_name.split(",") if len(model_params) < 3: raise (RuntimeError("incorrect model option" + model_name)) numlayer = int(model_params[0]) nhidden = int(model_params[1]) modelfile = "models/cifar_{}layer_relu_{}_{}".format( numlayer, nhidden, model_params[2]) print("loading", modelfile) model = NLayerModel([nhidden] * (numlayer - 1), modelfile, image_size=32, image_channel=3) elif dataset == "imagenet": self.batch_size = 32 model = ImageNetModel(self.sess, use_softmax=not output_logits, model_name=model_name, create_prediction=False) else: raise (RuntimeError("dataset unknown")) #print("*** Loaded model successfully") self.model = model self.compute_slope = compute_slope if batch_size != 0: self.batch_size = batch_size ## placeholders: self.img, self.true_label, self.target_label # img is the placeholder for image input self.img = tf.placeholder(shape=[ None, model.image_size, model.image_size, model.num_channels ], dtype=tf.float32) # output is the output tensor of the entire network self.output = model.predict(self.img) # create the graph to compute gradient # get the desired true label and target label self.true_label = tf.placeholder(dtype=tf.int32, shape=[]) self.target_label = tf.placeholder(dtype=tf.int32, shape=[]) true_output = self.output[:, self.true_label] target_output = self.output[:, self.target_label] # get the difference self.objective = true_output - target_output # get the gradient(deprecated arguments) self.grad_op = tf.gradients(self.objective, self.img)[0] # compute gradient norm: (in computation graph, so is faster) grad_op_rs = tf.reshape(self.grad_op, (tf.shape(self.grad_op)[0], -1)) self.grad_2_norm_op = tf.norm(grad_op_rs, axis=1) self.grad_1_norm_op = tf.norm(grad_op_rs, ord=1, axis=1) self.grad_inf_norm_op = tf.norm(grad_op_rs, ord=np.inf, axis=1) ### Lily: added Hessian-vector product calculation here for 2nd order bound: if order == 2: ## _hessian_vector_product(ys, xs, v): return a list of tensors containing the product between the Hessian and v ## ys: a scalar valur or a tensor or a list of tensors to be summed to yield of scalar ## xs: a list of tensors that we should construct the Hessian over ## v: a list of tensors with the same shape as xs that we want to multiply by the Hessian # self.randv: shape = (Nimg,28,28,1) (the v in _hessian_vector_product) self.randv = tf.placeholder(shape=[ None, model.image_size, model.image_size, model.num_channels ], dtype=tf.float32) # hv_op_tmp: shape = (Nimg,28,28,1) for mnist, same as self.img (the xs in _hessian_vector_product) hv_op_tmp = gradients_impl._hessian_vector_product( self.objective, [self.img], [self.randv])[0] # hv_op_rs: reshape hv_op_tmp to hv_op_rs whose shape = (Nimg, 784) for mnist hv_op_rs = tf.reshape(hv_op_tmp, (tf.shape(hv_op_tmp)[0], -1)) # self.hv_norm_op: norm of hessian vector product, keep shape = (Nimg,1) using keepdims self.hv_norm_op = tf.norm(hv_op_rs, axis=1, keepdims=True) # hv_op_rs_normalize: normalize Hv to Hv/||Hv||, shape = (Nimg, 784) hv_op_rs_normalize = hv_op_rs / self.hv_norm_op # self.hv_op: reshape hv_op_rs_normalize to shape = (Nimg,28,28,1) self.hv_op = tf.reshape(hv_op_rs_normalize, tf.shape(hv_op_tmp)) ## reshape randv and compute its norm # shape: (Nimg, 784) randv_rs = tf.reshape(self.randv, (tf.shape(self.randv)[0], -1)) # shape: (Nimg,) self.randv_norm_op = tf.norm(randv_rs, axis=1) ## compute v'Hv: use un-normalized Hv (hv_op_tmp, hv_op_rs) # element-wise multiplication and then sum over axis = 1 (now shape: (Nimg,)) self.vhv_op = tf.reduce_sum(tf.multiply(randv_rs, hv_op_rs), axis=1) ## compute Rayleigh quotient: v'Hv/v'v (estimated largest eigenvalue), shape: (Nimg,) # note: self.vhv_op and self.randv_norm_op has to be in the same dimension (either (Nimg,) or (Nimg,1)) self.eig_est = self.vhv_op / tf.square(self.randv_norm_op) ## Lily added the tf.while to compute the eigenvalue in computational graph later # cond for computing largest abs/neg eigen-value def cond(it, randv, eig_est, eig_est_prev, tfconst): norm_diff = tf.norm(eig_est - eig_est_prev, axis=0) return tf.logical_and(it < 500, norm_diff > 0.001) # compute largest abs eigenvalue: tfconst = 0 # compute largest neg eigenvalue: tfconst = 10 def body(it, randv, eig_est, eig_est_prev, tfconst): #hv_op_tmp = gradients_impl._hessian_vector_product(self.objective, [self.img], [randv])[0]-10*randv hv_op_tmp = gradients_impl._hessian_vector_product( self.objective, [self.img], [randv])[0] - tf.multiply( tfconst, randv) hv_op_rs = tf.reshape(hv_op_tmp, (tf.shape(hv_op_tmp)[0], -1)) hv_norm_op = tf.norm(hv_op_rs, axis=1, keepdims=True) hv_op_rs_normalize = hv_op_rs / hv_norm_op hv_op = tf.reshape(hv_op_rs_normalize, tf.shape(hv_op_tmp)) randv_rs = tf.reshape(randv, (tf.shape(randv)[0], -1)) randv_norm_op = tf.norm(randv_rs, axis=1) vhv_op = tf.reduce_sum(tf.multiply(randv_rs, hv_op_rs), axis=1) eig_est_prev = eig_est eig_est = vhv_op / tf.square(randv_norm_op) return (it + 1, hv_op, eig_est, eig_est_prev, tfconst) it = tf.constant(0) # compute largest abs eigenvalue result = tf.while_loop( cond, body, [it, self.randv, self.vhv_op, self.eig_est, tf.constant(0.0)]) # compute largest neg eigenvalue self.shiftconst = tf.placeholder(shape=(), dtype=tf.float32) result_1 = tf.while_loop( cond, body, [it, self.randv, self.vhv_op, self.eig_est, self.shiftconst]) # computing largest abs eig value and save result self.it = result[0] self.while_hv_op = result[1] self.while_eig = result[2] # computing largest neg eig value and save result self.it_1 = result_1[0] #self.while_eig_1 = tf.add(result_1[2], tfconst) self.while_eig_1 = tf.add(result_1[2], result_1[4]) show_tensor_op = False if show_tensor_op: print("====================") print("Define hessian_vector_product operator: ") print("hv_op_tmp = {}".format(hv_op_tmp)) print("hv_op_rs = {}".format(hv_op_rs)) print("self.hv_norm_op = {}".format(self.hv_norm_op)) print("hv_op_rs_normalize = {}".format(hv_op_rs_normalize)) print("self.hv_op = {}".format(self.hv_op)) print("self.grad_op = {}".format(self.grad_op)) print("randv_rs = {}".format(randv_rs)) print("self.randv_norm_op = {}".format(self.randv_norm_op)) print("self.vhv_op = {}".format(self.vhv_op)) print("self.eig_est = {}".format(self.eig_est)) print("====================") return self.img, self.output
def adam_fn(loss=loss, timestep=t, conv_layer_list=conv_layer_list, fc_layer_list=fc_layer_list): new_t = timestep.assign(timestep + 1) conv_wgrad_fisher_gradient = [] fc_wgrad_fisher_gradient = [] # get list of conv layer binarized wts, biases, here "wgrad" refers to either the weights or the binarized weights conv_layer_wgrad_list = [] conv_layer_bias_list = [] for convlayer in conv_layer_list: conv_layer_bias_list.append(convlayer.bias) if convlayer.binary: conv_layer_wgrad_list.append(convlayer.wb) else: conv_layer_wgrad_list.append(convlayer.weight) if convlayer.fisher and convlayer.is_binary: conv_wgrad_fisher_gradient.append( gamma * convlayer.fisherconst * 2.0 * gradients_impl._hessian_vector_product( loss, [convlayer.wb], [convlayer.perturbation])) elif convlayer.fisher: conv_wgrad_fisher_gradient.append( gamma * convlayer.fisherconst * 2.0 * gradients_impl._hessian_vector_product( loss, [convlayer.weight], [convlayer.perturbation])) else: conv_wgrad_fisher_gradient.append(0.) # get list of fc layer binarized wts, biases fc_layer_wgrad_list = [] fc_layer_bias_list = [] # for fclayer in fc_layer_list: # fc_layer_wgrad_list.append(fclayer.wb) # fc_layer_bias_list.append(fclayer.bias) for fclayer in fc_layer_list: fc_layer_bias_list.append(fclayer.bias) if fclayer.binary: fc_layer_wgrad_list.append(fclayer.wb) else: fc_layer_wgrad_list.append(fclayer.weight) if fclayer.fisher and fclayer.binary: fc_wgrad_fisher_gradient.append( gamma * fclayer.fisherconst * 2.0 * gradients_impl._hessian_vector_product( loss, [fclayer.wb], [fclayer.perturbation]) + gamma * fclayer.fisherconst * 2.0 * fclayer.perturbation) elif fclayer.fisher: fc_wgrad_fisher_gradient.append( gamma * fclayer.fisherconst * 0.05 * gradients_impl._hessian_vector_product( loss, [fclayer.weight], [fclayer.perturbation]) + gamma * fclayer.fisherconst * 0.05 * fclayer.perturbation) else: fc_wgrad_fisher_gradient.append(0.) print(fc_layer_wgrad_list) print(len(fc_layer_wgrad_list)) # exit() # Calculate gradients wrt conv layer wb conv_layer_wgrad_grads = tf.gradients(loss, conv_layer_wgrad_list) # Calculate gradients wrt fc layer wb fc_layer_wgrad_grads = tf.gradients(loss, fc_layer_wgrad_list) # Calculate gradients wrt conv layer wb conv_layer_bias_grads = tf.gradients(loss, conv_layer_bias_list) # Calculate gradients wrt fc layer wb fc_layer_bias_grads = tf.gradients(loss, fc_layer_bias_list) conv_layer_w_gradient_tot = [] fc_layer_w_gradient_tot = [] i = 0 for conv_w_grad, conv_w_fisher_grad in zip( conv_layer_wgrad_grads, conv_wgrad_fisher_gradient): conv_layer_w_gradient_tot.append(conv_w_grad + conv_w_fisher_grad) if record_tensorboard: tf.summary.histogram('conv_grads_layer' + str(i), conv_w_grad) tf.summary.histogram('conv_fishergrad_layer' + str(i), conv_w_fisher_grad) i += 1 for fc_w_grad, fc_w_fisher_grad in zip(fc_layer_wgrad_grads, fc_wgrad_fisher_gradient): fc_layer_w_gradient_tot.append(fc_w_grad + fc_w_fisher_grad) if record_tensorboard: tf.summary.histogram('fc_grads_layer' + str(i), fc_w_grad) tf.summary.histogram('fc_fishergrad_layer' + str(i), fc_w_fisher_grad) i += 1 # FOR CONV LAYERS: new_conv_m_wgrad = [] new_conv_v_wgrad = [] new_conv_m_b = [] new_conv_v_b = [] new_fc_m_wgrad = [] new_fc_v_wgrad = [] new_fc_m_b = [] new_fc_v_b = [] #Calculate m, and v from adam for the wts for grad, layer in zip(conv_layer_w_gradient_tot, conv_layer_list): # new_conv_m_wgrad.append( layer.m_w.assign(tf.squeeze(beta1 * layer.m_w + (1 - beta1) * grad))) new_conv_m_wgrad.append( layer.m_w.assign(beta1 * layer.m_w + (1 - beta1) * grad)) # new_conv_v_wgrad.append( layer.v_w.assign(tf.squeeze(beta2 * layer.v_w + (1 - beta2) * grad ** 2))) new_conv_v_wgrad.append( layer.v_w.assign(beta2 * layer.v_w + (1 - beta2) * grad**2)) #Calculate m, and v from adam for the biases for grad, layer in zip(conv_layer_bias_grads, conv_layer_list): new_conv_m_b.append( layer.m_b.assign(beta1 * layer.m_b + (1 - beta1) * grad)) new_conv_v_b.append( layer.v_b.assign(beta2 * layer.v_b + (1 - beta2) * grad**2)) #FOR FC LAYERS: for grad, layer in zip(fc_layer_w_gradient_tot, fc_layer_list): new_fc_m_wgrad.append( layer.m_w.assign( tf.squeeze(beta1 * layer.m_w + (1 - beta1) * grad))) new_fc_v_wgrad.append( layer.v_w.assign( tf.squeeze(beta2 * layer.v_w + (1 - beta2) * grad**2))) #Calculate m, and v from adam for the biases for grad, layer in zip(fc_layer_bias_grads, fc_layer_list): new_fc_m_b.append( layer.m_b.assign(beta1 * layer.m_b + (1 - beta1) * grad)) new_fc_v_b.append( layer.v_b.assign(beta2 * layer.v_b + (1 - beta2) * grad**2)) #CALCULATE UPDATES: conv_updates_wgrad = [] conv_updates_bias = [] fc_updates_wgrad = [] fc_updates_bias = [] #For Conv layers wts for m, v in zip(new_conv_m_wgrad, new_conv_v_wgrad): conv_updates_wgrad.append(m / (tf.sqrt(v) + epsilon)) #For conv layer bias for m, v in zip(new_conv_m_b, new_conv_v_b): conv_updates_bias.append(m / (tf.sqrt(v) + epsilon)) #For FC layer wts for m, v in zip(new_fc_m_wgrad, new_fc_v_wgrad): fc_updates_wgrad.append(m / (tf.sqrt(v) + epsilon)) #For FC layer bias for m, v in zip(new_fc_m_b, new_fc_v_b): fc_updates_bias.append(m / (tf.sqrt(v) + epsilon)) return conv_updates_wgrad, conv_updates_bias, fc_updates_wgrad, fc_updates_bias, new_t