def test_hv_with_builtin(): iris = load_iris() x = tf.placeholder(tf.float32, name='x') y = tf.placeholder(tf.float32, name='y') model = LinearModel(x, 4, 3) net_w, net_out = vectorize_model(model.var_list, model.inp[-1]) v = tf.constant(np.ones(net_w.tensor.get_shape()), dtype=tf.float32) # vector of ones of right shape ce_builtin = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=net_out, labels=y) ) # this is the builtin function advertised on tensorflow for computing cross entropy loss with softmax output ce_standard = tf.reduce_mean( -tf.reduce_sum(y * tf.log(tf.nn.softmax(net_out)), reduction_indices=[1]) # this is normal CE loss ) hvp_builtin = hvp( ce_builtin, net_w.tensor, v) # WITH PREVIOUS VERSIONS (r.0.11) WAS 0. NOW RAISES ERROR # UPDATE r1.2 now it's working! yeah! hessian_builtin = tf.hessians(ce_builtin, net_w.tensor)[0] hvp_standard = hvp(ce_standard, net_w.tensor, v) hessian_standard = tf.hessians(ce_standard, net_w.tensor)[0] def training_supplier(): return {x: iris.train.data, y: iris.train.target} ts = tf.train.GradientDescentOptimizer(.1).minimize( ce_standard, var_list=model.var_list) with tf.Session().as_default() as ss: tf.global_variables_initializer().run() print('builtin, standard:', ss.run([ce_builtin, ce_standard], feed_dict=training_supplier())) for _ in range(2000): ts.run(feed_dict=training_supplier()) print('builtin', ss.run([hvp_builtin, hessian_builtin], feed_dict=training_supplier())) # output is wrongly 0. print( 'standard', ss.run([hvp_standard, hessian_standard], feed_dict=training_supplier()))
def _test_hv(self, param_optimizer, debug_jac=False, iterations=100): tf.set_random_seed(0) np.random.seed(0) iris, x, y, model, model_w, model_y, error, accuracy = iris_logistic_regression( param_optimizer.get_augmentation_multiplier()) eta = tf.Variable(.001, name='eta') dyn = param_optimizer.create(model_w, eta, loss=error, _debug_jac_z=debug_jac) # # rho = tf.Variable([.1, .01], name='rho') # tr_error = error # # + rho[0]*tf.reduce_sum(model_w.tensor**2)\ # # + rho[1]*tf.abs(tf.reduce_sum(model_w.tensor)) tr_sup = lambda s=None: {x: iris.train.data, y: iris.train.target} from rfho.utils import hvp z = tf.ones(model_w.get_shape()) hv = hvp(error, model_w.tensor, z) with tf.Session().as_default() as ss: tf.global_variables_initializer().run() for t in range(iterations): ss.run(dyn.assign_ops, feed_dict=tr_sup()) return hv.eval(feed_dict=tr_sup())
def _jac_z(z): return ZMergedMatrix( hvp( integral, w, # MergedVariable.get_tensor(w), z.tensor))
def jac_z(z): r, u = z.var_list(Vl_Mode.TENSOR) assert loss is not None, 'Should specify loss to use jac_z' hessian_r_product = hvp(loss=loss, w=w_base, v=r) print('hessian_r_product', hessian_r_product) res = [ r - lr * mu * u - lr * hessian_r_product, hessian_r_product + mu * u ] print('res', res) return ZMergedMatrix(res)
def _jac_z(z): if _debug_jac_z: # I guess this would take an incredible long time to compile for large systems d = dynamics.get_shape().as_list()[0] d2 = d // 2 jac_1_1 = tf.stack([ tf.gradients(w_base_k[i], w_base)[0] for i in range(d2) ]) jac_2_1 = tf.stack( [tf.gradients(m_k[i], w_base)[0] for i in range(d2)]) # jac_1 = tf.concat([jac_1_1, jac_2_1], axis=0) jac_1_2 = tf.stack( [tf.gradients(w_base_k[i], m)[0] for i in range(d2)]) jac_2_2 = tf.stack( [tf.gradients(m_k[i], m)[0] for i in range(d2)]) # jac_2 = tf.concat([jac_1_2, jac_2_2], axis=0) # jac = tf.concat([jac_1, jac_2], axis=1, name='Jacobian') # mul = tf.matmul(jac, z.tensor) # # return ZMergedMatrix([ # mul[:d2, :], # mul[d2, :] # ]) r, u = z.var_list(VlMode.TENSOR) return ZMergedMatrix([ tf.matmul(jac_1_1, r) + tf.matmul(jac_1_2, u), tf.matmul(jac_2_1, r) + tf.matmul(jac_2_2, u) ]) else: r, u = z.var_list(VlMode.TENSOR) assert loss is not None, 'Should specify loss to use jac_z' hessian_r_product = hvp(loss=loss, w=w_base, v=r) # print('hessian_r_product', hessian_r_product) res = [ r - lr * mu * u - lr * hessian_r_product, hessian_r_product + mu * u ] return ZMergedMatrix(res)
def test_hvp(self): """ Test for hessian vector product :return: """ print('test 1') d = 20 x = tf.Variable(tf.random_normal([d])) # noinspection PyTypeChecker fx = 3 * tf.reduce_sum(x**3) vec = tf.Variable(tf.ones([d])) res = hvp(fx, x, vec) with tf.Session().as_default() as ss: ss.run(tf.global_variables_initializer()) hessian = 18. * np.eye(d) * ss.run(x) self.assertLess( np.linalg.norm(ss.run(res) - hessian.dot(ss.run(vec))), 1e-5)
def test_hv_matrix(self): """ Test for hessian vector product :return: """ print('test 2') d = 20 x = tf.Variable(tf.random_normal([d])) # noinspection PyTypeChecker fx = 3 * tf.reduce_sum(x**3) vec = tf.Variable(tf.ones([d, 2])) res = tf.stack([ hvp(fx, x, vec[:, k]) for k in range(vec.get_shape().as_list()[1]) ], axis=1) with tf.Session().as_default() as ss: ss.run(tf.global_variables_initializer()) hessian = np.eye(d) * ss.run(x) * 18. self.assertLess( np.linalg.norm(ss.run(res) - hessian.dot(ss.run(vec))), 1e-5)
def _jac_z(z): if _debug_jac_z: # I guess this would take an incredible long time to compile for large systems d = dynamics.get_shape().as_list()[0] // 3 r, u, s = z.var_list(VlMode.TENSOR) j11 = tf.stack([ tf.gradients(w_base_k[i], w_base)[0] for i in range(d) ]) j12 = tf.stack( [tf.gradients(w_base_k[i], m)[0] for i in range(d)]) j13 = tf.stack( [tf.gradients(w_base_k[i], v)[0] for i in range(d)]) j1 = tf.concat([j11, j12, j13], axis=1) jz1 = tf.matmul(j11, r) + tf.matmul(j12, u) + tf.matmul( j13, s) # second block j21 = tf.stack( [tf.gradients(m_k[i], w_base)[0] for i in range(d)]) j22 = tf.stack( [tf.gradients(m_k[i], m)[0] for i in range(d)]) j23 = tf.stack( [tf.gradients(m_k[i], v)[0] for i in range(d)]) j2 = tf.concat([j21, j22, j23], axis=1) jz2 = tf.matmul(j21, r) + tf.matmul(j22, u) + tf.matmul( j23, s) # third block j31 = tf.stack( [tf.gradients(v_k[i], w_base)[0] for i in range(d)]) j32 = tf.stack( [tf.gradients(v_k[i], m)[0] for i in range(d)]) j33 = tf.stack( [tf.gradients(v_k[i], v)[0] for i in range(d)]) j3 = tf.concat([j31, j32, j33], axis=1) jz3 = tf.matmul(j31, r) + tf.matmul(j32, u) + tf.matmul( j33, s) tf.concat([j1, j2, j3], axis=0, name='Jacobian') return ZMergedMatrix([jz1, jz2, jz3]) else: assert loss is not None, 'Should specify loss to use jac_z' r, u, s = z.var_list(VlMode.TENSOR) with tf.name_scope('Jac_Z'): hessian_r_product = hvp(loss=loss, w=w_base, v=r, name='hessian_r_product') # hessian_r_product = hvp(loss=loss, w=w.tensor, v=z.tensor, name='hessian_r_product')[:d, :d] j_11_r_tilde = l_diag_mul(pre_j_11_out, hessian_r_product, name='j_11_r_tilde') j_11_r = tf.identity(j_11_r_tilde + r, 'j_11_r') j_12_u_hat = tf.identity(-lr_k * beta1 / v_tilde_k, name='j_12_u_hat') j_12_u = l_diag_mul(j_12_u_hat, u, name='j_12_u') j_13_s_hat = tf.identity(lr_k * beta2 * m_k / (2 * v_k_eps_32), name='j_13_s_hat') j_13_s = l_diag_mul(j_13_s_hat, s, name='j_13_s') jac_z_1 = tf.identity(j_11_r + j_12_u + j_13_s, name='jac_z_1') # end first bock j_21_r = tf.identity((1. - beta1) * hessian_r_product, name='j_21_r') j_22_u = tf.identity(beta1 * u, name='j_22_u') # j_23_s = tf.zeros_like(s) # would be... jac_z_2 = tf.identity(j_21_r + j_22_u, name='jac_z_2') # end second block j_31_r = l_diag_mul(pre_j_31_out, hessian_r_product, name='j_31_r') # j_32_u = tf.zeros_like(u) # would be j_33_s = tf.identity(beta2 * s, name='j_33_s') jac_z_3 = tf.identity(j_31_r + j_33_s, name='jac_z_3') res = [jac_z_1, jac_z_2, jac_z_3] # print('res', res) return ZMergedMatrix(res)