def test_hv_with_builtin(): iris = load_iris() x = tf.placeholder(tf.float32, name='x') y = tf.placeholder(tf.float32, name='y') model = LinearModel(x, 4, 3) net_w, net_out = vectorize_model(model.var_list, model.inp[-1]) v = tf.constant(np.ones(net_w.tensor.get_shape()), dtype=tf.float32) # vector of ones of right shape ce_builtin = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=net_out, labels=y) ) # this is the builtin function advertised on tensorflow for computing cross entropy loss with softmax output ce_standard = tf.reduce_mean( -tf.reduce_sum(y * tf.log(tf.nn.softmax(net_out)), reduction_indices=[1]) # this is normal CE loss ) hvp_builtin = hvp( ce_builtin, net_w.tensor, v) # WITH PREVIOUS VERSIONS (r.0.11) WAS 0. NOW RAISES ERROR # UPDATE r1.2 now it's working! yeah! hessian_builtin = tf.hessians(ce_builtin, net_w.tensor)[0] hvp_standard = hvp(ce_standard, net_w.tensor, v) hessian_standard = tf.hessians(ce_standard, net_w.tensor)[0] def training_supplier(): return {x: iris.train.data, y: iris.train.target} ts = tf.train.GradientDescentOptimizer(.1).minimize( ce_standard, var_list=model.var_list) with tf.Session().as_default() as ss: tf.global_variables_initializer().run() print('builtin, standard:', ss.run([ce_builtin, ce_standard], feed_dict=training_supplier())) for _ in range(2000): ts.run(feed_dict=training_supplier()) print('builtin', ss.run([hvp_builtin, hessian_builtin], feed_dict=training_supplier())) # output is wrongly 0. print( 'standard', ss.run([hvp_standard, hessian_standard], feed_dict=training_supplier()))
def gradient(ckpt_file_path): detection_graph = tf.Graph() with tf.Session(graph=detection_graph) as sess: saver = tf.train.import_meta_graph(ckpt_file_path) saver.restore(sess, "../model/normal_cifar/normal_Cifar-19900") graph = tf.get_default_graph() x_input = graph.get_tensor_by_name('x:0') y_input = graph.get_tensor_by_name('y:0') keep_prob = graph.get_tensor_by_name('keep_prob:0') logits = graph.get_tensor_by_name('logits:0') max_index = tf.argmax(logits[0]) gradient_op = tf.gradients(logits[0][max_index], x_input) hess_2 = tf.hessians(logits[0][max_index], x_input) gradient = sess.run(gradient_op, { x_input: xx[j], y_input: y, keep_prob: 1 }) hess_2 = sess.run(hess_2, {x_input: xx[j], y_input: y, keep_prob: 1}) gradient = np.array(gradient) gradient = gradient.reshape(1, 3072) hess_2 = np.array(hess_2) hess_2 = hess_2.reshape(3072, 3072) save(gradient, "(1).xlsx") save(hess_2, "(2).xlsx")
def test_natural_gradient(self): """ Test random natural gradient cases. """ with tf.Graph().as_default(): with tf.Session() as sess: for size in range(3, 9): dist = NaturalSoftmax(size, epsilon=0) softmax = CategoricalSoftmax(size) param_row = tf.constant(np.random.normal(size=(size, )), dtype=tf.float64) params = tf.stack([param_row]) one_hot = np.zeros((1, size)) one_hot[0, 1] = 1 samples = tf.constant(one_hot, dtype=tf.float64) kl_div = softmax.kl_divergence(tf.stop_gradient(params), params) hessian = sess.run(tf.hessians(kl_div, param_row)[0]) gradient = sess.run( tf.gradients(softmax.log_prob(params, samples), params)[0][0]) expected = np.matmul(np.array([gradient]), np.linalg.pinv(hessian))[0] actual = sess.run( tf.gradients(dist.log_prob(params, samples), params)[0][0]) self.assertTrue(np.allclose(actual, expected))
def _log_likelihood(self, i_batch, dsetname, data_tensor, batch_info, omit_grads=tuple(), second_order=False, **params): # Stack the params to create a single node # to differentiate with respect to. grad_par_stack = tf.stack( [params[k] for k in self.param_names if k not in omit_grads]) # Retrieve individual params from the stacked node, # then add back the params we do not differentiate w.r.t. params_unstacked = dict( zip([x for x in self.param_names if x not in omit_grads], tf.unstack(grad_par_stack))) for k in omit_grads: params_unstacked[k] = params[k] # Forward computation ll = self._log_likelihood_inner(i_batch, params_unstacked, dsetname, data_tensor, batch_info) # Autodifferentiation. This is why we use tensorflow: grad = tf.gradients(ll, grad_par_stack)[0] if second_order: return ll, grad, tf.hessians(ll, grad_par_stack)[0] return ll, grad, None
def constrained_bestfit(self, objective, constrained_mu, data, pdf, init_pars, par_bounds): #the graph data = self.tb.astensor(data) nuis_pars = [ self.tb.astensor([p]) for i, p in enumerate(init_pars) if i != pdf.config.poi_index ] poi_par = self.tb.astensor([constrained_mu]) nuis_cat = self.tb.concatenate(nuis_pars) pars = self.tb.concatenate([nuis_cat[:0], poi_par, nuis_cat[0:]]) objective = objective(pars, data, pdf) hessian = tf.hessians(objective, nuis_cat)[0] gradient = tf.gradients(objective, nuis_cat)[0] invhess = tf.linalg.inv(hessian) update = tf.transpose( tf.matmul(invhess, tf.transpose(tf.stack([gradient]))))[0] #run newton's method best_fit_nuis = [ x for i, x in enumerate(init_pars) if i != pdf.config.poi_index ] for i in range(1000): up = self.tb.session.run(update, feed_dict={nuis_cat: best_fit_nuis}) best_fit_nuis = best_fit_nuis - up if np.abs(np.max(up)) < 1e-4: break best_fit = best_fit_nuis.tolist() best_fit.insert(pdf.config.poi_index, constrained_mu) return best_fit
def initialize(self, *args, **kwargs): # Store latent variables in a temporary attribute; MAP will # optimize `PointMass` random variables, which subsequently # optimizes mean parameters of the normal approximations. latent_vars_normal = self.latent_vars.copy() self.latent_vars = {z: PointMass(params=qz.loc) for z, qz in six.iteritems(latent_vars_normal)} super(Laplace, self).initialize(*args, **kwargs) hessians = tf.hessians(self.loss, list(six.itervalues(self.latent_vars))) self.finalize_ops = [] for z, hessian in zip(six.iterkeys(self.latent_vars), hessians): qz = latent_vars_normal[z] if isinstance(qz, (MultivariateNormalDiag, Normal)): scale_var = get_variables(qz.variance())[0] scale = 1.0 / tf.diag_part(hessian) else: # qz is MultivariateNormalTriL scale_var = get_variables(qz.covariance())[0] scale = tf.matrix_inverse(tf.cholesky(hessian)) self.finalize_ops.append(scale_var.assign(scale)) self.latent_vars = latent_vars_normal.copy() del latent_vars_normal
def getHess(f, x_value): x = tf.placeholder(tf.float32, shape=len(x_value)) f_grad = tf.hessians(f(x), x) sess = tf.Session() f_g = sess.run(f_grad, feed_dict={x: x_value}) f_g_value = f_g[0] return f_g_value
def G_tot_nn_unconstr(X_m, X_T, x_scaling_1, x_scaling_2, T_scaling_1, T_scaling_2, weights, n_hidden=2): # Assuming convex_nn architecture n_weights = 7 + (n_hidden - 1) * 10 + 8 weights_1 = weights[:n_weights] weights_2 = weights[-n_weights + 1:] + fixed_bias # Divide microstrucutral features into composition and fraction: X_1 = X_m[:, 0:1] X_2 = X_m[:, 1:2] f = X_m[:, 2:] # Temperature musts be scaled differently for each input X_T_1 = (X_T - T_scaling_1["mean"]) / T_scaling_1["std"] X_T_2 = (X_T - T_scaling_2["mean"]) / T_scaling_2["std"] # Phase 2 composition is calculated from phase 1 composition, f. X_2 = (X_2 - x_scaling_2["mean"]) / x_scaling_2["std"] X_1 = (X_1 - x_scaling_1["mean"]) / x_scaling_1["std"] G_nn_1 = convex_nn(X_T_1, X_1, weights_1) G_nn_2 = convex_nn(X_T_2, X_2, weights_2) G_tot = G_nn_1 * (1. - f) + G_nn_2 * f G_tot_grad = tf.gradients(G_tot, X_m, stop_gradients=X_m) G_tot_hess = tf.einsum("ijkl->ijl", tf.hessians(G_tot, X_m)[0]) return G_tot, G_tot_grad, G_tot_hess
def initialize(self, *args, **kwargs): # Store latent variables in a temporary attribute; MAP will # optimize ``PointMass`` random variables, which subsequently # optimizes mean parameters of the normal approximations. latent_vars_normal = self.latent_vars.copy() self.latent_vars = {z: PointMass(params=qz.loc) for z, qz in six.iteritems(latent_vars_normal)} super(Laplace, self).initialize(*args, **kwargs) hessians = tf.hessians(self.loss, list(six.itervalues(self.latent_vars))) self.finalize_ops = [] for z, hessian in zip(six.iterkeys(self.latent_vars), hessians): qz = latent_vars_normal[z] if isinstance(qz, (MultivariateNormalDiag, Normal)): scale_var = get_variables(qz.variance())[0] scale = 1.0 / tf.diag_part(hessian) else: # qz is MultivariateNormalTriL scale_var = get_variables(qz.covariance())[0] scale = tf.matrix_inverse(tf.cholesky(hessian)) self.finalize_ops.append(scale_var.assign(scale)) self.latent_vars = latent_vars_normal.copy() del latent_vars_normal
def _test_loss(sample_shape_fn): our_loss_fn = CrossEntropyLoss() single_vector_inputs = len(sample_shape_fn()) == 1 and sample_shape_fn() == sample_shape_fn() length = [None] if not single_vector_inputs else [sample_shape_fn()[0]] targets_ph = tf.placeholder(tf.float64, length * len(sample_shape_fn())) logits_ph = tf.placeholder(tf.float64, length * len(sample_shape_fn())) loss = tf.nn.softmax_cross_entropy_with_logits(labels=targets_ph, logits=logits_ph) grad = tf.gradients(loss, logits_ph) if single_vector_inputs: ihvp = tf.reshape(tf.matrix_solve(tf.hessians(loss, logits_ph)[0] + tf.eye(length[0], dtype=tf.float64), tf.reshape(grad, (-1,1))), (-1,)) session = tf.Session() for _ in xrange(1000): shape = sample_shape_fn() targets = np.random.rand(*shape) targets = targets ** 2 targets /= np.sum(targets, axis=-1, keepdims=True) logits = np.random.rand(*shape) our_loss = our_loss_fn(targets, logits) our_grad = our_loss_fn.gradient(targets, logits) if single_vector_inputs: our_ihvp = our_loss_fn.ihvp(targets, logits, l2_reg=1) feed_dict = {targets_ph: targets, logits_ph: logits} true_loss = session.run(loss, feed_dict=feed_dict) true_gradient = session.run(grad, feed_dict=feed_dict) if single_vector_inputs: true_ihvp = session.run(ihvp, feed_dict=feed_dict) assert np.allclose(our_loss, true_loss) and np.allclose(our_grad, true_gradient) if single_vector_inputs: assert np.allclose(our_ihvp, true_ihvp)
def finetune_and_test_hessian(self, input_pts, output_pts, num_steps, test_input_pts, inp_tau): "This returns the Hessian at the adapted parameter value for uncertainty estimates" pred = self.forward_pass(input_pts, self.theta) loss = mse(pred, output_pts) grad = tf.gradients(loss, list(self.theta.values())) grad = dict(zip(self.theta.keys(), grad)) phi = dict(zip(self.theta.keys(), [self.theta[key] - alpha * grad[key] for key in self.theta.keys()])) for _ in range(num_steps - 1): #this is never gone through pred = self.forward_pass(input_pts, phi) loss = mse(pred, output_pts) grad = tf.gradients(loss, list(phi.values())) grad = dict(zip(phi.keys(), grad)) phi = dict(zip(phi.keys(), [phi[key] - alpha * grad[key] for key in phi.keys()])) #splice in flat_params keys, vals = zip(*[(k, v) for k, v in phi.items()]) flat_params = tf.squeeze(tensors_to_column(vals)) phi = column_to_tensors(vals, flat_params) phi = {keys[i]: phi[i] for i in range(len(phi))} adapted_pred = self.forward_pass(input_pts, phi) adapted_mse = mse(adapted_pred, output_pts) log_pr_hessian = tf.hessians(adapted_mse, flat_params) log_prior_hessian = tf.eye(1761) * inp_tau hessian = tf.add(log_pr_hessian, log_prior_hessian) test_pred = self.forward_pass(test_input_pts, phi) return test_pred, flat_params, hessian
def flathess(loss, var_list, clip_norm=None): #Pseudocdoe: #We just need to get the hessians of the policy #So, step 1: get all the hessians #Step 2: Get the hessians in the namespace of the policy #Reshape them properly into a matrix - can probably flatten each of the first n/2 dimensions and then concatenate them into a block #Step 3: concatenate them all blockwise hessians = tf.hessians(loss, var_list) #TODO: is this right? if clip_norm is not None: IPython.embed() hessians = [ tf.clip_by_norm(hessian, clip_norm=clip_norm) for hessian in hessians ] for i in range(len(hessians)): #reshape #TODO: write this more cleanly as a list comprehension? hessian = hessians[i] shape = [int(s) for s in hessian.shape] dims = int(np.sqrt(reduce(mul, shape, 1))) #assumes symmetry hessians[i] = tf.reshape( hessian, [dims, dims ]) #TODO: verify this is the correct reshaping direction return block_diagonal(hessians)
def __init__(self, icb, train_documents, train_labels, leaf_method, weights=None): self.icb = icb if weights is None: self.weights_num = np.ones_like(train_labels, dtype=np.float64) else: self.weights_num = np.array(weights, dtype=tf.float64) self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) self.train_documents = train_documents self.train_labels = train_labels self.weights = tf.placeholder(tf.float64, shape=train_labels.shape, name='weights') self.x = tf.placeholder(tf.float64, shape=train_documents.shape, name='x') self.y = tf.placeholder(tf.float64, shape=train_labels.shape, name='y') self.approxes = [] self.approxes.append(tf.zeros_like(train_labels, dtype=tf.float64)) self.leaf_values = [] self.leaf_values_grads = [] for t in xrange(len(icb.trees)): a = self.approxes[-1] loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=a, name='loss_step_%s' % str(t))) grads = tf.gradients(loss, a)[0] hessians = tf.diag_part(tf.hessians(loss, a)[0]) leaf_doc_idxs = [sorted(list(l)) for l in icb.trees[t]._document_idxs_for_leaves] doc_leaf_idxs = [0] * len(train_labels) for l, leaf_idxs in enumerate(leaf_doc_idxs): for i in leaf_idxs: doc_leaf_idxs[i] = l doc_leaf_idxs = tf.constant(doc_leaf_idxs, dtype=tf.int32) leaf_values_lst = [] for l in xrange(len(icb.trees[t].leaf_values)): leaf_mask = tf.equal(doc_leaf_idxs, l) leaf_gradients = tf.boolean_mask(grads, leaf_mask) leaf_hessians = tf.boolean_mask(hessians, leaf_mask) leaf_weights = tf.boolean_mask(self.weights, leaf_mask) if leaf_method == 'Gradient': leaf_values_lst.append(-tf.divide( tf.reduce_sum(tf.multiply(leaf_weights, leaf_gradients)), tf.reduce_sum(leaf_weights) + icb.trees[t].l2_reg_coef ) * icb.trees[t].learning_rate) else: leaf_values_lst.append(-tf.divide( tf.reduce_sum(tf.multiply(leaf_weights, leaf_gradients)), tf.reduce_sum(tf.multiply(leaf_weights, leaf_hessians)) + icb.trees[t].l2_reg_coef ) * icb.trees[t].learning_rate) leaf_values = tf.stack(leaf_values_lst) self.leaf_values.append(leaf_values) tree_predictions = tf.gather(leaf_values, doc_leaf_idxs) self.approxes.append(a + tree_predictions) leaf_value_grad = [] for lv in leaf_values_lst: lvg = tf.gradients(lv, self.weights)[0] leaf_value_grad.append(lvg) self.leaf_values_grads.append(leaf_value_grad) self.train_idx = tf.placeholder(tf.int32, shape=[]) train_prediction_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits( labels=self.y[self.train_idx:self.train_idx + 1], logits=self.approxes[-1][self.train_idx:self.train_idx + 1] )) self.train_prediction_loss_grad = tf.gradients(train_prediction_loss, self.weights)[0]
def loss_func(self, model, S_interior, t_interior, Smin_boundary, tmin_boundary, Smax_boundary,\ tmax_boundary, S_terminal, t_terminal, use_fd_hessian, use_L2_err): ''' Compute total loss for training. Note: only geometric avg boundary condition is considered. Args: model: DGMNet model object t_interior: sampled time points in the interior of the function's domain S_interior: sampled space points in the interior of the function's domain t_terminal: sampled time points at terminal point (vector of terminal times) S_terminal: sampled space points at terminal time ''' # Loss term #1: PDE # compute function value and derivatives at current sampled points V = model(S_interior, t_interior) V_t = tf.gradients(V, t_interior)[0] V_s = tf.gradients(V, S_interior)[0] S_mean = tf.reduce_mean(S_interior) if use_fd_hessian: # deprecated V_ss = (fd_hessian(model, S_interior, t_interior, 1.5e-6 * S_mean) + \ fd_hessian(model, S_interior, t_interior, 1.5e-7 * S_mean) + \ fd_hessian(model, S_interior, t_interior, 1.5e-8 * S_mean)) / 3 else: V_ss = tf.hessians(V, S_interior)[0] V_ss = tf.reduce_sum(V_ss, axis=2) cov_Vss = tf.multiply(V_ss, self.cov_mat) sec_ord = tf.map_fn(lambda i: tf.tensordot(S_interior[i], tf.linalg.matvec(cov_Vss[i], S_interior[i]), 1) / 2,\ tf.range(tf.shape(S_interior)[0]), dtype=tf.float64) first_ord = tf.reduce_sum(tf.multiply(tf.multiply(V_s, S_interior), self.ir - self.dividend_vec), axis=1) diff_V = tf.reshape( V_t, [-1]) + sec_ord + first_ord - self.ir * tf.reshape(V, [-1]) # compute average L2-norm of differential operator L1 = tf.reduce_mean(tf.math.square(diff_V)) # Loss term #2: boundary condition V_minboundary = model(Smin_boundary, tmin_boundary) real_minboundary = tf.map_fn(lambda x: GeometricAvg_tf(self.dim, x[:-1], self.payoff_func.strike, self.domain.T-x[-1],\ self.ir, self.vol_vec, self.dividend_vec[0], self.corr_mat).european_option_price(), tf.concat([Smin_boundary, tmin_boundary], 1)) L2min = tf.reduce_mean( tf.math.square(tf.reshape(V_minboundary, [-1]) - real_minboundary)) V_maxboundary = model(Smax_boundary, tmax_boundary) real_maxboundary = tf.map_fn(lambda x: GeometricAvg_tf(self.dim, x[:-1], self.payoff_func.strike, self.domain.T-x[-1],\ self.ir, self.vol_vec, self.dividend_vec[0], self.corr_mat).european_option_price(), tf.concat([Smax_boundary, tmax_boundary], 1)) L2max = tf.reduce_mean( tf.math.square(tf.reshape(V_maxboundary, [-1]) - real_maxboundary)) # Loss term #3: initial/terminal condition target_payoff = self.payoff_func(S_terminal) fitted_payoff = tf.reshape(model(S_terminal, t_terminal), [-1]) if use_L2_err: L3 = tf.reduce_mean(tf.math.square(fitted_payoff - target_payoff)) else: L3 = tf.reduce_mean(tf.math.abs(fitted_payoff - target_payoff)) return L1, L2min, L2max, L3
def main(args): with tf.Session() as sess: sess.run(tf.global_variables_initializer()) global x_test, y_test EPOCH = args.epoch BATCHSIZE = args.batch_size model_chosen = args.model_type if model_chosen == '1': model = model1 elif model_chosen == '2': model = model2 model.compile(optimizer=tf.train.AdamOptimizer(**ADAMPARAM), loss='categorical_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, epochs=EPOCH, batch_size=BATCHSIZE) test_result = model.evaluate(x_test, y_test) train_result = model.evaluate(x_train, y_train) print('training loss:%f' % train_result[0]) print('training accuracy:%f' % train_result[1]) print('testing loss:%f' % test_result[0]) print('testing accuracy:%f' % test_result[1]) x_test = tf.convert_to_tensor(x_test, dtype=tf.float32) y_pred = model.apply(x_test) #print(type(y_pred)) y_test = tf.convert_to_tensor(y_test) #model_weights = tf.concat([tf.reshape(i, [-1]) for i in model.trainable_variables], axis=0) loss = tf.keras.losses.categorical_crossentropy(y_test, y_pred) #grad = tf.gradients(loss, model.trainable_variables) #print(grad) #input() hess = tf.hessians(loss, model.trainable_variables) print(len(hess)) #print(type(hess)) hess_norm = [] for i in hess: norm = tf.norm(i, 2) hess_norm.append(norm) hess_norm = sess.run(hess_norm) sharpness = max(hess_norm) * (1e-8) / 2 / (1 + test_result[0]) with open('sharpness.csv', 'a') as f: print(args.model_type, end=',', file=f) print(args.batch_size, end=',', file=f) print(train_result[0], end=',', file=f) print(train_result[1], end=',', file=f) print(test_result[0], end=',', file=f) print(test_result[1], end=',', file=f) print(sharpness, file=f) return
def get_hessian(self, layer_num=-2): self.hessian = tf.hessians(self.cost, self.params[layer_num])[0] shape = (self.params[layer_num].shape[0] * self.params[layer_num].shape[1], self.params[layer_num].shape[0] * self.params[layer_num].shape[1]) self.hessian = tf.reshape(self.hessian, shape=shape) return self.hessian
def compute_ghm(energy_op, x, params): """ Computes gradients, hessians, mixed_partials in one go """ grads = densify(tf.gradients(energy_op, x)[0]) hess = densify(tf.hessians(energy_op, x)[0]) mp = list_jacobian(grads, params) return grads, hess, mp
def eval_hess(self): if self.hess_op == None: self.hess_op = tf.hessians(self.meanloss, self.parameters) self.v_hess = self.sess.run(self.hess_op, feed_dict={ self.images: self.datax, self.label: self.datay, self.dropout_keep_prob: self.dp })
def get_hession_matrix_mutiply_v(v, x): loss = loss_func(x, tf.stop_gradient(x)) old_shape = v.get_shape() num_elements = old_shape.num_elements() H = tf.hessians(loss, x) H = tf.reshape(H, [num_elements, num_elements]) v = tf.reshape(v, [num_elements, 1]) out = tf.matmul(H, v) return tf.reshape(out, old_shape.as_list()), H
def test_hessian_gradient_2(self): dim = 10 w1_t = tf.Variable(np.random.randn(dim).astype(np.float32)) w2_t = tf.Variable(np.random.randn(dim).astype(np.float32)) w1w1_t = tf.reduce_sum(w1_t * w1_t) w1w2_t = tf.reduce_sum(w1_t * w2_t) w2w2_t = tf.reduce_sum(w2_t * w2_t) L_t = 0.3 * w1w1_t + 0.1 * w1w2_t - 0.2 * w2w2_t \ + 0.15 * w1w1_t * w1w1_t \ - 0.45 * w1w1_t * w2w2_t \ + 0.23 * w1w2_t * w1w1_t grad_t = tf.gradients(L_t, [w1_t, w2_t]) H11_t = tf.hessians(L_t, w1_t)[0] H22_t = tf.hessians(L_t, w2_t)[0] H12_t = [tf.gradients(grad_t[0][i], w2_t)[0] for i in range(dim)] H21_t = [tf.gradients(grad_t[1][i], w1_t)[0] for i in range(dim)] actual_Hg_t = self._compute_hess_grad(L_t, [w1_t, w2_t]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) grads = sess.run(grad_t) H11 = sess.run(H11_t) H22 = sess.run(H22_t) H12 = np.stack(sess.run(H12_t)) H21 = np.stack(sess.run(H21_t)) H = np.zeros((2 * dim, 2 * dim)) H[:dim, :dim] = H11 H[dim:, dim:] = H22 H[:dim, dim:] = H12 H[dim:, :dim] = H21 grad = np.zeros(2 * dim) grad[:dim] = grads[0] grad[dim:] = grads[1] expected_Hg = H.dot(grad) actual_Hg = np.concatenate(sess.run(actual_Hg_t)) self.assertTrue(np.allclose(expected_Hg, actual_Hg, rtol=1e-3))
def FIM_F(g, X): hessian = tf.hessians(self.KL_divergence, X) X_size = tf.size(X) hessian = tf.reshape(hessian, [X_size, X_size]) + epsilon * tf.eye( X_size ) FIM = tf.linalg.inv(hessian) return tf.reshape( tf.linalg.matvec(FIM, tf.reshape(g, [-1])), tf.shape(X) )
def simu_hessian(): X = tf.svd(tf.random.normal(shape=(n, p)))[1] f = tf.matmul(X, beta) y = f + tf.random.normal(shape=(n, 1)) loss = tf.reduce_mean((y - w[0] * tf.matmul( X, w[1:] / tf.math.sqrt(tf.matmul(tf.transpose(w[1:]), w[1:]))))**2) # dl_dv = tf.gradients(loss, v) # d2l_dvdg = tf.gradients(dl_dv, g) d2l_dvdg = tf.hessians(loss, w) return d2l_dvdg
def hessians(ys, xs, no_backprop=False): if not no_backprop: return tf.squeeze(tf.hessians(ys, xs)[0], axis=[0, 2]) grads = tf.gradients(ys, xs)[0][0] # Note: it is important to use parallel_iterations=None here, to avoid an # in graph while loop inside the jacobians computation, which itself is an # in graph while loop. This is more efficient since we do not have a large # number of parameters, but can use many samples to compute gradient estimator # variance. return tf.squeeze(jacobians(grads, xs, parallel_iterations=None), axis=1)
def loss_fcn_gradient_hessian(video_indices, **kwargs): """Compute the loss function, gradient and the Hessian.""" variable = kwargs['model_tensor'] loss = loss_fcn_dense(**kwargs)['loss'] g = tf.gradients(loss, variable)[0] g = tf.gather(g, axis=1, indices=video_indices) h = tf.hessians(loss, variable)[0] h = tf.gather(h, axis=1, indices=video_indices) h = tf.gather(h, axis=4, indices=video_indices) return {'loss': loss, 'gradient': g, 'hessian': h}
def split_and_hessian(self, out_node, innode): out_nodes = tf.split(out_node, 1, axis=1) hessian_node = [] for o_node in out_nodes: hessian_node.append(tf.stack(tf.hessians(o_node, innode))) new_dim = len(hessian_node[0].shape.as_list()) + 1 new_dim = list(range(new_dim)) new_dim[0] = 1 new_dim[1] = 0 return tf.transpose(tf.stack(hessian_node), perm=new_dim)
def hess_elemwise(F,X): #return the elementwise hessian of F w.r.t X in the following form #H = d^2F/dX, where H[i,j,k] = dF_i/dx_j dx_k #F needs to be a rank 1 tensor for this to work, #so a reshape operation is needed beforehand #turn into iterable list of tensors Ftemp = tf.unstack(F) hess = [tf.hessians(f, X) for f in Ftemp] # if F is a python list #convert back to rank 2 tensor with list dimension squeezed out H = tf.squeeze(tf.stack(hess, axis=0),axis=1) return H
def ML_point(self, task): with tf.name_scope("ML_point"): task_phi = task input_pts, output_pts = sample_linear_task_pts( np.random.choice([5, 7, 10, 15, 18, 20, 400, 500, 630, 800]), task_phi, noise=np.random.uniform(0.1, 10.)) phi = {} with tf.name_scope("train"): # Initialize phi with the first gradient update pred = self.forward_pass(input_pts, self.theta) loss = mse(pred, output_pts) loss = tf.Print(loss, [loss]) grad = tf.gradients(loss, list(self.theta.values())) #phi = dict(zip(self.theta.keys(), [self.theta[key] + 0. for key in self.theta.keys()])) #keys, vals = zip(*[(k, v) for k, v in phi.items()]) #og_flat_params = tf.squeeze(tensors_to_column(vals)) grad = dict(zip(self.theta.keys(), grad)) phi = dict( zip(self.theta.keys(), [ self.theta[key] - alpha * grad[key] for key in self.theta.keys() ])) keys, vals = zip(*[(k, v) for k, v in phi.items()]) flat_params = tf.squeeze(tensors_to_column(vals)) phi = column_to_tensors(vals, flat_params) phi = {keys[i]: phi[i] for i in range(len(phi))} with tf.name_scope("test"): test_input_pts, test_output_pts = sample_linear_task_pts( M, task_phi) test_pred = self.forward_pass(test_input_pts, phi) test_mse = mse(test_pred, test_output_pts) log_pr_hessian = tf.hessians(test_mse, flat_params) log_prior_hessian = tf.eye(n_fc + 1) * tau hessian = tf.add(log_prior_hessian, log_pr_hessian) test_mse = tf.Print( test_mse, [test_mse, tf.linalg.logdet(hessian)], message="Sanity") loss = tf.cond( tf.equal(self.use_hess, tf.constant(True)), lambda: tf.add(test_mse, tf.linalg.logdet(hessian)), lambda: test_mse) #test_mse = tf.Print(test_mse, [log_pr_hessian], message = "Log Pr Hessian") #test_mse = tf.Print(test_mse, [tf.linalg.logdet(hessian)], message = "Log det") return loss
def add_hess_ops(function, inputs, graph_dictionary): """adds ops to calculate and diagonalize the hessian to the graph and graph_dictionary """ with tf.variable_scope("hessian"): hessian_matrix = tf.hessians(function, inputs, name="hessians_output")[0] eigenvalues, eigenvectors = tf.self_adjoint_eig(hessian_matrix) graph_dictionary.update({ "hessian_matrix": hessian_matrix, "eigenvalues": eigenvalues, "eigenvectors": eigenvectors })
def hessian(data): """ Helper function that computes hessian for a given gene. :param data: tuple (X_t, size_factors_t, params_t) """ # Extract input data: X_t, size_factors_t, params_t = data size_factors = tf.transpose( size_factors_t) # observations x features X = tf.transpose(X_t) # observations x features params = tf.transpose(params_t) # design_params x features a_split, b_split = tf.split( params, tf.TensorShape([p_shape_a, p_shape_b])) # Define the model graph based on which the likelihood is evaluated # which which the hessian is computed: model = BasicModelGraph(X=X, design_loc=design_loc, design_scale=design_scale, constraints_loc=constraints_loc, constraints_scale=constraints_scale, a_var=a_split, b_var=b_split, dtype=dtype, size_factors=size_factors) # Compute the hessian of the model of the given gene: if self._compute_hess_a and self._compute_hess_b: H = tf.hessians(model.log_likelihood, params) elif self._compute_hess_a and not self._compute_hess_b: H = tf.hessians(model.log_likelihood, a_split) elif not self._compute_hess_a and self._compute_hess_b: H = tf.hessians(model.log_likelihood, b_split) else: raise ValueError("either require hess_a or hess_b") return H
def test_full_hessian(self): dim1 = 10 dim2 = 15 w1_t = tf.Variable(np.random.randn(dim1).astype(np.float32)) w2_t = tf.Variable(np.random.randn(dim2).astype(np.float32)) w1w1_t = tf.reduce_sum(w1_t * w1_t) w2w2_t = tf.reduce_sum(w2_t * w2_t) L_t = 0.3 * w1w1_t - 0.2 * w2w2_t \ + 0.15 * w1w1_t * w1w1_t - 0.45 * w1w1_t * w2w2_t grad_t = tf.gradients(L_t, [w1_t, w2_t]) H11_t = tf.hessians(L_t, w1_t)[0] H22_t = tf.hessians(L_t, w2_t)[0] H12_t = [tf.gradients(grad_t[0][i], w2_t)[0] for i in range(dim1)] H21_t = [tf.gradients(grad_t[1][i], w1_t)[0] for i in range(dim2)] hess_blocks_t = tfutils.hessian_tensor_blocks(L_t, [w1_t, w2_t]) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) H11 = sess.run(H11_t) H22 = sess.run(H22_t) H12 = np.stack(sess.run(H12_t)) H21 = np.stack(sess.run(H21_t)) H = np.zeros((dim1 + dim2, dim1 + dim2)) H[:dim1, :dim1] = H11 H[dim1:, dim1:] = H22 H[:dim1, dim1:] = H12 H[dim1:, :dim1] = H21 hess_blocks = sess.run(hess_blocks_t) actual_hess = tfutils.hessian_combine_blocks(hess_blocks) self.assertTrue(np.allclose(actual_hess, H))
def fisher6(num_sample=1000, meanf=None, cov=None): if meanf is None: mean = var else: mean = meanf(var) mgd = tf.contrib.distributions.MultivariateNormalFullCovariance( loc=mean, covariance_matrix=cov ) sample = tf.stop_gradient(mgd.sample(num_sample)) lnp = lnunnormalp(sample, mean, cov) lnpoverp = idn(lnp) kl = tf.log(tf.reduce_mean(lnpoverp)) - tf.reduce_mean(tf.log(lnpoverp)) fisher = tf.hessians(kl, var) return fisher
def testSecondGradient(self): images_placeholder = tf.placeholder(tf.float32, shape=(3, 2)) labels_placeholder = tf.placeholder(tf.int32, shape=(3)) weights = tf.Variable(tf.truncated_normal([2], stddev=1.0)) weights_with_zeros = tf.pack([tf.zeros([2]), weights], axis=1) logits = tf.matmul(images_placeholder, weights_with_zeros) cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits, labels_placeholder) loss = tf.reduce_mean(cross_entropy) # Taking ths second gradient should fail, since it is not # yet supported. with self.assertRaisesRegexp(LookupError, ".*No gradient defined.*PreventGradient.*"): _ = tf.hessians(loss, [weights])
def testSecondGradient(self): with self.test_session(): l = tf.constant([0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.5], shape=[12], dtype=tf.float64, name="l") f = tf.constant([0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4], shape=[12], dtype=tf.float64, name="f") x = tf.nn.softmax_cross_entropy_with_logits(f, l, name="xent") loss = tf.reduce_mean(x) # Taking ths second gradient should fail, since it is not # yet supported. with self.assertRaisesRegexp(LookupError, ".*No gradient defined.*PreventGradient.*"): _ = tf.hessians(loss, [f])