def initialize_mod_binary_MERA(phys_dim, chi, dtype=tf.float64): """ Parameters: ------------------- phys_dim: int Hilbert space dimension of the bottom layer chi: int maximum bond dimension dtype: tensorflow dtype dtype of the MERA tensors Returns: ------------------- (wC, vC, uC, rhoAB, rhoBA) wC, vC, uC: list of tf.Tensor rhoAB, rhoBA: tf.Tensor """ wC, vC, uC = increase_bond_dimension_by_adding_layers(chi_new=chi, wC=[tf.random_uniform(shape=[phys_dim, phys_dim, phys_dim],dtype=dtype)], vC=[tf.random_uniform(shape=[phys_dim, phys_dim, phys_dim],dtype=dtype)], uC=[tf.random_uniform(shape=[phys_dim, phys_dim, phys_dim, phys_dim],dtype=dtype)]) chi_top = wC[-1].shape[2] rhoAB = tf.reshape(tf.eye(chi_top * chi_top, dtype=dtype), (chi_top, chi_top, chi_top, chi_top)) rhoBA = tf.reshape(tf.eye(chi_top * chi_top, dtype=dtype), (chi_top, chi_top, chi_top, chi_top)) return wC, vC, uC, rhoAB, rhoBA
def test_with_tensors(self): net = tensornetwork.TensorNetwork() a = net.add_node(tf.eye(2) * 2, name="T") b = net.add_node(tf.eye(2) * 3, name="A") e1 = net.connect(a[0], b[0], "edge") e2 = net.connect(a[1], b[1], "edge2") net.check_correct() net.contract(e1) net.check_correct() val = net.contract(e2) net.check_correct() self.assertAlmostEqual(val.get_tensor().numpy(), 12.0)
def _build_predict(self, Xnew, full_cov=False): """ Compute the mean and variance of the latent function at some new points Xnew. For a derivation of the terms in here, see the associated SGPR notebook. """ num_inducing = len(self.feature) err = self.Y - self.mean_function(self.X) Kuf = self.feature.Kuf(self.kern, self.X) Kuu = self.feature.Kuu(self.kern, jitter=settings.numerics.jitter_level) Kus = self.feature.Kuf(self.kern, Xnew) sigma = tf.sqrt(self.likelihood.variance) L = tf.cholesky(Kuu) A = tf.matrix_triangular_solve(L, Kuf, lower=True) / sigma B = tf.matmul(A, A, transpose_b=True) + tf.eye(num_inducing, dtype=settings.float_type) LB = tf.cholesky(B) Aerr = tf.matmul(A, err) c = tf.matrix_triangular_solve(LB, Aerr, lower=True) / sigma tmp1 = tf.matrix_triangular_solve(L, Kus, lower=True) tmp2 = tf.matrix_triangular_solve(LB, tmp1, lower=True) mean = tf.matmul(tmp2, c, transpose_a=True) if full_cov: var = self.kern.K(Xnew) + tf.matmul(tmp2, tmp2, transpose_a=True) \ - tf.matmul(tmp1, tmp1, transpose_a=True) shape = tf.stack([1, 1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 2), shape) else: var = self.kern.Kdiag(Xnew) + tf.reduce_sum(tf.square(tmp2), 0) \ - tf.reduce_sum(tf.square(tmp1), 0) shape = tf.stack([1, tf.shape(self.Y)[1]]) var = tf.tile(tf.expand_dims(var, 1), shape) return mean + self.mean_function(Xnew), var
def _build_likelihood(self): """ q_alpha, q_lambda are variational parameters, size N x R This method computes the variational lower bound on the likelihood, which is: E_{q(F)} [ \log p(Y|F) ] - KL[ q(F) || p(F)] with q(f) = N(f | K alpha + mean, [K^-1 + diag(square(lambda))]^-1) . """ K = self.kern.K(self.X) K_alpha = tf.matmul(K, self.q_alpha) f_mean = K_alpha + self.mean_function(self.X) # compute the variance for each of the outputs I = tf.tile(tf.expand_dims(tf.eye(self.num_data, dtype=settings.float_type), 0), [self.num_latent, 1, 1]) A = I + tf.expand_dims(tf.transpose(self.q_lambda), 1) * \ tf.expand_dims(tf.transpose(self.q_lambda), 2) * K L = tf.cholesky(A) Li = tf.matrix_triangular_solve(L, I) tmp = Li / tf.expand_dims(tf.transpose(self.q_lambda), 1) f_var = 1. / tf.square(self.q_lambda) - tf.transpose(tf.reduce_sum(tf.square(tmp), 1)) # some statistics about A are used in the KL A_logdet = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(L))) trAi = tf.reduce_sum(tf.square(Li)) KL = 0.5 * (A_logdet + trAi - self.num_data * self.num_latent + tf.reduce_sum(K_alpha * self.q_alpha)) v_exp = self.likelihood.variational_expectations(f_mean, f_var, self.Y) return tf.reduce_sum(v_exp) - KL
def body(self, features): with tf.variable_scope('string_embedding'): string_embedding = self.encode(features, 'inputs') if 'targets' in features: with tf.variable_scope('code_embedding'): code_embedding = self.encode(features, 'targets') string_embedding_norm = tf.nn.l2_normalize(string_embedding, axis=1) code_embedding_norm = tf.nn.l2_normalize(code_embedding, axis=1) # All-vs-All cosine distance matrix, reshaped as row-major. cosine_dist = 1.0 - tf.matmul(string_embedding_norm, code_embedding_norm, transpose_b=True) cosine_dist_flat = tf.reshape(cosine_dist, [-1, 1]) # Positive samples on the diagonal, reshaped as row-major. label_matrix = tf.eye(tf.shape(cosine_dist)[0], dtype=tf.int32) label_matrix_flat = tf.reshape(label_matrix, [-1]) logits = tf.concat([1.0 - cosine_dist_flat, cosine_dist_flat], axis=1) labels = tf.one_hot(label_matrix_flat, 2) loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) return string_embedding, {'training': loss} return string_embedding
def testDimensionGuardDynamicShape(self): testee_lkj = tfd.LKJ( dimension=3, concentration=[1., 4.], validate_args=True) with self.assertRaisesOpError('dimension mismatch'): self.evaluate( testee_lkj.log_prob( tf.placeholder_with_default(tf.eye(4), shape=None)))
def maximum_mean_discrepancy(k_xx, k_yy, k_xy): samples_x = tf.cast(tf.shape(k_xx)[0], dtype=tf.float32) samples_y = tf.cast(tf.shape(k_yy)[0], dtype=tf.float32) k_xx_diag = tf.multiply(k_xx, tf.eye(tf.shape(k_xx)[0])) k_xx = k_xx - k_xx_diag k_yy_diag = tf.multiply(k_yy, tf.eye(tf.shape(k_yy)[0])) k_yy = k_yy - k_yy_diag E_xx = tf.reduce_sum(k_xx)/(samples_x*(samples_x-1)) E_yy = tf.reduce_sum(k_yy)/(samples_y*(samples_y-1)) E_xy = tf.reduce_mean(k_xy) mmd_2 = E_xx + E_yy - 2*E_xy mmd = tf.sqrt(tf.maximum(mmd_2,0)) return mmd
def test_sample_mvn(session_tf, cov_structure, num_samples): """ Draws 10,000 samples from a distribution with known mean and covariance. The test checks if the mean and covariance of the samples is close to the true mean and covariance. """ N, D = 10000, 2 means = tf.ones((N, D), dtype=float_type) if cov_structure == "full": covs = tf.eye(D, batch_shape=[N], dtype=float_type) elif cov_structure == "diag": covs = tf.ones((N, D), dtype=float_type) samples = _sample_mvn(means, covs, cov_structure, num_samples=num_samples) value = session_tf.run(samples) if num_samples is None: assert value.shape == (N, D) else: assert value.shape == (num_samples, N, D) value = value.reshape(-1, D) samples_mean = np.mean(value, axis=0) samples_cov = np.cov(value, rowvar=False) np.testing.assert_array_almost_equal(samples_mean, [1., 1.], decimal=1) np.testing.assert_array_almost_equal(samples_cov, [[1., 0.], [0., 1.]], decimal=1)
def _build_predict(self, Xnew, full_cov=False): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + tf.eye(tf.shape(self.X)[0], dtype=settings.float_type) * self.likelihood.variance L = tf.cholesky(K) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, self.Y - self.mean_function(self.X)) fmean = tf.matmul(A, V, transpose_a=True) + self.mean_function(Xnew) if full_cov: fvar = self.kern.K(Xnew) - tf.matmul(A, A, transpose_a=True) shape = tf.stack([1, 1, tf.shape(self.Y)[1]]) fvar = tf.tile(tf.expand_dims(fvar, 2), shape) else: fvar = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.reshape(fvar, (-1, 1)), [1, tf.shape(self.Y)[1]]) return fmean, fvar
def radial_symmetry(self, d_cutoff, d, atom_numbers): """ Radial Symmetry Function """ embedding = tf.eye(np.max(self.atom_cases) + 1) atom_numbers_embedded = tf.nn.embedding_lookup(embedding, atom_numbers) Rs = np.linspace(0., self.radial_cutoff, self.radial_length) ita = np.ones_like(Rs) * 3 / (Rs[1] - Rs[0])**2 Rs = tf.cast(np.reshape(Rs, (1, 1, 1, -1)), tf.float32) ita = tf.cast(np.reshape(ita, (1, 1, 1, -1)), tf.float32) length = ita.get_shape().as_list()[-1] d_cutoff = tf.stack([d_cutoff] * length, axis=3) d = tf.stack([d] * length, axis=3) out = tf.exp(-ita * tf.square(d - Rs)) * d_cutoff if self.atomic_number_differentiated: out_tensors = [] for atom_type in self.atom_cases: selected_atoms = tf.expand_dims( tf.expand_dims(atom_numbers_embedded[:, :, atom_type], axis=1), axis=3) out_tensors.append(tf.reduce_sum(out * selected_atoms, axis=2)) return tf.concat(out_tensors, axis=2) else: return tf.reduce_sum(out, axis=2)
def _update_ortho(self,v,i): s = self.gan.ops.shape(v) if len(s) == 4 and s[0] == s[1]: w=v newv = [] #s = self.ops.shape(v_transpose) #identity = tf.reshape(identity, [s[0],s[1],1,1]) #identity = tf.tile(identity, [1,1,s[2],s[3]]) decay = self.config.decay or 0.01 w = tf.transpose(w, perm=[2,3,0,1]) for i in range(self.config.iterations or 3): wt = tf.transpose(w, perm=[1,0,2,3]) w2 = tf.reshape(w,[-1, s[0],s[1]]) wt2 = tf.reshape(wt,[-1, s[0],s[1]]) wtw = tf.matmul(wt2,w2) eye = tf.eye(s[0],s[1]) eye = tf.tile(eye, [1,s[2]*s[3]]) eye = tf.reshape(eye, self.gan.ops.shape(w)) wtw = tf.reshape(wtw, self.gan.ops.shape(w)) qk = eye - wtw w = w * (eye + 0.5*qk) w = tf.transpose(w, perm=[2,3,0,1]) newv = w newv=(1.0+decay)*v - decay*(newv) newv = tf.reshape(newv,self.ops.shape(v)) return tf.assign(v, newv) else: return None
def _get_fldj_numerical(self, bijector, x, event_ndims, eps=1.e-6, input_to_vector=tfb.Identity, output_to_vector=tfb.Identity): """Numerically approximate the forward log det Jacobian of a bijector. Args: bijector: the bijector whose Jacobian we wish to approximate x: the value for which we want to approximate the Jacobian event_ndims: number of dimensions in an event eps: epsilon to add when forming (f(x+eps)-f(x)) / eps input_to_vector: a bijector that maps the input value to a vector output_to_vector: a bijector that maps the output value to a vector Returns: A numerical approximation to the log det Jacobian of bijector.forward evaluated at x. """ x_vector = input_to_vector.forward(x) n = tf.shape(x_vector)[-1] x_plus_eps_vector = x_vector + eps * tf.eye(n, dtype=x_vector.dtype) x_plus_eps = input_to_vector.inverse(x_plus_eps_vector) f_x = bijector.forward(x) f_x_vector = output_to_vector.forward(f_x) f_x_plus_eps = bijector.forward(x_plus_eps) f_x_plus_eps_vector = output_to_vector.forward(f_x_plus_eps) jacobian_numerical = (f_x_plus_eps_vector - f_x_vector) / eps return ( tf.log(tf.abs(tf.matrix_determinant(jacobian_numerical))) + input_to_vector.forward_log_det_jacobian(x, event_ndims=event_ndims) - output_to_vector.forward_log_det_jacobian(f_x, event_ndims=event_ndims))
def initial_state(self, batch_size, trainable=False): """Creates the initial memory. We should ensure each row of the memory is initialized to be unique, so initialize the matrix to be the identity. We then pad or truncate as necessary so that init_state is of size (batch_size, self._mem_slots, self._mem_size). Args: batch_size: The size of the batch. trainable: Whether the initial state is trainable. This is always True. Returns: init_state: A truncated or padded matrix of size (batch_size, self._mem_slots, self._mem_size). """ init_state = tf.eye(self._mem_slots, batch_shape=[batch_size]) # Pad the matrix with zeros. if self._mem_size > self._mem_slots: difference = self._mem_size - self._mem_slots pad = tf.zeros((batch_size, self._mem_slots, difference)) init_state = tf.concat([init_state, pad], -1) # Truncation. Take the first `self._mem_size` components. elif self._mem_size < self._mem_slots: init_state = init_state[:, :, :self._mem_size] return init_state
def testMultivariateNormalNd(self, event_size, num_samples): def target_log_prob_fn(event): return tfd.MultivariateNormalFullCovariance( loc=tf.zeros(event_size), covariance_matrix=tf.eye(event_size)).log_prob(event) state = tf.zeros(event_size) samples = [] for seed in range(num_samples): [state], _, _ = no_u_turn_sampler.kernel( target_log_prob_fn=target_log_prob_fn, current_state=[state], step_size=[0.3], seed=seed) npstate = state.numpy() samples.append([npstate[0], npstate[1]]) samples = np.array(samples) plt.scatter(samples[:, 0], samples[:, 1]) savefig("projection_chain_{}d_normal_{}_steps.png".format( event_size, num_samples)) plt.close() target_samples = tfd.MultivariateNormalFullCovariance( loc=tf.zeros(event_size), covariance_matrix=tf.eye(event_size)).sample( num_samples, seed=4).numpy() plt.scatter(target_samples[:, 0], target_samples[:, 1]) savefig("projection_independent_{}d_normal_{}_samples.png".format( event_size, num_samples)) plt.close()
def test_non_batch_2x2(self): num_rows = 2 dtype = np.float32 np_eye = np.eye(num_rows).astype(dtype) with self.test_session(): eye = tf.eye(num_rows, dtype=dtype) self.assertAllEqual((num_rows, num_rows), eye.get_shape()) self.assertAllEqual(np_eye, eye.eval())
def distance_cutoff(self, d, cutoff, flags): """ Generate distance matrix with trainable cutoff """ # Cutoff with threshold Rc d_flag = flags * tf.sign(cutoff - d) d_flag = tf.nn.relu(d_flag) d_flag = d_flag * tf.expand_dims((1 - tf.eye(self.max_atoms)), 0) d = 0.5 * (tf.cos(np.pi * d / cutoff) + 1) return d * d_flag
def sample_fast_rcnn_targets(boxes, gt_boxes, gt_labels): """ Sample some ROIs from all proposals for training. #fg is guaranteed to be > 0, because grount truth boxes are added as RoIs. Args: boxes: nx4 region proposals, floatbox gt_boxes: mx4, floatbox gt_labels: m, int32 Returns: sampled_boxes: tx4 floatbox, the rois sampled_labels: t labels, in [0, #class-1]. Positive means foreground. fg_inds_wrt_gt: #fg indices, each in range [0, m-1]. It contains the matching GT of each foreground roi. """ iou = pairwise_iou(boxes, gt_boxes) # nxm proposal_metrics(iou) # add ground truth as proposals as well boxes = tf.concat([boxes, gt_boxes], axis=0) # (n+m) x 4 iou = tf.concat([iou, tf.eye(tf.shape(gt_boxes)[0])], axis=0) # (n+m) x m # #proposal=n+m from now on def sample_fg_bg(iou): fg_mask = tf.reduce_max(iou, axis=1) >= cfg.FRCNN.FG_THRESH fg_inds = tf.reshape(tf.where(fg_mask), [-1]) num_fg = tf.minimum(int( cfg.FRCNN.BATCH_PER_IM * cfg.FRCNN.FG_RATIO), tf.size(fg_inds), name='num_fg') fg_inds = tf.random_shuffle(fg_inds)[:num_fg] bg_inds = tf.reshape(tf.where(tf.logical_not(fg_mask)), [-1]) num_bg = tf.minimum( cfg.FRCNN.BATCH_PER_IM - num_fg, tf.size(bg_inds), name='num_bg') bg_inds = tf.random_shuffle(bg_inds)[:num_bg] add_moving_summary(num_fg, num_bg) return fg_inds, bg_inds fg_inds, bg_inds = sample_fg_bg(iou) # fg,bg indices w.r.t proposals best_iou_ind = tf.argmax(iou, axis=1) # #proposal, each in 0~m-1 fg_inds_wrt_gt = tf.gather(best_iou_ind, fg_inds) # num_fg all_indices = tf.concat([fg_inds, bg_inds], axis=0) # indices w.r.t all n+m proposal boxes ret_boxes = tf.gather(boxes, all_indices) ret_labels = tf.concat( [tf.gather(gt_labels, fg_inds_wrt_gt), tf.zeros_like(bg_inds, dtype=tf.int64)], axis=0) # stop the gradient -- they are meant to be training targets return tf.stop_gradient(ret_boxes, name='sampled_proposal_boxes'), \ tf.stop_gradient(ret_labels, name='sampled_labels'), \ tf.stop_gradient(fg_inds_wrt_gt)
def _forward(self, x): with tf.control_dependencies(self._assertions(x)): x_shape = tf.shape(x) identity_matrix = tf.eye( x_shape[-1], batch_shape=x_shape[:-2], dtype=x.dtype.base_dtype) # Note `matrix_triangular_solve` implicitly zeros upper triangular of `x`. y = tf.matrix_triangular_solve(x, identity_matrix) y = tf.matmul(y, y, adjoint_a=True) return tf.cholesky(y)
def get_self_correlated_mat(num_out_A, scope=None, reuse=None): with tf.variable_scope(scope or 'Self_Correlated_mat', reuse=reuse): cooc1 = get_var('pa_corr', shape=[num_out_A, num_out_A], initializer_fn=tf.contrib.layers.variance_scaling_initializer(factor=0.1, mode='FAN_AVG', uniform=True, dtype=tf.float32), regularizer_fn=tf.contrib.layers.l2_regularizer(scale=3e-4)) return tf.matmul(cooc1, cooc1, transpose_b=True) + tf.eye(num_out_A)
def build(self): """ Parameters for the Gaussian """ len_Rs = len(self.Rs_init) len_ita = len(self.ita_init) self.length = len_Rs * len_ita Rs_init, ita_init = np.meshgrid(self.Rs_init, self.ita_init) self.Rs = tf.constant(Rs_init.flatten(), dtype=tf.float32) self.ita = tf.constant(ita_init.flatten(), dtype=tf.float32) self.atom_number_embedding = tf.eye(max(self.atom_number_cases) + 1)
def _identity(self): batch = tf.shape(self.concentration) answer = tf.eye( num_rows=self.dimension, batch_shape=batch, dtype=self.concentration.dtype) # set_shape only necessary because tf.eye doesn't do it itself: b/111413915 answer.set_shape( answer.shape[:-2].concatenate([self.dimension, self.dimension])) return answer
def test_non_batch_0x2(self): num_rows = 0 num_columns = 2 dtype = np.int64 np_eye = np.eye(num_rows, num_columns).astype(dtype) with self.test_session(): eye = tf.eye(num_rows, num_columns=num_columns, dtype=dtype) self.assertAllEqual((num_rows, num_columns), eye.get_shape()) self.assertAllEqual(np_eye, eye.eval())
def ising_hamiltonian(N, dtype): X = tf.convert_to_tensor([[0.0, 1.0], [1.0, 0.0]], dtype=dtype) Z = tf.convert_to_tensor([[1.0, 0.0], [0.0, -1.0]], dtype=dtype) I = tf.eye(2, dtype=dtype) h = -tf.tensordot(X, X, axes=0) - tf.tensordot(Z, I, axes=0) h_last = h - tf.tensordot(I, Z, axes=0) h = tf.transpose(h, (0,2,1,3)) h_last = tf.transpose(h_last, (0,2,1,3)) H = [h]*(N-2) + [h_last] return H
def entanglement_specs_1site(isos_012): specs = [] state = tf.eye(isos_012[-1].shape[0], dtype=isos_012[0][0].dtype) for l in reversed(range(len(isos_012))): iso_021 = tf.transpose(isos_012[l], (0, 2, 1)) state = descend_state_1site(state, isos_012[l], iso_021) e = tf.linalg.eigvalsh(state) e = tf.cast(e, e.dtype.real_dtype) specs.insert(0, e) return specs
def Kuu(feat, kern, *, jitter=0.0): with params_as_tensors_for(feat, kern): Zmu, Zlen = kern._slice(feat.Z, feat.scales) idlengthscales2 = tf.square(kern.lengthscales + Zlen) sc = tf.sqrt( tf.expand_dims(idlengthscales2, 0) + tf.expand_dims(idlengthscales2, 1) - tf.square( kern.lengthscales)) d = feat._cust_square_dist(Zmu, Zmu, sc) Kzz = kern.variance * tf.exp(-d / 2) * tf.reduce_prod(kern.lengthscales / sc, 2) Kzz += jitter * tf.eye(len(feat), dtype=settings.float_type) return Kzz
def angular_symmetry(self, d_cutoff, d, atom_numbers, coordinates): """ Angular Symmetry Function """ max_atoms = self.max_atoms embedding = tf.eye(np.max(self.atom_cases) + 1) atom_numbers_embedded = tf.nn.embedding_lookup(embedding, atom_numbers) Rs = np.linspace(0., self.angular_cutoff, self.angular_length) ita = 3 / (Rs[1] - Rs[0])**2 thetas = np.linspace(0., np.pi, self.angular_length) zeta = float(self.angular_length**2) ita, zeta, Rs, thetas = np.meshgrid(ita, zeta, Rs, thetas) zeta = tf.cast(np.reshape(zeta, (1, 1, 1, 1, -1)), tf.float32) ita = tf.cast(np.reshape(ita, (1, 1, 1, 1, -1)), tf.float32) Rs = tf.cast(np.reshape(Rs, (1, 1, 1, 1, -1)), tf.float32) thetas = tf.cast(np.reshape(thetas, (1, 1, 1, 1, -1)), tf.float32) length = zeta.get_shape().as_list()[-1] vector_distances = tf.stack([coordinates] * max_atoms, 1) - tf.stack( [coordinates] * max_atoms, 2) R_ij = tf.stack([d] * max_atoms, axis=3) R_ik = tf.stack([d] * max_atoms, axis=2) f_R_ij = tf.stack([d_cutoff] * max_atoms, axis=3) f_R_ik = tf.stack([d_cutoff] * max_atoms, axis=2) # Define angle theta = arccos(R_ij(Vector) dot R_ik(Vector)/R_ij(distance)/R_ik(distance)) vector_mul = tf.reduce_sum(tf.stack([vector_distances] * max_atoms, axis=3) * \ tf.stack([vector_distances] * max_atoms, axis=2), axis=4) vector_mul = vector_mul * tf.sign(f_R_ij) * tf.sign(f_R_ik) theta = tf.acos(tf.math.divide(vector_mul, R_ij * R_ik + 1e-5)) R_ij = tf.stack([R_ij] * length, axis=4) R_ik = tf.stack([R_ik] * length, axis=4) f_R_ij = tf.stack([f_R_ij] * length, axis=4) f_R_ik = tf.stack([f_R_ik] * length, axis=4) theta = tf.stack([theta] * length, axis=4) out_tensor = tf.pow((1. + tf.cos(theta - thetas)) / 2., zeta) * \ tf.exp(-ita * tf.square((R_ij + R_ik) / 2. - Rs)) * f_R_ij * f_R_ik * 2 if self.atomic_number_differentiated: out_tensors = [] for id_j, atom_type_j in enumerate(self.atom_cases): for atom_type_k in self.atom_cases[id_j:]: selected_atoms = tf.stack([atom_numbers_embedded[:, :, atom_type_j]] * max_atoms, axis=2) * \ tf.stack([atom_numbers_embedded[:, :, atom_type_k]] * max_atoms, axis=1) selected_atoms = tf.expand_dims( tf.expand_dims(selected_atoms, axis=1), axis=4) out_tensors.append( tf.reduce_sum(out_tensor * selected_atoms, axis=(2, 3))) return tf.concat(out_tensors, axis=2) else: return tf.reduce_sum(out_tensor, axis=(2, 3))
def test_1x3_batch_4x4(self): num_rows = 4 batch_shape = [1, 3] dtype = np.float32 np_eye = np.eye(num_rows).astype(dtype) with self.test_session(): eye = tf.eye(num_rows, batch_shape=batch_shape, dtype=dtype) self.assertAllEqual(batch_shape + [num_rows, num_rows], eye.get_shape()) eye_v = eye.eval() for i in range(batch_shape[0]): for j in range(batch_shape[1]): self.assertAllEqual(np_eye, eye_v[i, j, :, :])
def random_tree_tn_uniform(Ds, dtype, top_rank=1): num_layers = len(Ds) Ds = Ds + [top_rank] isos = [] for j in range(num_layers): if Ds[j + 1] == Ds[j]**2: iso = tf.eye(Ds[j + 1], dtype=dtype) else: iso = random_isometry(Ds[j + 1], Ds[j]**2, dtype) iso = tf.reshape(iso, (Ds[j + 1], Ds[j], Ds[j])) isos.append(iso) return isos
def _dense_ham_term(H): h1, (h2L, h2R) = H D = h1.shape[0] dtype = h1.dtype E = tf.eye(D, dtype=dtype) h = tensornetwork.ncon([h1, E], [(-1, -3), (-2, -4)]) for (hl, hr) in zip(h2L, h2R): h += tensornetwork.ncon([hl, hr], [(-1, -3), (-2, -4)]) return h
def _build_likelihood(self): r""" Construct a tensorflow function to compute the likelihood. \log p(Y | theta). """ K = self.kern.K(self.X) + tf.eye(tf.shape(self.X)[0], dtype=settings.float_type) * self.likelihood.variance L = tf.cholesky(K) m = self.mean_function(self.X) logpdf = multivariate_normal(self.Y, m, L) # (R,) log-likelihoods for each independent dimension of Y return tf.reduce_sum(logpdf)
def setup_image_loss(self): pix_x = tf.lin_space(-6.0,6.0,224) pix_y = tf.lin_space(-6.0,6.0,224) pix_x, pix_y = tf.meshgrid(pix_x, pix_y) pix = tf.stack([pix_x,pix_y], axis=-1) # batch of [1,63] gaussian distributions in R2, one for each edge self.image_pred = - sample_equdistance(self.pred, 64) # annotated configuration need to be negated to match render self.norm_rgb = self.rgb/255 center = (self.image_pred[:,1:,:]+self.image_pred[:,:-1,:]) / 2.0 diff = (self.image_pred[:,1:,:]-self.image_pred[:,:-1,:]) / 2.0 dist = tf.norm(diff, axis=2) self.dist = dist pix = tf.reshape(pix, [224,224,1,1,2]) # to broadcast pix_to_center = pix - center # perpendicular distance to segments # cross(pix-center, diff) / dist pix_to_segment_p = (pix_to_center[:,:,:,:,0]*diff[:,:,1]-pix_to_center[:,:,:,:,1]*diff[:,:,0]) / dist # longitutional distance to segments # max{ abs[ dot(pix-center, diff) / dist ] - dist, 0 } pix_to_segment_l = (pix_to_center[:,:,:,:,0]*diff[:,:,0]+pix_to_center[:,:,:,:,1]*diff[:,:,1]) / dist pix_to_segment_l = tf.maximum(tf.abs(pix_to_segment_l)-dist, 0) #sigma = tf.constant(0.1, dtype=tf.float32) self.sigma = tf.get_variable('sigma',dtype=tf.float32, shape=(), initializer=tf.constant_initializer(0.1)) sigma = tf.maximum(self.sigma, 0.1) pix_prob = tf.exp((-tf.square(pix_to_segment_p)-tf.square(pix_to_segment_l))/(2*sigma*sigma)) sum_pix_prob = tf.reduce_sum(pix_prob, axis=3) # self.reg_loss_e = tf.reduce_mean(sum_pix_prob*sum_pix_prob, axis=[0,1]) - \ # tf.reduce_mean(pix_prob*pix_prob, axis=[0,1,3])*tf.cast(tf.shape(pix_prob)[3], tf.float32) # self.reg_loss_e = self.reg_loss_e * 10 pix_prob = tf.reduce_max(pix_prob, axis=3) # shape should be [224,224,batch] pix_prob = tf.transpose(pix_prob, perm=[2,0,1]) # [batch, 224,224] pix_prob = tf.clip_by_value(pix_prob, 0, 1) pix_prob = tf.expand_dims(pix_prob, axis=3) self.pix_prob=pix_prob # debug # simple case, assume foreground and background color is known #pix_positive = tf.constant([1,0,0], dtype=tf.float32) # red pixel #pix_negative = tf.constant([1,1,1], dtype=tf.float32) # white pixel # harder case, foreground / background color by averaging prediction mean_positive = tf.reduce_mean(pix_prob, axis=[1,2], keep_dims=True) # B x 1 x 1 x 1 mean_negative = tf.reduce_mean(1-pix_prob, axis=[1,2], keep_dims=True) pix_positive = tf.reduce_mean(pix_prob*self.norm_rgb, axis=[1,2], keep_dims=True) / mean_positive pix_negative = tf.reduce_mean((1-pix_prob)*self.norm_rgb, axis=[1,2], keep_dims=True) / mean_negative # render is for visualization, not in loss self.render = pix_prob * pix_positive + (1-pix_prob) * pix_negative self.render = tf.clip_by_value(self.render, 0, 1) mean_positive = tf.squeeze(mean_positive, axis=[1,2,3]) mean_negative = tf.squeeze(mean_negative, axis=[1,2,3]) pix_prob = tf.squeeze(pix_prob, axis=3) pix_prob = tf.expand_dims(tf.expand_dims(pix_prob, -1), -1) # annoying reshaping to broadcast correctly mean_positive = tf.expand_dims(tf.expand_dims(mean_positive, -1), -1) mean_negative = tf.expand_dims(tf.expand_dims(mean_negative, -1), -1) cov_positive = tf.reduce_mean(pix_prob*tf.matmul( tf.expand_dims(self.norm_rgb-pix_positive, -1), tf.expand_dims(self.norm_rgb-pix_positive, -1), transpose_b=True), axis=[1,2]) / mean_positive cov_negative = tf.reduce_mean((1-pix_prob)*tf.matmul( tf.expand_dims(self.norm_rgb-pix_negative, -1), tf.expand_dims(self.norm_rgb-pix_negative, -1), transpose_b=True), axis=[1,2]) / mean_negative positive_gaussian = tfp.distributions.MultivariateNormalFullCovariance( loc=tf.squeeze(pix_positive, axis=[1,2]), covariance_matrix=cov_positive+tf.eye(3)*0.0001) negative_gaussian = tfp.distributions.MultivariateNormalFullCovariance( loc=tf.squeeze(pix_negative, axis=[1,2]), covariance_matrix=cov_negative+tf.eye(3)*0.0001) mean_positive = tf.reshape(mean_positive, [1,1,-1]) mean_negative = tf.reshape(mean_negative, [1,1,-1]) pix_prob = tf.transpose(tf.squeeze(pix_prob, axis=[3,4]), perm=[1,2,0]) image_reshape = tf.transpose(self.norm_rgb, perm=[1,2,0,3]) # H x W x B x 3 prob_positive = mean_positive * positive_gaussian.prob(image_reshape) # H x W x B prob_negative = mean_negative * negative_gaussian.prob(image_reshape) prob_positive = tf.maximum(prob_positive, 1e-30) prob_negative = tf.maximum(prob_negative, 1e-30) # for debuging self.pix_prob=pix_prob self.prob_positive=prob_positive self.prob_negative=prob_negative self.image_loss_e = - tf.reduce_mean(tf.log(pix_prob*prob_positive + (1-pix_prob)*prob_negative), axis=[0,1]) # image loss per instance self.image_loss = tf.reduce_mean(self.image_loss_e) self.reg_loss_e = tf.reduce_mean(dist*dist, axis=1)*4.0 # - tf.reduce_mean(tf.reduce_mean(dist,axis=[1])*tf.reduce_mean(dist,axis=[1]), axis=0)*0.95 self.reg_loss = tf.reduce_mean(self.reg_loss_e)
def contextual_attention(f, b, mask=None, ksize=3, stride=1, rate=1, fuse_k=3, softmax_scale=10., training=True, fuse=True): """ Contextual attention layer implementation. Contextual attention is first introduced in publication: Generative Image Inpainting with Contextual Attention, Yu et al. Args: x: Input feature to match (foreground). t: Input feature for match (background). mask: Input mask for t, indicating patches not available. ksize: Kernel size for contextual attention. stride: Stride for extracting patches from t. rate: Dilation for matching. softmax_scale: Scaled softmax for attention. training: Indicating if current graph is training or inference. Returns: tf.Tensor: output """ # get shapes raw_fs = tf.shape(f) raw_int_fs = f.get_shape().as_list() raw_int_bs = b.get_shape().as_list() # extract patches from background with stride and rate kernel = 2*rate raw_w = tf.extract_image_patches( b, [1,kernel,kernel,1], [1,rate*stride,rate*stride,1], [1,1,1,1], padding='SAME') raw_w = tf.reshape(raw_w, [raw_int_bs[0], -1, kernel, kernel, raw_int_bs[3]]) raw_w = tf.transpose(raw_w, [0, 2, 3, 4, 1]) # transpose to b*k*k*c*hw # downscaling foreground option: downscaling both foreground and # background for matching and use original background for reconstruction. f = resize(f, scale=1./rate, func=tf.image.resize_nearest_neighbor) b = resize(b, to_shape=[int(raw_int_bs[1]/rate), int(raw_int_bs[2]/rate)], func=tf.image.resize_nearest_neighbor) # https://github.com/tensorflow/tensorflow/issues/11651 if mask is not None: mask = resize(mask, scale=1./rate, func=tf.image.resize_nearest_neighbor) fs = tf.shape(f) int_fs = f.get_shape().as_list() f_groups = tf.split(f, int_fs[0], axis=0) # from t(H*W*C) to w(b*k*k*c*h*w) bs = tf.shape(b) int_bs = b.get_shape().as_list() w = tf.extract_image_patches( b, [1,ksize,ksize,1], [1,stride,stride,1], [1,1,1,1], padding='SAME') w = tf.reshape(w, [int_fs[0], -1, ksize, ksize, int_fs[3]]) w = tf.transpose(w, [0, 2, 3, 4, 1]) # transpose to b*k*k*c*hw # process mask if mask is None: mask = tf.zeros([1, bs[1], bs[2], 1]) m = tf.extract_image_patches( mask, [1,ksize,ksize,1], [1,stride,stride,1], [1,1,1,1], padding='SAME') m = tf.reshape(m, [1, -1, ksize, ksize, 1]) m = tf.transpose(m, [0, 2, 3, 4, 1]) # transpose to b*k*k*c*hw m = m[0] mm = tf.cast(tf.equal(tf.reduce_mean(m, axis=[0,1,2], keep_dims=True), 0.), tf.float32) w_groups = tf.split(w, int_bs[0], axis=0) raw_w_groups = tf.split(raw_w, int_bs[0], axis=0) y = [] offsets = [] k = fuse_k scale = softmax_scale fuse_weight = tf.reshape(tf.eye(k), [k, k, 1, 1]) for xi, wi, raw_wi in zip(f_groups, w_groups, raw_w_groups): # conv for compare wi = wi[0] wi_normed = wi / tf.maximum(tf.sqrt(tf.reduce_sum(tf.square(wi), axis=[0,1,2])), 1e-4) yi = tf.nn.conv2d(xi, wi_normed, strides=[1,1,1,1], padding="SAME") # conv implementation for fuse scores to encourage large patches if fuse: yi = tf.reshape(yi, [1, fs[1]*fs[2], bs[1]*bs[2], 1]) yi = tf.nn.conv2d(yi, fuse_weight, strides=[1,1,1,1], padding='SAME') yi = tf.reshape(yi, [1, fs[1], fs[2], bs[1], bs[2]]) yi = tf.transpose(yi, [0, 2, 1, 4, 3]) yi = tf.reshape(yi, [1, fs[1]*fs[2], bs[1]*bs[2], 1]) yi = tf.nn.conv2d(yi, fuse_weight, strides=[1,1,1,1], padding='SAME') yi = tf.reshape(yi, [1, fs[2], fs[1], bs[2], bs[1]]) yi = tf.transpose(yi, [0, 2, 1, 4, 3]) yi = tf.reshape(yi, [1, fs[1], fs[2], bs[1]*bs[2]]) # softmax to match yi *= mm # mask yi = tf.nn.softmax(yi*scale, 3) yi *= mm # mask offset = tf.argmax(yi, axis=3, output_type=tf.int32) offset = tf.stack([offset // fs[2], offset % fs[2]], axis=-1) # deconv for patch pasting # 3.1 paste center wi_center = raw_wi[0] yi = tf.nn.conv2d_transpose(yi, wi_center, tf.concat([[1], raw_fs[1:]], axis=0), strides=[1,rate,rate,1]) / 4. y.append(yi) offsets.append(offset) y = tf.concat(y, axis=0) y.set_shape(raw_int_fs) offsets = tf.concat(offsets, axis=0) offsets.set_shape(int_bs[:3] + [2]) # case1: visualize optical flow: minus current position h_add = tf.tile(tf.reshape(tf.range(bs[1]), [1, bs[1], 1, 1]), [bs[0], 1, bs[2], 1]) w_add = tf.tile(tf.reshape(tf.range(bs[2]), [1, 1, bs[2], 1]), [bs[0], bs[1], 1, 1]) offsets = offsets - tf.concat([h_add, w_add], axis=3) # to flow image flow = flow_to_image_tf(offsets) # # case2: visualize which pixels are attended # flow = highlight_flow_tf(offsets * tf.cast(mask, tf.int32)) if rate != 1: flow = resize(flow, scale=rate, func=tf.image.resize_bilinear) return y, flow
def _a(self): a_for_one_arm = 1.0 + 4.0 * tf.eye(self._encoding_dim, dtype=tf.float32) return [a_for_one_arm] * self._num_actions
def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs logits_list = [] stddev_list = [] for i in range(FLAGS.ensemble_size): logits = model(images, training=False) if isinstance(logits, tuple): # If model returns a tuple of (logits, covmat), extract both logits, covmat = logits else: covmat = tf.eye(FLAGS.per_core_batch_size) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) logits = mean_field_logits( logits, covmat, mean_field_factor=FLAGS.gp_mean_field_factor) stddev = tf.sqrt(tf.linalg.diag_part(covmat)) stddev_list.append(stddev) logits_list.append(logits) member_probs = tf.nn.softmax(logits) member_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, member_probs) metrics['test/nll_member_{}'.format(i)].update_state( member_loss) metrics['test/accuracy_member_{}'.format(i)].update_state( labels, member_probs) # Logits dimension is (num_samples, batch_size, num_classes). logits_list = tf.stack(logits_list, axis=0) stddev_list = tf.stack(stddev_list, axis=0) stddev = tf.reduce_mean(stddev_list, axis=0) probs_list = tf.nn.softmax(logits_list) probs = tf.reduce_mean(probs_list, axis=0) labels_broadcasted = tf.broadcast_to( labels, [FLAGS.ensemble_size, labels.shape[0]]) log_likelihoods = -tf.keras.losses.sparse_categorical_crossentropy( labels_broadcasted, logits_list, from_logits=True) negative_log_likelihood = tf.reduce_mean( -tf.reduce_logsumexp(log_likelihoods, axis=[0]) + tf.math.log(float(FLAGS.ensemble_size))) if dataset_name == 'clean': metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].update_state(labels, probs) metrics['test/stddev'].update_state(stddev) else: corrupt_metrics['test/nll_{}'.format( dataset_name)].update_state(negative_log_likelihood) corrupt_metrics['test/accuracy_{}'.format( dataset_name)].update_state(labels, probs) corrupt_metrics['test/ece_{}'.format( dataset_name)].update_state(labels, probs) corrupt_metrics['test/stddev_{}'.format( dataset_name)].update_state(stddev)
import tensorflow as tf import tensorflow_probability as tfp from time import time n_samples = 4096 n_electrons = 4 n = 100 mu = tf.zeros(3) sig = 0.02 sigma = tf.eye(3) * sig sig = tf.sqrt(sig) prev_sample = tf.random.uniform((n_samples, n_electrons, 3)) step_gaussian = tfp.distributions.MultivariateNormalFullCovariance(mu, sigma) t0 = time() for _ in range(n): x = step_gaussian.sample(prev_sample.shape[:-1], dtype=tf.float32) print(time() - t0) t0 = time() for _ in range(n): x = tf.random.normal(prev_sample.shape, stddev=sig) print(time() - t0)
def grad(dL): aones = tf.fill(tf.shape(pt_in),np.float64(1.)) bones = tf.fill(tf.shape(pt_out),np.float64(1.)) Mnew = tf.cast(tf.transpose(ground_distance,perm=[0,2,1]),tf.float64) T = tf.cast(tf.transpose(match,perm=[0,2,1]),tf.float64) Ttilde = T[:,:,:-1] L = T * Mnew Ltilde = L[:,:,:-1] D1 = tf.linalg.diag(tf.reduce_sum(T,axis=-1)) D2 = tf.linalg.diag(1/(tf.reduce_sum(Ttilde,axis=-2) + np.float64(1e-100))) # Add epsilon to ensure invertibility H = D1 - tf.matmul(tf.matmul(Ttilde,D2),Ttilde,transpose_b=True) + epsilon* tf.eye(num_rows = tf.shape(bones)[-1],batch_shape = [tf.shape(bones)[0]],dtype=tf.float64) # Add small diagonal piece to make sure H is invertible in edge cases. f = - tf.reduce_sum(L,axis=-1) + tf.squeeze(tf.matmul(tf.matmul(Ttilde,D2),tf.expand_dims(tf.reduce_sum(Ltilde,axis=-2),-1)),axis=-1) g = tf.squeeze(tf.matmul(tf.linalg.inv(H),tf.expand_dims(f,-1)),axis=-1) grad_pT = g - bones*tf.expand_dims(tf.reduce_sum(g,axis=-1),-1)/tf.cast(tf.shape(bones)[1],tf.float64) grad_x_out = tf.gradients(recon_loss,x_out)[0] return [-tf.expand_dims(dL,-1) * tf.cast(grad_pT,tf.float32), tf.expand_dims(tf.expand_dims(dL,-1),-1)*tf.cast(grad_x_out,tf.float32)]
def call(self, inputs): if len(inputs) == 3: X, A, I = inputs if K.ndim(I) == 2: I = I[:, 0] else: X, A = inputs I = None N = K.shape(A)[-1] # Check if the layer is operating in mixed or batch mode mode = ops.autodetect_mode(A, X) self.reduce_loss = mode in (ops.modes['M'], ops.modes['B']) # Get normalized adjacency if K.is_sparse(A): I_ = tf.sparse.eye(N, dtype=A.dtype) A_ = tf.sparse.add(A, I_) else: I_ = tf.eye(N, dtype=A.dtype) A_ = A + I_ fltr = ops.normalize_A(A_) # Node embeddings Z = K.dot(X, self.kernel_emb) Z = ops.filter_dot(fltr, Z) if self.activation is not None: Z = self.activation(Z) # Compute cluster assignment matrix S = K.dot(X, self.kernel_pool) S = ops.filter_dot(fltr, S) S = activations.softmax(S, axis=-1) # softmax applied row-wise # Link prediction loss S_gram = ops.matmul_A_BT(S, S) if K.is_sparse(A): LP_loss = tf.sparse.add(A, -S_gram) # A/tf.norm(A) - S_gram/tf.norm(S_gram) else: LP_loss = A - S_gram LP_loss = tf.norm(LP_loss, axis=(-1, -2)) if self.reduce_loss: LP_loss = K.mean(LP_loss) self.add_loss(LP_loss) # Entropy loss entr = tf.negative(tf.reduce_sum(tf.multiply(S, K.log(S + K.epsilon())), axis=-1)) entr_loss = K.mean(entr, axis=-1) if self.reduce_loss: entr_loss = K.mean(entr_loss) self.add_loss(entr_loss) # Pooling X_pooled = ops.matmul_AT_B(S, Z) A_pooled = ops.matmul_AT_B_A(S, A) output = [X_pooled, A_pooled] if I is not None: I_mean = tf.math.segment_mean(I, I) I_pooled = ops.repeat(I_mean, tf.ones_like(I_mean) * self.k) output.append(I_pooled) if self.return_mask: output.append(S) return output
def _build(self, x, y): print('build start') opts = self.opts time_loss_start = opts.time_loss_start time_loss_end = opts.time_loss_end batch_size = opts.batch_size self.decay = opts.dt / opts.tau assert opts.activation_fn in [ 'relu', 'tanh', 'relu6', 'retanh', 'sigmoid' ], "Invalid nonlinearity" fn = opts.activation_fn if fn == 'sigmoid': self.fn = tf.nn.sigmoid elif fn == 'tanh': self.fn = tf.tanh elif fn == 'relu': self.fn = tf.nn.relu elif fn == 'relu6': self.fn = tf.nn.relu6 else: self.fn = lambda L: tf.nn.relu(tf.nn.tanh(L)) inputs_series = tf.unstack(x, axis=1) labels_series = tf.unstack(y, axis=1) layer_size = opts.layer_size layer_size.insert(0, x.shape[-1]) # EI_in = opts.EI_in # either a percentage excitatory/inhibitory for each layer or None for random init # EI_h = opts.EI_h # EI_out = opts.EI_out self.Wxh, self.Whh, self.Wh_bias, self.Whh_mask, self.recurrent_mask, self.forward_mask, init_state = \ [], [], [], [], [], [], [] for i in range(1, len(layer_size)): prev = layer_size[i - 1] cur = layer_size[i] self.Wxh.append( tf.get_variable(f"input_weights_{i-1}", [prev, cur])) self.Whh.append( tf.get_variable(f"hidden_weights_{i-1}", [cur, cur])) self.Wh_bias.append( tf.Variable(tf.zeros([1, cur]), name=f"hidden_bias_{i-1}")) self.Whh_mask.append(1 - tf.eye(cur)) self.Wout = tf.get_variable("output_weights", [layer_size[-1], y.shape[-1]]) self.Wout_bias = tf.Variable(tf.zeros([1, y.shape[-1]]), name="output_bias") # layer_size.pop(0) next_state = [ tf.zeros(shape=[batch_size, L], dtype=tf.float32) for L in layer_size[1:] ] state_series = [] logit_series = [] for i, current_input in enumerate(inputs_series): next_state, next_logit = self.scan_fn(next_state, current_input, opts) state_series.append(next_state) logit_series.append(next_logit) self.predictions = [tf.nn.softmax(log) for log in logit_series] xe = [ tf.nn.softmax_cross_entropy_with_logits_v2(labels=lab, logits=log) for lab, log in zip(labels_series, logit_series) ] self.error_loss = tf.reduce_mean(xe[time_loss_start:time_loss_end]) rnn_activity = tf.stack( [tf.stack([s for s in state], axis=2) for state in state_series], axis=1) self.activity_loss = opts.activity_alpha * tf.reduce_mean( tf.square(rnn_activity)) # zero activity self.weight_loss = opts.weight_alpha * ( tf.reduce_mean([tf.reduce_mean(tf.square(W)) for W in self.Whh]) + tf.reduce_mean([tf.reduce_mean(tf.square(W)) for W in self.Wxh])) self.total_loss = self.error_loss + self.weight_loss + self.activity_loss layer_ix = np.cumsum(layer_size) self.states = [ rnn_activity[:, :, layer_size[i]:layer_size[i + 1]] for i in range(len(layer_ix) - 1) ] self.logits = tf.stack(logit_series, axis=1)
def levmarq(settings, x_train, y_train, mu_init=3.0, min_error=1e-10, max_steps=100, mu_multiply=10, mu_divide=10, m_into_epoch=10, verbose=False): outs = settings["outs"] m = settings["input_len"] print(5 * "=" + ">Training info<" + 5 * "=", "\n") print("Settings: ") for i in settings.keys(): print(f" {i}:{settings[i]}") print("\ntf version: ", tf.__version__, "\n") print( f"shape X:\t{x_train.shape}\nshape y:\t{y_train.shape}\n m:\t{m}\n p:\t{outs}" ) print("\n") x = tf.compat.v1.placeholder(tf.float64, shape=[m, settings["inputs"]]) y = tf.compat.v1.placeholder(tf.float64, shape=[m, settings["outs"]]) # hidden layers nn = settings["architecture"] st = [x_train.shape[-1]] + nn + [y_train.shape[-1]] sizes = [] shapes = [] for i in range(len(nn) + 1): shapes.append((st[i], st[i + 1])) shapes.append((1, st[i + 1])) sizes = [h * w for h, w in shapes] neurons_cnt = sum(sizes) print( f"Complex:\n [parameters]x[data lenth]\n {neurons_cnt}x{m}\n" ) if settings["activation"] == "relu": activation = tf.nn.relu if settings["activation"] == "tanh": activation = tf.nn.tanh else: activation = tf.nn.sigmoid # feed forward initializer = tf.contrib.layers.xavier_initializer() p = tf.Variable(initializer([neurons_cnt], dtype=tf.float64)) parms = tf.split(p, sizes, 0) for i in range(len(parms)): parms[i] = tf.reshape(parms[i], shapes[i]) Ws = parms[0:][::2] bs = parms[1:][::2] y_hat = x for i in range(len(nn)): y_hat = activation(tf.matmul(y_hat, Ws[i]) + bs[i]) y_hat = tf.matmul(y_hat, Ws[-1]) + bs[-1] y_hat_flat = tf.squeeze(y_hat) r = y - y_hat loss = tf.reduce_mean(tf.square(r)) # feed dicts for map placeholders to actual values train_dict = {x: x_train, y: y_train} Error_estimate = 10 * \ math.log10(1/(4*len(x_train) * int(y_train.shape[-1]))) opt = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=1) mu = tf.compat.v1.placeholder(tf.float64, shape=[1]) p_store = tf.Variable(tf.zeros([neurons_cnt], dtype=tf.float64)) save_parms = tf.compat.v1.assign(p_store, p) restore_parms = tf.compat.v1.assign(p, p_store) def jacobian(y, x, m): loop_vars = [ tf.constant(0, tf.int32), tf.TensorArray(tf.float64, size=m), ] _, jacobian = tf.while_loop( lambda i, _: i < m, lambda i, res: (i + 1, res.write(i, tf.gradients(y[i], x)[0])), loop_vars) return jacobian.stack() I = tf.eye(neurons_cnt, dtype=tf.float64) j = jacobian(y_hat_flat, p, m) jT = tf.transpose(j) jTj = tf.matmul(jT, j) jTr = tf.matmul(jT, r) jTj = tf.hessians(loss, p)[0] jTr = -tf.gradients(loss, p)[0] jTr = tf.reshape(jTr, shape=(neurons_cnt, 1)) jTj_store = tf.Variable( tf.zeros((neurons_cnt, neurons_cnt), dtype=tf.float64)) jTr_store = tf.Variable(tf.zeros((neurons_cnt, 1), dtype=tf.float64)) save_jTj_jTr = [ tf.compat.v1.assign(jTj_store, jTj), tf.compat.v1.assign(jTr_store, jTr) ] dx = tf.matmul(tf.linalg.inv(jTj_store + tf.multiply(mu, I)), jTr_store) dx = tf.squeeze(dx) _dx = tf.matmul(tf.linalg.inv(jTj + tf.multiply(mu, I)), jTr) _dx = -tf.squeeze(_dx) lm = opt.apply_gradients([(-dx, p)]) # Train session = tf.compat.v1.Session() train_dict[mu] = np.array([mu_init]) history = [] step = 0 session.run(tf.compat.v1.global_variables_initializer()) current_loss = session.run(loss, train_dict) while current_loss > min_error and step < max_steps: step += 1 if step % int(max_steps / 5) == 0 and verbose: print( f'LM step: {step}, mu: {train_dict[mu][0]:.2e}, current loss: {current_loss:.2e}' ) session.run(save_parms) session.run(save_jTj_jTr, train_dict) success = False for i in range(m_into_epoch): session.run(lm, train_dict) new_loss = session.run(loss, train_dict) if new_loss < current_loss: train_dict[mu] /= mu_divide current_loss = new_loss success = True break train_dict[mu] *= mu_multiply session.run(restore_parms) history.append(current_loss) if not success: print( f'LM failed to improve, on step {step:}, loss: {current_loss:.2e}\n' ) tp = session.run(p) session.close() tf.compat.v1.reset_default_graph() return np.asarray(history), tp break print(f'LevMarq ended on: {step:},\tfinal loss: {current_loss:.2e}\n') tp = session.run(p) session.close() tf.compat.v1.reset_default_graph() return np.asarray(history), tp
def __init__(self, num_features, l2reg=0.001): self.num_features = num_features self.l2reg = l2reg self.eye = tf.eye(num_features)
def build_graph(Tlims, num_inducing_points=11, dim=1, alphas_init_val=1, gamma_init_val=1., ag_poser='exp', m_init_val=0.1, lvech_init_val=None, stabilizer_value=0.01, kzz_stabilizer_value=1e-8, optimize_inducing_points=False, assert_correct_covariances=False): ## ######### ## # PLACEHOLDER # ## ######### ## if not optimize_inducing_points: # TODO change back to None Z_ph = tf.placeholder(DTYPE, [None, None], name='inducing_point_locations') u_ph = tf.placeholder(DTYPE, [], name='inducing_point_mean') X_ph = tf.placeholder(DTYPE, [None, None], name='input_data') #a_ph = tf.placeholder(DTYPE, [None] ,name='alphas') # TODO: set constants as variables and create two optimizers with var_lists to optimize with/without hyperparams #a_const = 1 * tf.ones([1]) # dimension = tf.shape(Z_ph)[1] #g_const = tf.ones([1]) # later we have to define gamma as variable C = tf.constant(0.57721566490153286, dtype=DTYPE) # Tlims = tf.constant(Tlims, dtype=DTYPE) assert (Tlims.shape == (dim, 2)) #Tlims Tmins = tf.reduce_min(Tlims, axis=1) Tmaxs = tf.reduce_max(Tlims, axis=1) assert (Tmins.dtype == DTYPE) assert (len(Tmins.shape) == 1) ## ####### ## # VARIABLES # ## ####### ## if optimize_inducing_points: # optimize inducing point location with tf.name_scope('inducing_point_optimization'): omegas_init = ( tf.random_uniform([num_inducing_points, dim], dtype=DTYPE) - 0.5) * tf.constant(2, dtype=DTYPE) omegas = tf.Variable(omegas_init, dtype=DTYPE, name='ind_point_omegas') with tf.name_scope('omegas'): if dim == 1: for z in range(num_inducing_points): tf.summary.scalar('omegas_{}'.format(z), tf.squeeze(omegas[z])) elif dim == 2: print('omega 2d treatment not yet implemented') # TODO: add fancy 2d movement as images # for z in range(num_inducing_points): # tf.summary.('omegas_{}'.format(z), omegas[z]) else: print( 'omega summaries not available for dimensions higher than 2' ) dim_mean = tf.reduce_mean(Tlims, axis=1) dim_shifter = tf.subtract(Tmins, Tmaxs, name='ind_point_ranges') / 2 dim_mean = tf.expand_dims(dim_mean, 0) dim_shifter = tf.expand_dims(dim_shifter, 0) assert (dim_mean.shape == (1, dim)) assert (dim_shifter.shape == (1, dim)) Z_ph = tf.subtract(dim_mean, dim_shifter * tf.tanh(omegas), name='inducing_point_locations') else: omegas = None Tlims = tf.cast(Tlims, dtype=DTYPE) with tf.name_scope('kernel_hyperparameters'): # poser fun makes sure the values for alphas and gamas are non-negative if ag_poser == 'abs': tf_poser_fun = lambda x: tf.abs(x) tf_poser_fun_inv = lambda x: tf.abs(x) elif ag_poser == 'square': tf_poser_fun = lambda x: tf.square(x) tf_poser_fun_inv = lambda x: tf.sqrt(x) elif ag_poser == None: tf_poser_fun = lambda x: x tf_poser_fun_inv = lambda x: x else: # default: exp tf_poser_fun = lambda x: tf.exp(x) tf_poser_fun_inv = lambda x: tf.log(x) #alphas alphas_init_val = tf.constant(alphas_init_val, dtype=DTYPE) alphas_init = tf.ones([dim], dtype=DTYPE) * tf_poser_fun_inv(alphas_init_val) alphas_base = tf.Variable(alphas_init, name='variational_alphas', dtype=DTYPE) alphas = tf_poser_fun(alphas_base) with tf.name_scope('alphas'): for a in range(dim): tf.summary.scalar('alphas_{}'.format(a), alphas[a]) #gamma gamma_init_val = tf.constant(gamma_init_val, dtype=DTYPE) gamma_init_val = tf_poser_fun_inv(gamma_init_val) gamma_base = tf.Variable(gamma_init_val, name='variational_gamma', dtype=DTYPE) gamma = tf_poser_fun(gamma_base) tf.summary.scalar('gamma_base', gamma_base) tf.summary.scalar('gamma', gamma) tf.summary.tensor_summary('alphas_base', alphas_base) tf.summary.tensor_summary('alphas', alphas) # kernel call K_zz = ard_kernel(Z_ph, Z_ph, gamma=gamma, alphas=alphas) + tf.eye( num_inducing_points, dtype=DTYPE) * kzz_stabilizer_value K_zz_inv = tf.matrix_inverse(K_zz) with tf.name_scope('variational_distribution_parameters'): # mean m_init = tf.ones([num_inducing_points], dtype=DTYPE) * m_init_val m = tf.Variable(m_init, name='variational_mean', dtype=DTYPE) ## #### ## # S INIT # ## #### ## # vectorized version of covariance matrix S (ensure valid covariance matrix) vech_size = tf.cast( (num_inducing_points * (num_inducing_points + 1)) / 2, DTYPE_INT) vech_indices = tf.transpose(tf_tril_indices(num_inducing_points)) # L_vech_init = tf.ones([vech_size]) ''' if lvech_init_val is None: lvechinitializer = np.zeros([(num_inducing_points * (num_inducing_points+1)) // 2]) lvechinitializer[(np.cumsum(np.arange(num_inducing_points+1)) - 1)[1:]] = 1. L_vech_init = tf.constant(lvechinitializer, dtype=DTYPE) else: L_vech_init = tf.constant(lvech_init_val, dtype=DTYPE) ''' lvech_init_stddev = 0.01 L_vech_init = tf.random_normal([vech_size], stddev=lvech_init_stddev, dtype=DTYPE) L_vech = tf.Variable(L_vech_init, dtype=DTYPE) L_shape = tf.constant([num_inducing_points, num_inducing_points]) L_st = tf.SparseTensor(tf.to_int64(vech_indices), L_vech, tf.to_int64(L_shape)) L = tf.sparse_add(tf.zeros(L_shape, dtype=DTYPE), L_st) # L = tf.sparse_add(tf.eye(L_shape[0], num_columns=L_shape[1]), L_st) S = tf.matmul(L, tf.transpose(L), name='variational_covariance') with tf.name_scope('variational_dist_parameters'): tf.summary.histogram('mean_at_inducing_points', m) tf.summary.histogram('cov_at_inducing_points', S) with tf.name_scope('positive_definiteness_check'): kzz_eigvals, kzz_eigvecs = tf.linalg.eigh(K_zz) S_eigvals, S_eigvecs = tf.linalg.eigh(S) tf.summary.histogram('kzz', K_zz) tf.summary.histogram('kzz_eigenvalues', kzz_eigvals) tf.summary.histogram('S_eigenvalues', S_eigvals) with tf.name_scope('integration-over-region-T'): with tf.name_scope('psi_matrix'): psi_matrix = psi_term(Z_ph, Z_ph, alphas, gamma, Tmins, Tmaxs) with tf.name_scope('T_integral'): integral_over_T = T_Integral(m, S, K_zz_inv, psi_matrix, gamma, Tmins, Tmaxs) with tf.name_scope('expectation_at_datapoints'): with tf.name_scope('mu_and_sig_calculation'): mu_t, sig_t_sqr, sigsqr_lmr = mu_tilde_square( X_ph, Z_ph, S, m, K_zz_inv, alphas, gamma) with tf.name_scope('squaring_that_mu'): mu_t_square = tf.square(mu_t) exp_term = exp_at_datapoints(mu_t_square, sig_t_sqr, C) with tf.name_scope('KL-divergence'): kl_term_op, logdet_dbg = kl_term(m, S, K_zz, K_zz_inv, u_ph, L, stabilizer_value) with tf.name_scope('calculate_bound'): lower_bound = -integral_over_T + exp_term - kl_term_op tf.summary.scalar('variational_lower_bound', tf.squeeze(lower_bound)) tf.summary.scalar('integral_over_T', tf.squeeze(integral_over_T)) tf.summary.scalar('exp_term', tf.squeeze(exp_term)) tf.summary.scalar('kl_div', kl_term_op) # m_grad = tf.gradients(kl_term_op, [m])[0] # L_vech_grad = tf.gradients(kl_term_op, [L_vech])[0] if assert_correct_covariances: # assert positive semidefinite covariance matrices S_symm_assert = tf.Assert(tf.reduce_all(tf.equal(S, tf.transpose(S))), [S]) S_possemidef_assert = tf.Assert( tf.reduce_all(tf.greater_equal(tf.linalg.eigh(S)[0], 0)), [S]) covariance_asserts = [S_symm_assert, S_possemidef_assert] else: covariance_asserts = [] interesting_gradient = tf.gradients(lower_bound, [exp_term])[0] merged = tf.summary.merge_all() return lower_bound, merged, Z_ph, u_ph, X_ph, m, S, L_vech, interesting_gradient, K_zz_inv, alphas_base, gamma_base, K_zz, omegas, covariance_asserts, sig_t_sqr, sigsqr_lmr, logdet_dbg
def inner_cca_objective(y_true, y_pred): """ It is the loss function of CCA as introduced in the original paper. There can be other formulations. It is implemented on Tensorflow based on github@VahidooX's cca loss on Theano. y_true is just ignored """ r1 = 1e-4 r2 = 1e-4 eps = 1e-12 o1 = o2 = int(y_pred.shape[1] // 2) # unpack (separate) the output of networks for view 1 and view 2 H1 = tf.transpose(y_pred[:, 0:o1]) H2 = tf.transpose(y_pred[:, o1:o1 + o2]) m = tf.shape(H1)[1] H1bar = H1 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul( H1, tf.ones([m, m])) H2bar = H2 - tf.cast(tf.divide(1, m), tf.float32) * tf.matmul( H2, tf.ones([m, m])) SigmaHat12 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul( H1bar, H2bar, transpose_b=True) # [dim, dim] SigmaHat11 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul( H1bar, H1bar, transpose_b=True) + r1 * tf.eye(o1) SigmaHat22 = tf.cast(tf.divide(1, m - 1), tf.float32) * tf.matmul( H2bar, H2bar, transpose_b=True) + r2 * tf.eye(o2) # Calculating the root inverse of covariance matrices by using eigen decomposition [D1, V1] = tf.self_adjoint_eig(SigmaHat11) [D2, V2] = tf.self_adjoint_eig(SigmaHat22) # Added to increase stability posInd1 = tf.where(tf.greater(D1, eps)) D1 = tf.gather_nd(D1, posInd1) # get eigen values that are larger than eps V1 = tf.transpose( tf.nn.embedding_lookup(tf.transpose(V1), tf.squeeze(posInd1))) posInd2 = tf.where(tf.greater(D2, eps)) D2 = tf.gather_nd(D2, posInd2) V2 = tf.transpose( tf.nn.embedding_lookup(tf.transpose(V2), tf.squeeze(posInd2))) SigmaHat11RootInv = tf.matmul(tf.matmul(V1, tf.diag(D1**-0.5)), V1, transpose_b=True) # [dim, dim] SigmaHat22RootInv = tf.matmul(tf.matmul(V2, tf.diag(D2**-0.5)), V2, transpose_b=True) Tval = tf.matmul(tf.matmul(SigmaHat11RootInv, SigmaHat12), SigmaHat22RootInv) if use_all_singular_values: corr = tf.sqrt(tf.trace(tf.matmul(Tval, Tval, transpose_a=True))) else: [U, V] = tf.self_adjoint_eig(tf.matmul(Tval, Tval, transpose_a=True)) U = tf.gather_nd(U, tf.where(tf.greater(U, eps))) kk = tf.reshape(tf.cast(tf.shape(U), tf.int32), []) K = tf.minimum(kk, outdim_size) w, _ = tf.nn.top_k(U, k=K) corr = tf.reduce_sum(tf.sqrt(w)) return -corr
def eye_diff(x): shape = K.shape(x) return x - mul * tf.eye(shape[0], shape[1])
def motion_field_consistency_loss(frame1transformed_pixelxy, mask, rotation1, translation1, rotation2, translation2): """Computes a cycle consistency loss between two motion maps. Given two rotation and translation maps (of two frames), and a mapping from one frame to the other, this function assists in imposing that the fields at frame 1 represent the opposite motion of the ones in frame 2. In other words: At any given pixel on frame 1, if we apply the translation and rotation designated at that pixel, we land on some pixel in frame 2, and if we apply the translation and rotation designated there, we land back at the original pixel at frame 1. Args: frame1transformed_pixelxy: A tf.Tensor of shape [B, H, W, 2] representing the motion-transformed location of each pixel in frame 1. It is assumed (but not verified) that frame1transformed_pixelxy was obtained by properly applying rotation1 and translation1 on the depth map of frame 1. mask: A tf.Tensor of shape [B, H, W, 2] expressing the weight of each pixel in the calculation of the consistency loss. rotation1: A tf.Tensor of shape [B, 3] representing rotation angles. translation1: A tf.Tensor of shape [B, H, W, 3] representing translation vectors. rotation2: A tf.Tensor of shape [B, 3] representing rotation angles. translation2: A tf.Tensor of shape [B, H, W, 3] representing translation vectors. Returns: A dicionary from string to tf.Tensor, with the following entries: rotation_error: A tf scalar, the rotation consistency error. translation_error: A tf scalar, the translation consistency error. """ translation2resampled = tf.contrib.resampler.resampler( translation2, tf.stop_gradient(frame1transformed_pixelxy)) rotation1field = tf.broadcast_to(_expand_dims_twice(rotation1, -2), tf.shape(translation1)) rotation2field = tf.broadcast_to(_expand_dims_twice(rotation2, -2), tf.shape(translation2)) rotation1matrix = transform_utils.matrix_from_angles(rotation1field) rotation2matrix = transform_utils.matrix_from_angles(rotation2field) rot_unit, trans_zero = transform_utils.combine(rotation2matrix, translation2resampled, rotation1matrix, translation1) eye = tf.eye(3, batch_shape=tf.shape(rot_unit)[:-2]) transform_utils.matrix_from_angles(rotation1field) # Delete this later transform_utils.matrix_from_angles(rotation2field) # Delete this later # We normalize the product of rotations by the product of their norms, to make # the loss agnostic of their magnitudes, only wanting them to be opposite in # directions. Otherwise the loss has a tendency to drive the rotations to # zero. rot_error = tf.reduce_mean(tf.square(rot_unit - eye), axis=(3, 4)) rot1_scale = tf.reduce_mean(tf.square(rotation1matrix - eye), axis=(3, 4)) rot2_scale = tf.reduce_mean(tf.square(rotation2matrix - eye), axis=(3, 4)) rot_error /= (1e-24 + rot1_scale + rot2_scale) rotation_error = tf.reduce_mean(rot_error) def norm(x): return tf.reduce_sum(tf.square(x), axis=-1) # Here again, we normalize by the magnitudes, for the same reason. translation_error = tf.reduce_mean( mask * norm(trans_zero) / (1e-24 + norm(translation1) + norm(translation2))) return { 'rotation_error': rotation_error, 'translation_error': translation_error }
def construct_model(self): with self.sess.graph.as_default(): last_layer = self.config['nn_layers'][-1] # build priors self.SigEps = self.sigma_eps*tf.eye(self.y_dim) self.SigEps = tf.reshape(self.SigEps, (1,1,self.y_dim,self.y_dim)) self.K = tf.get_variable('K_init',shape=[last_layer,self.y_dim]) #\bar{K}_0 self.L_asym = tf.get_variable('L_asym',initializer=tf.eye(last_layer)) # cholesky decomp of \Lambda_0 self.L = self.L_asym @ tf.matrix_transpose(self.L_asym) # \Lambda_0 # context_x,y: x,y points available for context (M, N_context, x_dim/y_dim) self.context_x = tf.placeholder(tf.float32, shape=[self.M,None,self.x_dim], name="cx") self.context_y = tf.placeholder(tf.float32, shape=[self.M,None,self.y_dim], name="cy") # y: query points (M, N_test, x_dim) self.x = tf.placeholder(tf.float32, shape=[self.M,None,self.x_dim], name="x") self.y = tf.placeholder(tf.float32, shape=[self.M,None,self.y_dim], name="y") # encode x to phi(x) self.context_phi = tf.map_fn( lambda x: self.basis(x), elems=self.context_x, dtype=tf.float32) self.phi = tf.map_fn( lambda x: self.basis(x), elems=self.x, dtype=tf.float32) # build invertible flow network self.flow_bijector = self.build_flow() # num_updates: number of context points from context_x,y to use when computing posterior. size (M,) self.num_models = tf.shape(self.context_y)[0] self.max_num_context = tf.shape(self.context_y)[1]*tf.ones((self.num_models,), dtype=tf.int32) self.num_context = tf.placeholder_with_default(self.max_num_context, shape=(None,)) # in the case of conditional density est, map x to feature space # map context y to latent space # self.context_z is (M, N_context, y_dim) self.context_z = tf.map_fn( lambda xy: self.flow_bijector.inverse(xy[1], x=xy[0]), elems=(self.context_x,self.context_y), dtype=tf.float32) # compute posteriors self.K_N, self.Linv_N = tf.map_fn(lambda x: self.batch_blr(*x), elems=(self.context_phi, self.context_y, self.num_context), dtype=(tf.float32, tf.float32) ) # compute posterior predictive in latent space self.mu_N = batch_matmul(tf.matrix_transpose(self.K_N), self.phi) spread_fac = 1 + batch_quadform(self.Linv_N, self.phi) self.Sig_N = tf.expand_dims(spread_fac, axis=-1)*self.SigEps # print_op = tf.print(tf.reduce_mean(self.Sig_N, axis=(0,1)), tf.linalg.det(self.Linv_N), tf.linalg.det(tf.linalg.inv(self.L))) # with tf.control_dependencies([print_op]): self.base = tfd.MultivariateNormalFullCovariance(loc=self.mu_N, covariance_matrix=self.Sig_N) # map test data to latent space to evaluate log likelihood self.z = tf.map_fn( lambda xy: self.flow_bijector.inverse(xy[1], x=xy[0]), elems=(self.x,self.y), dtype=tf.float32) rmse_z = tf.reduce_mean( tf.sqrt( tf.reduce_sum( (self.z - tf.expand_dims(self.mu_N, axis=1))**2, axis=-1) ) ) tf.summary.scalar("rmse_z", rmse_z) logdetinvJ = tf.map_fn( lambda xy: self.flow_bijector.inverse_log_det_jacobian(xy[1], event_ndims=1, x=xy[0]), elems=(self.x, self.y), dtype=tf.float32) self.loss = -self.base.log_prob(self.z) -logdetinvJ # map to observation space #self.transformed_dist = tfd.ConditionalTransformedDistribution(distribution=self.base,bijector=self.flow_bijector) #self.loss = -(self.transformed_dist.log_prob(self.y, x=self.x)) self.total_loss = tf.reduce_mean(self.loss) tf.summary.scalar("loss", self.total_loss) optimizer = tf.train.AdamOptimizer(self.config['learning_rate']) gs, vs = zip(*optimizer.compute_gradients(self.total_loss)) gs, _ = tf.clip_by_global_norm(gs, 5.) self.train_op = optimizer.apply_gradients(zip(gs, vs)) #minimize(self.total_loss)# self.train_writer = tf.summary.FileWriter('summaries/'+str(time.time()), self.sess.graph, flush_secs=10) self.merged = tf.summary.merge_all() self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer())
def wct(content, style, alpha=1, eps=1e-8): '''TensorFlow version of Whiten-Color Transform Assume that content/style encodings have shape 1xHxWxC See p.4 of the Universal Style Transfer paper for corresponding equations: https://arxiv.org/pdf/1705.08086.pdf ''' # Remove batch dim and reorder to CxHxW Cc = content.shape[3] Cs = style.shape[3] content_t = tf.transpose(tf.squeeze(content), (2, 0, 1)) style_t = tf.transpose(tf.squeeze(style), (2, 0, 1)) Cc, Hc, Wc = tf.unstack(tf.shape(content_t)) Cs, Hs, Ws = tf.unstack(tf.shape(style_t)) # CxHxW -> CxH*W content_flat = tf.reshape(content_t, (Cc, Hc * Wc)) style_flat = tf.reshape(style_t, (Cs, Hs * Ws)) # Content covariance # keep_dims wurde in keepdims umbenannt, das is doch scheiße mc = tf.reduce_mean(content_flat, axis=1, keepdims=True) fc = content_flat - mc fcfc = tf.matmul(fc, fc, transpose_b=True) / ( tf.cast(Hc * Wc, tf.float32) - 1.) + tf.eye(Cc) * eps # Style covariance ms = tf.reduce_mean(style_flat, axis=1, keepdims=True) fs = style_flat - ms fsfs = tf.matmul(fs, fs, transpose_b=True) / ( tf.cast(Hs * Ws, tf.float32) - 1.) + tf.eye(Cs) * eps # tf.svd is slower on GPU, see https://github.com/tensorflow/tensorflow/issues/13603 with tf.device('/cpu:0'): Sc, Uc, _ = tf.linalg.svd(fcfc) Ss, Us, _ = tf.linalg.svd(fsfs) ## Uncomment to perform SVD for content/style with np in one call ## This is slower than CPU tf.svd but won't segfault for ill-conditioned matrices # @jit # def np_svd(content, style): # '''tf.py_func helper to run SVD with NumPy for content/style cov tensors''' # Uc, Sc, _ = np.linalg.svd(content) # Us, Ss, _ = np.linalg.svd(style) # return Uc, Sc, Us, Ss # Uc, Sc, Us, Ss = tf.py_func(np_svd, [fcfc, fsfs], [tf.float32, tf.float32, tf.float32, tf.float32]) # Filter small singular values k_c = tf.reduce_sum(tf.cast(tf.greater(Sc, 1e-5), tf.int32)) k_s = tf.reduce_sum(tf.cast(tf.greater(Ss, 1e-5), tf.int32)) # Whiten content feature Dc = tf.linalg.diag(tf.pow(Sc[:k_c], -0.5)) fc_hat = tf.matmul( tf.matmul(tf.matmul(Uc[:, :k_c], Dc), Uc[:, :k_c], transpose_b=True), fc) # Color content with style Ds = tf.linalg.diag(tf.pow(Ss[:k_s], 0.5)) fcs_hat = tf.matmul( tf.matmul(tf.matmul(Us[:, :k_s], Ds), Us[:, :k_s], transpose_b=True), fc_hat) # Re-center with mean of style fcs_hat = fcs_hat + ms # Blend whiten-colored feature with original content feature blended = alpha * fcs_hat + (1 - alpha) * (fc + mc) # CxH*W -> CxHxW blended = tf.reshape(blended, (Cc, Hc, Wc)) # CxHxW -> 1xHxWxC blended = tf.expand_dims(tf.transpose(blended, (1, 2, 0)), 0) return blended
def __call__(self, beta, theta, get_skin=False, name=None): """ Obtain SMPL with shape (beta) & pose (theta) inputs. Theta includes the global rotation. Args: beta: N x 10 theta: N x 72 (with 3-D axis-angle rep) Updates: self.J_transformed: N x 24 x 3 joint location after shaping & posing with beta and theta Returns: - joints: N x 19 or 14 x 3 joint locations depending on joint_type If get_skin is True, also returns - Verts: N x 6980 x 3 """ with tf.name_scope(name, "smpl_main", [beta, theta]): print(beta) print(beta.shape) num_batch = beta.shape[0].value # 1. Add shape blend shapes # (N x 10) x (10 x 6890*3) = N x 6890 x 3 v_shaped = tf.reshape( tf.matmul(beta, self.shapedirs, name='shape_bs'), [-1, self.size[0], self.size[1]]) + self.v_template # 2. Infer shape-dependent joint locations. Jx = tf.matmul(v_shaped[:, :, 0], self.J_regressor) Jy = tf.matmul(v_shaped[:, :, 1], self.J_regressor) Jz = tf.matmul(v_shaped[:, :, 2], self.J_regressor) J = tf.stack([Jx, Jy, Jz], axis=2) # 3. Add pose blend shapes # N x 24 x 3 x 3 Rs = tf.reshape(batch_rodrigues(tf.reshape(theta, [-1, 3])), [-1, 24, 3, 3]) with tf.name_scope("lrotmin"): # Ignore global rotation. pose_feature = tf.reshape(Rs[:, 1:, :, :] - tf.eye(3), [-1, 207]) # (N x 207) x (207, 20670) -> N x 6890 x 3 v_posed = tf.reshape(tf.matmul(pose_feature, self.posedirs), [-1, self.size[0], self.size[1]]) + v_shaped #4. Get the global joint location self.J_transformed, A = batch_global_rigid_transformation( Rs, J, self.parents) # 5. Do skinning: # W is N x 6890 x 24 W = tf.reshape(tf.tile(self.weights, [num_batch, 1]), [num_batch, -1, 24]) # (N x 6890 x 24) x (N x 24 x 16) T = tf.reshape(tf.matmul(W, tf.reshape(A, [num_batch, 24, 16])), [num_batch, -1, 4, 4]) v_posed_homo = tf.concat( [v_posed, tf.ones([num_batch, v_posed.shape[1].value, 1])], 2) v_homo = tf.matmul(T, tf.expand_dims(v_posed_homo, -1)) verts = v_homo[:, :, :3, 0] # Get cocoplus or lsp joints: joint_x = tf.matmul(verts[:, :, 0], self.joint_regressor) joint_y = tf.matmul(verts[:, :, 1], self.joint_regressor) joint_z = tf.matmul(verts[:, :, 2], self.joint_regressor) joints = tf.stack([joint_x, joint_y, joint_z], axis=2) if get_skin: return verts, joints, Rs else: return joints
def call(self, x): r1 = tf.constant([1e-4]) r2 = tf.constant([1e-4]) eps = tf.constant([1e-12]) o1 = o2 = tf.shape(x)[1] // 2 H1 = T.transpose(x[:, 0:o1]) H2 = T.transpose(x[:, o1:o1 + o2]) one = tf.constant([1.0]) m = tf.shape(H1)[1] m_float = tf.cast(m, 'float') # minus the mean value partition = tf.divide(one, m_float) H1bar = H1 - partition * tf.matmul(H1, tf.ones([m, m])) H2bar = H2 - partition * tf.matmul(H2, tf.ones([m, m])) # calculate the auto-covariance and cross-covariance partition2 = tf.divide(one, (m_float - 1)) SigmaHat12 = partition2 * tf.matmul(H1bar, tf.transpose(H2bar)) SigmaHat11 = partition2 * \ tf.matmul(H1bar, tf.transpose(H1bar)) + r1 * tf.eye(o1) SigmaHat22 = partition2 * \ tf.matmul(H2bar, tf.transpose(H2bar)) + r2 * tf.eye(o2) # calculate the root inverse of covariance matrices by using eigen decomposition [D1, V1] = tf.py_func(my_eigen, [SigmaHat11], [tf.float32, tf.float32]) [D2, V2] = tf.py_func(my_eigen, [SigmaHat22], [tf.float32, tf.float32]) # for stability D1_indices = tf.where(D1 > eps) D1_indices = tf.squeeze(D1_indices) V1 = tf.gather(V1, D1_indices) D1 = tf.gather(D1, D1_indices) D2_indices = tf.where(D2 > eps) D2_indices = tf.squeeze(D2_indices) V2 = tf.gather(V2, D2_indices) D2 = tf.gather(D2, D2_indices) pow_value = tf.constant([-0.5]) SigmaHat11RootInv = tf.matmul( tf.matmul(V1, tf.diag(tf.pow(D1, pow_value))), tf.transpose(V1)) SigmaHat22RootInv = tf.matmul( tf.matmul(V2, tf.diag(tf.pow(D2, pow_value))), tf.transpose(V2)) Tval = tf.matmul(tf.matmul(SigmaHat11RootInv, SigmaHat12), SigmaHat22RootInv) if self.use_all_singular_values: # all singular values are used to calculate the correlation self.corr = tf.trace(T.sqrt(tf.matmul(tf.transpose(Tval), Tval))) else: # just the top outdim_size singular values are used TT = tf.matmul(tf.transpose(Tval), Tval) U, V = tf.self_adjoint_eig(TT) U_sort, _ = tf.nn.top_k(U, self.cca_space_dim) self.corr = T.sum(T.sqrt(U_sort)) return -self.corr
def test_box_classification_loss_relative(self): gt_classes = tf.reshape(tf.constant([1, 1], dtype=tf.int32), [2, 1]) gt_length = tf.reshape(tf.constant([1, 1], dtype=tf.float32), [2, 1]) gt_height = tf.reshape(tf.constant([1, 1], dtype=tf.float32), [2, 1]) gt_width = tf.reshape(tf.constant([1, 1], dtype=tf.float32), [2, 1]) gt_center = tf.reshape( tf.constant([1, 1, 1, 1, 1, 1], dtype=tf.float32), [2, 3]) gt_rotation_matrix = tf.tile(tf.expand_dims(tf.eye(3), axis=0), [2, 1, 1]) logits1 = tf.reshape( tf.constant( [[-2.0, 2.0, -3.0, -2.0, 0.0], [-2.0, 2.0, -3.0, -2.0, 0.0]], dtype=tf.float32), [2, 5]) logits2 = tf.reshape( tf.constant( [[-2.0, 0.0, -3.0, -2.0, 2.0], [-2.0, 0.0, -3.0, -2.0, 2.0]], dtype=tf.float32), [2, 5]) gt_instance_ids = tf.reshape(tf.constant([1, 1], dtype=tf.int32), [2, 1]) inputs = { standard_fields.InputDataFields.num_valid_voxels: tf.constant([2, 2], dtype=tf.int32), standard_fields.InputDataFields.object_class_voxels: tf.stack([gt_classes, gt_classes], axis=0), standard_fields.InputDataFields.object_length_voxels: tf.stack([gt_length, gt_length], axis=0), standard_fields.InputDataFields.object_height_voxels: tf.stack([gt_height, gt_height], axis=0), standard_fields.InputDataFields.object_width_voxels: tf.stack([gt_width, gt_width], axis=0), standard_fields.InputDataFields.object_center_voxels: tf.stack([gt_center, gt_center], axis=0), standard_fields.InputDataFields.object_rotation_matrix_voxels: tf.stack([gt_rotation_matrix, gt_rotation_matrix], axis=0), standard_fields.InputDataFields.object_instance_id_voxels: tf.stack([gt_instance_ids, gt_instance_ids], axis=0), } outputs1 = { standard_fields.DetectionResultFields.object_semantic_voxels: tf.stack([logits1, logits1], axis=0), standard_fields.DetectionResultFields.object_length_voxels: tf.stack([gt_length, gt_length], axis=0), standard_fields.DetectionResultFields.object_height_voxels: tf.stack([gt_height, gt_height], axis=0), standard_fields.DetectionResultFields.object_width_voxels: tf.stack([gt_width, gt_width], axis=0), standard_fields.DetectionResultFields.object_center_voxels: tf.stack([gt_center, gt_center], axis=0), standard_fields.DetectionResultFields.object_rotation_matrix_voxels: tf.stack([gt_rotation_matrix, gt_rotation_matrix], axis=0), } outputs2 = { standard_fields.DetectionResultFields.object_semantic_voxels: tf.stack([logits2, logits2], axis=0), standard_fields.DetectionResultFields.object_length_voxels: tf.stack([gt_length, gt_length], axis=0), standard_fields.DetectionResultFields.object_height_voxels: tf.stack([gt_height, gt_height], axis=0), standard_fields.DetectionResultFields.object_width_voxels: tf.stack([gt_width, gt_width], axis=0), standard_fields.DetectionResultFields.object_center_voxels: tf.stack([gt_center, gt_center], axis=0), standard_fields.DetectionResultFields.object_rotation_matrix_voxels: tf.stack([gt_rotation_matrix, gt_rotation_matrix], axis=0), } loss1 = classification_losses.box_classification_loss(inputs=inputs, outputs=outputs1) loss2 = classification_losses.box_classification_loss(inputs=inputs, outputs=outputs2) self.assertGreater(loss2.numpy(), loss1.numpy())
def gauss_kl(q_mu, q_sqrt, K=None, *, K_cholesky=None): """ Compute the KL divergence KL[q || p] between q(x) = N(q_mu, q_sqrt^2) and p(x) = N(0, K) if K is not None p(x) = N(0, I) if K is None We assume L multiple independent distributions, given by the columns of q_mu and the first or last dimension of q_sqrt. Returns the *sum* of the divergences. q_mu is a matrix ([M, L]), each column contains a mean. q_sqrt can be a 3D tensor ([L, M, M]), each matrix within is a lower triangular square-root matrix of the covariance of q. q_sqrt can be a matrix ([M, L]), each column represents the diagonal of a square-root matrix of the covariance of q. K is the covariance of p (positive-definite matrix). The K matrix can be passed either directly as `K`, or as its Cholesky factor, `K_cholesky`. In either case, it can be a single matrix [M, M], in which case the sum of the L KL divergences is computed by broadcasting, or L different covariances [L, M, M]. Note: if no K matrix is given (both `K` and `K_cholesky` are None), `gauss_kl` computes the KL divergence from p(x) = N(0, I) instead. """ if (K is not None) and (K_cholesky is not None): raise ValueError( "Ambiguous arguments: gauss_kl() must only be passed one of `K` or `K_cholesky`." ) is_white = (K is None) and (K_cholesky is None) is_diag = len(q_sqrt.shape) == 2 shape_constraints = [ (q_mu, ["M", "L"]), (q_sqrt, (["M", "L"] if is_diag else ["L", "M", "M"])), ] if not is_white: if K is not None: shape_constraints.append( (K, (["L", "M", "M"] if len(K.shape) == 3 else ["M", "M"]))) else: shape_constraints.append((K_cholesky, (["L", "M", "M"] if len( K_cholesky.shape) == 3 else ["M", "M"]))) tf.debugging.assert_shapes(shape_constraints, message="gauss_kl() arguments") M, L = tf.shape(q_mu)[0], tf.shape(q_mu)[1] if is_white: alpha = q_mu # [M, L] else: if K is not None: Lp = tf.linalg.cholesky(K) # [L, M, M] or [M, M] elif K_cholesky is not None: Lp = K_cholesky # [L, M, M] or [M, M] is_batched = len(Lp.shape) == 3 q_mu = tf.transpose( q_mu)[:, :, None] if is_batched else q_mu # [L, M, 1] or [M, L] alpha = tf.linalg.triangular_solve(Lp, q_mu, lower=True) # [L, M, 1] or [M, L] if is_diag: Lq = Lq_diag = q_sqrt Lq_full = tf.linalg.diag(tf.transpose(q_sqrt)) # [L, M, M] else: Lq = Lq_full = tf.linalg.band_part( q_sqrt, -1, 0) # force lower triangle # [L, M, M] Lq_diag = tf.linalg.diag_part(Lq) # [M, L] # Mahalanobis term: μqᵀ Σp⁻¹ μq mahalanobis = tf.reduce_sum(tf.square(alpha)) # Constant term: - L * M constant = -to_default_float(tf.size(q_mu, out_type=tf.int64)) # Log-determinant of the covariance of q(x): logdet_qcov = tf.reduce_sum(tf.math.log(tf.square(Lq_diag))) # Trace term: tr(Σp⁻¹ Σq) if is_white: trace = tf.reduce_sum(tf.square(Lq)) else: if is_diag and not is_batched: # K is [M, M] and q_sqrt is [M, L]: fast specialisation LpT = tf.transpose(Lp) # [M, M] Lp_inv = tf.linalg.triangular_solve(Lp, tf.eye(M, dtype=default_float()), lower=True) # [M, M] K_inv = tf.linalg.diag_part( tf.linalg.triangular_solve( LpT, Lp_inv, lower=False))[:, None] # [M, M] -> [M, 1] trace = tf.reduce_sum(K_inv * tf.square(q_sqrt)) else: # TODO: broadcast instead of tile when tf allows -- tf2.1 segfaults # (https://github.com/tensorflow/tensorflow/issues/37584). # See # https://github.com/GPflow/GPflow/issues/1321 Lp_full = Lp if is_batched else tf.tile(tf.expand_dims(Lp, 0), [L, 1, 1]) LpiLq = tf.linalg.triangular_solve(Lp_full, Lq_full, lower=True) trace = tf.reduce_sum(tf.square(LpiLq)) twoKL = mahalanobis + constant - logdet_qcov + trace # Log-determinant of the covariance of p(x): if not is_white: log_sqdiag_Lp = tf.math.log(tf.square(tf.linalg.diag_part(Lp))) sum_log_sqdiag_Lp = tf.reduce_sum(log_sqdiag_Lp) # If K is [L, M, M], num_latent_gps is no longer implicit, no need to multiply the single kernel logdet scale = 1.0 if is_batched else to_default_float(L) twoKL += scale * sum_log_sqdiag_Lp tf.debugging.assert_shapes( [(twoKL, ())], message="gauss_kl() return value") # returns scalar return 0.5 * twoKL
def compute_reward(self, m, s): ''' Reward function, calculating mean and variance of rewards, given mean and variance of state distribution, along with the target State and a weight matrix. Input m : [1, k] Input s : [k, k] Output M : [1, 1] Output S : [1, 1] ''' # for robot arm m=m[:,:9] s=s[:9,:9] SW = s @ self.W iSpW = tf.transpose( tf.matrix_solve( (tf.eye(self.state_dim, dtype=float_type) + SW), tf.transpose(self.W), adjoint=True)) muR = tf.exp(-(m-self.t) @ iSpW @ tf.transpose(m-self.t)/2) / \ tf.sqrt( tf.linalg.det(tf.eye(self.state_dim, dtype=float_type) + SW) ) i2SpW = tf.transpose( tf.matrix_solve( (tf.eye(self.state_dim, dtype=float_type) + 2*SW), tf.transpose(self.W), adjoint=True)) r2 = tf.exp(-(m-self.t) @ i2SpW @ tf.transpose(m-self.t)) / \ tf.sqrt( tf.linalg.det(tf.eye(self.state_dim, dtype=float_type) + 2*SW) ) sR = r2 - muR @ muR muR.set_shape([1, 1]) sR.set_shape([1, 1]) return muR, sR # import abc # import tensorflow as tf # from gpflow import Parameterized, Param, params_as_tensors, settings # import numpy as np # # float_type = settings.dtypes.float_type # # # class Reward(Parameterized): # def __init__(self): # Parameterized.__init__(self) # # @abc.abstractmethod # def compute_reward(self, m, s): # raise NotImplementedError # # # class ExponentialReward(Reward): # def __init__(self, state_dim, W=None, t=None): # Reward.__init__(self) # self.state_dim = state_dim # if W is not None: # self.W = Param(np.reshape(W, (state_dim, state_dim)), trainable=False) # else: # self.W = Param(np.eye(state_dim), trainable=False) # self.t=t # # if t is not None: # # self.t = Param(np.reshape(t, (1, state_dim)), trainable=False) # # else: # # self.t = Param(np.zeros((1, state_dim)), trainable=False) # # def update_target(self,t): # # self.t.assign(np.reshape(t, (1, self.state_dim))) # self.t=t # # @params_as_tensors # def compute_reward(self, m, s): # ''' # Reward function, calculating mean and variance of rewards, given # mean and variance of state distribution, along with the target State # and a weight matrix. # Input m : [1, k] # Input s : [k, k] # # Output M : [1, 1] # Output S : [1, 1] # ''' # # for robot arm # m=m[:,:3] # s=s[:3,:3] # # SW = s @ self.W # # iSpW = tf.transpose( # tf.matrix_solve( (tf.eye(self.state_dim, dtype=float_type) + SW), # tf.transpose(self.W), adjoint=True)) # # muR = tf.exp(-(m-self.t) @ iSpW @ tf.transpose(m-self.t)/2) / \ # tf.sqrt( tf.linalg.det(tf.eye(self.state_dim, dtype=float_type) + SW) ) # # i2SpW = tf.transpose( # tf.matrix_solve( (tf.eye(self.state_dim, dtype=float_type) + 2*SW), # tf.transpose(self.W), adjoint=True)) # # r2 = tf.exp(-(m-self.t) @ i2SpW @ tf.transpose(m-self.t)) / \ # tf.sqrt( tf.linalg.det(tf.eye(self.state_dim, dtype=float_type) + 2*SW) ) # # sR = r2 - muR @ muR # muR.set_shape([1, 1]) # sR.set_shape([1, 1]) # return muR, sR
def testActionBatchWithVariablesAndPolicyUpdate(self, batch_size, actions_from_reward_layer): a_list = [] a_new_list = [] b_list = [] b_new_list = [] num_samples_list = [] num_samples_new_list = [] for k in range(1, self._num_actions + 1): a_initial_value = k + 1 + 2 * k * tf.eye(self._encoding_dim, dtype=tf.float32) a_for_one_arm = tf.compat.v2.Variable(a_initial_value) a_list.append(a_for_one_arm) b_initial_value = tf.constant(k * np.ones(self._encoding_dim), dtype=tf.float32) b_for_one_arm = tf.compat.v2.Variable(b_initial_value) b_list.append(b_for_one_arm) num_samples_initial_value = tf.constant([1], dtype=tf.float32) num_samples_for_one_arm = tf.compat.v2.Variable( num_samples_initial_value) num_samples_list.append(num_samples_for_one_arm) # Variables for the new policy (they differ by an offset). a_new_for_one_arm = tf.compat.v2.Variable(a_initial_value + _POLICY_VARIABLES_OFFSET) a_new_list.append(a_new_for_one_arm) b_new_for_one_arm = tf.compat.v2.Variable(b_initial_value + _POLICY_VARIABLES_OFFSET) b_new_list.append(b_new_for_one_arm) num_samples_for_one_arm_new = tf.compat.v2.Variable( num_samples_initial_value + _POLICY_VARIABLES_OFFSET) num_samples_new_list.append(num_samples_for_one_arm_new) policy = neural_linucb_policy.NeuralLinUCBPolicy( encoding_network=DummyNet(self._obs_spec), encoding_dim=self._encoding_dim, reward_layer=get_reward_layer(), actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=a_list, data_vector=b_list, num_samples=num_samples_list, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) new_policy = neural_linucb_policy.NeuralLinUCBPolicy( encoding_network=DummyNet(self._obs_spec), encoding_dim=self._encoding_dim, reward_layer=get_reward_layer(), actions_from_reward_layer=tf.constant(actions_from_reward_layer, dtype=tf.bool), cov_matrix=a_new_list, data_vector=b_new_list, num_samples=num_samples_new_list, epsilon_greedy=0.0, time_step_spec=self._time_step_spec) action_step = policy.action( self._time_step_batch(batch_size=batch_size)) new_action_step = new_policy.action( self._time_step_batch(batch_size=batch_size)) self.assertEqual(action_step.action.shape, new_action_step.action.shape) self.assertEqual(action_step.action.dtype, new_action_step.action.dtype) self.evaluate(tf.compat.v1.global_variables_initializer()) self.evaluate(new_policy.update(policy)) action_fn = common.function_in_tf1()(policy.action) action_step = action_fn(self._time_step_batch(batch_size=batch_size)) new_action_fn = common.function_in_tf1()(new_policy.action) new_action_step = new_action_fn( self._time_step_batch(batch_size=batch_size)) actions_, new_actions_ = self.evaluate( [action_step.action, new_action_step.action]) self.assertAllEqual(actions_, new_actions_)
def test_inv_update_thunks(self): """Ensures inverse update ops run once per global_step.""" with self._graph.as_default(), self.test_session() as sess: fisher_estimator = estimator.FisherEstimatorRoundRobin( variables=[self.weights], layer_collection=self.layer_collection, damping=0.2, cov_ema_decay=0.0) # Construct op that updates one inverse per global step. global_step = tf.train.get_or_create_global_step() (cov_variable_thunks, _, inv_variable_thunks, inv_update_op_thunks ) = fisher_estimator.create_ops_and_vars_thunks() for thunk in cov_variable_thunks: thunk() for thunk in inv_variable_thunks: thunk() inv_matrices = [ matrix for fisher_factor in self.layer_collection.get_factors() for matrix in fisher_factor._matpower_by_exp_and_damping.values() ] inv_update_op = tf.case([ (tf.equal(global_step, i), thunk) for i, thunk in enumerate(inv_update_op_thunks) ]) increment_global_step = global_step.assign_add(1) sess.run(tf.global_variables_initializer()) initial_inv_values = sess.run(inv_matrices) # Ensure there's one update per inverse matrix. This is true as long as # there's no fan-in/fan-out or parameter re-use. self.assertEqual(len(inv_matrices), len(inv_update_op_thunks)) # Test is no-op if only 1 invariance matrix. assert len(inv_matrices) > 1 # Assign each covariance matrix a value other than the identity. This # ensures that the inverse matrices are updated to something different as # well. cov_matrices = [ fisher_factor.get_cov() for fisher_factor in self.layer_collection.get_factors() ] sess.run([ cov_matrix.assign(2 * tf.eye(int(cov_matrix.shape[0]))) for cov_matrix in cov_matrices ]) for i in range(len(inv_matrices)): # Compare new and old inverse values new_inv_values = sess.run(inv_matrices) is_inv_equal = [ np.allclose(initial_inv_value, new_inv_value) for (initial_inv_value, new_inv_value ) in zip(initial_inv_values, new_inv_values) ] num_inv_equal = sum(is_inv_equal) # Ensure exactly one inverse matrix changes per step. self.assertEqual(num_inv_equal, len(inv_matrices) - i) # Run all inverse update ops. sess.run(inv_update_op) sess.run(increment_global_step)
def construct_model(self): with self.sess.graph.as_default(): #build priors if self.sig_0 is list: raise ValueError('need to define inits for this case') else: # self.V0_inv = (1./self.sig_0)*tf.eye(self.x_dim) self.V0_asym = tf.get_variable('V0_asym',initializer=1/self.sig_0*tf.eye(self.y_dim)) self.V0_inv = self.V0_asym @ tf.transpose(self.V0_asym) # self.V0 = tf.linalg.inv(self.V0_inv) # making S0 trainable; enforce invertibility via cholesky # self.S0_asym = tf.Variable(5*self.sig_0*tf.eye(self.x_dim)) # cholesky decomp of \Lambda_0 # self.S0_inv = self.S0_asym @ tf.transpose(self.S0_asym) self.S0_inv = self.sig_0*tf.eye(self.y_dim) self.S0 = tf.linalg.inv(self.S0_inv) self.mu_0 = tf.constant(self.mu_0, dtype=tf.float32) # context_x,y: x,y points available for context (M, N_context, x_dim/y_dim) self.context_y = tf.placeholder(tf.float32, shape=[None,None,self.y_dim], name="cy") # y: query points (M, N_test, x_dim) self.y = tf.placeholder(tf.float32, shape=[None,None,self.y_dim], name="y") # build network self.flow_bijector = self.build_flow() # num_updates: number of context points from context_x,y to use when computing posterior. size (M,) self.num_models = tf.shape(self.context_y)[0] self.max_num_context = tf.shape(self.context_y)[1]*tf.ones((self.num_models,), dtype=tf.int32) self.num_context = tf.placeholder_with_default(self.max_num_context, shape=(None,)) # in the case of conditional density est, map x to feature space # map context data to latent space # self.context_phi is (M, N_context, phi_dim) self.context_z = tf.map_fn( lambda y: self.flow_bijector.inverse(y), elems=self.context_y, dtype=tf.float32) # compute posteriors self.mu_N, self.V_N = tf.map_fn(lambda x: self.batch_gaussian_update(*x), elems=(self.context_z, self.num_context), dtype=(tf.float32, tf.float32) ) # posterior base distribution self.base = tfd.MultivariateNormalFullCovariance(loc=self.mu_N, covariance_matrix=self.V_N + self.S0) self.transformed_dist = tfd.TransformedDistribution(distribution=self.base,bijector=self.flow_bijector) y_transposed = tf.transpose(self.y, perm=[1,0,2]) self.loss = -(self.transformed_dist.log_prob(y_transposed)) self.total_loss = tf.reduce_mean(self.loss) optimizer = tf.train.AdamOptimizer(self.config['learning_rate']) gs, vs = zip(*optimizer.compute_gradients(self.total_loss)) # v_names = [v.name for v in tf.trainable_variables()] # global_norms = [tf.reduce_max(g) for g in gs] #print_op = tf.print(list(zip(v_names,global_norms))) #with tf.control_dependencies([print_op]): # gs, _ = tf.clip_by_global_norm(gs, 5.) self.train_op = optimizer.apply_gradients(zip(gs, vs)) # rmse_z = tf.reduce_mean( tf.sqrt( tf.reduce_sum( (self.z - tf.expand_dims(self.mu_N, axis=1))**2, axis=-1) ) ) # tf.summary.scalar("rmse_z", rmse_z) norm_S0_inv = tf.reduce_mean( tf.norm(self.S0_inv, ord='fro', axis=(-2,-1)) ) tf.summary.scalar("norm_S0_inv", norm_S0_inv) norm_V0_inv = tf.reduce_mean( tf.norm(self.V0_inv, ord='fro', axis=(-2,-1)) ) tf.summary.scalar("norm_V0_inv", norm_V0_inv) # mean_invJ_logdet = tf.reduce_mean( logdetinvJ ) # tf.summary.scalar("mean_invJ_logdet", mean_invJ_logdet) self.train_writer = tf.summary.FileWriter('summaries/'+str(time.time()), self.sess.graph, flush_secs=10) self.merged = tf.summary.merge_all() self.saver = tf.train.Saver() self.sess.run(tf.global_variables_initializer())
def kl_term(m, S, K_zz, K_zz_inv, u_ovln, L, stabilizer_value): # mean_diff = (u_ovln * tf.ones([tf.shape(Z_ph)[0]]) - m) mean_diff = tf.expand_dims( u_ovln * tf.ones([tf.shape(m)[0]], dtype=DTYPE) - m, 1) first = tf.trace(tf.matmul(K_zz_inv, S), name='kl_first') # ######################################### # TODO: solve matrix determinant Problem # Approaches: # 1. naive impl of determinants # -> Problem: NaN as Determimants get very large for big matrices # Code: # kzz_det = tf.matrix_determinant(K_zz) # S_det = tf.matrix_determinant(S) # second = tf.log(kzz_det / S_det, name='kl_second') # 2. Logdet and Cholesky decomp # -> Problem: Cholesky decomp not always possible (only pos semidefinite by our constr?) # -> Adding Eye to S might be a possible solution with tf.name_scope('log_of_determinant_ratio'): # posdef_stabilizer = tf.diag(tf.random_normal([tf.shape(K_zz)[0]], stddev=stabilizer_value)) posdef_stabilizer = tf.eye(tf.shape(K_zz)[0], dtype=DTYPE) * stabilizer_value with tf.name_scope('K_zz_logdet'): K_zz_logdet = tf.linalg.logdet(K_zz + posdef_stabilizer) with tf.name_scope('S_logdet'): S_logdet = tf.linalg.logdet(S + posdef_stabilizer) alt_logdet_via_L = tf.diag_part( L) # 2 * tf.reduce_sum(tf.log(tf.diag_part(L))) # S_logdet = 2 * tf.reduce_sum(tf.log(tf.diag_part(L))) # posdef_stabilizer = tf.eye(L_shape[0]) * lambda second = tf.subtract(K_zz_logdet, S_logdet, name='kl_second') # 3. Using tf.slogdet # -> Problem: slogdet doesn't seem to have a gradient defined #kzz_lds, kzz_ldav = tf.linalg.slogdet(tf.expand_dims(K_zz, 0)) #K_zz_logdet = kzz_lds[0] * kzz_ldav[0] #S_lds, S_ldav = tf.linalg.slogdet(tf.expand_dims(S, 0)) #S_logdet = S_lds[0] * S_ldav[0] #second = tf.subtract(K_zz_logdet, S_logdet, name='kl_second') # ######################################### if DTYPE == tf.float32: third = tf.to_float(tf.shape(m)[0], name='kl_third') elif DTYPE == tf.float64: third = tf.to_double(tf.shape(m)[0], name='kl_third') else: print('ERROR: DTYPE must be set to either tf.float32 or tf.float64') # fourth = tf.reduce_sum(tf.multiply(tf.reduce_sum(tf.multiply(mean_diff, tf.transpose(K_zz_inv)), axis=1) , mean_diff)) fourth = tf.squeeze(tf.matmul(tf.matmul(tf.transpose(mean_diff), K_zz_inv), mean_diff), name='kl_fourth') return 0.5 * (first + second - third + fourth), [ S_logdet, alt_logdet_via_L ]
def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, d_head, d_inner, dropout, dropatt, attn_type, bi_data, initializer, is_training, mem_len=None, inp_q=None, mems=None, same_length=False, clamp_len=-1, untie_r=False, use_tpu=True, input_mask=None, perm_mask=None, seg_id=None, reuse_len=None, ff_activation='relu', target_mapping=None, use_bfloat16=False, scope='transformer', **kwargs): """ Defines a Transformer-XL computation graph with additional support for XLNet. Args: inp_k: int32 Tensor in shape [len, bsz], the input token IDs. seg_id: int32 Tensor in shape [len, bsz], the input segment IDs. input_mask: float32 Tensor in shape [len, bsz], the input mask. 0 for real tokens and 1 for padding. mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. If None, no memory is used. perm_mask: float32 Tensor in shape [len, len, bsz]. If perm_mask[i, j, k] = 0, i attend to j in batch k; if perm_mask[i, j, k] = 1, i does not attend to j in batch k. If None, each position attends to all the others. target_mapping: float32 Tensor in shape [num_predict, len, bsz]. If target_mapping[i, j, k] = 1, the i-th predict in batch k is on the j-th token. Only used during pretraining for partial prediction. Set to None during finetuning. inp_q: float32 Tensor in shape [len, bsz]. 1 for tokens with losses and 0 for tokens without losses. Only used during pretraining for two-stream attention. Set to None during finetuning. n_layer: int, the number of layers. d_model: int, the hidden size. n_head: int, the number of attention heads. d_head: int, the dimension size of each attention head. d_inner: int, the hidden size in feed-forward layers. ff_activation: str, "relu" or "gelu". untie_r: bool, whether to untie the biases in attention. n_token: int, the vocab size. is_training: bool, whether in training mode. use_tpu: bool, whether TPUs are used. use_bfloat16: bool, use bfloat16 instead of float32. dropout: float, dropout rate. dropatt: float, dropout rate on attention probabilities. init: str, the initialization scheme, either "normal" or "uniform". init_range: float, initialize the parameters with a uniform distribution in [-init_range, init_range]. Only effective when init="uniform". init_std: float, initialize the parameters with a normal distribution with mean 0 and stddev init_std. Only effective when init="normal". mem_len: int, the number of tokens to cache. reuse_len: int, the number of tokens in the currect batch to be cached and reused in the future. bi_data: bool, whether to use bidirectional input pipeline. Usually set to True during pretraining and False during finetuning. clamp_len: int, clamp all relative distances larger than clamp_len. -1 means no clamping. same_length: bool, whether to use the same attention length for each token. summary_type: str, "last", "first", "mean", or "attn". The method to pool the input to get a vector representation. initializer: A tf initializer. scope: scope name for the computation graph. """ tf.logging.info('memory input {}'.format(mems)) tf_float = tf.bfloat16 if use_bfloat16 else tf.float32 tf.logging.info('Use float type {}'.format(tf_float)) new_mems = [] with tf.variable_scope(scope): if untie_r: r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer) r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer) else: r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head], dtype=tf_float, initializer=initializer) r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head], dtype=tf_float, initializer=initializer) bsz = tf.shape(inp_k)[1] qlen = tf.shape(inp_k)[0] mlen = tf.shape(mems[0])[0] if mems is not None else 0 klen = mlen + qlen ##### Attention mask # causal attention mask if attn_type == 'uni': attn_mask = _create_mask(qlen, mlen, tf_float, same_length) attn_mask = attn_mask[:, :, None, None] elif attn_type == 'bi': attn_mask = None else: raise ValueError('Unsupported attention type: {}'.format(attn_type)) # data mask: input mask & perm mask if input_mask is not None and perm_mask is not None: data_mask = input_mask[None] + perm_mask elif input_mask is not None and perm_mask is None: data_mask = input_mask[None] elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # all mems can be attended to mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz], dtype=tf_float) data_mask = tf.concat([mems_mask, data_mask], 1) if attn_mask is None: attn_mask = data_mask[:, :, :, None] else: attn_mask += data_mask[:, :, :, None] if attn_mask is not None: attn_mask = tf.cast(attn_mask > 0, dtype=tf_float) if attn_mask is not None: non_tgt_mask = -tf.eye(qlen, dtype=tf_float) non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=tf_float), non_tgt_mask], axis=-1) non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=tf_float) else: non_tgt_mask = None ##### Word embedding word_emb_k, lookup_table = embedding_lookup( x=inp_k, n_token=n_token, d_embed=d_model, initializer=initializer, use_tpu=use_tpu, dtype=tf_float, scope='word_embedding') if inp_q is not None: with tf.variable_scope('mask_emb'): mask_emb = tf.get_variable('mask_emb', [1, 1, d_model], dtype=tf_float) if target_mapping is not None: word_emb_q = tf.tile(mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) else: inp_q_ext = inp_q[:, :, None] word_emb_q = inp_q_ext * mask_emb + (1 - inp_q_ext) * word_emb_k output_h = tf.layers.dropout(word_emb_k, dropout, training=is_training) if inp_q is not None: output_g = tf.layers.dropout(word_emb_q, dropout, training=is_training) else: output_g = None ##### Segment embedding if seg_id is not None: if untie_r: r_s_bias = tf.get_variable('r_s_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer) else: # default case (tie) r_s_bias = tf.get_variable('r_s_bias', [n_head, d_head], dtype=tf_float, initializer=initializer) seg_embed = tf.get_variable('seg_embed', [n_layer, 2, n_head, d_head], dtype=tf_float, initializer=initializer) # Convert `seg_id` to one-hot `seg_mat` mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32) cat_ids = tf.concat([mem_pad, seg_id], 0) # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = tf.cast( tf.logical_not(tf.equal(seg_id[:, None], cat_ids[None, :])), tf.int32) seg_mat = tf.one_hot(seg_mat, 2, dtype=tf_float) else: seg_mat = None ##### Positional encoding pos_emb = relative_positional_encoding( qlen, klen, d_model, clamp_len, attn_type, bi_data, bsz=bsz, dtype=tf_float) pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training) ##### Attention layers if mems is None: mems = [None] * n_layer hidden_states = [] for i in range(n_layer): # cache new mems new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len)) # segment bias if seg_id is None: r_s_bias_i = None seg_embed_i = None else: r_s_bias_i = r_s_bias if not untie_r else r_s_bias[i] seg_embed_i = seg_embed[i] with tf.variable_scope('layer_{}'.format(i)): if inp_q is not None: o = tf.transpose(output_h, [1, 0, 2]) q = tf.transpose(output_g, [1, 0, 2]) hidden_states.append((o, q)) output_h, output_g = two_stream_rel_attn( h=output_h, g=output_g, r=pos_emb, r_w_bias=r_w_bias if not untie_r else r_w_bias[i], r_r_bias=r_r_bias if not untie_r else r_r_bias[i], seg_mat=seg_mat, r_s_bias=r_s_bias_i, seg_embed=seg_embed_i, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, mems=mems[i], target_mapping=target_mapping, d_model=d_model, n_head=n_head, d_head=d_head, dropout=dropout, dropatt=dropatt, is_training=is_training, kernel_initializer=initializer) reuse = True else: o = tf.transpose(output_h, [1, 0, 2]) hidden_states.append(o) reuse = False output_h, special = rel_multihead_attn( h=output_h, r=pos_emb, r_w_bias=r_w_bias if not untie_r else r_w_bias[i], r_r_bias=r_r_bias if not untie_r else r_r_bias[i], seg_mat=seg_mat, r_s_bias=r_s_bias_i, seg_embed=seg_embed_i, attn_mask=non_tgt_mask, mems=mems[i], d_model=d_model, n_head=n_head, d_head=d_head, dropout=dropout, dropatt=dropatt, is_training=is_training, kernel_initializer=initializer, reuse=reuse) if i == 0: special_out = special if inp_q is not None: output_g = positionwise_ffn( inp=output_g, d_model=d_model, d_inner=d_inner, dropout=dropout, kernel_initializer=initializer, activation_type=ff_activation, is_training=is_training) output_h = positionwise_ffn( inp=output_h, d_model=d_model, d_inner=d_inner, dropout=dropout, kernel_initializer=initializer, activation_type=ff_activation, is_training=is_training, reuse=reuse) if inp_q is not None: o = tf.transpose(output_h, [1, 0, 2]) q = tf.transpose(output_g, [1, 0, 2]) hidden_states.append((o, q)) output = tf.layers.dropout(output_g, dropout, training=is_training) else: o = tf.transpose(output_h, [1, 0, 2]) hidden_states.append(o) output = tf.layers.dropout(output_h, dropout, training=is_training) return output, new_mems, lookup_table, hidden_states, special_out
def _apply_dense(self, grad, var): rms = self.get_slot(var, "rms") mom = self.get_slot(var, "momentum") eps = self.get_slot(var, 'eps') tf.summary.scalar('grad_norm', tf.norm(grad)) # debug_here() if 'orthogonal_stiefel' in var.name and 'bias' not in var.name: with tf.variable_scope("orthogonal_update"): print('Appling an orthogonality preserving step to', var.name) # apply the rms update rule. new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \ * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) # scale the gradient. if self._nat_grad_normalization: grad = grad / (tf.sqrt(rms) + eps) # the update should preserve orthogonality. grad_shape = tf.Tensor.get_shape(grad).as_list() # W_new_lst = [] eye = tf.eye(grad_shape[0], dtype=tf.float32) G = grad W = var # Reunitarize after n steps. if self._qr_steps is not None: W = tf.cond(tf.equal(tf.mod(self._global_step_tensor, self._qr_steps), 0), lambda: self.re_unitarize(W), lambda: W) # A = tf.matmul(tf.transpose(G), W) - tf.matmul(tf.transpose(W), G) A = tf.matmul(G, tf.transpose(W)) - tf.matmul(W, tf.transpose(G)) cayleyDenom = eye + (self._learning_rate_tensor/2.0) * A cayleyNumer = eye - (self._learning_rate_tensor/2.0) * A C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer) W_new = tf.matmul(C, W) if self._debug: # self._summary_A(A) self._summary_C(C) self._summary_W(W) var_update_op = tf.assign(var, W_new) return tf.group(*[var_update_op, rms_assign_op]) elif 'unitary_stiefel' in var.name and 'bias' not in var.name: with tf.variable_scope("unitary_update"): print('Appling an unitarity preserving step to', var.name) # apply the rms update rule. new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \ * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) # scale the gradient. if self._nat_grad_normalization: grad = grad / (tf.sqrt(new_rms) + eps) # do an update step, which preserves unitary structure. # checking shapes. grad_shape = tf.Tensor.get_shape(grad).as_list() assert grad_shape[0] == grad_shape[1] eye = tf.eye(grad_shape[0], dtype=tf.complex64) G = tf.complex(grad[:, :, 0], grad[:, :, 1]) W = tf.complex(var[:, :, 0], var[:, :, 1]) # Reunitarize after n steps. if self._qr_steps is not None: W = tf.cond(tf.equal(tf.mod(self._global_step_tensor, self._qr_steps), 0), lambda: self.re_unitarize(W), lambda: W) A = tf.matmul(G, tf.conj(tf.transpose(W))) \ - tf.matmul(W, tf.conj(tf.transpose(G))) # A must be skew symmetric. larning_rate_scale = tf.complex(self._learning_rate_tensor/2.0, tf.zeros_like(self._learning_rate_tensor)) cayleyDenom = eye + larning_rate_scale * A cayleyNumer = eye - larning_rate_scale * A C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer) W_new = tf.matmul(C, W) if self._debug: # self._summary_A(A) self._summary_C(C) self._summary_W(W) # debug_here() W_new_re = tf.real(W_new) W_new_img = tf.imag(W_new) W_array = tf.stack([W_new_re, W_new_img], -1) var_update_op = tf.assign(var, W_array) return tf.group(*[var_update_op, rms_assign_op]) else: # do the usual RMSprop update if 1: # tensorflow default. print('Appling standard rmsprop to', var.name) return training_ops.apply_rms_prop( var, rms, mom, tf.cast(self._learning_rate_tensor, var.dtype.base_dtype), tf.cast(self._decay_tensor, var.dtype.base_dtype), tf.cast(self._momentum_tensor, var.dtype.base_dtype), tf.cast(self._epsilon_tensor, var.dtype.base_dtype), grad, use_locking=False).op else: # My rmsprop implementation. new_rms = self._decay_tensor * rms \ + (1. - self._decay_tensor) * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) W_new = var - self._learning_rate_tensor * grad / (tf.sqrt(new_rms) + eps) var_update_op = tf.assign(var, W_new) return tf.group(*[var_update_op, rms_assign_op])
def _summary_C(self, C): # C must be unitary/orthogonal: eye = tf.eye(*tf.Tensor.get_shape(C).as_list(), dtype=C.dtype) test_c = eye - tf.matmul(tf.transpose(tf.conj(C)), C) test_c_norm = tf.real(tf.norm(test_c)) tf.summary.scalar('I-C.HC', test_c_norm)
def identity(length, dtype=tf.float64): return tf.eye(length, dtype=dtype)
def __init__(self, inputSize, outputSize): self.weight = tf.zeros((inputSize, outputSize), dtype=tf.float32) self.nParams = tf.size(self.weight) self.inft = tf.reshape(tf.eye(self.nParams, dtype=tf.float32), (self.nParams, *self.weight.shape))