def _sample_conditional(Xnew, feat, kern, f, *, full_cov=False, full_output_cov=False, q_sqrt=None, white=False, num_samples=None): """ `sample_conditional` will return a sample from the conditinoal distribution. In most cases this means calculating the conditional mean m and variance v and then returning m + sqrt(v) * eps, with eps ~ N(0, 1). However, for some combinations of Mok and Mof more efficient sampling routines exists. The dispatcher will make sure that we use the most efficent one. :return: N x P (full_output_cov = False) or N x P x P (full_output_cov = True) """ logger.debug("sample conditional: (MixedKernelSharedMof, MixedKernelSeparateMof), SeparateMixedMok") if full_cov: raise NotImplementedError("full_cov not yet implemented") if full_output_cov: raise NotImplementedError("full_output_cov not yet implemented") independent_cond = conditional.dispatch(object, SeparateIndependentMof, SeparateIndependentMok, object) g_mu, g_var = independent_cond(Xnew, feat, kern, f, white=white, q_sqrt=q_sqrt, full_output_cov=False, full_cov=False) # N x L, N x L g_sample = _sample_mvn(g_mu, g_var, "diag", num_samples=num_samples) # N x L with params_as_tensors_for(kern): f_sample = tf.einsum("pl,nl->np", kern.W, g_sample) f_mu = tf.einsum("pl,nl->np", kern.W, g_mu) # W g_var W.T # [P, L] @ [L, L] @ [L, P] # \sum_l,l' W_pl g_var_ll' W_p'l' # \sum_l W_pl g_var_nl W_p'l # -> f_var = tf.einsum("pl,nl,pl->np", kern.W, g_var, kern.W) return f_sample, f_mu, f_var
def _variance(self): with tf.control_dependencies(self._runtime_assertions): probs = self._marginal_hidden_probs() # probs :: num_steps batch_shape num_states means = self._observation_distribution.mean() # means :: observation_batch_shape[:-1] num_states # observation_event_shape means_shape = tf.concat( [self.batch_shape_tensor(), [self._num_states], self._observation_distribution.event_shape_tensor()], axis=0) means = tf.broadcast_to(means, means_shape) # means :: batch_shape num_states observation_event_shape observation_event_shape = ( self._observation_distribution.event_shape_tensor()) batch_size = tf.reduce_prod(self.batch_shape_tensor()) flat_probs_shape = [self._num_steps, batch_size, self._num_states] flat_means_shape = [ batch_size, 1, self._num_states, tf.reduce_prod(observation_event_shape)] flat_probs = tf.reshape(probs, flat_probs_shape) # flat_probs :: num_steps batch_size num_states flat_means = tf.reshape(means, flat_means_shape) # flat_means :: batch_size 1 num_states observation_event_size flat_mean = tf.einsum("ijk,jmkl->jiml", flat_probs, flat_means) # flat_mean :: batch_size num_steps 1 observation_event_size variances = self._observation_distribution.variance() variances = tf.broadcast_to(variances, means_shape) # variances :: batch_shape num_states observation_event_shape flat_variances = tf.reshape(variances, flat_means_shape) # flat_variances :: batch_size 1 num_states observation_event_size # For a mixture of n distributions with mixture probabilities # p[i], and where the individual distributions have means and # variances given by mean[i] and var[i], the variance of # the mixture is given by: # # var = sum i=1..n p[i] * ((mean[i] - mean)**2 + var[i]**2) flat_variance = tf.einsum("ijk,jikl->jil", flat_probs, (flat_means - flat_mean)**2 + flat_variances) # flat_variance :: batch_size num_steps observation_event_size unflat_mean_shape = tf.concat( [self.batch_shape_tensor(), [self._num_steps], observation_event_shape], axis=0) # returns :: batch_shape num_steps observation_event_shape return tf.reshape(flat_variance, unflat_mean_shape)
def _build_clp_multiplication(self, clp_kernel): from TFUtil import safe_log input_placeholder = self.input_data.get_placeholder_as_batch_major() tf.assert_equal(tf.shape(clp_kernel)[1], tf.shape(input_placeholder)[2] // 2) tf.assert_equal(tf.shape(clp_kernel)[2], self._nr_of_filters) input_real = tf.strided_slice(input_placeholder, [0, 0, 0], tf.shape(input_placeholder), [1, 1, 2]) input_imag = tf.strided_slice(input_placeholder, [0, 0, 1], tf.shape(input_placeholder), [1, 1, 2]) kernel_real = self._clp_kernel[0, :, :] kernel_imag = self._clp_kernel[1, :, :] output_real = tf.einsum('btf,fp->btp', input_real, kernel_real) - tf.einsum('btf,fp->btp', input_imag, kernel_imag) output_imag = tf.einsum('btf,fp->btp', input_imag, kernel_real) + tf.einsum('btf,fp->btp', input_real, kernel_imag) output_uncompressed = tf.sqrt(tf.pow(output_real, 2) + tf.pow(output_imag, 2)) output_compressed = safe_log(output_uncompressed) return output_compressed
def time_distributed_dense_layer(inputs, output_units, bias=True, activation=None, dropout=None, scope='time-distributed-dense-layer', reuse=False): """ Applies a shared dense layer to each timestep of a tensor of shape [batch_size, max_seq_len, input_units] to produce a tensor of shape [batch_size, max_seq_len, output_units]. Args: inputs: Tensor of shape [batch size, max sequence length, ...]. output_units: Number of output units. activation: activation function. dropout: dropout keep prob. Returns: Tensor of shape [batch size, max sequence length, output_units]. """ with tf.variable_scope(scope, reuse=reuse): W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(inputs, -1), output_units] ) z = tf.einsum('ijk,kl->ijl', inputs, W) if bias: b = tf.get_variable( name='biases', initializer=tf.constant_initializer(), shape=[output_units] ) z = z + b z = activation(z) if activation else z z = tf.nn.dropout(z, dropout) if dropout else z return z
def dense_word_embedding_from_chars(chars, embed_dim, bias=True, scope='dense-word-embed', reuse=False): """ Word embeddings via dense transformation + maxpooling of character sequences. Args: chars: Tensor of shape [batch_size, word sequence length, char sequence length, alphabet size]. embed_dim: Dimension of word embeddings. Integer. Returns: Sequence of embedding vectors. Tensor of shape [batch_size, word sequence length, embed_dim]. """ with tf.variable_scope(scope, reuse=reuse): chars = tf.cast(chars, tf.float32) W = tf.get_variable( name='weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(chars, -1), embed_dim] ) z = tf.einsum('ijkl,lm->ijkm', chars, W) if bias: b = tf.get_variable( name='biases', initializer=tf.constant_initializer(), shape=[embed_dim] ) z = z + b dense_word_embedding = tf.reduce_max(z, 2) return dense_word_embedding
def maxpool_attentive_matching(a, b, a_lengths, b_lengths, max_seq_len, attention_func=dot_attention, attention_func_kwargs={}): """ Matches each vector in a with a vector created by maxpooling over the weighted vectors in b. The weightings are determined by the attention matrix. The attention matrix is computed using attention_func. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. attention_func: Function used to calculate attention matrix. Can be one of the following: multiplicative_attention, additive_attention, concat_attention, dot_attention, or cosine_attention. attention_func_kwargs: Keyword arguments to pass to attention_func. Returns: Tensor of shape [batch_size, max_seq_len, input_size] consisting of the matching vectors for each timestep in a. """ attn = attention_func(a, b, a_lengths, b_lengths, max_seq_len, **attention_func_kwargs) return tf.reduce_max(tf.einsum('ijk,ikl->ijkl', attn, b), axis=2)
def _expectation(p, mean1, none1, mean2, none2, nghp=None): """ Compute the expectation: expectation[n] = <m1(x_n)^T m2(x_n)>_p(x_n) - m1(.), m2(.) :: Linear mean functions :return: NxQ1xQ2 """ with params_as_tensors_for(mean1), params_as_tensors_for(mean2): e_xxt = p.cov + (p.mu[:, :, None] * p.mu[:, None, :]) # NxDxD e_A1t_xxt_A2 = tf.einsum("iq,nij,jz->nqz", mean1.A, e_xxt, mean2.A) # NxQ1xQ2 e_A1t_x_b2t = tf.einsum("iq,ni,z->nqz", mean1.A, p.mu, mean2.b) # NxQ1xQ2 e_b1_xt_A2 = tf.einsum("q,ni,iz->nqz", mean1.b, p.mu, mean2.A) # NxQ1xQ2 e_b1_b2t = mean1.b[:, None] * mean2.b[None, :] # Q1xQ2 return e_A1t_xxt_A2 + e_A1t_x_b2t + e_b1_xt_A2 + e_b1_b2t
def additive_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, scope='additive-attention', reuse=False): """ For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, where attn(i, j) = dot(v, tanh(W*a_i + W*b_j)). v is a learnable vector and W is a learnable matrix. The rows of attn are softmax normalized. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. hidden_units: Number of hidden units. Integer. Returns: Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. """ with tf.variable_scope(scope, reuse=reuse): aW = time_distributed_dense_layer(a, hidden_units, bias=False, scope='dense', reuse=False) bW = time_distributed_dense_layer(b, hidden_units, bias=False, scope='dense', reuse=True) aW = tf.expand_dims(aW, 2) bW = tf.expand_dims(bW, 1) v = tf.get_variable( name='dot_weights', initializer=tf.variance_scaling_initializer(), shape=[hidden_units] ) logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(aW + bW), v) logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) attn = tf.exp(logits) attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
def test_invalid(self): for axes in self.invalid_cases: inputs = [ tf.placeholder(tf.float32, shape=(3,4)), tf.placeholder(tf.float32, shape=(3,4)), ] with self.assertRaises(ValueError): _ = tf.einsum(axes, *inputs)
def test_dim_mismatch(self): for axes, input_shapes in self.dim_mismatch_cases: inputs = [ tf.placeholder(tf.float32, shape=shape) for shape in input_shapes ] with self.assertRaises(ValueError): _ = tf.einsum(axes, *inputs)
def lookahead(self, t, z_prev): """Compute the 'lookahead' distribution, p(x_{t:T} | z_{t-1}). Args: t: A scalar Tensor int, the current timestep. Must be at least 1. z_prev: The latent state at time t-1. A Tensor of shape [batch_size]. Returns: p(x_{t:T} | z_{t-1}) as a multivariate normal distribution. """ z_prev = tf.convert_to_tensor(z_prev) sigma_zx = self.sigma_zx[t-1, t:] z_var = self.sigma_z[t-1, t-1] mean = tf.einsum("i,j->ij", z_prev, sigma_zx) / z_var variance = (self.sigma_x[t:, t:] - tf.einsum("i,j->ij", sigma_zx, sigma_zx) / z_var) return tfd.MultivariateNormalFullCovariance( loc=mean, covariance_matrix=variance)
def not_fully_connected_layer(inputs, segment_count, segment_dim, num_kernels, nonlinearity=tf.nn.relu): weights = tf.Variable( tf.truncated_normal( [segment_dim, num_kernels], stddev=2. / (num_kernels + segment_dim) ** 0.5), 'weights') biases = tf.Variable(tf.zeros([num_kernels]), 'biases') inputs_1 = tf.reshape(inputs, [50, segment_count, segment_dim]) output = tf.einsum('ijk,kl->ijl', inputs_1, weights) + biases temp = tf.reshape(output, [50, segment_count * num_kernels]) outputs = nonlinearity(temp) return outputs, weights
def test_dim_mismatch(self): for axes, input_shapes in self.dim_mismatch_cases: inputs = [ tf.placeholder(tf.float32, shape=shape) for shape in input_shapes ] result = None try: result = tf.einsum(axes, *inputs) except AssertionError: pass assert result is None, "An exception should have been thrown."
def test_input_is_placeholder(self): with tf.Graph().as_default(): m0 = tf.placeholder(tf.int32, shape=(1, None)) m1 = tf.placeholder(tf.int32, shape=(None, 1)) out = tf.einsum('ij,jk->ik', m0, m1) with tf.Session() as sess: feed_dict = { m0: [[1, 2, 3]], m1: [[2], [1], [1]], } np.testing.assert_almost_equal([[7]], sess.run(out, feed_dict=feed_dict))
def __call__(self, inputs, state, scope=None): if not isinstance(state, CopyNetWrapperState): raise TypeError("Expected state to be instance of CopyNetWrapperState. " "Received type %s instead." % type(state)) last_ids = state.last_ids prob_c = state.prob_c cell_state = state.cell_state mask = tf.cast(tf.equal(tf.expand_dims(last_ids, 1), self._encoder_input_ids), tf.float32) mask_sum = tf.reduce_sum(mask, axis=1) mask = tf.where(tf.less(mask_sum, 1e-7), mask, mask / tf.expand_dims(mask_sum, 1)) rou = mask * prob_c selective_read = tf.einsum("ijk,ij->ik", self._encoder_states, rou) inputs = tf.concat([inputs, selective_read], 1) outputs, cell_state = self._cell(inputs, cell_state, scope) generate_score = self._projection(outputs) copy_score = tf.einsum("ijk,km->ijm", self._encoder_states, self._copy_weight) copy_score = tf.nn.tanh(copy_score) copy_score = tf.einsum("ijm,im->ij", copy_score, outputs) encoder_input_mask = tf.one_hot(self._encoder_input_ids, self._vocab_size) expanded_copy_score = tf.einsum("ijn,ij->ij", encoder_input_mask, copy_score) prob_g = generate_score prob_c = expanded_copy_score # mixed_score = tf.concat([generate_score, expanded_copy_score], 1) # probs = tf.nn.softmax(mixed_score) # prob_g = probs[:, :self._gen_vocab_size] # prob_c = probs[:, self._gen_vocab_size:] prob_c_one_hot = tf.einsum("ijn,ij->in", encoder_input_mask, prob_c) prob_g_total = tf.pad(prob_g, [[0, 0], [0, self._vocab_size - self._gen_vocab_size]]) outputs = prob_c_one_hot + prob_g_total last_ids = tf.argmax(outputs, axis=-1, output_type=tf.int32) #prob_c.set_shape([None, self._encoder_state_size]) state = CopyNetWrapperState(cell_state=cell_state, last_ids=last_ids, prob_c=prob_c) return outputs, state
def cl_loss_from_embedding(self,embedded,return_intermediate=False): with tf.device('/gpu:1'): output,_ = self.layers['BiLSTM'](embedded) output = tf.concat([tf.reshape(output,[-1,2*self.args.rnn_size]),tf.constant(np.zeros((1,2*self.args.rnn_size),dtype=np.float32))],0) input_f1 =tf.nn.l2_normalize(tf.reduce_sum(tf.nn.embedding_lookup(output,self.entMentIndex),1),1) #input_f2 =tf.nn.l2_normalize(tf.reduce_sum(tf.nn.embedding_lookup(output,self.entCtxLeftIndex),1),1) #input_f3 =tf.nn.l2_normalize(tf.reduce_sum(tf.nn.embedding_lookup(output,self.entCtxRightIndex),1),1) f2_temp = tf.nn.embedding_lookup(output,self.entCtxLeftIndex) f3_temp = tf.nn.embedding_lookup(output,self.entCtxRightIndex) f2_atten = tf.nn.softmax(tf.einsum('aij,ajk->aik', f2_temp, tf.expand_dims(input_f1,-1)),-1) #Batch matrix multiplication f3_atten = tf.nn.softmax(tf.einsum('aij,ajk->aik', f3_temp, tf.expand_dims(input_f1,-1)),-1) input_f2 = tf.einsum('aij,ajk->aik',tf.transpose(f2_temp,[0,2,1]),f2_atten)[:,:,0] input_f3 = tf.einsum('aij,ajk->aik',tf.transpose(f3_temp,[0,2,1]),f3_atten)[:,:,0] print 'f2_input:',input_f2 print 'f3_input:',input_f3 input_ctx = tf.concat([input_f2,input_f3],1) if self.args.dropout: #dropout position is here! input_f1 = tf.nn.dropout(input_f1,self.keep_prob) input_ctx = tf.nn.dropout(input_ctx,self.keep_prob) prediction_l1_ment = self.layers['fullyConnect_ment'](input_f1,activation_fn=None) prediction_ment = tf.matmul(prediction_l1_ment,self.hier) print 'ment:',prediction_ment prediction_ctx = self.layers['fullyConnect_ctx'](input_ctx,activation_fn=None) print 'ctx:',prediction_ctx prediction = tf.nn.sigmoid(prediction_ment + prediction_ctx) loss = tf.reduce_mean(layers_lib.classification_loss('figer',self.dense_outputdata,prediction)) return prediction,loss
def concat_attention(a, b, a_lengths, b_lengths, max_seq_len, hidden_units=150, scope='concat-attention', reuse=False): """ For sequences a and b of lengths a_lengths and b_lengths, computes an attention matrix attn, where attn(i, j) = dot(v, tanh(W*[a_i; b_j])). v is a learnable vector and W is a learnable matrix. The rows of attn are softmax normalized. Args: a: Input sequence a. Tensor of shape [batch_size, max_seq_len, input_size]. b: Input sequence b. Tensor of shape [batch_size, max_seq_len, input_size]. a_lengths: Lengths of sequences in a. Tensor of shape [batch_size]. b_lengths: Lengths of sequences in b. Tensor of shape [batch_size]. max_seq_len: Length of padded sequences a and b. Integer. hidden_units: Number of hidden units. Integer. Returns: Attention matrix. Tensor of shape [max_seq_len, max_seq_len]. """ with tf.variable_scope(scope, reuse=reuse): a = tf.expand_dims(a, 2) b = tf.expand_dims(b, 1) c = tf.concat([a, b], axis=3) W = tf.get_variable( name='matmul_weights', initializer=tf.contrib.layers.variance_scaling_initializer(), shape=[shape(c, -1), hidden_units] ) cW = tf.einsum('ijkl,lm->ijkm', c, W) v = tf.get_variable( name='dot_weights', initializer=tf.ones_initializer(), shape=[hidden_units] ) logits = tf.einsum('ijkl,l->ijk', tf.nn.tanh(cW), v) logits = logits - tf.expand_dims(tf.reduce_max(logits, axis=2), 2) attn = tf.exp(logits) attn = mask_attention_weights(attn, a_lengths, b_lengths, max_seq_len) return attn / tf.expand_dims(tf.reduce_sum(attn, axis=2) + 1e-10, 2)
def __call__(self, x, stop_params_gradient=False, is_eval=True, ensemble_idxs=None, pre_expanded=None, reduce_mode="none"): if pre_expanded is None: pre_expanded = ensemble_idxs is not None if ensemble_idxs is None: ensemble_idxs = tf.random_shuffle(tf.range(self.ensemble_size)) ensemble_sample_n = self.eval_sample_count if is_eval else self.train_sample_count ensemble_idxs = ensemble_idxs[:ensemble_sample_n] else: ensemble_sample_n = tf.shape(ensemble_idxs)[0] weights = [tf.gather(w, ensemble_idxs, axis=0) for w in self.weights] biases = [tf.expand_dims(tf.gather(b, ensemble_idxs, axis=0),0) for b in self.biases] original_shape = tf.shape(x) if pre_expanded: h = tf.reshape(x, [-1, ensemble_sample_n, self.in_size]) else: h = tf.tile(tf.reshape(x, [-1, 1, self.in_size]), [1, ensemble_sample_n, 1]) for layer_i in range(self.layers): nonlinearity = tf.nn.relu if layer_i + 1 < self.layers else self.final_nonlinearity if stop_params_gradient: h = nonlinearity(tf.einsum('bri,rij->brj', h, tf.stop_gradient(weights[layer_i])) + tf.stop_gradient(biases[layer_i])) else: h = nonlinearity(tf.einsum('bri,rij->brj', h, weights[layer_i]) + biases[layer_i]) if pre_expanded: if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant(self.out_shape)], -1)) else: h = tf.reshape(h, original_shape[:-1]) else: if len(self.out_shape) > 0: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant([ensemble_sample_n]), tf.constant(self.out_shape)], -1)) else: h = tf.reshape(h, tf.concat([original_shape[:-1], tf.constant([ensemble_sample_n])], -1)) if reduce_mode == "none": pass elif reduce_mode == "random": if len(self.out_shape) > 0: h = tf.reduce_sum(h * tf.reshape(tf.one_hot(tf.random_uniform([tf.shape(h)[0]], 0, ensemble_sample_n, dtype=tf.int64), ensemble_sample_n), tf.concat([tf.shape(h)[:1], tf.ones_like(tf.shape(h)[1:-2]), tf.constant([ensemble_sample_n]), tf.constant([1])], 0)), -2) else: h = tf.reduce_sum(h * tf.reshape(tf.one_hot(tf.random_uniform([tf.shape(h)[0]], 0, ensemble_sample_n, dtype=tf.int64), ensemble_sample_n), tf.concat([tf.shape(h)[:1], tf.ones_like(tf.shape(h)[1:-1]), tf.constant([ensemble_sample_n])], 0)), -1) elif reduce_mode == "mean": if len(self.out_shape) > 0: h = tf.reduce_mean(h, -2) else: h = tf.reduce_mean(h, -1) else: raise Exception("use a valid reduce mode: none, random, or mean") return h
def test_invalid(self): for axes in self.invalid_cases: result = None inputs = [ tf.placeholder(tf.float32, shape=(3,4)), tf.placeholder(tf.float32, shape=(3,4)), ] try: result = tf.einsum(axes, *inputs) except AssertionError as e: print(e) assert result is None, \ "An exception should have been thrown."
def fit(self, x=None, y=None): # p(coeffs | x, y) = Normal(coeffs | # mean = (1/noise_variance) (1/noise_variance x^T x + I)^{-1} x^T y, # covariance = (1/noise_variance x^T x + I)^{-1}) # TODO(trandustin): We newly fit the data at each call. Extend to do # Bayesian updating. kernel_matrix = tf.matmul(x, x, transpose_a=True) / self.noise_variance coeffs_precision = tf.matrix_set_diag( kernel_matrix, tf.matrix_diag_part(kernel_matrix) + 1.) coeffs_precision_tril = tf.linalg.cholesky(coeffs_precision) self.coeffs_precision_tril_op = tf.linalg.LinearOperatorLowerTriangular( coeffs_precision_tril) self.coeffs_mean = self.coeffs_precision_tril_op.solvevec( self.coeffs_precision_tril_op.solvevec(tf.einsum('nm,n->m', x, y)), adjoint=True) / self.noise_variance # TODO(trandustin): To be fully Keras-compatible, return History object. return
def __init__(self, sess, config, name, is_train): self.sess = sess self.name = name self.is_train = is_train self.X_hsd = tf.placeholder(tf.float32, shape=[config.batch_size, config.im_size, config.im_size, 3], name="original_color_image") self.D, h_s = tf.split(self.X_hsd,[1,2], axis=3) self.E_Step = CNN("E_Step", config, is_train=self.is_train) self.Gama = self.E_Step(self.D) self.loss, self.Mu, self.Std = GMM_M_Step(self.X_hsd, self.Gama, config.ClusterNo, name='GMM_Statistics') if self.is_train: self.optim = tf.train.AdamOptimizer(config.lr) self.train = self.optim.minimize(self.loss, var_list=self.E_Step.Param) ClsLbl = tf.arg_max(self.Gama, 3) ClsLbl = tf.cast(ClsLbl, tf.float32) ColorTable = [[255,0,0],[0,255,0],[0,0,255],[255,255,0], [0,255,255], [255,0,255]] colors = tf.cast(tf.constant(ColorTable), tf.float32) Msk = tf.tile(tf.expand_dims(ClsLbl, axis=3),[1,1,1,3]) for k in range(0, config.ClusterNo): ClrTmpl = tf.einsum('anmd,df->anmf', tf.expand_dims(tf.ones_like(ClsLbl), axis=3), tf.reshape(colors[k,...],[1,3])) Msk = tf.where(tf.equal(Msk,k), ClrTmpl, Msk) self.X_rgb = utils.HSD2RGB(self.X_hsd) tf.summary.image("1.Input_image", self.X_rgb*255.0, max_outputs=2) tf.summary.image("2.Gamma_image", Msk, max_outputs=2) tf.summary.image("3.Density_image", self.D*255.0, max_outputs=2) tf.summary.scalar("loss", self.loss) self.summary_op = tf.summary.merge_all() self.saver = tf.train.Saver() self.summary_writer = tf.summary.FileWriter(config.logs_dir, self.sess.graph) self.sess.run(tf.global_variables_initializer()) ckpt = tf.train.get_checkpoint_state(config.logs_dir) if ckpt and ckpt.model_checkpoint_path: self.saver.restore(self.sess, ckpt.model_checkpoint_path) print("Model restored...")
def call(self, inputs): if self.coeffs_mean is None and self.coeffs_precision_tril_op is None: # p(mean(ynew) | xnew) = Normal(ynew | mean = 0, variance = xnew xnew^T) predictive_mean = 0. predictive_variance = tf.reduce_sum(tf.square(inputs), -1) else: # p(mean(ynew) | xnew, x, y) = Normal(ynew | # mean = xnew (1/noise_variance) (1/noise_variance x^T x + I)^{-1}x^T y, # variance = xnew (1/noise_variance x^T x + I)^{-1} xnew^T) predictive_mean = tf.einsum('nm,m->n', inputs, self.coeffs_mean) predictive_covariance = tf.matmul( inputs, self.coeffs_precision_tril_op.solve( self.coeffs_precision_tril_op.solve(inputs, adjoint_arg=True), adjoint=True)) predictive_variance = tf.diag_part(predictive_covariance) return ed.Normal(loc=predictive_mean, scale=tf.sqrt(predictive_variance))
def n_dimensional_weightmul(L, W, L_shape, Lout_shape, first_dim_of_l_is_batch=True): """ Equivalent to matmul(W,L) but works for L with larger shapes than 1 L_shape and Lout_shape are excluding the batch dimension (0)""" if not first_dim_of_l_is_batch: raise NotImplementedError if len(L_shape) == 1 and len(Lout_shape) == 1: return tf.matmul(L, W) # L : ?xN1xN2xN3x... # Lout : ?xM1xM2xM3x... # W : N1xN2x...xM1xM2x... # Einstein notation: letter b (denotes batch dimension) # Lout_blmn... = L_bijk... * Wijk...lmn... letters = list('ijklmnopqrst') l_subscripts = ''.join([letters.pop(0) for _ in range(len(L_shape))]) lout_subscripts = ''.join([letters.pop(0) for _ in range(len(Lout_shape))]) einsum_string = 'b'+l_subscripts+','+l_subscripts+lout_subscripts+'->'+'b'+lout_subscripts return tf.einsum(einsum_string,L,W)
def _compute_covariances(self, emission_weights, emission_variances): """Compute all covariance matrices. Computes the covaraince matrix for the latent variables, the observations, and the covariance between the latents and observations. Args: emission_weights: A Tensor of shape [num_timesteps] containing the emission distribution weights at each timestep. emission_variances: A Tensor of shape [num_timesteps] containing the emiision distribution variances at each timestep. """ # Compute the marginal variance of each latent. z_variances = [self.transition_variances.read(0)] for i in range(1, self.num_timesteps): z_variances.append( z_variances[i-1] * tf.square(self.transition_weights.read(i-1)) + self.transition_variances.read(i)) # Compute the latent covariance matrix. sigma_z = [] for i in range(self.num_timesteps): sigma_z_row = [] for j in range(self.num_timesteps): if i == j: sigma_z_row.append(z_variances[i]) continue min_ind = min(i, j) max_ind = max(i, j) weight = tf.reduce_prod( self.transition_weights.gather(tf.range(min_ind, max_ind))) sigma_z_row.append(z_variances[min_ind] * weight) sigma_z.append(tf.stack(sigma_z_row)) self.sigma_z = tf.stack(sigma_z) # Compute the observation covariance matrix. x_weights_outer = tf.einsum("i,j->ij", emission_weights, emission_weights) self.sigma_x = x_weights_outer * self.sigma_z + tf.diag(emission_variances) # Compute the latent - observation covariance matrix. # The first axis will index latents, the second axis will index observtions. self.sigma_zx = emission_weights[tf.newaxis, :] * self.sigma_z self.obs_dist = tfd.MultivariateNormalFullCovariance( loc=tf.zeros([self.num_timesteps], dtype=tf.float32), covariance_matrix=self.sigma_x)
def log_blend(inputs, weights): """Blends state in the log space. Args: inputs: A set of scalar states, one for each particle in each particle filter. Should be [num_samples, batch_size]. weights: A set of weights used to blend the state. Each set of weights should be of dimension [num_samples] (one weight for each previous particle). There should be one set of weights for each new particle in each particle filter. Thus the shape should be [num_samples, batch_size, num_samples] where the first axis indexes new particle and the last axis indexes old particles. Returns: blended: The blended states, a tensor of shape [num_samples, batch_size]. """ raw_max = tf.reduce_max(inputs, axis=0, keepdims=True) my_max = tf.stop_gradient( tf.where(tf.is_finite(raw_max), raw_max, tf.zeros_like(raw_max)) ) # Don't ask. blended = tf.log(tf.einsum("ijk,kj->ij", weights, tf.exp(inputs - raw_max))) + my_max return blended
def run_test(self, axes): all_axes = {ax: np.random.randint(4, 12) for ax in axes if ax.isalpha()} input_vals = [] input_axes, _, _ = axes.partition('->') for idx in input_axes.split(','): shape = [all_axes[ax] for ax in idx] input_vals.append(np.random.random(shape)) input_tensors = [tf.constant(val) for val in input_vals] output_tensor = tf.einsum(axes, *input_tensors) with self.test_session(): output_value = output_tensor.eval() correct_value = np.einsum(axes, *input_vals) err = np.abs(correct_value - output_value).max() print(axes, err) assert err < 1e-8
def inference(x, q, n_batch, vocab_size=None, embedding_dim=None, story_maxlen=None, question_maxlen=None): def weight_variable(shape, stddev=0.08): initial = tf.truncated_normal(shape, stddev=stddev) return tf.Variable(initial) def bias_variable(shape): initial = tf.zeros(shape, dtype=tf.float32) return tf.Variable(initial) A = weight_variable([vocab_size, embedding_dim]) B = weight_variable([vocab_size, embedding_dim]) C = weight_variable([vocab_size, question_maxlen]) m = tf.nn.embedding_lookup(A, x) u = tf.nn.embedding_lookup(B, q) c = tf.nn.embedding_lookup(C, x) p = tf.nn.softmax(tf.einsum('ijk,ilk->ijl', m, u)) o = tf.add(p, c) o = tf.transpose(o, perm=[0, 2, 1]) ou = tf.concat([o, u], axis=-1) cell = tf.contrib.rnn.BasicLSTMCell(embedding_dim//2, forget_bias=1.0) initial_state = cell.zero_state(n_batch, tf.float32) state = initial_state outputs = [] with tf.variable_scope('LSTM'): for t in range(question_maxlen): if t > 0: tf.get_variable_scope().reuse_variables() (cell_output, state) = cell(ou[:, t, :], state) outputs.append(cell_output) output = outputs[-1] W = weight_variable([embedding_dim//2, vocab_size], stddev=0.01) a = tf.nn.softmax(tf.matmul(output, W)) return a
def _mean(self): with tf.control_dependencies(self._runtime_assertions): probs = self._marginal_hidden_probs() # probs :: num_steps batch_shape num_states means = self._observation_distribution.mean() # means :: observation_batch_shape[:-1] num_states # observation_event_shape means_shape = tf.concat( [self.batch_shape_tensor(), [self._num_states], self._observation_distribution.event_shape_tensor()], axis=0) means = tf.broadcast_to(means, means_shape) # means :: batch_shape num_states observation_event_shape observation_event_shape = ( self._observation_distribution.event_shape_tensor()) batch_size = tf.reduce_prod(self.batch_shape_tensor()) flat_probs_shape = [self._num_steps, batch_size, self._num_states] flat_means_shape = [ batch_size, self._num_states, tf.reduce_prod(observation_event_shape)] flat_probs = tf.reshape(probs, flat_probs_shape) # flat_probs :: num_steps batch_size num_states flat_means = tf.reshape(means, flat_means_shape) # flat_means :: batch_size num_states observation_event_size flat_mean = tf.einsum("ijk,jkl->jil", flat_probs, flat_means) # flat_mean :: batch_size num_steps observation_event_size unflat_mean_shape = tf.concat( [self.batch_shape_tensor(), [self._num_steps], observation_event_shape], axis=0) # returns :: batch_shape num_steps observation_event_shape return tf.reshape(flat_mean, unflat_mean_shape)
def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs, params, tie_projs, initializer=None, proj_initializer=None, div_val=1, perms=None, proj_same_dim=True, scope='adaptive_softmax', **kwargs): def _logit(x, W, b, proj): y = x if x.shape.ndims == 3: if proj is not None: y = tf.einsum('ibd,ed->ibe', y, proj) return tf.einsum('ibd,nd->ibn', y, W) + b else: if proj is not None: y = tf.einsum('id,ed->ie', y, proj) return tf.einsum('id,nd->in', y, W) + b params_W, params_projs = params[0], params[1] with tf.variable_scope(scope): if len(cutoffs) == 0: softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer()) output = _logit(hidden, params_W, softmax_b, params_projs) nll = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) nll = tf.reduce_mean(nll) else: total_loss, total_cnt = 0, 0 cutoff_ends = [0] + cutoffs + [n_token] for i in range(len(cutoff_ends) - 1): with tf.variable_scope('cutoff_{}'.format(i)): l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1] cur_d_embed = d_embed // (div_val ** i) if div_val == 1: cur_W = params_W[l_idx: r_idx] else: cur_W = params_W[i] cur_b = tf.get_variable('b', [r_idx - l_idx], initializer=tf.zeros_initializer()) if tie_projs[i]: if div_val == 1: cur_proj = params_projs else: cur_proj = params_projs[i] else: if (div_val == 1 or not proj_same_dim) and d_proj == cur_d_embed: cur_proj = None else: cur_proj = tf.get_variable('proj', [cur_d_embed, d_proj], initializer=proj_initializer) if i == 0: cluster_W = tf.get_variable('cluster_W', [len(cutoffs), d_embed], initializer=tf.zeros_initializer()) cluster_b = tf.get_variable('cluster_b', [len(cutoffs)], initializer=tf.zeros_initializer()) cur_W = tf.concat([cur_W, cluster_W], 0) cur_b = tf.concat([cur_b, cluster_b], 0) head_logit = _logit(hidden, cur_W, cur_b, cur_proj) head_target = kwargs.get("head_target") head_nll = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=head_target, logits=head_logit) masked_loss = head_nll * perms[i] total_loss += tf.reduce_sum(masked_loss) total_cnt += tf.reduce_sum(perms[i]) # head_logprob = tf.nn.log_softmax(head_logit) # final_logprob = head_logprob * perms[i][:, :, None] # final_target = tf.one_hot(target, tf.shape(head_logprob)[2]) # total_loss -= tf.einsum('ibn,ibn->', final_logprob, final_target) # total_cnt += tf.reduce_sum(perms[i]) else: cur_head_nll = tf.einsum('ib,ibk->k', head_nll, perms[i]) cur_hidden = tf.einsum('ibd,ibk->kd', hidden, perms[i]) tail_logit = _logit(cur_hidden, cur_W, cur_b, cur_proj) tail_target = tf.einsum('ib,ibk->k', tf.to_float(target - l_idx), perms[i]) tail_nll = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.to_int32(tail_target), logits=tail_logit) sum_nll = cur_head_nll + tail_nll mask = tf.reduce_sum(perms[i], [0, 1]) masked_loss = sum_nll * mask total_loss += tf.reduce_sum(masked_loss) total_cnt += tf.reduce_sum(mask) nll = total_loss / total_cnt return nll
def outer(self, tensor_in_1, tensor_in_2): tensor_in_1 = (tensor_in_1 if tensor_in_1.dtype != tf.bool else tf.cast(tensor_in_1, tf.float32)) tensor_in_1 = (tensor_in_1 if tensor_in_2.dtype != tf.bool else tf.cast(tensor_in_2, tf.float32)) return tf.einsum('i,j->ij', tensor_in_1, tensor_in_2)
def insert_state(state, system, state_is_pure, mode=None, batched=False): """ Append a new mode (at slot 'mode') to system and initialize it in 'state'. If 'mode' is not specified or is greater than the largest current mode number, the new mode is added to the end. If an integer within [0,...,N-1] is given for 'mode' (where N is the number of modes) in 'system,' then the new state is put in the corresponding mode, and all following modes are shifted to the right by one. """ # pylint: disable=too-many-branches num_indices = len(system.shape) if batched: batch_offset = 1 else: batch_offset = 0 if state_is_pure: num_modes = num_indices - batch_offset else: num_modes = (num_indices - batch_offset) // 2 if mode is None or mode >= num_modes: mode = num_modes if len(system.shape) == 0: # pylint: disable=len-as-condition # no modes in system # pylint: disable=no-else-return if len(state.shape) - batch_offset == 1: if state_is_pure: return state else: return mixed(state) elif len(state.shape) - batch_offset == 2: return state else: raise ValueError( "'state' must have dim={} or dim={}".format(1 - batch_offset, 2 - batch_offset) ) else: # modes in system if len(state.shape) == batch_offset + 1 and state_is_pure: # everything is pure # basic form: # 'ab...ln...yz,m->ab...lmn...yz' ('m' indices belong to bra of mode being inserted) mode_size = 1 else: # everything is mixed # basic form: # 'abcd...klop...wxyz,mn->abcd...klmnop...wxyz' ('mn' indices belong to bra/ket of mode being inserted) mode_size = 2 batch_index = indices[:batch_offset] left_part = indices[batch_offset : batch_offset + mode * mode_size] middle_part = indices[ batch_offset + mode * mode_size : batch_offset + (mode + 1) * mode_size ] right_part = indices[ batch_offset + (mode + 1) * mode_size : batch_offset + (num_modes + 1) * mode_size ] eqn_lhs = batch_index + left_part + right_part + "," + batch_index + middle_part eqn_rhs = batch_index + left_part + middle_part + right_part eqn = eqn_lhs + "->" + eqn_rhs revised_modes = tf.einsum(eqn, system, state) return revised_modes
def dam_model(input_x, input_x_mask, input_y, input_y_mask, word_emb, keep_rate, conf, x_len=None, y_len=None): Hr = tf.nn.embedding_lookup(word_emb, input_y) if conf['is_positional'] and conf['stack_num'] > 0: with tf.variable_scope('positional'): Hr = op.positional_encoding_vector(Hr, max_timescale=10) Hr_stack = [Hr] for index in range(conf['stack_num']): with tf.variable_scope('self_stack_cr_' + str(index)): Hr = layers.block(Hr, Hr, Hr, Q_lengths=y_len, K_lengths=y_len) Hr_stack.append(Hr) #context part #a list of length max_turn_num, every element is a tensor with shape [batch, max_turn_len] list_turn_t = tf.unstack(input_x, axis=1) list_turn_length = tf.unstack(x_len, axis=1) sim_turns = [] #for every turn_t calculate matching vector for turn_t, t_turn_length in zip(list_turn_t, list_turn_length): Hu = tf.nn.embedding_lookup(word_emb, turn_t) #[batch, max_turn_len, emb_size] if conf['is_positional'] and conf['stack_num'] > 0: with tf.variable_scope('positional', reuse=True): Hu = op.positional_encoding_vector(Hu, max_timescale=10) Hu_stack = [Hu] for index in range(conf['stack_num']): with tf.variable_scope('self_stack_cr_' + str(index), reuse=True): Hu = layers.block(Hu, Hu, Hu, Q_lengths=t_turn_length, K_lengths=t_turn_length) Hu_stack.append(Hu) r_a_t_stack = [] t_a_r_stack = [] for index in range(conf['stack_num'] + 1): with tf.variable_scope('t_attend_r_cr_' + str(index)): try: t_a_r = layers.block(Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=y_len) except ValueError: tf.get_variable_scope().reuse_variables() t_a_r = layers.block(Hu_stack[index], Hr_stack[index], Hr_stack[index], Q_lengths=t_turn_length, K_lengths=y_len) with tf.variable_scope('r_attend_t_cr_' + str(index)): try: r_a_t = layers.block(Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=y_len, K_lengths=t_turn_length) except ValueError: tf.get_variable_scope().reuse_variables() r_a_t = layers.block(Hr_stack[index], Hu_stack[index], Hu_stack[index], Q_lengths=y_len, K_lengths=t_turn_length) t_a_r_stack.append(t_a_r) r_a_t_stack.append(r_a_t) t_a_r_stack.extend(Hu_stack) r_a_t_stack.extend(Hr_stack) t_a_r = tf.stack(t_a_r_stack, axis=-1) r_a_t = tf.stack(r_a_t_stack, axis=-1) #calculate similarity matrix with tf.variable_scope('similarity'): # sim shape [batch, max_turn_len, max_turn_len, 2*stack_num+1] # divide sqrt(200) to prevent gradient explosion sim = tf.einsum('biks,bjks->bijs', t_a_r, r_a_t) / tf.sqrt(200.0) sim_turns.append(sim) #cnn and aggregation sim = tf.stack(sim_turns, axis=1) print('sim shape: %s' % sim.shape) with tf.variable_scope('cnn_aggregation'): final_info = layers.CNN_3d(sim, 32, 16) #for douban #final_info = layers.CNN_3d(sim, 16, 16) return final_info
def two_mode_gate(matrix, mode1, mode2, in_modes, pure=True, batched=False): """basic form: 'abcd,efg...b...d...xyz->efg...a...c...xyz' (pure state) 'abcd,ij...be...dg...xyz,efgh->ij...af...ch...xyz' (mixed state) """ # pylint: disable=too-many-branches,too-many-statements if batched: batch_offset = 1 else: batch_offset = 0 batch_index = indices[:batch_offset] left_gate_str = indices[batch_offset : batch_offset + 4] # |a><b| |c><d| num_indices = len(in_modes.shape) if pure: num_modes = num_indices - batch_offset mode_size = 1 else: right_gate_str = indices[batch_offset + 4 : batch_offset + 8] # |e><f| |g><h| num_modes = (num_indices - batch_offset) // 2 mode_size = 2 max_len = (len(indices) - 4) // mode_size - batch_offset if num_modes == 0: raise ValueError("'in_modes' must have at least one mode") if num_modes > max_len: raise NotImplementedError( "The max number of supported modes for this operation is currently {}".format(max_len) ) min_mode = min(mode1, mode2) max_mode = max(mode1, mode2) if min_mode < 0 or max_mode >= num_modes or mode1 == mode2: raise ValueError("One or more mode numbers are incompatible") other_modes_indices = indices[ batch_offset + 4 * mode_size : batch_offset + 4 * mode_size + mode_size * (num_modes - 2) ] # build equation if mode1 == min_mode: lhs_min_mode_indices = left_gate_str[1] lhs_max_mode_indices = left_gate_str[3] rhs_min_mode_indices = left_gate_str[0] rhs_max_mode_indices = left_gate_str[2] else: lhs_min_mode_indices = left_gate_str[3] lhs_max_mode_indices = left_gate_str[1] rhs_min_mode_indices = left_gate_str[2] rhs_max_mode_indices = left_gate_str[0] if not pure: if mode1 == min_mode: lhs_min_mode_indices += right_gate_str[0] lhs_max_mode_indices += right_gate_str[2] rhs_min_mode_indices += right_gate_str[1] rhs_max_mode_indices += right_gate_str[3] else: lhs_min_mode_indices += right_gate_str[2] lhs_max_mode_indices += right_gate_str[0] rhs_min_mode_indices += right_gate_str[3] rhs_max_mode_indices += right_gate_str[1] eqn_lhs = "{},{}{}{}{}{}{}".format( batch_index + left_gate_str, batch_index, other_modes_indices[: min_mode * mode_size], lhs_min_mode_indices, other_modes_indices[min_mode * mode_size : (max_mode - 1) * mode_size], lhs_max_mode_indices, other_modes_indices[(max_mode - 1) * mode_size :], ) if not pure: eqn_lhs += "," + batch_index + right_gate_str eqn_rhs = "".join( [ batch_index, other_modes_indices[: min_mode * mode_size], rhs_min_mode_indices, other_modes_indices[min_mode * mode_size : (max_mode - 1) * mode_size], rhs_max_mode_indices, other_modes_indices[(max_mode - 1) * mode_size :], ] ) eqn = eqn_lhs + "->" + eqn_rhs einsum_inputs = [matrix, in_modes] if not pure: if batched: transpose_list = [0, 2, 1, 4, 3] else: transpose_list = [1, 0, 3, 2] einsum_inputs.append(tf.math.conj(tf.transpose(matrix, transpose_list))) output = tf.einsum(eqn, *einsum_inputs) return output
def combine_single_modes(modes_list, batched=False): """Group together a list of single modes (each having dim=1 or dim=2) into a composite mode system.""" if batched: batch_offset = 1 else: batch_offset = 0 num_modes = len(modes_list) if num_modes <= 1: raise ValueError("'modes_list' must have at least two modes") dims = np.array([len(mode.shape) - batch_offset for mode in modes_list]) if min(dims) < 1 or max(dims) > 2: raise ValueError("Each mode in 'modes_list' can only have dim=1 or dim=2") if np.all(dims == 1): # All modes are represented as pure states. # Can return combined state also as pure state. # basic form: # 'a,b,c,...,x,y,z->abc...xyz' max_num = max_num_indices - batch_offset if num_modes > max_num: raise NotImplementedError( "The max number of supported modes for this operation with pure states is currently {}".format( max_num ) ) batch_index = indices[:batch_offset] out_str = indices[batch_offset : batch_offset + num_modes] modes_str = ",".join([batch_index + idx for idx in out_str]) eqn = "{}->{}".format(modes_str, batch_index + out_str) einsum_inputs = modes_list else: # Some modes are mixed. # Return combined state as mixed. # basic form: # e.g., if first mode is pure and second is mixed... # 'a,b,cd,...->abcd...' # where (a,b) will belong to the first mode (bra & ket) # and cd will belong to the second mode (density matrix) max_num = (max_num_indices - batch_offset) // 2 batch_index = indices[:batch_offset] if num_modes > max_num: raise NotImplementedError( "The max number of supported modes for this operation with mixed states is currently {}".format( max_num ) ) mode_idxs = [ indices[slice(batch_offset + idx, batch_offset + idx + 2)] for idx in range(0, 2 * num_modes, 2) ] # each mode gets a pair of consecutive indices eqn_rhs = batch_index + "".join(mode_idxs) eqn_idxs = [ batch_index + m if dims[idx] == 2 else ",".join(m) for idx, m in enumerate(mode_idxs) ] eqn_lhs = ",".join(eqn_idxs) eqn = eqn_lhs + "->" + eqn_rhs einsum_inputs = [] for idx, mode in enumerate(modes_list): if dims[idx] == 1: new_inputs = [mode, tf.math.conj(mode)] elif dims[idx] == 2: new_inputs = [mode] einsum_inputs += new_inputs combined_modes = tf.einsum(eqn, *einsum_inputs) return combined_modes
def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model, n_head, d_head, dropout, dropatt, is_training, kernel_initializer, scope='rel_attn'): scale = 1 / (d_head**0.5) with tf.variable_scope(scope): qlen = tf.shape(w)[0] rlen = tf.shape(r)[0] bsz = tf.shape(w)[1] cat = tf.concat([mems, w], 0) if mems is not None and mems.shape.ndims > 1 else w w_heads = tf.layers.dense(cat, 3 * n_head * d_head, use_bias=False, kernel_initializer=kernel_initializer, name='qkv') r_head_k = tf.layers.dense(r, n_head * d_head, use_bias=False, kernel_initializer=kernel_initializer, name='r') w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, -1) w_head_q = w_head_q[-qlen:] klen = tf.shape(w_head_k)[0] w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head]) w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head]) w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head]) r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head]) rw_head_q = w_head_q + r_w_bias rr_head_q = w_head_q + r_r_bias # qlen, bsz, n_head, d_head , klen, bsz, n_head, d_head -> qlen,klen,bsz,n_head AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k) BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k) BD = rel_shift(BD) attn_score = (AC + BD) * scale attn_mask_t = attn_mask[:, :, None, None] attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t attn_prob = tf.nn.softmax(attn_score, 1) attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training) attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v) size_t = tf.shape(attn_vec) attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head]) attn_out = tf.layers.dense(attn_vec, d_model, use_bias=False, kernel_initializer=kernel_initializer, name='o') attn_out = tf.layers.dropout(attn_out, dropout, training=is_training) output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis=-1) return output
def fidelity_coherent(self, alpha_list, **kwargs): r""" Compute the fidelity of the state with the coherent states specified by alpha_list. May be numerical or symbolic. Args: alpha_list (Sequence[complex]): list of coherence parameter values, one for each mode **kwargs: Optional keyword arguments. * If this contains the key ``eval``, then the corresponding argument will be used to determine the return behaviour of this function. When ``eval=True``, the return value is numerical; when ``eval=False``, it is symbolic. * If eval is not present in kwargs, then state falls back to the an internal evaluation behaviour, which is specified at initialization. * A Tensorflow Session or feed_dict may also be passed via the keys ``session`` or ``feed_dict``, respectively. If a Session is supplied, then ``eval`` is overriden and the numerical evaluation takes place in the provided Session. If session and/or feed_dict are not given, then a temporary session and/or empty feed_dict will be used. Returns: float/Tensor: the numerical value, or an unevaluated Tensor object, for the fidelity :math:`\bra{\vec{\alpha}}\rho\ket{\vec{\alpha}}`. """ with self.graph.as_default(): if not hasattr(alpha_list, "__len__"): alpha_list = [alpha_list] if len(alpha_list) != self.num_modes: raise ValueError( "The number of alpha values must match the number of modes." ) max_indices = (len(indices) - 1) // 2 if len(alpha_list) > max_indices: raise ValueError( "Length of `alpha_list` exceeds supported number of modes." ) s = self.data if not self.batched: s = tf.expand_dims(s, 0) # introduce fake batch dimension coh = lambda a, dim: [ np.exp(-0.5 * np.abs(a)**2) * (a)**n / np.sqrt(factorial(n)) for n in range(dim) ] multi_cohs_list = [coh(a, self.cutoff_dim) for a in alpha_list ] # shape is: [num_modes, cutoff_dim] eqn = ",".join( indices[:self._modes]) + "->" + indices[:self._modes] multi_cohs_vec = np.einsum( eqn, *multi_cohs_list ) # tensor product of specified coherent states flat_multi_cohs = np.reshape( multi_cohs_vec, [1, self.cutoff_dim**self.num_modes] ) # flattened tensor product; shape is: [1, cutoff_dim * num_modes] if self.is_pure: flat_state = tf.reshape(s, [-1, self.cutoff_dim**self.num_modes]) ovlap = tf.reduce_sum(flat_multi_cohs.conj() * flat_state, axis=1) f = tf.abs(ovlap)**2 else: batch_index = indices[0] free_indices = indices[1:] bra_indices = free_indices[:self.num_modes] ket_indices = free_indices[self.num_modes:2 * self.num_modes] eqn = (bra_indices + "," + batch_index + "".join(bra_indices[idx] + ket_indices[idx] for idx in range(self.num_modes)) + "," + ket_indices + "->" + batch_index) f = tf.einsum( eqn, tf.convert_to_tensor(np.conj(multi_cohs_vec), dtype=def_type), s, tf.convert_to_tensor(multi_cohs_vec, def_type), ) if not self.batched: f = tf.squeeze(f, 0) # drop fake batch dimension f = tf.identity(f, name="fidelity_coherent") f = self._run(f, **kwargs) return f
def buildGamStep0(self, ListWeightUZ, ListBiasUZ, ListWeightGam, ListBiasGam, Gam0_initializer): dic = {} dic["LRate"] = tf.compat.v1.placeholder(tf.float32, shape=[], name="learning_rate") dic["RandG"] = tf.compat.v1.placeholder( dtype=tf.float32, shape=[None, self.d, self.nbStepGam], name='randG') dic["Gam0"] = tf.compat.v1.get_variable("Gam0", [self.d, self.d], tf.float32, Gam0_initializer) sample_size = tf.shape(dic["RandG"])[0] sig = self.model.sigScal mu = self.model.muScal sqrtDt = np.sqrt(self.TStepGam) XPrev = tf.tile( tf.expand_dims(tf.convert_to_tensor(self.xInit, dtype=tf.float32), axis=0), [sample_size, 1]) XNext = XPrev XNextAnti = XNext GamTraj = tf.zeros([sample_size, self.d, self.d]) WAccul = tf.zeros([sample_size, self.d]) TAccul = 0. for i in range(len(ListWeightGam) - 1): iStepLoc = i + 1 tLoc = (i + 1) * self.TStepGam WAccul = WAccul + sqrtDt * dic["RandG"][:, :, i] TAccul = TAccul + self.TStepGam XNext = XNext + mu * self.TStepGam + sig * sqrtDt * dic[ "RandG"][:, :, i] XNextAnti = XNextAnti + mu * self.TStepGam - sig * sqrtDt * dic[ "RandG"][:, :, i] iPosBSDE = (-i - 1) * self.nbStepGamStab print("len( ListWeightGam)", len(ListWeightGam), " ListWeightUZ ", len(ListWeightUZ), " IPO", iPosBSDE) normX = (XNext - self.xInit - mu * self.TStepGam * iStepLoc) / ( sig * np.sqrt(self.TStepGam * iStepLoc)) U, Z = self.networkUZ.createNetworkNotTrainable( normX, iStepLoc, ListWeightUZ[iPosBSDE], ListBiasUZ[iPosBSDE]) Gam = self.networkGam.createNetworkNotTrainable( normX, iStepLoc, ListWeightGam[-i - 1], ListBiasGam[-i - 1]) driver = self.TStepGam * (0.5 * tf.einsum( 'j,ij->i', tf.constant(sig * sig, dtype=tf.float32), tf.matrix_diag_part(Gam)) - self.model.fDW( iStepLoc * self.TStepGam, XNext, U, Z, Gam)) normXAnti = (XNextAnti - self.xInit - mu * self.TStepGam * iStepLoc) / (sig * np.sqrt(self.TStepGam * iStepLoc)) UAnti, ZAnti = self.networkUZ.createNetworkNotTrainable( normXAnti, self.nbStepUDU + iStepLoc, ListWeightUZ[iPosBSDE], ListBiasUZ[iPosBSDE]) GamAnti = self.networkGam.createNetworkNotTrainable( normXAnti, self.nbStepUDU + iStepLoc, ListWeightGam[-i - 1], ListBiasGam[-i - 1]) driverAnti = self.TStepGam * (0.5 * tf.einsum( 'j,ij->i', tf.constant(sig * sig, dtype=tf.float32), tf.matrix_diag_part(GamAnti)) - self.model.fDW( iStepLoc * self.TStepGam, XNextAnti, UAnti, ZAnti, GamAnti)) normXPrev = (XPrev - self.xInit - mu * self.TStepGam * iStepLoc) / (sig * np.sqrt(self.TStepGam * iStepLoc)) UPrev, ZPrev = self.networkUZ.createNetworkNotTrainable( normXPrev, 2 * self.nbStepUDU + iStepLoc, ListWeightUZ[iPosBSDE], ListBiasUZ[iPosBSDE]) GamPrev = self.networkGam.createNetworkNotTrainable( normXPrev, 2 * self.nbStepUDU + iStepLoc, ListWeightGam[-i - 1], ListBiasGam[-i - 1]) driverPrev = self.TStepGam * (0.5 * tf.einsum( 'j,ij->i', tf.constant(sig * sig, dtype=tf.float32), tf.matrix_diag_part(GamPrev)) - self.model.fDW( iStepLoc * self.TStepGam, XPrev, UPrev, ZPrev, GamPrev)) weight = (tf.einsum( 'lij,j->lij', tf.einsum('i,lij->lij', tf.constant(1 / sig, dtype=tf.float32), tf.einsum("li,lj->lij", WAccul, WAccul) - TAccul), tf.constant(1 / sig, dtype=tf.float32))) / (TAccul * TAccul) GamTraj = GamTraj - tf.einsum( "l,lij->lij", 0.5 * (driver + driverAnti - 2 * driverPrev), weight) XNext = XNext + mu * self.TStepGam + sig * sqrtDt * dic[ "RandG"][:, :, len(ListWeightGam) - 1] XNextAnti = XNextAnti + mu * self.TStepGam - sig * sqrtDt * dic[ "RandG"][:, :, len(ListWeightGam) - 1] GamTraj = GamTraj + 0.5 * (self.model.D2gTf(XNext) + self.model.D2gTf(XNextAnti)) dic["Loss"] = tf.reduce_mean(tf.pow(dic["Gam0"] - GamTraj, 2)) dic["train"] = tf.compat.v1.train.AdamOptimizer( learning_rate=dic["LRate"]).minimize(dic["Loss"]) return dic
def sig(self, t, x): return tf.einsum('j,i->ij', tf.constant(self.sigScal, dtype=tf.float32), tf.ones(shape=tf.shape(x)[0], dtype=tf.float32))
def fDW(self, t, x, u, Du, D2u): return -self.R / 2 * tf.einsum( 'i,i->i', tf.einsum('ij,ij->i', Du, Du), tf.reshape( tf.math.reciprocal(D2u), [tf.shape(D2u)[0]])) - tf.einsum( 'j,ij->i', tf.constant(self.muScal, dtype=tf.float32), Du)
def __init__( self, num_symbols, num_embed_units, num_units, num_layers, num_labels, embed, learning_rate, max_gradient_norm=5.0, param_da=150, param_r=10, ): self.texts = tf.placeholder(tf.string, (None, None), "texts") # shape: [batch, length] # todo: implement placeholders self.texts_length = tf.placeholder(tf.int32, None, "texts_length") # shape: [batch] self.labels = tf.placeholder(tf.int32, None, "labels") # shape: [batch] self.symbol2index = MutableHashTable( key_dtype=tf.string, value_dtype=tf.int64, default_value=UNK_ID, shared_name="in_table", name="in_table", checkpoint=True, ) batch_size = tf.shape(self.texts)[0] # build the vocab table (string to index) # initialize the training process self.learning_rate = tf.Variable(float(learning_rate), trainable=False, dtype=tf.float32) self.global_step = tf.Variable(0, trainable=False) self.index_input = self.symbol2index.lookup( self.texts) # shape: [batch, length] # build the embedding table (index to vector) if embed is None: # initialize the embedding randomly self.embed = tf.get_variable("embed", [num_symbols, num_embed_units], tf.float32) else: # initialize the embedding by pre-trained word vectors self.embed = tf.get_variable("embed", dtype=tf.float32, initializer=embed) # todo: implement embedding inputs self.embed_input = tf.nn.embedding_lookup( self.embed, self.index_input) # shape: [batch, length, num_embed_units] # todo: implement Multi-layer RNNCell with #num_units neurons and #num_layers layers def LSTM(): return BasicLSTMCell(num_units) cells = [LSTM() for i in range(num_layers)] cell_fw = MultiRNNCell(cells) cell_bw = MultiRNNCell(cells) # todo: implement bidirectional RNN outputs, states = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, self.embed_input, self.texts_length, dtype=tf.float32, scope="rnn") H = tf.concat(outputs, 2) # shape: (batch, length, 2*num_units) # H = tf.Print(H, [H, tf.shape(H), "H"]) with tf.variable_scope("logits"): # todo: implement self-attention mechanism, feel free to add codes to calculate internal results Ws1 = tf.get_variable("Ws1", [2 * num_units, param_da]) Ws2 = tf.get_variable("Ws2", [param_da, param_r]) temp = tf.tanh(tf.einsum("aij,jr->air", H, Ws1)) # temp = tf.Print(temp, [temp, tf.shape(temp), "shape"]) A = tf.nn.softmax( tf.einsum("aij,jr->air", temp, Ws2)) # shape: (batch, param_r*2*num_units) # A = tf.Print(A, [A, tf.shape(A), "A"]) M = tf.reduce_sum(tf.einsum("aij,aik->ajk", A, H), axis=1) # M = tf.Print(M, [M, tf.shape(M), "M"]) logits = tf.layers.dense( M, num_labels, activation=None, name="projection") # shape: (batch, num_labels) # logits = tf.Print(logits, [logits, tf.shape(logits), "logits"]) # todo: calculate additional loss, feel free to add codes to calculate internal results identity = tf.reshape( tf.tile(tf.diag(tf.ones([param_r])), [batch_size, 1]), [batch_size, param_r, param_r]) temp = tf.matmul(A, A, transpose_a=True) self.penalized_term = tf.norm(temp - identity) self.loss = ( tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.labels, logits=logits), name="loss") + 0.001 * self.penalized_term) predict_labels = tf.argmax(logits, 1, "predict_labels") self.accuracy = tf.reduce_sum(tf.cast( tf.equal(self.labels, tf.cast(predict_labels, tf.int32)), tf.int32), name="accuracy") self.params = tf.trainable_variables() # global_step = tf.Variable(0, trainable=False) # initial_learning_rate = self.learning_rate # learning_rate = tf.train.exponential_decay(initial_learning_rate, # global_step=global_step, # decay_steps=10,decay_rate=0.9) # calculate the gradient of parameters # opt = tf.train.AdamOptimizer(learning_rate) opt = tf.train.MomentumOptimizer(self.learning_rate, 0.9) gradients = tf.gradients(self.loss, self.params) clipped_gradients, self.gradient_norm = tf.clip_by_global_norm( gradients, max_gradient_norm) self.update = opt.apply_gradients(zip(clipped_gradients, self.params), global_step=self.global_step) self.saver = tf.train.Saver(write_version=tf.train.SaverDef.V2, max_to_keep=5, pad_step_number=True)
def call(self, inputs, **kwargs): """Implements call() for the layer.""" unpacked_inputs = tf_utils.unpack_inputs(inputs) sequence_output = unpacked_inputs[0] p_mask = unpacked_inputs[1] cls_index = unpacked_inputs[2] start_positions = unpacked_inputs[3] _, seq_len, _ = sequence_output.shape.as_list() sequence_output = tf.transpose(sequence_output, [1, 0, 2]) start_logits = self.start_logits_proj_layer(sequence_output) start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0]) start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) if kwargs.get("training", False): # during training, compute the end logits based on the # ground truth of the start position start_positions = tf.reshape(start_positions, [-1]) start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum('lbh,bl->bh', sequence_output, start_index) start_features = tf.tile(start_features[None], [seq_len, 1, 1]) end_logits = self.end_logits_proj_layer0( tf.concat([sequence_output, start_features], axis=-1)) end_logits = self.end_logits_layer_norm(end_logits) end_logits = self.end_logits_proj_layer1(end_logits) end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0]) end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) else: start_top_log_probs, start_top_index = tf.nn.top_k( start_log_probs, k=self.start_n_top) start_index = tf.one_hot(start_top_index, depth=seq_len, axis=-1, dtype=tf.float32) start_features = tf.einsum('lbh,bkl->bkh', sequence_output, start_index) end_input = tf.tile(sequence_output[:, :, None], [1, 1, self.start_n_top, 1]) start_features = tf.tile(start_features[None], [seq_len, 1, 1, 1]) end_input = tf.concat([end_input, start_features], axis=-1) end_logits = self.end_logits_proj_layer0(end_input) end_logits = tf.reshape(end_logits, [seq_len, -1, self.hidden_size]) end_logits = self.end_logits_layer_norm(end_logits) end_logits = tf.reshape( end_logits, [seq_len, -1, self.start_n_top, self.hidden_size]) end_logits = self.end_logits_proj_layer1(end_logits) end_logits = tf.reshape(end_logits, [seq_len, -1, self.start_n_top]) end_logits = tf.transpose(end_logits, [1, 2, 0]) end_logits_masked = end_logits * ( 1 - p_mask[:, None]) - 1e30 * p_mask[:, None] end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) end_top_log_probs, end_top_index = tf.nn.top_k(end_log_probs, k=self.end_n_top) end_top_log_probs = tf.reshape( end_top_log_probs, [-1, self.start_n_top * self.end_n_top]) end_top_index = tf.reshape(end_top_index, [-1, self.start_n_top * self.end_n_top]) # an additional layer to predict answerability # get the representation of CLS cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32) cls_feature = tf.einsum('lbh,bl->bh', sequence_output, cls_index) # get the representation of START start_p = tf.nn.softmax(start_logits_masked, axis=-1, name='softmax_start') start_feature = tf.einsum('lbh,bl->bh', sequence_output, start_p) ans_feature = tf.concat([start_feature, cls_feature], -1) ans_feature = self.answer_class_proj_layer0(ans_feature) ans_feature = self.ans_feature_dropout(ans_feature, training=kwargs.get( 'training', False)) cls_logits = self.answer_class_proj_layer1(ans_feature) cls_logits = tf.squeeze(cls_logits, -1) if kwargs.get("training", False): return (start_log_probs, end_log_probs, cls_logits) else: return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits)
def build_model(self): """ the model takes in: walk: key, label, neg node: key, label, neg walks are dealt with directly while nodes will need sampling """ self.neighs_and_types = self.Dataset.types_and_nodes print(np.max(self.neighs_and_types[:, :, 0])) print(np.max(self.neighs_and_types[:, :, 1])) self.batch_keys = tf.placeholder(tf.int32, [None]) self.batch_labels = tf.placeholder(tf.int32, [None]) self.batch_negs = tf.placeholder(tf.int32, [None]) self.batch_input = tf.placeholder(tf.int32, [None]) self.input_size = tf.placeholder(tf.int32) self.key_walks = tf.placeholder(tf.int32, [None]) self.label_walks = tf.placeholder(tf.int32, [None]) self.neg_walks = tf.placeholder(tf.int32, [None]) self.nodes_keys, self.paths_keys = self.sample(self.batch_keys, self.num_neighbor, self.batch_size) self.nodes_labels, self.paths_labels = self.sample(self.batch_labels, self.num_neighbor, self.batch_size) self.nodes_negs, self.paths_negs = self.sample(self.batch_negs, self.num_neighbor, self.neg_size) self.nodes_inputs, self.paths_inputs = self.sample(self.batch_input, self.num_neighbor, self.input_size) self.walk_embeddings = tf.get_variable("walk_embeddings", [self.num_anonym_walk_types, self.walk_dim], tf.float64, initializer = tf.contrib.layers.xavier_initializer()) self.walk_loss = self.compute_walk_loss() self.output_keys = self.aggregate(self.nodes_keys, self.paths_keys, self.batch_size)#, compute_regularizer = True) self.output_labels = self.aggregate(self.nodes_labels, self.paths_labels, self.batch_size) self.output_negs = self.aggregate(self.nodes_negs, self.paths_negs, self.neg_size) self.output = self.aggregate(self.nodes_inputs, self.paths_inputs, self.input_size) self.output_keys = tf.nn.l2_normalize(self.output_keys, 1) self.output_labels = tf.nn.l2_normalize(self.output_labels, 1) self.output_negs = tf.nn.l2_normalize(self.output_negs, 1) self.output = tf.nn.l2_normalize(self.output, 1) pos_aff = tf.reduce_sum(tf.multiply(self.output_keys, self.output_labels), axis = 1) neg_aff = tf.einsum("ij,kj->ik", self.output_keys, self.output_negs) self.likelihood = tf.log(tf.sigmoid(pos_aff) + 1e-6) + tf.reduce_sum(tf.log(1-tf.sigmoid(neg_aff) + 1e-6), axis =1 ) self.link_loss = -tf.reduce_mean(self.likelihood) self.walk_loss *= self.walk_loss_lambda self.loss = self.link_loss + self.walk_loss #self.loss += self.regu_lambda * self.l2_loss if self.optimizer == "Adam": self.optim = tf.train.AdamOptimizer(self.learning_rate) elif self.optimizer == "SGD": self.optim = tf.train.GradientDescentOptimizer(self.learning_rate) elif self.optimizer == "Momentum": self.optim = tf.train.MomentumOptimizer(learning_rate= self.learning_rate, momentum = 0.9) # Clipping # grads_and_vars = self.optim.compute_gradients(self.loss) # clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var) # for grad, var in grads_and_vars] # self.opt_op = self.optim.apply_gradients(clipped_grads_and_vars) # No clipping self.opt_op = self.optim.minimize(self.loss)
def _logit(x, W, b, proj): y = x if proj is not None: y = tf.einsum('ibd,ed->ibe', y, proj) return tf.einsum('ibd,nd->ibn', y, W) + b
def model_fn(features, labels, mode, params): """Bulid Model function f(x) for Estimator.""" #------hyperparameters---- field_size = params["field_size"] feature_size = params["feature_size"] embedding_size = params["embedding_size"] l2_reg = params["l2_reg"] learning_rate = params["learning_rate"] #optimizer = params["optimizer"] layers = map(int, params["deep_layers"].split(',')) dropout = map(float, params["dropout"].split(',')) num_pairs = field_size * (field_size - 1) / 2 #------bulid weights------ Global_Bias = tf.get_variable(name='bias', shape=[1], initializer=tf.constant_initializer(0.0)) Feat_Bias = tf.get_variable(name='linear', shape=[feature_size], initializer=tf.glorot_normal_initializer()) Feat_Emb = tf.get_variable(name='emb', shape=[feature_size, embedding_size], initializer=tf.glorot_normal_initializer()) #Prod_Kernel = tf.get_variable(name='kernel', shape=[embedding_size, num_pairs, embedding_size], initializer=tf.glorot_normal_initializer()) #------build feaure------- feat_ids = features['feat_ids'] # None * F * 1 feat_ids = tf.reshape(feat_ids,shape=[-1,field_size]) feat_vals = features['feat_vals'] # None * F * 1 feat_vals = tf.reshape(feat_vals,shape=[-1,field_size]) #------build f(x)------ with tf.variable_scope("Linear-part"): feat_wgts = tf.nn.embedding_lookup(Feat_Bias, feat_ids) # None * F * 1 y_linear = tf.reduce_sum(tf.multiply(feat_wgts, feat_vals),1) with tf.variable_scope("Embedding-layer"): embeddings = tf.nn.embedding_lookup(Feat_Emb, feat_ids) # None * F * K feat_vals = tf.reshape(feat_vals, shape=[-1, field_size, 1]) embeddings = tf.multiply(embeddings, feat_vals) # None * F * K with tf.variable_scope("Product-layer"): if FLAGS.model_type == 'FNN': deep_inputs = tf.reshape(embeddings,shape=[-1,field_size*embedding_size]) elif FLAGS.model_type == 'Inner': row = [] col = [] for i in range(field_size-1): for j in range(i+1, field_size): row.append(i) col.append(j) p = tf.gather(embeddings, row, axis=1) q = tf.gather(embeddings, col, axis=1) #p = tf.reshape(p, [-1, num_pairs, embedding_size]) #q = tf.reshape(q, [-1, num_pairs, embedding_size]) inner = tf.reshape(tf.reduce_sum(p * q, [-1]), [-1, num_pairs]) # None * (F*(F-1)/2) deep_inputs = tf.concat([tf.reshape(embeddings,shape=[-1,field_size*embedding_size]), inner], 1) # None * ( F*K+F*(F-1)/2 ) elif FLAGS.model_type == 'Outer': #ERROR: NOT ready yet row = [] col = [] for i in range(field_size-1): for j in range(i+1, field_size): row.append(i) col.append(j) p = tf.gather(embeddings, row, axis=1) q = tf.gather(embeddings, col, axis=1) #p = tf.reshape(p, [-1, num_pairs, embedding_size]) #q = tf.reshape(q, [-1, num_pairs, embedding_size]) #einsum('i,j->ij', p, q) # output[i,j] = p[i]*q[j] # Outer product outer = tf.reshape(tf.einsum('api,apj->apij', p, q), [-1, num_pairs*embedding_size*embedding_size]) # None * (F*(F-1)/2*K*K) deep_inputs = tf.concat([tf.reshape(embeddings,shape=[-1,field_size*embedding_size]), outer], 1) # None * ( F*K+F*(F-1)/2*K*K ) with tf.variable_scope("Deep-part"): if mode == tf.estimator.ModeKeys.TRAIN: train_phase = True else: train_phase = False for i in range(len(layers)): deep_inputs = tf.contrib.layers.fully_connected(inputs=deep_inputs, num_outputs=layers[i], \ weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), scope='mlp%d' % i) if FLAGS.batch_norm: deep_inputs = batch_norm_layer(deep_inputs, train_phase=train_phase, scope_bn='bn_%d' %i) #放在RELU之后 https://github.com/ducha-aiki/caffenet-benchmark/blob/master/batchnorm.md#bn----before-or-after-relu if mode == tf.estimator.ModeKeys.TRAIN: deep_inputs = tf.nn.dropout(deep_inputs, keep_prob=dropout[i]) #Apply Dropout after all BN layers and set dropout=0.8(drop_ratio=0.2) #deep_inputs = tf.layers.dropout(inputs=deep_inputs, rate=dropout[i], training=mode == tf.estimator.ModeKeys.TRAIN) y_deep = tf.contrib.layers.fully_connected(inputs=deep_inputs, num_outputs=1, activation_fn=tf.identity, \ weights_regularizer=tf.contrib.layers.l2_regularizer(l2_reg), scope='deep_out') y_d = tf.reshape(y_deep,shape=[-1]) with tf.variable_scope("PNN-out"): #y_bias = Global_Bias * tf.ones_like(labels, dtype=tf.float32) # None * 1 warning;这里不能用label,否则调用predict/export函数会出错,train/evaluate正常;初步判断estimator做了优化,用不到label是不传 y_bias = Global_Bias * tf.ones_like(y_d, dtype=tf.float32) # None * 1 y = y_bias + y_linear + y_d pred = tf.sigmoid(y) predictions={"prob": pred} export_outputs = {tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: tf.estimator.export.PredictOutput(predictions)} # Provide an estimator spec for `ModeKeys.PREDICT` if mode == tf.estimator.ModeKeys.PREDICT: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs=export_outputs) #------bulid loss------ loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y, labels=labels)) + \ l2_reg * tf.nn.l2_loss(Feat_Bias) + l2_reg * tf.nn.l2_loss(Feat_Emb) # Provide an estimator spec for `ModeKeys.EVAL` eval_metric_ops = { "auc": tf.metrics.auc(labels, pred) } if mode == tf.estimator.ModeKeys.EVAL: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, eval_metric_ops=eval_metric_ops) #------bulid optimizer------ if FLAGS.optimizer == 'Adam': optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-8) elif FLAGS.optimizer == 'Adagrad': optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate, initial_accumulator_value=1e-8) elif FLAGS.optimizer == 'Momentum': optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=0.95) elif FLAGS.optimizer == 'ftrl': optimizer = tf.train.FtrlOptimizer(learning_rate) train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) # Provide an estimator spec for `ModeKeys.TRAIN` modes if mode == tf.estimator.ModeKeys.TRAIN: return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, loss=loss, train_op=train_op)
def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer, proj_initializer, div_val=1, perms=None, proj_same_dim=True, scope='adaptive_embed'): """ perms: If None, first compute W = W1 x W2 (projection for each bin), and then compute X x W (embedding lookup). If not None, use bin-based embedding lookup with max_bin_size defined by the shape of perms. """ emb_scale = d_proj ** 0.5 with tf.variable_scope(scope): if div_val == 1: lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], initializer=initializer) y = embedding_lookup(lookup_table, x) if d_proj != d_embed: proj_W = tf.get_variable('proj_W', [d_embed, d_proj], initializer=proj_initializer) y = tf.einsum('ibe,ed->ibd', y, proj_W) else: proj_W = None ret_params = [lookup_table, proj_W] else: tables, projs = [], [] cutoff_ends = [0] + cutoffs + [n_token] x_size = tf.shape(x) if perms is None: cat_lookup = [] else: cat_lookup = tf.zeros([x_size[0], x_size[1], d_proj]) for i in range(len(cutoff_ends) - 1): with tf.variable_scope('cutoff_{}'.format(i)): l_idx, r_idx = cutoff_ends[i], cutoff_ends[i + 1] cur_d_embed = d_embed // (div_val ** i) lookup_table = tf.get_variable('lookup_table', [r_idx - l_idx, cur_d_embed], initializer=initializer) if cur_d_embed == d_proj and not proj_same_dim: proj_W = None else: proj_W = tf.get_variable('proj_W', [cur_d_embed, d_proj], initializer=proj_initializer) if perms is None: cat_lookup.append(tf.einsum('ie,ed->id', lookup_table, proj_W)) else: # speed up the computation of the first bin # also save some meory if i == 0: cur_y = embedding_lookup(lookup_table, tf.minimum(x, r_idx - 1)) if proj_W is not None: cur_y = tf.einsum('ibe,ed->ibd', cur_y, proj_W) cur_y *= perms[i][:, :, None] cat_lookup += cur_y else: cur_x = tf.einsum('ib,ibk->k', tf.to_float(x - l_idx), perms[i]) cur_x = tf.to_int32(cur_x) cur_y = embedding_lookup(lookup_table, cur_x) if proj_W is not None: cur_y = tf.einsum('ke,ed->kd', cur_y, proj_W) cat_lookup += tf.einsum('kd,ibk->ibd', cur_y, perms[i]) tables.append(lookup_table) projs.append(proj_W) if perms is None: cat_lookup = tf.concat(cat_lookup, 0) y = embedding_lookup(cat_lookup, x) else: y = cat_lookup ret_params = [tables, projs] y *= emb_scale return y, ret_params
def grad(dy): dloss_dw_out = tf.einsum('btj,btk->jk', psp, dy) dloss_dba_out = tf.einsum('btj,btk->jk', psp, dy) if FLAGS.eprop == 'adaptive' else tf.zeros_like(BA_out) dloss_dpsp = tf.einsum('bik,jk->bij', dy, BA_out) return [dloss_dpsp, dloss_dw_out, dloss_dba_out]
def attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, batch_size=None, from_seq_length=None, to_seq_length=None): """Performs multi-headed attention from `from_tensor` to `to_tensor`. This is an implementation of multi-headed attention based on "Attention is all you Need". If `from_tensor` and `to_tensor` are the same, then this is self-attention. Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector. This function first projects `from_tensor` into a "query" tensor and `to_tensor` into "key" and "value" tensors. These are (effectively) a list of tensors of length `num_attention_heads`, where each tensor is of shape [batch_size, seq_length, size_per_head]. Then, the query and key tensors are dot-producted and scaled. These are softmaxed to obtain attention probabilities. The value tensors are then interpolated by these probabilities, then concatenated back to a single tensor and returned. In practice, the multi-headed attention are done with tf.einsum as follows: Input_tensor: [BFD] Wq, Wk, Wv: [DNH] Q:[BFNH] = einsum('BFD,DNH->BFNH', Input_tensor, Wq) K:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wk) V:[BTNH] = einsum('BTD,DNH->BTNH', Input_tensor, Wv) attention_scores:[BNFT] = einsum('BFNH,BTNH>BNFT', Q, K) / sqrt(H) attention_probs:[BNFT] = softmax(attention_scores) context_layer:[BFNH] = einsum('BNFT,BTNH->BFNH', attention_probs, V) Wout:[DNH] Output:[BFD] = einsum('BFNH,DNH>BFD', context_layer, Wout) Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads, size_per_head]. Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` # `query_layer` = [B, F, N, H] query_layer = dense_layer_3d(from_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), query_act, "query") # `key_layer` = [B, T, N, H] key_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), key_act, "key") # `value_layer` = [B, T, N, H] value_layer = dense_layer_3d(to_tensor, num_attention_heads, size_per_head, create_initializer(initializer_range), value_act, "value") # Take the dot product between "query" and "key" to get the raw # attention scores. attention_scores = tf.einsum("BTNH,BFNH->BNFT", key_layer, query_layer) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = dropout(attention_probs, attention_probs_dropout_prob) # `context_layer` = [B, F, N, H] context_layer = tf.einsum("BNFT,BTNH->BFNH", attention_probs, value_layer) return context_layer
def call(self, x, y, bias, cache=None): """Apply attention mechanism to x and y. Args: x: a tensor with shape [batch_size, length_x + num_ve, hidden_size] y: a tensor with shape [batch_size, length_y + num_ve, hidden_size] bias: attention bias that will be added to the result of the dot product. cache: (Used during prediction) dictionary with tensors containing results of previous attentions. The dictionary must have the items: {"k": tensor with shape [batch_size, i, key_channels], "v": tensor with shape [batch_size, i, value_channels]} where i is the current decoded length. Returns: AttentionOne layer output with shape [batch_size, length_x + num_ve, hidden_size] """ num_ve = self.num_vir_entities length = tf.shape(x)[1] # input_length + num_ve depth = (self.hidden_size // self.num_heads) # Linearly project the query (q), key (k) and value (v) using different # learned projections. This is in preparation of splitting them into # multiple heads. Multi-head attention uses multiple queries, keys, and # values rather than regular attention (which uses a single q, k, v). q = self.q_dense_layer(x) k = self.k_dense_layer(y) v = self.v_dense_layer(y) if cache is not None: # Combine cached keys and values with new keys and values. k = tf.concat([cache["k"], k], axis=1) v = tf.concat([cache["v"], v], axis=1) # Update cache cache["k"] = k cache["v"] = v # Split q, k, v into heads. q = self.split_heads(q) # shape (batch_size, num_heads, length, depth) k = self.split_heads(k) v = self.split_heads(v) # collapse the batch dimension and head dimensions to operate simultaneously on all heads. q = tf.reshape(q, (-1, length, depth)) # shape (-1, length, depth) k = tf.reshape(k, (-1, length, depth)) v = tf.reshape(v, (-1, length, depth)) # Scale q to prevent the dot product between q and k from growing too large. q *= depth**-0.5 # Calculate dot product attention. Only standard entities update reps of standard entities. # Virtual entities receive updates from all entities. logits_std = tf.einsum('aib,ajb->aij', q[:, :length - num_ve, :], k[:, :length - num_ve, :]) # (-1, len-num_ve, len-num_ve) logits_vir = tf.einsum('aib,ajb->aij', q[:, length - num_ve:, :], k) # (-1, num_ve, length) bias = self.split_collapse(bias) # bias has shape (-1, 1, length) logits_std += bias weights_std = tf.nn.softmax(logits_std, name="weights_qk_std") weights_vir = tf.nn.softmax(logits_vir, name="weights_qk_vir") if self.train: weights_std = tf.nn.dropout(weights_std, 1.0 - self.attention_dropout) weights_vir = tf.nn.dropout(weights_vir, 1.0 - self.attention_dropout) ao_std = tf.einsum('aij,ajc->aic', weights_std, v[:, :length - num_ve, :]) # shape (-1, length-num_ve, depth) ao_vir = tf.einsum('aij,ajc->aic', weights_vir, v) # shape (-1, num_ve, depth) ao = tf.concat([ao_std, ao_vir], axis=-2) # shape (-1, length, depth) attention_output = tf.reshape(ao, (-1, self.num_heads, length, depth)) # Recombine heads --> [batch_size, length, hidden_size] attention_output = self.combine_heads(attention_output) # Run the combined outputs through another linear projection layer. attention_output = self.output_dense_layer(attention_output) return attention_output # shape (batch_size, length, hidden_size)
def buildGamStep(self, iStep, ListWeightUZ, ListBiasUZ, ListWeightGam, ListBiasGam): dic = {} dic["LRate"] = tf.compat.v1.placeholder(tf.float32, shape=[], name="learning_rate") dic["XPrev"] = tf.compat.v1.placeholder(dtype=tf.float32, shape=[None, self.d], name='XPrev') dic["RandG"] = tf.compat.v1.placeholder( dtype=tf.float32, shape=[None, self.d, self.nbStepGam - iStep], name='randG') sample_size = tf.shape(dic["XPrev"])[0] sig = self.model.sigScal mu = self.model.muScal rescale = sig * np.sqrt(self.TStepGam * iStep) normX0 = (dic["XPrev"] - self.xInit - mu * self.TStepGam * iStep) / rescale if (iStep < self.nbStepGam): dic["Gam"] = self.networkGam.createNetworkWithInitializer( normX0, iStep, ListWeightGam[-1], ListBiasGam[-1], rescale) else: dic["Gam"] = self.networkGam.createNetwork(normX0, iStep, rescale) sqrtDt = np.sqrt(self.TStepGam) XNext = dic["XPrev"] XNextAnti = dic["XPrev"] GamTraj = tf.zeros([sample_size, self.d, self.d]) WAccul = tf.zeros([sample_size, self.d]) TAccul = 0. for i in range(len(ListWeightGam) - 1): iStepLoc = iStep + i + 1 tLoc = (iStep + i + 1) * self.TStepGam WAccul = WAccul + sqrtDt * dic["RandG"][:, :, i] TAccul = TAccul + self.TStepGam XNext = XNext + mu * self.TStepGam + sig * sqrtDt * dic[ "RandG"][:, :, i] XNextAnti = XNextAnti + mu * self.TStepGam - sig * sqrtDt * dic[ "RandG"][:, :, i] iPosBSDE = (-i - 1) * self.nbStepGamStab normX = (XNext - self.xInit - mu * self.TStepGam * iStepLoc) / ( sig * np.sqrt(self.TStepGam * iStepLoc)) U, Z = self.networkUZ.createNetworkNotTrainable( normX, iStepLoc, ListWeightUZ[iPosBSDE], ListBiasUZ[iPosBSDE]) Gam = self.networkGam.createNetworkNotTrainable( normX, iStepLoc, ListWeightGam[-i - 1], ListBiasGam[-i - 1]) driver = self.TStepGam * (0.5 * tf.einsum( 'j,ij->i', tf.constant(sig * sig, dtype=tf.float32), tf.matrix_diag_part(Gam)) - self.model.fDW( iStepLoc * self.TStepGam, XNext, U, Z, Gam)) normXAnti = (XNextAnti - self.xInit - mu * self.TStepGam * iStepLoc) / (sig * np.sqrt(self.TStepGam * iStepLoc)) UAnti, ZAnti = self.networkUZ.createNetworkNotTrainable( normXAnti, self.nbStepUDU + iStepLoc, ListWeightUZ[iPosBSDE], ListBiasUZ[iPosBSDE]) GamAnti = self.networkGam.createNetworkNotTrainable( normXAnti, self.nbStepUDU + iStepLoc, ListWeightGam[-i - 1], ListBiasGam[-i - 1]) driverAnti = self.TStepGam * (0.5 * tf.einsum( 'j,ij->i', tf.constant(sig * sig, dtype=tf.float32), tf.matrix_diag_part(GamAnti)) - self.model.fDW( iStepLoc * self.TStepGam, XNextAnti, UAnti, ZAnti, GamAnti)) normXPrev = (dic["XPrev"] - self.xInit - mu * self.TStepGam * iStepLoc) / (sig * np.sqrt(self.TStepGam * iStepLoc)) UPrev, ZPrev = self.networkUZ.createNetworkNotTrainable( normXPrev, 2 * self.nbStepUDU + iStepLoc, ListWeightUZ[iPosBSDE], ListBiasUZ[iPosBSDE]) GamPrev = self.networkGam.createNetworkNotTrainable( normXPrev, 2 * self.nbStepUDU + iStepLoc, ListWeightGam[-i - 1], ListBiasGam[-i - 1]) driverPrev = self.TStepGam * (0.5 * tf.einsum( 'j,ij->i', tf.constant(sig * sig, dtype=tf.float32), tf.matrix_diag_part(GamPrev)) - self.model.fDW( iStepLoc * self.TStepGam, dic["XPrev"], UPrev, ZPrev, GamPrev)) weight = (tf.einsum( 'lij,j->lij', tf.einsum('i,lij->lij', tf.constant(1 / sig, dtype=tf.float32), tf.einsum("li,lj->lij", WAccul, WAccul) - TAccul), tf.constant(1 / sig, dtype=tf.float32))) / (TAccul * TAccul) GamTraj = GamTraj - tf.einsum( "l,lij->lij", 0.5 * (driver + driverAnti - 2 * driverPrev), weight) if (len(ListWeightGam) > 0): XNext = XNext + mu * self.TStepGam + sig * sqrtDt * dic[ "RandG"][:, :, len(ListWeightGam) - 1] XNextAnti = XNextAnti + mu * self.TStepGam - sig * sqrtDt * dic[ "RandG"][:, :, len(ListWeightGam) - 1] GamTraj = GamTraj + 0.5 * (self.model.D2gTf(XNext) + self.model.D2gTf(XNextAnti)) dic["weightLoc"], dic[ "biasLoc"] = self.networkGam.getBackWeightAndBias(iStep) dic["Loss"] = tf.reduce_mean(tf.pow(dic["Gam"] - GamTraj, 2)) dic["train"] = tf.compat.v1.train.AdamOptimizer( learning_rate=dic["LRate"]).minimize(dic["Loss"]) return dic
def matmul_joint_coords(transformation_matrices, coords): return tf.einsum('Bij,BCj->BCi', transformation_matrices, coords)
def make_eval_graph( self, batch_size, source_length, target_length, bos_token_id, ): """Make all the placeholders and outputs.""" with tf.name_scope('eval_placeholders_len{0}'.format(source_length)): inputs = tf.placeholder( dtype=tf.int32, shape=[batch_size, source_length], name='inputs', ) bos_tokens = tf.constant([bos_token_id] * batch_size, dtype=tf.int32, shape=[batch_size], name='bos_tokens') with tf.name_scope('eval_encoder_len{0}'.format(source_length)): embedded_encoder_inputs = tf.nn.embedding_lookup( self.source_embedding_matrix, inputs, name='embedded_encoder_inputs', ) h_start_encoder = tf.zeros( [batch_size, self.hidden_size], name='h_start_encoder', dtype=tf.float32, ) h_prev_encoder = h_start_encoder h_states_encoder = [] for i in range(source_length): h_states_encoder.append( gru_update(embedded_encoder_inputs[:, i, :], h_prev_encoder, self.source_gru_params, i)) h_prev_encoder = h_states_encoder[-1] # concatenated_states will have shape # (batch_size, num_steps * hidden_size) concatenated_states_encoder = tf.concat( h_states_encoder, axis=1, name='concatenated_states_encoder') # reshaped_states (which will get used for attention) # will have have shape (batch_size, num_steps, hidden_size) reshaped_states_encoder = tf.reshape( concatenated_states_encoder, [batch_size, source_length, self.hidden_size], name='reshaped_states_encoder', ) # attended_states will have shape # (batch_size, num_steps, hidden_size) attended_states = tf.identity( tf.einsum( 'ij,fgj->fgi', self.attention_params['W'], reshaped_states_encoder, ), name='attended_states', ) # final_states will have shape # (batch_size, hidden_size) final_states = h_states_encoder[-1] with tf.name_scope('eval_decoder_len{0}'.format(source_length)): # embedded_decoder_inputs = tf.nn.embedding_lookup( # self.target_embedding_matrix, # targets, # name='embedded_decoder_inputs', # ) transposed_target_embeddings = tf.transpose( self.target_embedding_matrix, [1, 0], 'transposed_target_embeddings', ) h_prev_decoder = final_states prev_outputs = bos_tokens attention_weights = [] output_tokens = [] for i in range(target_length): embedded_decoder_inputs = tf.nn.embedding_lookup( self.target_embedding_matrix, prev_outputs, name='embedded_decoder_inputs{0}'.format(i), ) # attention_weights_unnormalized will have shape # (batch_size, source_length) attention_weights_unnormalized = tf.identity( tf.einsum( 'ik,ijk->ij', h_prev_decoder, attended_states, ), name='attention_weights_unnormalized{0}'.format(i)) # attention_weights_normalized will have shape # (batch_size, source_length) attention_weights_normalized = tf.nn.softmax( attention_weights_unnormalized, name='attention_weights_normalized{0}'.format(i)) attention_weights.append(attention_weights_normalized) # context_vector will have shape # (batch_size, hidden_size) context_vector = tf.identity( tf.einsum( 'ij,ijk->ik', attention_weights_normalized, reshaped_states_encoder, ), name='context_vector{0}'.format(i)) h_states = self._attn_gru_update(embedded_decoder_inputs, context_vector, h_prev_decoder, i) antiembeddings = tf.nn.xw_plus_b( h_states, self.softmax_params['W'], self.softmax_params['b'], name='antiembeddings{0}'.format(i), ) logits = tf.matmul( antiembeddings, transposed_target_embeddings, name='logits{0}'.format(i), ) output_tokens.append( tf.argmax( logits, axis=1, name='output{0}'.format(i), )) h_prev_decoder = h_states prev_outputs = output_tokens[-1] # concatenated_attention_weights will have shape # (batch_size, (target_length - 1) * source_length) # concatenated_attention_weights = tf.concat( # attention_weights, # axis=1, # name='concatenated_attention_weights' # ) # reshaped_attention_weights will have have shape # (batch_size, (target_length - 1) * source_length) # reshaped_attention_weights = tf.reshape( # concatenated_attention_weights, # [batch_size, target_length, source_length], # name='attention_weights', # ) stacked_attention_weights = tf.stack(attention_weights, axis=1, name='attention_weights') outputs = tf.stack(output_tokens, axis=1, name='eval_outputs') return { 'placeholders': { 'inputs': inputs, }, 'outputs': { 'outputs': outputs, 'attention_weights': stacked_attention_weights, 'attention_weights_as_array': attention_weights, }, }
def _logit(x, W, b, proj=None): y = x if proj is not None: y = tf.einsum("ibd,ed->ibe", y, proj) return tf.einsum("ibd,nd->ibn", y, W) + b
def make_training_graph( self, batch_size, source_length, target_length, ): """Make all the placeholders, outputs, and training ops.""" with tf.name_scope('placeholders_len{0}'.format(source_length)): inputs = tf.placeholder( dtype=tf.int32, shape=[batch_size, source_length], name='inputs', ) targets = tf.placeholder( dtype=tf.int32, shape=[batch_size, target_length], name='targets', ) learning_rate = tf.placeholder( dtype=tf.float32, shape=[], name='learning_rate', ) max_norm = tf.placeholder( dtype=tf.float32, shape=[], name='max_norm', ) with tf.name_scope('encoder_len{0}'.format(source_length)): embedded_encoder_inputs = tf.nn.embedding_lookup( self.source_embedding_matrix, inputs, name='embedded_encoder_inputs', ) h_start_encoder = tf.zeros( [batch_size, self.hidden_size], name='h_start_encoder', dtype=tf.float32, ) h_prev_encoder = h_start_encoder h_states_encoder = [] for i in range(source_length): h_states_encoder.append( gru_update(embedded_encoder_inputs[:, i, :], h_prev_encoder, self.source_gru_params, i)) h_prev_encoder = h_states_encoder[-1] # concatenated_states will have shape # (batch_size, num_steps * hidden_size) concatenated_states_encoder = tf.concat( h_states_encoder, axis=1, name='concatenated_states_encoder') # reshaped_states will have have shape # (batch_size, num_steps, hidden_size) reshaped_states_encoder = tf.reshape( concatenated_states_encoder, [batch_size, source_length, self.hidden_size], name='reshaped_states_encoder', ) # attended_states will have shape # (batch_size, num_steps, hidden_size) attended_states = tf.identity( tf.einsum( 'ij,fgj->fgi', self.attention_params['W'], reshaped_states_encoder, ), name='attended_states', ) # final_states will have shape # (batch_size, hidden_size) final_states = h_states_encoder[-1] tf.summary.histogram( 'concatenated_states_encoder', concatenated_states_encoder, collections=['summaries_len{0}'.format(source_length)], ) with tf.name_scope('decoder_len{0}'.format(source_length)): embedded_decoder_inputs = tf.nn.embedding_lookup( self.target_embedding_matrix, targets, name='embedded_decoder_inputs', ) h_prev_decoder = final_states attention_weights = [] h_states_decoder = [] for i in range(target_length - 1): # attention_weights_unnormalized will have shape # (batch_size, source_length) attention_weights_unnormalized = tf.identity( tf.einsum( 'ik,ijk->ij', h_prev_decoder, attended_states, ), name='attention_weights_unnormalized{0}'.format(i)) # attention_weights_normalized will have shape # (batch_size, source_length) attention_weights_normalized = tf.nn.softmax( attention_weights_unnormalized, name='attention_weights_normalized{0}'.format(i)) attention_weights.append(attention_weights_normalized) # context_vector will have shape # (batch_size, hidden_size) context_vector = tf.identity( tf.einsum( 'ij,ijk->ik', attention_weights_normalized, reshaped_states_encoder, ), name='context_vector{0}'.format(i)) h_states_decoder.append( self._attn_gru_update(embedded_decoder_inputs[:, i, :], context_vector, h_prev_decoder, i)) h_prev_decoder = h_states_decoder[-1] # concatenated_states will have shape # (batch_size, num_steps * hidden_size) concatenated_states_decoder = tf.concat( h_states_decoder, axis=1, name='concatenated_states_decoder') # long_and_skinny_states will have shape # (batch_size * num_steps, hidden_size) long_and_skinny_states = tf.reshape( concatenated_states_decoder, [batch_size * (target_length - 1), self.hidden_size], name='long_and_skinny_states', ) # long_and_skinny_logits will have shape # (batch_size * num_steps, vocab_size) long_and_skinny_antiembeddings = tf.nn.xw_plus_b( long_and_skinny_states, self.softmax_params['W'], self.softmax_params['b'], name='long_and_skinny_antiembeddings', ) transposed_target_embeddings = tf.transpose( self.target_embedding_matrix, [1, 0], 'transposed_target_embeddings', ) long_and_skinny_logits = tf.matmul( long_and_skinny_antiembeddings, transposed_target_embeddings, name='long_and_skinny_logits', ) # logits will have shape # (batch_size, num_steps, vocab_size) logits = tf.reshape( long_and_skinny_logits, [batch_size, (target_length - 1), self.target_vocab_size], name='logits') # concatenated_attention_weights will have shape # (batch_size, (target_length - 1) * source_length) concatenated_attention_weights = tf.concat( attention_weights, axis=1, name='concatenated_attention_weights') tf.summary.histogram( 'concatenated_states_decoder', concatenated_states_encoder, collections=['summaries_len{0}'.format(source_length)], ) tf.summary.histogram( 'concatenated_attention_weights', concatenated_attention_weights, collections=['summaries_len{0}'.format(source_length)], ) with tf.name_scope('summary_len{0}'.format(source_length)): targets_without_start_token = tf.identity( targets[:, 1:], name='targets_without_start_token') batch_loss = tf.contrib.seq2seq.sequence_loss( logits=logits, targets=targets_without_start_token, weights=tf.ones_like(targets_without_start_token, dtype=tf.float32), average_across_timesteps=True, average_across_batch=True, name='batch_loss', ) loss = tf.reduce_sum( batch_loss, name='loss', ) predictions = tf.cast( tf.argmax( logits, axis=-1, ), tf.int32, name='predictions', ) num_correct_predictions = tf.reduce_sum( tf.cast(tf.equal(predictions, targets_without_start_token), tf.int32), name='num_correct_predictions', ) tf.summary.scalar( 'loss', loss, collections=['summaries_len{0}'.format(source_length)], ) tf.summary.scalar( 'num_correct_predictions', num_correct_predictions, collections=['summaries_len{0}'.format(source_length)], ) with tf.name_scope('train_ops_len{0}'.format(source_length)): trainable_variables = tf.trainable_variables() unclipped_gradients = tf.gradients(loss, trainable_variables) gradient_global_norm = tf.global_norm(unclipped_gradients, name='gradient_global_norm') clipped_gradients, _ = tf.clip_by_global_norm( unclipped_gradients, max_norm, name='clipped_gradients') optimizer = tf.train.GradientDescentOptimizer(learning_rate) train_op = optimizer.apply_gradients( zip(clipped_gradients, trainable_variables), ) merged_summaries = tf.summary.merge_all( key='summaries_len{0}'.format(source_length)) return { 'placeholders': { 'inputs': inputs, 'targets': targets, 'learning_rate': learning_rate, 'max_norm': max_norm, }, 'outputs': { 'loss': loss, 'num_correct_predictions': num_correct_predictions, }, 'train_ops': { 'train_op': train_op, 'gradient_global_norm': gradient_global_norm, 'summary': tf.summary.merge([ self.merged_variable_summaries, merged_summaries, ]), } }
def __init__( self, review_num_u, review_num_i, review_len_u, review_len_i, user_num, item_num, num_classes, user_vocab_size, item_vocab_size, n_latent, embedding_id, attention_size, embedding_size, filter_sizes, num_filters, l2_reg_lambda=0.0): self.input_u = tf.placeholder(tf.int32, [None, review_num_u, review_len_u], name="input_u") self.input_i = tf.placeholder(tf.int32, [None, review_num_i, review_len_i], name="input_i") self.input_reuid = tf.placeholder(tf.int32, [None, review_num_u], name='input_reuid') self.input_reiid = tf.placeholder(tf.int32, [None, review_num_i], name='input_reuid') self.input_y = tf.placeholder(tf.float32, [None, 1], name="input_y") self.input_uid = tf.placeholder(tf.int32, [None, 1], name="input_uid") self.input_iid = tf.placeholder(tf.int32, [None, 1], name="input_iid") self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.drop0 = tf.placeholder(tf.float32, name="dropout0") iidW = tf.Variable(tf.random_uniform([item_num + 2, embedding_id], -0.1, 0.1), name="iidW") uidW = tf.Variable(tf.random_uniform([user_num + 2, embedding_id], -0.1, 0.1), name="uidW") l2_loss = tf.constant(0.0) with tf.name_scope("user_embedding"): self.W1 = tf.Variable( tf.random_uniform([user_vocab_size, embedding_size], -1.0, 1.0), name="W1") self.embedded_user = tf.nn.embedding_lookup(self.W1, self.input_u) self.embedded_users = tf.expand_dims(self.embedded_user, -1) with tf.name_scope("item_embedding"): self.W2 = tf.Variable( tf.random_uniform([item_vocab_size, embedding_size], -1.0, 1.0), name="W2") self.embedded_item = tf.nn.embedding_lookup(self.W2, self.input_i) self.embedded_items = tf.expand_dims(self.embedded_item, -1) pooled_outputs_u = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("user_conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") self.embedded_users = tf.reshape(self.embedded_users, [-1, review_len_u, embedding_size, 1]) conv = tf.nn.conv2d( self.embedded_users, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, review_len_u - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs_u.append(pooled) num_filters_total = num_filters * len(filter_sizes) self.h_pool_u = tf.concat(3,pooled_outputs_u) self.h_pool_flat_u = tf.reshape(self.h_pool_u, [-1, review_num_u, num_filters_total]) pooled_outputs_i = [] for i, filter_size in enumerate(filter_sizes): with tf.name_scope("item_conv-maxpool-%s" % filter_size): # Convolution Layer filter_shape = [filter_size, embedding_size, 1, num_filters] W = tf.Variable(tf.truncated_normal(filter_shape, stddev=0.1), name="W") b = tf.Variable(tf.constant(0.1, shape=[num_filters]), name="b") self.embedded_items = tf.reshape(self.embedded_items, [-1, review_len_i, embedding_size, 1]) conv = tf.nn.conv2d( self.embedded_items, W, strides=[1, 1, 1, 1], padding="VALID", name="conv") # Apply nonlinearity h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu") # Maxpooling over the outputs pooled = tf.nn.max_pool( h, ksize=[1, review_len_i - filter_size + 1, 1, 1], strides=[1, 1, 1, 1], padding='VALID', name="pool") pooled_outputs_i.append(pooled) num_filters_total = num_filters * len(filter_sizes) self.h_pool_i = tf.concat(3,pooled_outputs_i) self.h_pool_flat_i = tf.reshape(self.h_pool_i, [-1, review_num_i, num_filters_total]) with tf.name_scope("dropout"): self.h_drop_u = tf.nn.dropout(self.h_pool_flat_u, 1.0) self.h_drop_i = tf.nn.dropout(self.h_pool_flat_i, 1.0) with tf.name_scope("attention"): Wau = tf.Variable(tf.random_uniform([num_filters_total, attention_size], -0.1, 0.1), name='Wau') Wru = tf.Variable(tf.random_uniform([embedding_id, attention_size], -0.1, 0.1), name='Wru') Wpu = tf.Variable(tf.random_uniform([attention_size, 1], -0.1, 0.1), name='Wpu') bau = tf.Variable(tf.constant(0.1, shape=[attention_size]), name="bau") bbu = tf.Variable(tf.constant(0.1, shape=[1]), name="bbu") self.iid_a = tf.nn.relu(tf.nn.embedding_lookup(iidW, self.input_reuid)) self.u_j = tf.einsum('ajk,kl->ajl', tf.nn.relu(tf.einsum('ajk,kl->ajl', self.h_drop_u, Wau) + tf.einsum('ajk,kl->ajl', self.iid_a, Wru) + bau), Wpu) + bbu # None*u_len*1 self.u_a = tf.nn.softmax(self.u_j, 1) # none*u_len*1 #print self.u_a Wai = tf.Variable(tf.random_uniform([num_filters_total, attention_size], -0.1, 0.1), name='Wai') Wri = tf.Variable(tf.random_uniform([embedding_id, attention_size], -0.1, 0.1), name='Wri') Wpi = tf.Variable(tf.random_uniform([attention_size, 1], -0.1, 0.1), name='Wpi') bai = tf.Variable(tf.constant(0.1, shape=[attention_size]), name="bai") bbi = tf.Variable(tf.constant(0.1, shape=[1]), name="bbi") self.uid_a = tf.nn.relu(tf.nn.embedding_lookup(uidW, self.input_reiid)) self.i_j =tf.einsum('ajk,kl->ajl', tf.nn.relu( tf.einsum('ajk,kl->ajl', self.h_drop_i, Wai) + tf.einsum('ajk,kl->ajl', self.uid_a, Wri) + bai), Wpi)+bbi self.i_a = tf.nn.softmax(self.i_j,1) # none*len*1 l2_loss += tf.nn.l2_loss(Wau) l2_loss += tf.nn.l2_loss(Wru) l2_loss += tf.nn.l2_loss(Wri) l2_loss += tf.nn.l2_loss(Wai) with tf.name_scope("add_reviews"): self.u_feas = tf.reduce_sum(tf.multiply(self.u_a, self.h_drop_u), 1) self.u_feas = tf.nn.dropout(self.u_feas, self.dropout_keep_prob) self.i_feas = tf.reduce_sum(tf.multiply(self.i_a, self.h_drop_i), 1) self.i_feas = tf.nn.dropout(self.i_feas, self.dropout_keep_prob) with tf.name_scope("get_fea"): iidmf = tf.Variable(tf.random_uniform([item_num + 2, embedding_id], -0.1, 0.1), name="iidmf") uidmf = tf.Variable(tf.random_uniform([user_num + 2, embedding_id], -0.1, 0.1), name="uidmf") self.uid = tf.nn.embedding_lookup(uidmf,self.input_uid) self.iid = tf.nn.embedding_lookup(iidmf,self.input_iid) self.uid = tf.reshape(self.uid,[-1,embedding_id]) self.iid = tf.reshape(self.iid,[-1,embedding_id]) Wu = tf.Variable(tf.random_uniform([num_filters_total, n_latent], -0.1, 0.1), name='Wu') bu = tf.Variable(tf.constant(0.1, shape=[n_latent]), name="bu") self.u_feas = tf.matmul(self.u_feas, Wu)+self.uid + bu Wi = tf.Variable( tf.random_uniform([num_filters_total, n_latent], -0.1, 0.1), name='Wi') bi = tf.Variable(tf.constant(0.1, shape=[n_latent]), name="bi") self.i_feas = tf.matmul(self.i_feas, Wi) +self.iid+ bi with tf.name_scope('ncf'): self.FM = tf.multiply(self.u_feas, self.i_feas) self.FM = tf.nn.relu(self.FM) self.FM=tf.nn.dropout(self.FM,self.dropout_keep_prob) Wmul=tf.Variable( tf.random_uniform([n_latent, 1], -0.1, 0.1), name='wmul') self.mul=tf.matmul(self.FM,Wmul) self.score=tf.reduce_sum(self.mul,1,keep_dims=True) self.uidW2 = tf.Variable(tf.constant(0.1, shape=[user_num + 2]), name="uidW2") self.iidW2 = tf.Variable(tf.constant(0.1, shape=[item_num + 2]), name="iidW2") self.u_bias = tf.gather(self.uidW2, self.input_uid) self.i_bias = tf.gather(self.iidW2, self.input_iid) self.Feature_bias = self.u_bias + self.i_bias self.bised = tf.Variable(tf.constant(0.1), name='bias') self.predictions = self.score + self.Feature_bias + self.bised with tf.name_scope("loss"): losses = tf.nn.l2_loss(tf.subtract(self.predictions, self.input_y)) self.loss = losses + l2_reg_lambda * l2_loss with tf.name_scope("accuracy"): self.mae = tf.reduce_mean(tf.abs(tf.subtract(self.predictions, self.input_y))) self.accuracy =tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(self.predictions, self.input_y))))
import tensorflow as tf a = tf.Variable(tf.ones([2, 20, 1])) b = tf.Variable(tf.ones([20, 1, 20])) # c = tf.matmul(b,a) c = tf.einsum("aij,ijk->ajk", a, b) with tf.Session() as sess: sess.run(tf.initialize_all_variables()) s = sess.run([c,b,a], feed_dict = {}) for i in s: print i.shape break
def call(self, inputs, training=False): w, r, attn_mask, mems, head_mask, output_attentions = inputs qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1] if mems is not None: cat = tf.concat([mems, w], 0) if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(cat)) else: w_heads = self.qkv_net(cat) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1) w_head_q = w_head_q[-qlen:] else: if self.pre_lnorm: w_heads = self.qkv_net(self.layer_norm(w)) else: w_heads = self.qkv_net(w) r_head_k = self.r_net(r) w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1) klen = shape_list(w_head_k)[0] w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head)) # qlen x bsz x n_head x d_head r_head_k = tf.reshape( r_head_k, (rlen, self.n_head, self.d_head)) # qlen x n_head x d_head # compute attention score rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k) # qlen x klen x bsz x n_head rr_head_q = w_head_q + self.r_r_bias BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k) # qlen x klen x bsz x n_head BD = self._rel_shift(BD) # [qlen x klen x bsz x n_head] attn_score = AC + BD attn_score = attn_score * self.scale # compute attention probability if attn_mask is not None: attn_mask_t = attn_mask[:, :, None, None] attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t # [qlen x klen x bsz x n_head] attn_prob = tf.nn.softmax(attn_score, axis=1) attn_prob = self.dropatt(attn_prob, training=training) # Mask heads if we want to if head_mask is not None: attn_prob = attn_prob * head_mask # compute attention vector attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v) # [qlen x bsz x n_head x d_head] attn_vec_sizes = shape_list(attn_vec) attn_vec = tf.reshape( attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head)) # linear projection attn_out = self.o_net(attn_vec) attn_out = self.drop(attn_out, training=training) if self.pre_lnorm: # residual connection outputs = [w + attn_out] else: # residual connection + layer normalization outputs = [self.layer_norm(w + attn_out)] if cast_bool_to_primitive(output_attentions) is True: outputs.append(attn_prob) return outputs
def sh_invar_conv(signal, patches_idx, conv_tensor, kernel, l_max): y = sh_invar_conv_(signal, patches_idx, conv_tensor, l_max) return tf.einsum('inrj,bvnrj->bvi', kernel, y)
def single_mode_gate(matrix, mode, in_modes, pure=True, batched=False): """basic form: 'ab,cde...b...xyz->cde...a...xyz' (pure state) 'ab,ef...bc...xyz,cd->ef...ad...xyz' (mixed state) """ if batched: batch_offset = 1 else: batch_offset = 0 batch_index = indices[:batch_offset] left_gate_str = indices[batch_offset : batch_offset + 2] # |a><b| num_indices = len(in_modes.shape) if pure: num_modes = num_indices - batch_offset mode_size = 1 else: right_gate_str = indices[batch_offset + 2 : batch_offset + 4] # |c><d| num_modes = (num_indices - batch_offset) // 2 mode_size = 2 max_len = len(indices) - 2 * mode_size - batch_offset if num_modes == 0: raise ValueError("'in_modes' must have at least one mode") if num_modes > max_len: raise NotImplementedError( "The max number of supported modes for this operation is currently {}".format(max_len) ) if mode < 0 or mode >= num_modes: raise ValueError("'mode' argument is not compatible with number of in_modes") other_modes_indices = indices[ batch_offset + 2 * mode_size : batch_offset + (1 + num_modes) * mode_size ] if pure: eqn_lhs = "{},{}{}{}{}".format( batch_index + left_gate_str, batch_index, other_modes_indices[: mode * mode_size], left_gate_str[1], other_modes_indices[mode * mode_size :], ) eqn_rhs = "".join( [ batch_index, other_modes_indices[: mode * mode_size], left_gate_str[0], other_modes_indices[mode * mode_size :], ] ) else: eqn_lhs = "{},{}{}{}{}{},{}".format( batch_index + left_gate_str, batch_index, other_modes_indices[: mode * mode_size], left_gate_str[1], right_gate_str[0], other_modes_indices[mode * mode_size :], batch_index + right_gate_str, ) eqn_rhs = "".join( [ batch_index, other_modes_indices[: mode * mode_size], left_gate_str[0], right_gate_str[1], other_modes_indices[mode * mode_size :], ] ) eqn = eqn_lhs + "->" + eqn_rhs einsum_inputs = [matrix, in_modes] if not pure: transposed_axis = [0, 2, 1] if batched else [1, 0] einsum_inputs.append(tf.transpose(tf.math.conj(matrix), transposed_axis)) output = tf.einsum(eqn, *einsum_inputs) return output
def inference(x, y, n_batch, is_training, input_digits=None, output_digits=None, n_hidden=None, n_out=None): def weight_variable(shape): initial = tf.truncated_normal(shape, stddev=0.01) return tf.Variable(initial) def bias_variable(shape): initial = tf.zeros(shape, dtype=tf.float32) return tf.Variable(initial) # Encode encoder = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0) encoder = rnn.AttentionCellWrapper(encoder, input_digits, state_is_tuple=True) state = encoder.zero_state(n_batch, tf.float32) encoder_outputs = [] encoder_states = [] with tf.variable_scope('Encoder'): for t in range(input_digits): if t > 0: tf.get_variable_scope().reuse_variables() (output, state) = encoder(x[:, t, :], state) encoder_outputs.append(output) encoder_states.append(state) # Decode decoder = rnn.BasicLSTMCell(n_hidden, forget_bias=1.0) decoder = rnn.AttentionCellWrapper(decoder, input_digits, state_is_tuple=True) state = encoder_states[-1] decoder_outputs = [encoder_outputs[-1]] # 출력층의 웨이트와 바이어스를 미리 정의해둔다 V = weight_variable([n_hidden, n_out]) c = bias_variable([n_out]) outputs = [] with tf.variable_scope('Decoder'): for t in range(1, output_digits): if t > 1: tf.get_variable_scope().reuse_variables() if is_training is True: (output, state) = decoder(y[:, t-1, :], state) else: # 직전의 출력을 구한다 linear = tf.matmul(decoder_outputs[-1], V) + c out = tf.nn.softmax(linear) outputs.append(out) out = tf.one_hot(tf.argmax(out, -1), depth=output_digits) (output, state) = decoder(out, state) decoder_outputs.append(output) if is_training is True: output = tf.reshape(tf.concat(decoder_outputs, axis=1), [-1, output_digits, n_hidden]) linear = tf.einsum('ijk,kl->ijl', output, V) + c return tf.nn.softmax(linear) else: # 마지막 출력을 구한다 linear = tf.matmul(decoder_outputs[-1], V) + c out = tf.nn.softmax(linear) outputs.append(out) output = tf.reshape(tf.concat(outputs, axis=1), [-1, output_digits, n_out]) return output
def call(self, inputs): x = tf.einsum("nhj,hji->nhi", inputs, self.kernel) return self._finalize(x)