def testEmptyInput(self): with self.test_session(): x = array_ops.placeholder(dtypes.float32, shape=[0, 3]) self.assertEqual(0, array_ops.size(x).eval()) # reshape would raise if logits is empty with self.assertRaises(errors_impl.InvalidArgumentError): nn_ops.softmax(x, axis=0).eval()
def testDimTooLarge(self): with self.test_session(): # Use placeholder to make sure we get runtime error instead of shape # inference error. dim = array_ops.placeholder_with_default(100, shape=[]) with self.assertRaises(errors_impl.InvalidArgumentError): nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval()
def testEmptyInput(self): with self.test_session(): x = constant_op.constant([[]], shape=[0, 3]) self.assertEqual(0, array_ops.size(x).eval()) # reshape would raise if logits is empty with self.assertRaises(errors_impl.InvalidArgumentError): nn_ops.softmax(x, dim=0).eval()
def testSoftmaxAxes(self): arr = np.linspace(0., 1, 12).reshape(3, 4) x_neg_axis = nn_ops.softmax(arr, axis=-2) y_pos_axis = nn_ops.softmax(arr, axis=0) z_gt_axis = nn_ops.softmax(arr, axis=4) x_neg_axis_tf = self.evaluate(x_neg_axis) y_pos_axis_tf = self.evaluate(y_pos_axis) z_gt_axis_tf = self.evaluate(z_gt_axis) eps = 1e-3 self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps) self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
def testSoftmax(self): x_shape = [5, 10] x_np = np.random.randn(*x_shape).astype(np.float32) y_np = self._softmax(x_np) x_tf = constant_op.constant(x_np) y_tf = nn_ops.softmax(x_tf) y_tf_last_dim = nn_ops.softmax(x_tf, 1) y_tf_np = self.evaluate(y_tf) y_tf_last_dim_np = self.evaluate(y_tf_last_dim) eps = 1e-3 self.assertAllClose(y_tf_np, y_np, eps) self.assertAllClose(y_tf_last_dim_np, y_np, eps)
def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): """Gradient function for SoftmaxCrossEntropyWithLogits.""" # grad_loss is the backprop for cost, and we multiply it with the gradients # (which is output[1]) # grad_grad is the backprop for softmax gradient. # There is no gradient for the labels # # Second derivative is just softmax derivative w.r.t. logits. softmax_grad = op.outputs[1] grad = _BroadcastMul(grad_loss, softmax_grad) def IsZero(g): # Some introspection to check if the gradient is feeding zeros if g.op.type in ("ZerosLike", "Zeros"): return True const_fill_value = tensor_util.constant_value(g) return const_fill_value is not None and (const_fill_value == 0).all() if not IsZero(grad_grad): logits = op.inputs[0] softmax = nn_ops.softmax(logits) grad += ((grad_grad - array_ops.squeeze( math_ops.matmul(grad_grad[:, None, :], softmax[:, :, None]), axis=1)) * softmax) return grad, None
def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): """Gradient function for SoftmaxCrossEntropyWithLogits.""" # grad_loss is the backprop for cost, and we multiply it with the gradients # (which is output[1]) # grad_grad is the backprop for softmax gradient. # # Second derivative is just softmax derivative w.r.t. logits. softmax_grad = op.outputs[1] grad = _BroadcastMul(grad_loss, softmax_grad) def IsZero(g): # Some introspection to check if the gradient is feeding zeros if context.executing_eagerly(): # TODO(apassos) add an efficient way to detect eager zeros here. return False if g.op.type in ("ZerosLike", "Zeros"): return True const_fill_value = tensor_util.constant_value(g) return const_fill_value is not None and (const_fill_value == 0).all() logits = op.inputs[0] if grad_grad is not None and not IsZero(grad_grad): softmax = nn_ops.softmax(logits) grad += ((grad_grad - array_ops.squeeze( math_ops.matmul( array_ops.expand_dims(grad_grad, 1), array_ops.expand_dims(softmax, 2)), axis=1)) * softmax) return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits))
def testMeanMultivariate(self): with self.test_session() as sess: for batch_shape in ((), (2,), (2, 3)): dist = make_multivariate_mixture( batch_shape=batch_shape, num_components=2, event_shape=(4,), use_static_graph=self.use_static_graph) mean = dist.mean() self.assertEqual(batch_shape + (4,), mean.get_shape()) cat_probs = nn_ops.softmax(dist.cat.logits) dist_means = [d.mean() for d in dist.components] mean_value, cat_probs_value, dist_means_value = sess.run( [mean, cat_probs, dist_means]) self.assertEqual(batch_shape + (4,), mean_value.shape) cat_probs_value = _swap_first_last_axes(cat_probs_value) # Add a new innermost dimension for broadcasting to mvn vector shape cat_probs_value = [np.expand_dims(c_p, -1) for c_p in cat_probs_value] true_mean = sum( [c_p * m for (c_p, m) in zip(cat_probs_value, dist_means_value)]) self.assertAllClose(true_mean, mean_value)
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(1, query_list) for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) # multiply with source mask, then do softmax if src_mask is not None: s = s * src_mask a = nn_ops.softmax(s) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def _forward(self, x): # Pad the last dim with a zeros vector. We need this because it lets us # infer the scale in the inverse function. y = array_ops.expand_dims(x, dim=-1) if self._static_event_ndims == 0 else x ndims = (y.get_shape().ndims if y.get_shape().ndims is not None else array_ops.rank(y)) y = array_ops.pad(y, paddings=array_ops.concat( (array_ops.zeros( (ndims - 1, 2), dtype=dtypes.int32), [[0, 1]]), 0)) # Set shape hints. if x.get_shape().ndims is not None: shape = x.get_shape().as_list() if self._static_event_ndims == 0: shape += [2] elif shape[-1] is not None: shape[-1] += 1 shape = tensor_shape.TensorShape(shape) y.get_shape().assert_is_compatible_with(shape) y.set_shape(shape) # Since we only support event_ndims in [0, 1] and we do padding, we always # reduce over the last dimension, i.e., dim=-1 (which is the default). return nn_ops.softmax(y)
def testProbScalarMultivariate(self): with self.test_session() as sess: dist = make_multivariate_mixture( batch_shape=[], num_components=2, event_shape=[3], use_static_graph=self.use_static_graph) for x in [ np.array( [[-1.0, 0.0, 1.0], [0.5, 1.0, -0.3]], dtype=np.float32), np.array( [-1.0, 0.0, 1.0], dtype=np.float32), np.random.randn(2, 2, 3).astype(np.float32) ]: p_x = dist.prob(x) self.assertEqual(x.shape[:-1], p_x.get_shape()) cat_probs = nn_ops.softmax([dist.cat.logits])[0] dist_probs = [d.prob(x) for d in dist.components] p_x_value, cat_probs_value, dist_probs_value = sess.run( [p_x, cat_probs, dist_probs]) self.assertEqual(x.shape[:-1], p_x_value.shape) total_prob = sum(c_p_value * d_p_value for (c_p_value, d_p_value ) in zip(cat_probs_value, dist_probs_value)) self.assertAllClose(total_prob, p_x_value)
def testProbBatchMultivariate(self): with self.test_session() as sess: dist = make_multivariate_mixture( batch_shape=[2, 3], num_components=2, event_shape=[4], use_static_graph=self.use_static_graph) for x in [ np.random.randn(2, 3, 4).astype(np.float32), np.random.randn(4, 2, 3, 4).astype(np.float32) ]: p_x = dist.prob(x) self.assertEqual(x.shape[:-1], p_x.get_shape()) cat_probs = nn_ops.softmax(dist.cat.logits) dist_probs = [d.prob(x) for d in dist.components] p_x_value, cat_probs_value, dist_probs_value = sess.run( [p_x, cat_probs, dist_probs]) self.assertEqual(x.shape[:-1], p_x_value.shape) cat_probs_value = _swap_first_last_axes(cat_probs_value) total_prob = sum(c_p_value * d_p_value for (c_p_value, d_p_value ) in zip(cat_probs_value, dist_probs_value)) self.assertAllClose(total_prob, p_x_value)
def testSmallNetwork(self): image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1]) label = array_ops.placeholder(dtypes.float32, shape=[1, 10]) w = variables.Variable( random_ops.truncated_normal([5, 5, 1, 32], stddev=0.1)) b = variables.Variable(random_ops.truncated_normal([32], stddev=0.1)) conv = nn_ops.conv2d(image, w, strides=[1, 1, 1, 1], padding="SAME") h_conv = nn_ops.relu(conv + b) h_conv_flat = array_ops.reshape(h_conv, [1, -1]) w_fc = variables.Variable( random_ops.truncated_normal([25088, 10], stddev=0.1)) b_fc = variables.Variable(random_ops.truncated_normal([10], stddev=0.1)) y_conv = nn_ops.softmax(math_ops.matmul(h_conv_flat, w_fc) + b_fc) cross_entropy = math_ops.reduce_mean(-math_ops.reduce_sum( label * math_ops.log(y_conv), reduction_indices=[1])) _ = adam.AdamOptimizer(1e-4).minimize(cross_entropy) mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph()) report = cost_analyzer.GenerateCostReport(mg) self.assertTrue(b"MatMul" in report) self.assertTrue(b"ApplyAdam" in report) self.assertTrue(b"Conv2D" in report) self.assertTrue(b"Conv2DBackpropInput" in report) self.assertTrue(b"Conv2DBackpropFilter" in report) self.assertTrue(b"Softmax" in report) # Also print the report to make it easier to debug print("{}".format(report))
def attention(query, use_attention=False): """Put attention masks on hidden using hidden_features and query.""" attn_weights = [] ds = [] # Results of attention reads will be stored here. for i in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % i): y = rnn_cell._linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v[i] * math_ops.tanh(hidden_features[i] + y), [2, 3]) if use_attention is False: # apply mean pooling weights = tf.tile(sequence_length, tf.stack([attn_length])) weights = array_ops.reshape(weights, tf.shape(s)) a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(weights) # a = array_ops.ones(tf.shape(s), dtype=dtype) / math_ops.to_float(tf.shape(s)[1]) else: a = nn_ops.softmax(s) attn_weights.append(a) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return attn_weights, ds
def _forward(self, x): y = x # Pad the event_ndims with a zeros vector. We need this because it lets # us infer the scale in the inverse function. if self._static_event_ndims == 0: y = array_ops.expand_dims(y, dim=-1) zeros = array_ops.zeros_like(y) else: shape = array_ops.concat(0, (array_ops.shape(x)[:-1], [1])) zeros = array_ops.zeros(shape, dtype=y.dtype) y = array_ops.concat(array_ops.rank(y)-1, (y, zeros)) # Set shape hints. if x.get_shape().ndims is not None: shape = x.get_shape().as_list() if self._static_event_ndims == 0: shape += [2] elif shape[-1] is not None: shape[-1] += 1 shape = tensor_shape.TensorShape(shape) y.get_shape().assert_is_compatible_with(shape) y.set_shape(shape) # Since we only support event_ndims in [0, 1] and we do padding, we always # reduce over the last dimension, i.e., dim=-1 (which is the default). return nn_ops.softmax(y)
def sequence_softmax(inputs, noutput, scope=None, name=None, linear_name=None): """Run a softmax layer over all the time steps of an input sequence. Args: inputs: (length, batch_size, depth) tensor noutput: output depth scope: optional scope name name: optional name for output tensor linear_name: name for linear (pre-softmax) output Returns: A tensor of size (length, batch_size, noutput). """ length, _, ninputs = _shape(inputs) inputs_u = array_ops.unstack(inputs) output_u = [] with variable_scope.variable_scope(scope, "SequenceSoftmax", [inputs]): initial_w = random_ops.truncated_normal([0 + ninputs, noutput], stddev=0.1) initial_b = constant_op.constant(0.1, shape=[noutput]) w = variables.model_variable("weights", initializer=initial_w) b = variables.model_variable("biases", initializer=initial_b) for i in xrange(length): with variable_scope.variable_scope(scope, "SequenceSoftmaxStep", [inputs_u[i]]): # TODO(tmb) consider using slim.fully_connected(..., # activation_fn=tf.nn.softmax) linear = nn_ops.xw_plus_b(inputs_u[i], w, b, name=linear_name) output = nn_ops.softmax(linear) output_u += [output] outputs = array_ops.stack(output_u, name=name) return outputs
def testEntropyGradient(self): with self.cached_session() as sess: logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]]) probabilities = nn_ops.softmax(logits) log_probabilities = nn_ops.log_softmax(logits) true_entropy = - math_ops.reduce_sum( probabilities * log_probabilities, axis=-1) categorical_distribution = categorical.Categorical(probs=probabilities) categorical_entropy = categorical_distribution.entropy() # works true_entropy_g = gradients_impl.gradients(true_entropy, [logits]) categorical_entropy_g = gradients_impl.gradients( categorical_entropy, [logits]) res = sess.run({"true_entropy": true_entropy, "categorical_entropy": categorical_entropy, "true_entropy_g": true_entropy_g, "categorical_entropy_g": categorical_entropy_g}) self.assertAllClose(res["true_entropy"], res["categorical_entropy"]) self.assertAllClose(res["true_entropy_g"], res["categorical_entropy_g"])
def _entropy(self): logits_2d = array_ops.reshape( self.logits, array_ops.pack([-1, self.num_classes])) histogram_2d = nn_ops.softmax(logits_2d) ret = array_ops.reshape( nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d), self.batch_shape()) ret.set_shape(self.get_batch_shape()) return ret
def testShapeInference(self): op = nn_ops.softmax( [ [[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0]], [[2.0, 3.0, 4.0, 5.0], [6.0, 7.0, 8.0, 9.0]], [[5.0, 4.0, 3.0, 2.0], [1.0, 2.0, 3.0, 4.0]], ] ) self.assertEqual([3, 2, 4], op.get_shape())
def body(i, prev_c, prev_h, actions, log_probs): # pylint: disable=g-long-lambda signal = control_flow_ops.cond( math_ops.equal(i, 0), lambda: array_ops.tile(device_go_embedding, [self.hparams.num_children, 1]), lambda: embedding_ops.embedding_lookup(device_embeddings, actions.read(i - 1)) ) if self.hparams.keep_prob is not None: signal = nn_ops.dropout(signal, self.hparams.keep_prob) next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias) query = math_ops.matmul(next_h, attn_w_2) query = array_ops.reshape( query, [self.hparams.num_children, 1, self.hparams.hidden_size]) query = math_ops.tanh(query + attn_mem) query = array_ops.reshape(query, [ self.hparams.num_children * self.num_groups, self.hparams.hidden_size ]) query = math_ops.matmul(query, attn_v) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups]) query = nn_ops.softmax(query) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups, 1]) query = math_ops.reduce_sum(attn_mem * query, axis=1) query = array_ops.concat([next_h, query], axis=1) logits = math_ops.matmul(query, device_softmax) logits /= self.hparams.temperature if self.hparams.tanh_constant > 0: logits = math_ops.tanh(logits) * self.hparams.tanh_constant if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast( array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide( linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) if mode == "sample": next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed) elif mode == "greedy": next_y = math_ops.argmax(logits, 1) elif mode == "target": next_y = array_ops.slice(y, [0, i], [-1, 1]) else: raise NotImplementedError next_y = math_ops.to_int32(next_y) next_y = array_ops.reshape(next_y, [self.hparams.num_children]) actions = actions.write(i, next_y) log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=next_y) return i + 1, next_c, next_h, actions, log_probs
def testGradient(self, x_shape): x_np = np.random.randn(*x_shape).astype(np.float64) with self.cached_session(): x_tf = constant_op.constant(x_np) y_tf = nn_ops.softmax(x_tf) err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf, x_shape) eps = 2e-8 self.assertLess(err, eps)
def _entropy(self): if self.logits.get_shape().ndims == 2: logits_2d = self.logits else: logits_2d = array_ops.reshape(self.logits, [-1, self.num_classes]) histogram_2d = nn_ops.softmax(logits_2d) ret = array_ops.reshape(nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d), self.batch_shape()) ret.set_shape(self.get_batch_shape()) return ret
def testSoftmax(self): x_shape = [5, 10] x_np = np.random.randn(*x_shape).astype(np.float32) y_np = self._softmax(x_np) with self.test_session(): x_tf = constant_op.constant(x_np) y_tf = nn_ops.softmax(x_tf) y_tf_np = y_tf.eval() eps = 1e-3 self.assertAllClose(y_tf_np, y_np, eps)
def entropy(self, name="sample"): with ops.name_scope(self.name): with ops.op_scope([], name): logits_2d = array_ops.reshape( self.logits, array_ops.pack([-1, self.num_classes])) histogram_2d = nn_ops.softmax(logits_2d) ret = array_ops.reshape( nn_ops.softmax_cross_entropy_with_logits(logits_2d, histogram_2d), self.batch_shape()) ret.set_shape(self.get_batch_shape()) return ret
def _output_with_attention(cell_output, output_size, decoder_hidden, attn_size, projection_attention_f, initializer=None, output_form=OUTPUT_CONCAT): """ Parameters ---------- decoder_hidden attn_size projection_attention_f initializer step_num Returns ------- """ assert initializer is not None with vs.variable_scope("AttnOutputProjection", initializer=initializer): with vs.variable_scope("output_attention", initializer=initializer): s = projection_attention_f(decoder_hidden, attn_size) # beta will be (?, timesteps) beta = nn_ops.softmax(s) shape = decoder_hidden.get_shape() timesteps = shape[1].value b = array_ops.reshape(beta, [-1, timesteps, 1, 1]) # b and decoder_hidden will be (?, timesteps, 1, 1) d = math_ops.reduce_sum(b * decoder_hidden, [1, 2]) # d is (?, decoder_size) # ds is (?, decoder_size) ds = tf.reshape(d, [-1, attn_size]) _ = tf.histogram_summary('attention_context', ds) # output = cells.linear([cell_output] + [ds], output_size, True) if output_form == OUTPUT_SPLIT: output = _output_form_split(cell_output, ds, output_size, initializer=initializer) elif output_form == OUTPUT_SINGLE: output = _output_form_single(ds, output_size, initializer=initializer) else: output = _output_form_concat(cell_output, ds, output_size, initializer=initializer) output = tf.tanh(output) return output
def _forward(self, x): # Pad the last dim with a zeros vector. We need this because it lets us # infer the scale in the inverse function. y = distribution_util.pad(x, axis=-1, back=True) # Set shape hints. if x.shape.ndims is not None: shape = x.shape[:-1].concatenate(x.shape[-1] + 1) y.shape.assert_is_compatible_with(shape) y.set_shape(shape) return nn_ops.softmax(y)
def global_attention(decoder_hidden_state, hidden_attn, initializer, window_size=10, content_function=vinyals_kaiser, dtype=tf.float32): """Put global attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn). Parameters ---------- decoder_hidden_state : 2-D Tensor Tensor representing the current hidden state of the decoder (output of the recurrent layers). Shape is (?, decoder_size). hidden_attn : 4-D Tensor Tensor representing the hidden states of the encoder (output of the recurrent layers). It has shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate the attention score more efficiently. initializer : function Function to use when initializing variables within the variables context. window_size : int Size of each side of the window to use when applying local attention. Not relevant to global attention. Default to 10. content_function : function Content function to score the decoder hidden states and encoder hidden states to extract their weights. Default to 'vinyals_kaiser'. dtype : tensorflow dtype Type of tensors. Default to tf.float32 Returns ------- ds : 2-D Tensor Tensor representing the context vector generated after scoring the encoder and decoder hidden states. Has shape (?, decoder_size), i.e., one context vector per batch sample. """ assert content_function is not None attention_vec_size = hidden_attn.get_shape()[3].value attn_length = hidden_attn.get_shape()[1].value with vs.variable_scope("AttentionGlobal", initializer=initializer): # apply content function to score the hidden states from the encoder s = content_function(hidden_attn, decoder_hidden_state) alpha = nn_ops.softmax(s) _ = tf.histogram_summary('global_alpha_weights', alpha) # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum(array_ops.reshape(alpha, [-1, attn_length, 1, 1]) * hidden_attn, [1, 2]) ds = array_ops.reshape(d, [-1, attention_vec_size])# _ = tf.histogram_summary('global_attention_context', ds) return ds
def _entropy(self): if self.logits.get_shape().ndims == 2: logits_2d = self.logits else: logits_2d = array_ops.reshape(self.logits, [-1, self.event_size]) histogram_2d = nn_ops.softmax(logits_2d) ret = array_ops.reshape( nn_ops.softmax_cross_entropy_with_logits(labels=histogram_2d, logits=logits_2d), self.batch_shape_tensor()) ret.set_shape(self.batch_shape) return ret
def classifier_score(images, classifier_fn, num_batches=1): """Classifier score for evaluating a conditional generative model. This is based on the Inception Score, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1606.03498. In summary, this function calculates exp( E[ KL(p(y|x) || p(y)) ] ) which captures how different the network's classification prediction is from the prior distribution over classes. Args: images: Images to calculate the classifier score for. classifier_fn: A function that takes images and produces logits based on a classifier. num_batches: Number of batches to split `generated_images` in to in order to efficiently run them through the classifier network. Returns: The classifier score. A floating-point scalar. """ generated_images_list = array_ops.split( images, num_or_size_splits=num_batches) # Compute the classifier splits using the memory-efficient `map_fn`. logits = functional_ops.map_fn( fn=classifier_fn, elems=array_ops.stack(generated_images_list), parallel_iterations=1, back_prop=False, swap_memory=True, name='RunClassifier') logits = array_ops.concat(array_ops.unstack(logits), 0) logits.shape.assert_has_rank(2) # Use maximum precision for best results. logits_dtype = logits.dtype if logits_dtype != dtypes.float64: logits = math_ops.cast(logits, dtypes.float64) p = nn_ops.softmax(logits) q = math_ops.reduce_mean(p, axis=0) kl = _kl_divergence(p, logits, q) kl.shape.assert_has_rank(1) log_score = math_ops.reduce_mean(kl) final_score = math_ops.exp(log_score) if logits_dtype != dtypes.float64: final_score = math_ops.cast(final_score, dtypes.float64) return final_score
def testLargeDims(self): # Make sure that we properly handle large inputs. See # https://github.com/tensorflow/tensorflow/issues/4425 for details for dims in [129, 256]: ones = np.random.rand(dims, dims).astype(np.float32) np_softmax = self._npSoftmax(ones) for use_gpu in [True, False]: with self.test_session(use_gpu=use_gpu) as sess: x = array_ops.placeholder(dtypes.float32) y = nn_ops.softmax(x) tf_softmax = sess.run(y, feed_dict={x: ones}) self.assertAllClose(tf_softmax, np_softmax)
def decoder_fn(time, cell_state, cell_input, cell_output, context_state): """在 dynamic_rnn_decoder 中用于推导的解码器函数 这个解码器函数和 attention_decoder_fn_train 中的 decoder_fn 最大的区别是,next_cell_input 是如何计算的。在解码器函数中,我们通过在解码器输出的特征维度上使用一个 argmax 来计算下一个输入。 这是一种 greedy-search 的方式。(Bahdanau et al., 2014) & (Sutskever et al., 2014) 使用 beam-search。 Args: time: 反映当前时间步的正整型常量 positive integer constant reflecting the current timestep. cell_state: RNNCell 的状态 state of RNNCell. cell_input: dynamic_rnn_decoder 提供的输入 input provided by `dynamic_rnn_decoder`. cell_output: RNNCell的输出 output of RNNCell. context_state: dynamic_rnn_decoder 提供的上下文状态 context state provided by `dynamic_rnn_decoder`. Returns: 一个元组 (done, next state, next input, emit output, next context state) 其中: done: 一个指示哪个句子已经达到 end_of_sequence_id 的布尔向量。 被 dynamic_rnn_decoder 用来提早停止。当 time>maximum_length 时, 一个所有元素都为 true 的布尔向量被返回。 next state: `cell_state`, 这个解码器函数不修改给定的状态。 next input: cell_output 的 argmax 的嵌入被用作 next_input emit output: 如果 output_fn is None,所提供的 cell_output 被返回。 否则被用来在计算 next_input 和返回 cell_output 之前更新 cell_output。 next context state: `context_state`, 这个解码器函数不修改给定的上下文状态。 当使用,例如,beam search 时,上下文状态能够被修改。 Raises: ValueError: if cell_input is not None. """ with ops.name_scope( name, "attention_decoder_fn_inference", [time, cell_state, cell_input, cell_output, context_state]): if cell_input is not None: raise ValueError( "Expected cell_input to be None, but saw: %s" % cell_input) if cell_output is None: # invariant that this is time == 0 next_input_id = array_ops.ones([ batch_size, ], dtype=dtype) * (start_of_sequence_id) done = array_ops.zeros([ batch_size, ], dtype=dtypes.bool) cell_state = encoder_state cell_output = array_ops.zeros([num_decoder_symbols], dtype=dtypes.float32) cell_input = array_ops.gather(embeddings, next_input_id) # init attention attention = _init_attention(encoder_state) # init context state log_beam_probs = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="log_beam_probs", size=maximum_length, dynamic_size=True, infer_shape=False) beam_parents = tensor_array_ops.TensorArray( dtype=dtypes.int32, tensor_array_name="beam_parents", size=maximum_length, dynamic_size=True, infer_shape=False) beam_symbols = tensor_array_ops.TensorArray( dtype=dtypes.int32, tensor_array_name="beam_symbols", size=maximum_length, dynamic_size=True, infer_shape=False) result_probs = tensor_array_ops.TensorArray( dtype=dtypes.float32, tensor_array_name="result_probs", size=maximum_length, dynamic_size=True, infer_shape=False) result_parents = tensor_array_ops.TensorArray( dtype=dtypes.int32, tensor_array_name="result_parents", size=maximum_length, dynamic_size=True, infer_shape=False) result_symbols = tensor_array_ops.TensorArray( dtype=dtypes.int32, tensor_array_name="result_symbols", size=maximum_length, dynamic_size=True, infer_shape=False) context_state = (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols) else: # construct attention attention = attention_construct_fn(cell_output, attention_keys, attention_values) cell_output = attention # beam search decoder (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols) = context_state cell_output = output_fn(cell_output) # logits cell_output = nn_ops.softmax(cell_output) cell_output = array_ops.split(cell_output, [2, num_decoder_symbols - 2], 1)[1] tmp_output = array_ops.gather( cell_output, math_ops.range(origin_batch) * beam_size) probs = control_flow_ops.cond( math_ops.equal(time, ops.convert_to_tensor(1, dtype)), lambda: math_ops.log(tmp_output + ops.convert_to_tensor( 1e-20, dtypes.float32)), lambda: math_ops.log(cell_output + ops.convert_to_tensor( 1e-20, dtypes.float32)) + array_ops.reshape( log_beam_probs.read(time - 2), [-1, 1])) probs = array_ops.reshape(probs, [origin_batch, -1]) best_probs, indices = nn_ops.top_k(probs, beam_size * 2) #indices = array_ops.reshape(indices, [-1]) indices_flatten = array_ops.reshape(indices, [ -1 ]) + array_ops.reshape( array_ops.concat([ array_ops.reshape( math_ops.range(origin_batch) * ((num_decoder_symbols - 2) * beam_size), [-1, 1]) ] * (beam_size * 2), 1), [origin_batch * beam_size * 2]) best_probs_flatten = array_ops.reshape(best_probs, [-1]) symbols = indices_flatten % (num_decoder_symbols - 2) symbols = symbols + 2 parents = indices_flatten // (num_decoder_symbols - 2) probs_wo_eos = best_probs + 1e5 * math_ops.cast( math_ops.cast( (indices % (num_decoder_symbols - 2) + 2) - end_of_sequence_id, dtypes.bool), dtypes.float32) best_probs_wo_eos, indices_wo_eos = nn_ops.top_k( probs_wo_eos, beam_size) indices_wo_eos = array_ops.reshape( indices_wo_eos, [-1]) + array_ops.reshape( array_ops.concat([ array_ops.reshape( math_ops.range(origin_batch) * (beam_size * 2), [-1, 1]) ] * beam_size, 1), [origin_batch * beam_size]) _probs = array_ops.gather(best_probs_flatten, indices_wo_eos) _symbols = array_ops.gather(symbols, indices_wo_eos) _parents = array_ops.gather(parents, indices_wo_eos) log_beam_probs = log_beam_probs.write(time - 1, _probs) beam_symbols = beam_symbols.write(time - 1, _symbols) beam_parents = beam_parents.write(time - 1, _parents) result_probs = result_probs.write(time - 1, best_probs_flatten) result_symbols = result_symbols.write(time - 1, symbols) result_parents = result_parents.write(time - 1, parents) next_input_id = array_ops.reshape(_symbols, [batch_size]) state_size = int(cell_state[0].get_shape().with_rank(2)[1]) attn_size = int(attention.get_shape().with_rank(2)[1]) state = [] for j in cell_state: state.append( array_ops.reshape(array_ops.gather(j, _parents), [-1, state_size])) cell_state = tuple(state) attention = array_ops.reshape( array_ops.gather(attention, _parents), [-1, attn_size]) done = math_ops.equal(next_input_id, end_of_sequence_id) cell_input = array_ops.gather(embeddings, next_input_id) # combine cell_input and attention next_input = array_ops.concat([cell_input, attention], 1) # if time > maxlen, return all true vector done = control_flow_ops.cond( math_ops.greater(time, maximum_length), lambda: array_ops.ones([ batch_size, ], dtype=dtypes.bool), lambda: array_ops.zeros([ batch_size, ], dtype=dtypes.bool)) return (done, cell_state, next_input, cell_output, (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols)) #context_state)
def attention(decoder_state, temporal_e, coverage=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder temporal_e: store previous attentions for temporal attention mechanism coverage: Optional. Previous timestep's coverage vector, shape (batch_size, max_enc_steps, 1, 1). Returns: context_vector: weighted sum of _enc_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, max_enc_steps, 1, 1) masked_e: store the attention score for temporal attention mechanism. """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) decoder_features = linear( decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) # We can't have coverage with matrix attention if not _hps.matrix_attention and use_coverage and coverage is not None: # non-first step of coverage # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d( coverage, w_c, [1, 1, 1, 1], "SAME" ) # c has shape (batch_size, max_enc_steps, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e_not_masked = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,max_enc_steps) masked_e = nn_ops.softmax( e_not_masked ) * enc_padding_mask # (batch_size, max_enc_steps) masked_sums = tf.reduce_sum(masked_e, axis=1) # shape (batch_size) masked_e = masked_e / tf.reshape(masked_sums, [-1, 1]) # Equation 3 in if _hps.use_temporal_attention: try: len_temporal_e = temporal_e.get_shape()[0].value except: len_temporal_e = 0 if len_temporal_e == 0: attn_dist = masked_e else: masked_sums = tf.reduce_sum( temporal_e, axis=0 ) + 1e-10 # if it's zero due to masking we set it to a small value attn_dist = masked_e / masked_sums # (batch_size, max_enc_steps) else: attn_dist = masked_e masked_attn_sums = tf.reduce_sum(attn_dist, axis=1) attn_dist = attn_dist / tf.reshape(masked_attn_sums, [-1, 1]) # re-normalize # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) else: if _hps.matrix_attention: # Calculate h_d * W_attn * h_i, equation 2 in https://arxiv.org/pdf/1705.04304.pdf _dec_attn = tf.unstack( tf.matmul( tf.squeeze(decoder_features, axis=[1, 2]), w_attn), axis=0) # batch_size * (attention_vec_size) _enc_states_lst = tf.unstack( tf.squeeze(_enc_states, axis=2), axis=0 ) # batch_size * (max_enc_steps, attention_vec_size) e_not_masked = tf.squeeze( tf.stack([ tf.matmul(tf.reshape(_dec, [1, -1]), tf.transpose(_enc)) for _dec, _enc in zip(_dec_attn, _enc_states_lst) ]), axis=1) # (batch_size, max_enc_steps) masked_e = tf.exp( e_not_masked * enc_padding_mask) # (batch_size, max_enc_steps) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e_not_masked = math_ops.reduce_sum( v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e, (batch_size, max_enc_steps) masked_e = nn_ops.softmax( e_not_masked ) * enc_padding_mask # (batch_size, max_enc_steps) masked_sums = tf.reduce_sum( masked_e, axis=1) # shape (batch_size) masked_e = masked_e / tf.reshape(masked_sums, [-1, 1]) if _hps.use_temporal_attention: try: len_temporal_e = temporal_e.get_shape()[0].value except: len_temporal_e = 0 if len_temporal_e == 0: attn_dist = masked_e else: masked_sums = tf.reduce_sum( temporal_e, axis=0 ) + 1e-10 # if it's zero due to masking we set it to a small value attn_dist = masked_e / masked_sums # (batch_size, max_enc_steps) else: attn_dist = masked_e # Calculate attention distribution masked_attn_sums = tf.reduce_sum(attn_dist, axis=1) attn_dist = attn_dist / tf.reshape(masked_attn_sums, [-1, 1]) # re-normalize if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2), 2) # initialize coverage # Calculate the context vector from attn_dist and _enc_states context_vector = math_ops.reduce_sum( array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * _enc_states, [1, 2]) # shape (batch_size, attn_size). context_vector = array_ops.reshape(context_vector, [-1, attn_size]) return context_vector, attn_dist, coverage, masked_e
def masked_attention(e): """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax(e) # take softmax. shape (batch_size, attn_length) attn_dist *= enc_padding_mask # apply mask masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a, dtype=dtype): attention_vec_size = attn_size # Size of query vectors for attention. # to calucate wp * ht v_p = variable_scope.get_variable("AttnV_p%d" % a, [attention_vec_size]) qiu = linear(query, attention_vec_size, True) qiu = array_ops.reshape(qiu, [-1, 1, 1, attention_vec_size]) tan_v = math_ops.reduce_sum(v_p * math_ops.tanh(qiu), [2, 3]) # print(tan_v.get_shape()) pt_sig = math_ops.sigmoid(tan_v) # print(pt_sig.get_shape()) p = attn_length * pt_sig # print(p.get_shape()) # p_t = (array_ops.reshape(p, [-1, attn_length])) p_t = math_ops.cast(p, dtype=dtypes.int32) p_t = math_ops.cast(p_t, dtype=dtypes.float32) # print(p_t.get_shape()) # print(4) # To calculate W1 * hi we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size]) k = variable_scope.get_variable( "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]) with variable_scope.variable_scope("Attention_l_%d" % a, dtype=dtype): # w2 * ht y = linear(query, attention_vec_size, True) y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) ai = nn_ops.softmax(s) ai = tf.reshape(ai, [-1, attn_length, 1]) # print(5,ai.get_shape()) # do the p_t part extent = tf.ones([1, attn_length], dtype=dtypes.float32) p_t = p_t * extent p_t = tf.reshape(p_t, [-1, attn_length, 1]) # print (p_t.get_shape()) pos = [i for i in xrange(attn_length)] pos = tf.reshape(pos, [attn_length, 1]) pos = math_ops.cast(pos, dtype=dtypes.float32) # print((p_t-pos).get_shape(),"jing") value = math_ops.square(p_t - pos) * 2 / (attn_local_D * attn_local_D) pre = math_ops.exp(math_ops.negative(value)) # print(pre.get_shape(),"qiu") ai = ai * pre # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(ai, [-1, attn_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [-1, attn_size])) return ds
def attention(query): """Put attention masks on hidden using hidden_features and query.""" ds = [] # Results of attention reads will be stored here. if nest.is_sequence(query): # If the query is a tuple, flatten it. query_list = nest.flatten(query) for q in query_list: # Check that ndims == 2 if specified. ndims = q.get_shape().ndims if ndims: assert ndims == 2 query = array_ops.concat(query_list, 1) for a in xrange(num_heads): with variable_scope.variable_scope("Attention_%d" % a, dtype=dtype): attention_vec_size = attn_size # Size of query vectors for attention. # to calucate wp * ht v_p = variable_scope.get_variable("AttnV_p%d" % a, [attention_vec_size]) qiu = linear(query, attention_vec_size, True) qiu = array_ops.reshape( qiu, [batch_size, 1, 1, attention_vec_size]) tan_v = math_ops.reduce_sum(v_p * math_ops.tanh(qiu), [2, 3]) # print(tan_v.get_shape()) pt_sig = math_ops.sigmoid(tan_v) # print(pt_sig.get_shape()) p = attn_length * pt_sig # print(p.get_shape()) # p_t = (array_ops.reshape(p, [-1, attn_length])) p_t = math_ops.cast(p, dtype=dtypes.int32) p_t = math_ops.cast(p_t, dtype=dtypes.float32) # print(p_t.get_shape()) # print(4) # p_t=tf.convert_to_tensor(p_t) #print(p_t.shape, attention_states.shape) # set a window p_t = array_ops.reshape(p_t, [ batch_size, ]) attention_states_windows = [] D = attn_local_D for i in range(attention_states.shape[0]): x = tf.constant(D, dtype=dtypes.float32) y = math_ops.cast(p_t[i], dtype=dtypes.float32) z = tf.constant(attn_length, dtype=dtypes.float32) def f1(): return tf.constant( 0, dtype=dtypes.int32), math_ops.cast( D - p_t[i], dtype=dtypes.int32) def f2(): return math_ops.cast( p_t[i] - D, dtype=dtypes.int32), tf.constant( 0, dtype=dtypes.int32) def f3(): return tf.constant( attn_length, dtype=dtypes.int32), math_ops.cast( p_t[i] + D + 1 - attn_length, dtype=dtypes.int32) def f4(): return math_ops.cast( p_t[i] + D + 1, dtype=dtypes.int32), tf.constant( 0, dtype=dtypes.int32) begin, pre_num = tf.cond(tf.less(x, y), f2, f1) end, last_num = tf.cond(tf.less(y + D + 1, z), f4, f3) d = tf.constant(attn_fixed_length, dtype=dtypes.int32) #num = tf.cond(tf.less(end - begin, d), f5, f6) pre_tmp = tf.zeros([pre_num, attention_vec_size], dtype=dtypes.float32) last_tmp = tf.zeros([last_num, attention_vec_size], dtype=dtypes.float32) #tmp = tf.zeros([num, attention_vec_size], dtype=dtypes.float32) attention_states_window = math_ops.cast( attention_states[i][begin:end], dtype=dtypes.float32) attention_states_window = tf.concat( [pre_tmp, attention_states_window], 0) attention_states_window = tf.concat( [attention_states_window, last_tmp], 0) attention_states_window = tf.expand_dims( attention_states_window, 0) attention_states_windows.append( attention_states_window) attention_states_windows = tf.concat( attention_states_windows, 0) attention_states_windows = array_ops.reshape( attention_states_windows, [batch_size, attn_fixed_length, attention_vec_size]) # print(attention_states_windows.shape) # To calculate W1 * hi we use a 1-by-1 convolution, need to reshape before. hidden = array_ops.reshape( attention_states_windows, [batch_size, attn_fixed_length, 1, attn_size]) k = variable_scope.get_variable( "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size]) hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME") v = variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]) with variable_scope.variable_scope("Attention_l_%d" % a, dtype=dtype): # w2 * ht y = linear(query, attention_vec_size, True) y = array_ops.reshape( y, [batch_size, 1, 1, attention_vec_size]) # Attention mask is a softmax of v^T * tanh(...). s = math_ops.reduce_sum( v * math_ops.tanh(hidden_features + y), [2, 3]) ai = nn_ops.softmax(s) ai = tf.reshape(ai, [batch_size, attn_fixed_length, 1]) # print(5,ai.get_shape()) # do the p_t part p_t = array_ops.reshape(p_t, [batch_size, 1]) extent = tf.ones([1, attn_fixed_length], dtype=dtypes.float32) p_t = p_t * extent p_t = tf.reshape(p_t, [batch_size, attn_fixed_length, 1]) # print (p_t.get_shape()) pos = [i for i in xrange(attn_fixed_length)] pos = tf.reshape(pos, [attn_fixed_length, 1]) pos = math_ops.cast(pos, dtype=dtypes.float32) # print((p_t-pos).get_shape(),"jing") value = math_ops.square(p_t - pos) * 2 / (D * D) pre = math_ops.exp(math_ops.negative(value)) # print(pre.get_shape(),"qiu") ai = ai * pre # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape( ai, [batch_size, attn_fixed_length, 1, 1]) * hidden, [1, 2]) ds.append(array_ops.reshape(d, [batch_size, attn_size])) return ds
def masked_attention(e,enc_padding_mask): attn_dist = nn_ops.softmax(e) attn_dist *= enc_padding_mask attn_dist +=1e-10 masked_sums = tf.reduce_sum(attn_dist, axis=1) return attn_dist / tf.reshape(masked_sums, [-1, 1])
def inference_graph(self, data, data_spec=None): """Returns the op that performs inference on a batch of data.""" return nn_ops.softmax(self._base_inference(data, data_spec=data_spec))
def f(x): assert x.dtype == dtypes.float32 with backprop.GradientTape() as tape: tape.watch(x) y = nn_ops.softmax(x) return tape.gradient(y, x)
def testDimTooLarge(self): with self.test_session(): with self.assertRaises(errors_impl.InvalidArgumentError): nn_ops.softmax([1., 2., 3., 4.], dim=100).eval()
def testInvalidAxis(self): # Test case for GitHub issue 22793. with self.cached_session(): ones = array_ops.ones(shape=[2, 3]) with self.assertRaises(errors_impl.InvalidArgumentError): nn_ops.softmax(ones, axis=2).eval()
def head_pieces(head, mem_size, num_shifts=3, axis=1): ''' There are several activation functions applied to the output of the LSTM or FF controller, this method performs the necessary operations to produce the shift vector, interpolation, sharpening, key, and beta for the read/write operations. Also produces the add and erase vectors for modifying the memory matrix. This method is used outside of the class as well, which is why it's static. Arguments: head - Tensor of the raw output of the controller network. mem_size - Tuple of integers stating the size of the memory (NxM). num_shifts - Integer that is used to determine the magnitude and direction of possible shifts for the read and write heads. axis - The axis of 'head' where splitting should occur. This is used for instances when 'head' is a rank 3 or rank 2 tensor. The default value is 1. (This should be eliminated to perform splitting on the last axis of the tensor... can probably be changed to '-1' without problems) ''' num_slots, num_bits = mem_size _ = num_slots #center = int(num_shifts/2.) shift_bias = np.zeros(num_shifts) #shift_bias[center] = 2.5 # Temporarily commented out for regression # testing with NP implementation. #print(write_head_raw.get_shape(), read_head_raw.get_shape()) # Number of elements in the read/write heads, respectively. splits = [num_bits+num_shifts+3, 3*num_bits+num_shifts+3] read_head_raw, write_head_raw = array_ops.split(head, splits, axis=axis) write_splits = [num_bits, num_shifts, 1, 1, 1, num_bits, num_bits] read_splits = [num_bits, num_shifts, 1, 1, 1] write_pieces = array_ops.split(write_head_raw, write_splits, axis=axis) read_pieces = array_ops.split(read_head_raw, read_splits, axis=axis) key_w, shift_w, gamma_w, beta_w, g_w, add_w, erase_w = write_pieces # Multiple operations are applied to the pieces of the write head, # see the original paper or this project's writeup for the breakdown. shift_w = nn_ops.softmax(shift_w + shift_bias) gamma_w = gen_math_ops.minimum(nn_ops.softplus(gamma_w) + 1, 21.) beta_w = nn_ops.softplus(beta_w) g_w = math_ops.sigmoid(g_w) add_w = math_ops.sigmoid(add_w) erase_w = math_ops.sigmoid(erase_w) key_r, shift_r, gamma_r, beta_r, g_r = read_pieces # Operations applied to the pieces of the read head. shift_r = nn_ops.softmax(shift_r + shift_bias) gamma_r = gen_math_ops.minimum(nn_ops.softplus(gamma_r) + 1, 21.) beta_r = nn_ops.softplus(beta_r) g_r = math_ops.sigmoid(g_r) write_head = (key_w, shift_w, gamma_w, beta_w, g_w, add_w, erase_w) read_head = (key_r, shift_r, gamma_r, beta_r, g_r) return write_head, read_head
def _attention(output_sequence, output_target, mode): """ Get context vector based on attention and weighted output target by context vector derived by attention mechanism. :param output_sequence: tensor shaped (batch, attention window, hidden unit num) :param output_target: tensor shaped (batch, hidden unit num) :param mode: None for basic one, `kvp` for key-value-predict attention :return: new output, context vector derived by attention mechanism """ n_window, n_hidden = output_sequence.shape.as_list()[1:] if mode is None or mode == "k": # basic attention os_k = os_v = os_p = output_sequence ot_k = ot_v = ot_p = output_target elif mode == "kv": # key-value attention os_k, os_v = array_ops.split(value=output_sequence, num_or_size_splits=2, axis=2) ot_k, ot_v = array_ops.split(value=output_target, num_or_size_splits=2, axis=1) ot_p, os_p = ot_v, os_v if n_hidden % 2 != 0: raise ValueError("for `kv` mode, `n_hidden` should be even.") n_hidden = int(n_hidden / 2) elif mode == "kvp": # key-value-prediction attention os_k, os_v, os_p = array_ops.split(value=output_sequence, num_or_size_splits=3, axis=2) ot_k, ot_v, ot_p = array_ops.split(value=output_target, num_or_size_splits=3, axis=1) if n_hidden % 3 != 0: raise ValueError( "for `kvp` mode, `n_hidden` should be able to be divided by 3." ) n_hidden = int(n_hidden / 3) else: raise ValueError("unknown mode") with vs.variable_scope("context_vector"): a = [] # alpha of attention mechanism w_h = vs.get_variable("w_h", shape=[n_hidden, n_hidden]) # weight for target w_y = vs.get_variable("w_y", shape=[n_hidden, n_hidden]) # weight for sequence w = vs.get_variable("w", shape=[n_hidden, 1]) # weight for attention logit_h = math_ops.matmul(ot_k, w_h) # (batch, hidden) for n_w in range(n_window): logit_y = math_ops.matmul(os_k[:, n_w, :], w_y) # (batch, hidden) logit = logit_h + logit_y m = math_ops.tanh(logit) # M of attention mechanism a.append(math_ops.matmul(m, w)) # (batch, 1) a = nn_ops.softmax(array_ops.stack(a, axis=1)) # (batch, window, 1) r = math_ops.reduce_sum(os_v * a, axis=1) # context vector (batch, hidden) with vs.variable_scope( "weighted_output"): # derive attention weighted output w_h = vs.get_variable("w_h", shape=[n_hidden, n_hidden ]) # weight for original target w_r = vs.get_variable("w_r", shape=[n_hidden, n_hidden ]) # weight for context vector logit = math_ops.matmul(ot_p, w_h) + math_ops.matmul(r, w_r) output = math_ops.tanh(logit) # new output (batch, hidden or hidden/2 (kv) or hidden/3 (kvp)) return output, r, n_hidden
def create_model(self): keep_prob = tf.placeholder(name='keep_prob', dtype=tf.float32) answer = tf.placeholder(dtype=tf.int64, shape=[None], name="answer") query = tf.placeholder(dtype=tf.int64, shape=[None, self.dataset.query_max_len], name="query") document = tf.placeholder(dtype=tf.int64, shape=[None, self.dataset.doc_max_len], name="document") alterative = tf.placeholder(dtype=tf.int64, shape=[None, 3, self.dataset.alt_max_len], name="alternative") if self.args.use_char_embedding: q_input_char = tf.placeholder(dtype=tf.int32, shape=[ None, self.dataset.query_max_len, self.dataset.q_char_len ], name='query_char') d_input_char = tf.placeholder(dtype=tf.int32, shape=[ None, self.dataset.doc_max_len, self.dataset.d_char_len ], name='document_char') doc_char_length = tf.reduce_sum(tf.sign(tf.abs(d_input_char)), axis=-1) query_char_length = tf.reduce_sum(tf.sign(tf.abs(q_input_char)), axis=-1) doc_char_mask = tf.sequence_mask(doc_char_length, maxlen=self.dataset.d_char_len, dtype=tf.float32) query_char_mask = tf.sequence_mask(query_char_length, maxlen=self.dataset.q_char_len, dtype=tf.float32) doc_length = tf.reduce_sum(tf.sign(tf.abs(document)), axis=-1) query_length = tf.reduce_sum(tf.sign(tf.abs(query)), axis=-1) alt_length = tf.reduce_sum(tf.sign(tf.abs(alterative)), axis=-1) alt_mask = tf.sequence_mask(alt_length, maxlen=self.dataset.alt_max_len, dtype=tf.float32) init_embedding = tf.constant(self.embedding_matrix, dtype=tf.float32, name="embedding_init") embedding_matrix = tf.get_variable("embedding_matrix", initializer=init_embedding, dtype=tf.float32, trainable=False) # embedding_matrix = tf.get_variable("embedding_matrix", shape = [self.dataset.word2id_size, self.args.embedding_dim], dtype = tf.float32) if self.args.rnn_type.lower() == 'modified': CELL = ModifiedRNNCell elif self.args.rnn_type.lower() == 'lstm': CELL = LSTMCell elif self.args.rnn_type.lower() == 'gru': CELL = GRUCell elif self.args.rnn_type.lower() == 'vanilla': CELL = VanillaRNNCell elif self.args.rnn_type.lower() == 'indrnn': CELL = IndRNNCell else: raise NotImplementedError( "No rnn_type named : %s implemented. Check." % self.args.rnn_type) if self.args.activation == 'sigmoid': activation = math_ops.sigmoid elif self.args.activation == 'relu': activation = nn_ops.relu elif self.args.activation == 'tanh': activation = math_ops.tanh elif self.args.activation == 'log': activation = math_ops.log elif self.args.activation == 'sin': activation = math_ops.sin elif self.args.activation == 'none': activation = lambda yy: yy else: raise NotImplementedError( "No activation named : %s implemented. Check." % self.args.rnn_type) if self.args.use_char_embedding: char_embedding = tf.get_variable(name='char_embdding_matrix', shape=[ self.dataset.char2id_size, self.args.char_embedding_dim ], dtype=tf.float32, trainable=True) q_char_embed = tf.nn.embedding_lookup(char_embedding, q_input_char) d_char_embed = tf.nn.embedding_lookup(char_embedding, d_input_char) q_char_embed = tf.nn.dropout(tf.reduce_max(q_char_embed, -1), keep_prob=keep_prob) d_char_embed = tf.nn.dropout(tf.reduce_max(d_char_embed, -1), keep_prob=keep_prob) with tf.variable_scope('char_rnn', reuse=tf.AUTO_REUSE) as scp: # q_char_embed = tf.reshape(q_char_embed, [-1, self.dataset.query_max_len * self.dataset.q_char_len, self.args.char_embedding_dim]) # d_char_embed = tf.reshape(d_char_embed, [-1, self.dataset.doc_max_len * self.dataset.d_char_len, self.args.char_embedding_dim]) char_rnn_f = MultiRNNCell(cells=[ DropoutWrapper(CELL(num_units=self.args.char_hidden_size, activation=activation), output_keep_prob=keep_prob) ]) char_rnn_b = MultiRNNCell(cells=[ DropoutWrapper(CELL(num_units=self.args.char_hidden_size, activation=activation), output_keep_prob=keep_prob) ]) d_char_embed_out, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=char_rnn_f, cell_bw=char_rnn_b, inputs=d_char_embed, sequence_length=tf.reduce_sum( tf.sign(tf.abs(doc_char_length)), -1), initial_state_bw=None, dtype="float32", parallel_iterations=None, swap_memory=True, time_major=False, scope='char_rnn') q_char_embed_out, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=char_rnn_f, cell_bw=char_rnn_b, inputs=q_char_embed, sequence_length=tf.reduce_sum( tf.sign(tf.abs(query_char_length)), -1), initial_state_bw=None, dtype="float32", parallel_iterations=None, swap_memory=True, time_major=False, scope='char_rnn') # with tf.variable_scope('char_conv', reuse = tf.AUTO_REUSE) as scp: # # q_char_embed = tf.transpose(q_char_embed, perm = [0, 2, 3, 1]) # [batch, height, width, channels] # filter = tf.get_variable('q_filter_w', # shape = [1, 5, self.args.char_hidden_size, # 32]) # [filter_height, filter_width, in_channels, out_channels] # cnned_char = tf.nn.conv2d(q_char_embed, filter, strides = [1, 1, 1, 1], padding = 'VALID', use_cudnn_on_gpu = True, data_format = "NHWC", # name = None) # [B, (char_len-filter_size/stride), (word_len-filter_size/stride), d_len] # # q_char_embed_out = tf.nn.max_pool(cnned_char, ksize = [1, 1, 5, 1], strides = [1, 1, 1, 1], padding = 'VALID', # data_format = "NHWC", # name = None) # # char_out_size = q_char_embed_out.get_shape().as_list()[-1] * q_char_embed_out.get_shape().as_list()[-2] # q_char_embed_out = tf.reshape(q_char_embed_out, # shape = [-1, self.dataset.query_max_len, char_out_size]) # # # d_char_embed = tf.transpose(d_char_embed, perm = [0, 2, 3, 1]) # [batch, height, width, channels] # filter = tf.get_variable('d_filter_w', # shape = [1, 5, self.args.char_hidden_size, # 32]) # [filter_height, filter_width, in_channels, out_channels] # cnned_char = tf.nn.conv2d(d_char_embed, filter, strides = [1, 1, 1, 1], padding = 'VALID', use_cudnn_on_gpu = True, # data_format = "NHWC", # name = None) # [B, (char_len-filter_size/stride), (word_len-filter_size/stride), d_len] # # d_char_embed_out = tf.nn.max_pool(cnned_char, ksize = [1, 1, 5, 1], strides = [1, 1, 1, 1], padding = 'VALID', # data_format = "NHWC", # name = None) # char_out_size = d_char_embed_out.get_shape().as_list()[-1] * d_char_embed_out.get_shape().as_list()[-2] # d_char_embed_out = tf.reshape(d_char_embed_out, # shape = [-1, self.dataset.doc_max_len, char_out_size]) # # # d_char_embed_out = tf.reshape(d_char_embed_out, shape = [-1, self.dataset.doc_max_len, char_out_size]) d_char_out = tf.concat(d_char_embed_out, -1) q_char_out = tf.concat(q_char_embed_out, -1) # d_char_out = tf.reduce_max(d_char_embed * tf.expand_dims(doc_char_mask, -1), -1) # q_char_out = tf.reduce_max(q_char_embed * tf.expand_dims(query_char_mask, -1), -1) with tf.variable_scope("query_encoder") as scp: query_embed = tf.nn.embedding_lookup(embedding_matrix, query, max_norm=1.) if self.args.use_char_embedding: query_embed = tf.concat([query_embed, q_char_out], -1) query_inputs = tf.nn.relu(query_embed) query_last_states_concat = list() query_outputs_concat = list() for i in range(self.args.num_layers): query_inputs = tf.nn.relu(query_inputs) cell_fw = MultiRNNCell([ CELL(num_units=self.args.hidden_size, activation=activation, name='rnn_fw_%d' % i) ]) cell_bw = MultiRNNCell([ CELL(num_units=self.args.hidden_size, activation=activation, name='rnn_fw_%d' % i) ]) query_outputs, query_last_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=query_inputs, sequence_length=query_length, initial_state_fw=None, initial_state_bw=None, dtype=tf.float32, parallel_iterations=None, swap_memory=True, time_major=False, scope=None) query_output_con = tf.concat(query_outputs, -1) # self_att_w = tf.get_variable('self_att_w_%d' % i, shape = [query_output_con.get_shape()[-1], 1]) # att = nn_ops.softmax(tf.squeeze(special_math_ops.einsum('bij,jk->bik', query_output_con, self_att_w), -1), -1) # query_output_con = query_output_con * tf.expand_dims(att, -1) query_last_states_concat.extend(query_last_states) query_outputs_concat.extend(query_outputs) query_inputs = tf.concat([query_embed, query_output_con], -1) query_outputs = tf.concat(query_outputs_concat, axis=-1) query_last_states = tf.concat(query_last_states_concat, axis=-1) query_last_states = tf.reshape( query_last_states, shape=[ -1, query_last_states.get_shape()[0] * query_last_states.get_shape()[2] ]) query_outputs_dropped = tf.nn.dropout(query_outputs, keep_prob=keep_prob) query_last_states_dropped = query_last_states query_outputs_max = math_ops.reduce_max(query_outputs_dropped, axis=-2) query_encoded = query_outputs_max query_encoded = tf.nn.dropout(query_encoded, keep_prob=keep_prob) with tf.variable_scope('doc_encoder') as scp: doc_embed = tf.nn.embedding_lookup(embedding_matrix, document, max_norm=1.) if self.args.use_char_embedding: doc_embed = tf.concat([doc_embed, d_char_out], -1) qry_encoded_dupli = tf.tile( tf.expand_dims(query_encoded, 1), multiples=[1, self.dataset.doc_max_len, 1]) # doc_embed = tf.nn.dropout(tf.concat([doc_embed, qry_encoded_dupli], -1), keep_prob = keep_prob) doc_inputs = tf.nn.dropout(tf.concat( [doc_embed, qry_encoded_dupli], -1), keep_prob=keep_prob) doc_outputs_concat = list() doc_last_states_concat = list() for i in range(self.args.num_layers): doc_inputs = nn_ops.relu(doc_inputs) cell_fw = MultiRNNCell([ CELL(num_units=self.args.hidden_size, activation=activation, name='rnn_fw_%d' % i) ]) cell_bw = MultiRNNCell([ CELL(num_units=self.args.hidden_size, activation=activation, name='rnn_fw_%d' % i) ]) doc_outputs, doc_last_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=doc_inputs, sequence_length=doc_length, initial_state_fw=None, initial_state_bw=None, dtype=tf.float32, parallel_iterations=None, swap_memory=True, time_major=False, scope=None) doc_output_con = tf.concat(doc_outputs, -1) # self attention # self_att_w = tf.get_variable('self_att_w_%d' % i, shape = [doc_output_con.get_shape()[-1], 1]) # att = nn_ops.softmax(tf.squeeze(special_math_ops.einsum('bij,jk->bik', doc_output_con, self_att_w), -1), -1) # doc_output_con = tf.concat(doc_outputs, -1) * tf.expand_dims(att, -1) # AOA atted att = tf.nn.softmax( tf.squeeze( special_math_ops.einsum( 'bij,bjk->bik', tf.concat(doc_outputs, -1), tf.expand_dims( tf.concat([ query_last_states_concat[2 * i][0], query_last_states_concat[2 * i + 1][0] ], -1), -1)), -1), -1) doc_output_con = tf.concat(doc_outputs, -1) * tf.expand_dims( att, -1) doc_outputs_concat.extend(doc_outputs) doc_last_states_concat.extend(doc_last_states) doc_inputs = tf.concat([doc_embed, doc_output_con], -1) # ELMo s^{task}_j # doc_outputs_concat = [tf.expand_dims(dd, 1) for dd in doc_outputs_concat] # layer_norm_w = tf.get_variable(name = "layer_norm_w", shape = [self.args.num_layers * 2, 1, 1]) # layer_norm_w = tf.nn.softmax(layer_norm_w) # doc_outputs = tf.concat(doc_outputs_concat, axis = 1) * layer_norm_w # doc_outputs = tf.reshape(doc_outputs, shape = [-1, doc_outputs.get_shape()[2], doc_outputs.get_shape()[1] * doc_outputs.get_shape()[-1]]) doc_outputs = tf.concat(doc_outputs_concat, axis=-1) doc_last_states = tf.concat(doc_last_states_concat, axis=-1) doc_last_states = tf.reshape(doc_last_states, shape=[ -1, doc_last_states.get_shape()[0] * doc_last_states.get_shape()[2] ]) doc_last_states_dropped = tf.nn.dropout(doc_last_states, keep_prob=keep_prob) doc_encoded = doc_outputs with tf.variable_scope("attention") as scp: bi_att_w = tf.get_variable('bi_att_w', shape=[ doc_encoded.get_shape()[-1], query_encoded.get_shape()[-1] ]) doc_out_query_last_att = nn_ops.softmax(tf.squeeze( math_ops.matmul( special_math_ops.einsum('bij,jk->bik', doc_encoded, bi_att_w), tf.expand_dims(query_encoded, axis=-1)), -1), axis=-1) # AOA # att = nn_ops.softmax(tf.reduce_sum(tf.einsum('bij,bjk->bik', doc_encoded, tf.transpose(query_outputs, perm = [0, 2, 1])), -1), -1) # vanilla att = doc_out_query_last_att doc_atted = doc_encoded * tf.expand_dims(att, -1) # B * D * 2H doc_atted_max = math_ops.reduce_max(doc_atted, axis=-2) with tf.variable_scope("alt_encoder", reuse=tf.AUTO_REUSE) as scp: alter_embed = embedding_ops.embedding_lookup(embedding_matrix, alterative, max_norm=1.) # alter_embed_sumed = tf.reduce_max(alter_embed * tf.expand_dims(alt_mask, -1), axis = -2) # alter_w = tf.get_variable('alter_w', shape = [self.args.embedding_dim, doc_atted.get_shape()[-1]]) # alter_b = tf.get_variable('alter_b', shape = [doc_atted.get_shape()[-1]]) # alter_embed_wxb = special_math_ops.einsum('bij,jk->bik', alter_embed_sumed, alter_w) + alter_b # # alter_embed_wxb = alter_embed_wxb * tf.expand_dims(alt_mask, -1) # # B * 3 * 2H # alter_encoded = alter_embed_wxb # alter_encoded = tf.transpose(alter_encoded, perm = [0, 2, 1]) # num_layers = self.args.num_layers alt_last_states_concat = list() alt_outputs_concat = list() for j in range(3): alt_last_states_concat_tmp = list() alt_outputs_concat_tmp = list() alter_input = alter_embed[:, j] for i in range(num_layers): alter_input = tf.nn.relu(alter_input) cell_fw = MultiRNNCell([ CELL(num_units=self.args.hidden_size, activation=activation, name='rnn_fw_%d' % i) ]) cell_bw = MultiRNNCell([ CELL(num_units=self.args.hidden_size, activation=activation, name='rnn_fw_%d' % i) ]) alter_outputs, alter_last_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=cell_fw, cell_bw=cell_bw, inputs=alter_input, sequence_length=alt_length[:, j], initial_state_fw=None, initial_state_bw=None, dtype=tf.float32, parallel_iterations=None, swap_memory=True, time_major=False, scope=None) alt_last_states_concat_tmp.extend(alter_last_states) alt_outputs_concat_tmp.extend(alter_outputs) alter_input = tf.concat( [alter_embed[:, j], tf.concat(alter_outputs, -1)], -1) alt_last_states_concat.append( tf.concat(alt_last_states_concat_tmp, -1)) alt_outputs_concat.append(tf.concat(alt_outputs_concat_tmp, -1)) alter_encoded = tf.transpose( tf.concat(alt_last_states_concat, 0), perm=[1, 2, 0]) # tf.stack(alt_outputs_concat, 1) with tf.variable_scope("classify") as scp: # max pooled result = tf.squeeze( math_ops.matmul(tf.expand_dims(doc_atted_max, -2), alter_encoded), -2) result = result + tf.squeeze( math_ops.matmul( tf.expand_dims(tf.reduce_max(doc_encoded, 1), 1), alter_encoded), 1) embed_w = tf.get_variable('embed_w', shape=[ doc_embed.get_shape()[-1], alter_encoded.get_shape()[-2] ]) result = result + tf.squeeze( tf.matmul( tf.expand_dims( tf.reduce_max( tf.einsum('bij,jk->bik', doc_embed, embed_w), 1), 1), alter_encoded)) # sumed # result = tf.reduce_sum(math_ops.matmul(doc_atted, alter_encoded), 1) # result = result + tf.reduce_sum(math_ops.matmul(doc_encoded, alter_encoded), 1) # embed_w = tf.get_variable('embed_w', shape = [doc_embed.get_shape()[-1], alter_encoded.get_shape()[-2]]) # result = result + tf.reduce_sum(tf.matmul(special_math_ops.einsum('bij,jk-bik', doc_embed, embed_w), alter_encoded), 1) # last hidden state # result = tf.squeeze(math_ops.matmul(tf.expand_dims(doc_atted_max, -2), alter_encoded), -2) # result = result + tf.squeeze(math_ops.matmul(tf.expand_dims(doc_last_states_dropped, 1), alter_encoded), 1) # embed_w = tf.get_variable('embed_w', shape = [doc_embed.get_shape()[-1], alter_encoded.get_shape()[-2]]) # result = result + tf.squeeze(tf.matmul(tf.expand_dims(tf.matmul(tf.reduce_max(doc_embed, 1), embed_w), 1), alter_encoded)) # result = tf.reduce_sum(special_math_ops.einsum('bij,bjk->bik', doc_atted, alter_encoded), 1) self.correct_prediction = tf.reduce_sum( tf.cast(tf.equal(tf.argmax(result, -1), answer), tf.int32)) self.loss = tf.reduce_mean( nn_ops.sparse_softmax_cross_entropy_with_logits(logits=result, labels=answer)) self.accuracy = self.correct_prediction / tf.shape(document)[0] self.prediction = tf.argmax(result, -1) self.merged_summary = tf.summary.merge_all()
def predict_proba(self, data, data_spec=None): inference_result = self.inference_graph(data, data_spec=data_spec) probabilities = nn_ops.softmax(inference_result, name="probabilities") return probabilities
def decoder_fn(time, cell_state, cell_input, cell_output, context_state): """处理每个时间步输出并准备下个时间步输入的函数 """ with ops.name_scope( name, "attention_decoder_fn_inference", [time, cell_state, cell_input, cell_output, context_state]): # 推导时没有输入 if cell_input is not None: raise ValueError( "Expected cell_input to be None, but saw: %s" % cell_input) # time=0 if cell_output is None: # 下一步的输入 next_input_id = array_ops.ones([ batch_size, ], dtype=dtype) * (start_of_sequence_id ) # [batch_size] start_of_sequence_id # 是否解码完成 done = array_ops.zeros([ batch_size, ], dtype=dtypes.bool) # [batch_size] False # 解码器状态初始化 cell_state = encoder_state # 第0个时间步之前的解码器输出 cell_output = array_ops.zeros( [num_decoder_symbols], dtype=dtypes.float32) # [num_decoder_symbols] # 下一步输入的id转化成嵌入 word_input = array_ops.gather( embeddings, next_input_id) # [batch_size, num_embed_units] # 解码器输入拼接了这一步使用的三元组 # naf_triple_id = array_ops.zeros([batch_size, 2], dtype=dtype) # [batch_size, 2] 0 # imem[1]: [encoder_batch_size, triple_num*triple_len, 3*num_trans_units] 三元组嵌入 # triple_input = array_ops.gather_nd(imem[1], naf_triple_id) # [batch_size, 3*num_trans_units] # cell_input = array_ops.concat([word_input, triple_input], axis=1) # [batch_size, num_embed_units+3*num_trans_units] cell_input = word_input # 初始化注意力 attention = _init_attention(encoder_state) if imem is not None: # 如果传入了实体嵌入和词嵌入 context_state = tensor_array_ops.TensorArray( dtype=dtypes.int32, tensor_array_name="output_ids_ta", size=maximum_length, dynamic_size=True, infer_shape=False) # time >= 1 else: # 构建注意力 attention = attention_construct_fn(cell_output, attention_keys, attention_values) if type(attention) is tuple: # 输出了alignments attention, alignment = attention cell_output = attention alignment = tf.reshape( alignment, [batch_size, -1] ) # [batch_size, triple_num*triple_len]或者[batch_size, decoder_len] selector = selector_fn(cell_output) # 选择实体词的概率选择器 logit = output_fn( cell_output ) # [batch_size, num_decoder_symbols] 未softmax的预测 word_prob = nn_ops.softmax(logit) * ( 1 - selector ) # [batch_size, num_decoder_symbols] 选择生成词概率 entity_prob = alignment * selector # 选择实体词的概率 [batch_size, triple_num*triple_len]或者[batch_size, decoder_len] # [batch_size, 1] 该步是否选择生成词 # 1、tf.reduce_max(word_prob, 1): [batch_size] 生成词最大的概率 # 2、tf.reduce_max(entity_prob, 1): [batch_size] 实体词最大的概率 # 3、greater: [batch_size] 生成词的概率是否大于实体词概率 # 4、cast: [batch_size] 将bool值转化成浮点 # 5、reshape(cast): [batch_size, 1] 用生成词则为1,否则则为0 mask = array_ops.reshape( math_ops.cast(math_ops.greater( tf.reduce_max(word_prob, 1), tf.reduce_max(entity_prob, 1)), dtype=dtypes.float32), [-1, 1]) # [batch_size, num_embed_units] 当前时间步输入的嵌入 # 1、cast(math_ops.argmax(word_prob, 1): [batch_size] 生成词中最大概率的下标 # 2、gather: [batch_size, num_embed_units]: 采用的生成词 # 3、mask * gather: [batch_size, num_embed_units] 实际采用的生成词 # 4、reshape(range(batch_size)): [batch_size, 1] # 5、reshape(cast(argmax(entity_prob, 1))): [batch_size, 1] 实体词中最大概率的下标 # 6、concat: [batch_size, 2] 4、5 两步的结果在第1维度上拼接 # 7、imem[0]:[batch_size, triple_num*triple_len, num_embed_units] # 8、gather_nd: [batch_size, num_embed_units] 采用的实体词 # 9、(1-mask) * gather_nd: 实际采用的生成词 # 10、mask*gather+(1-mask)*gather_nd: [batch_size, num_embed_units] 当前时间步输入的嵌入 word_input = mask * array_ops.gather(embeddings, math_ops.cast(math_ops.argmax(word_prob, 1), dtype=dtype)) + \ (1-mask)*array_ops.gather_nd(imem[0], array_ops.concat([array_ops.reshape(math_ops.range(batch_size, dtype=dtype), [-1, 1]), array_ops.reshape(math_ops.cast(math_ops.argmax(entity_prob, 1), dtype=dtype), [-1, 1])], axis=1)) # [batch_size, 2] 当前时间步选择实体词的索引 # 1、reshape(range(batch_size)): [batch_size, 1] # 2、cast(1-mask): [batch_size, 1] 选择实体词的 mask # 3、reshape(argmax(alignment, 1)): [batch_size, 1] 选择实体词的下标 # 4、cast(1-mask) * reshape(argmax(alignment, 1)): [batch_size, 1] 选择了实体词,则为实体词下标,否则则为0 # 5、concat: [batch_size, 2] 第二个维度的第一个元素为 batch,第二个元素为 indice # indices = array_ops.concat([array_ops.reshape(math_ops.range(batch_size, dtype=dtype), [-1, 1]), # math_ops.cast(1-mask, dtype=dtype) * # tf.reshape(math_ops.cast(math_ops.argmax(alignment, 1), dtype=dtype), [-1, 1])], # axis=1) # imem[1]: [encoder_batch_size, triple_num*triple_len, 3*num_trans_units] 三元组嵌入 # 使用的三元组嵌入 # triple_input = array_ops.gather_nd(imem[1], indices) # [batch_size, 3*num_trans_units] # 当前时间步单词的嵌入拼上所用三元组的嵌入 # cell_input = array_ops.concat([word_input, triple_input], axis=1) # [batch_size, num_embed_units+3*num_trans_units] cell_input = word_input mask = array_ops.reshape(math_ops.cast(mask, dtype=dtype), [-1]) # [batch_size] 选择生成词的 mask # 当前时间步输入的单词id,如果为生成词则id为正,如果为实体词则id为负 # argmax(word_prob, 1): [batch_size] 生成词下标 # mask - 1: [batch_size] 如果取生成词则为 0,如果取实体词则为 -1 # argmax(entity_prob, 1): [batch_size] 实体词下标 # input_id: [batch_size] 如果为生成词则id为正,如果为实体词则id为负 input_id = mask * math_ops.cast(math_ops.argmax(word_prob, 1), dtype=dtype) + \ (mask - 1) * math_ops.cast(math_ops.argmax(entity_prob, 1), dtype=dtype) # 把 input_id 写入 TensorArray context_state = context_state.write(time - 1, input_id) # 判断句子是否已经结束 done = array_ops.reshape( math_ops.equal(input_id, end_of_sequence_id), [-1]) cell_output = logit # [batch_size, num_decoder_symbols] 未softmax的预测 else: # 不输出 alignments 的情况 cell_output = attention cell_output = output_fn( cell_output ) # [batch_size, num_decoder_symbols] 未softmax的预测 # [batch_size] 最大概率生成词的下标 next_input_id = math_ops.cast(math_ops.argmax( cell_output, 1), dtype=dtype) # 判断句子是否已经结束 done = math_ops.equal(next_input_id, end_of_sequence_id) # 下个时间步细胞输入 cell_input = array_ops.gather( embeddings, next_input_id) # [batch_size, num_embed_units] # 下个时间步输入,加上 attention next_input = array_ops.concat([cell_input, attention], 1) # 如果 time > maximum_length 则返回全为 True 的向量,否则返回 done done = control_flow_ops.cond( math_ops.greater(time, maximum_length), lambda: array_ops.ones([ batch_size, ], dtype=dtypes.bool), lambda: done) return (done, cell_state, next_input, cell_output, context_state)
def decoder_fn(time, cell_state, cell_input, cell_output, context_state): with ops.name_scope( name, "attention_decoder_fn_inference", [time, cell_state, cell_input, cell_output, context_state]): if cell_input is not None: raise ValueError("Expected cell_input to be None, but saw: %s" % cell_input) if cell_output is None: # invariant that this is time == 0 next_input_id = array_ops.ones( [batch_size,], dtype=dtype) * (start_of_sequence_id) done = array_ops.zeros([batch_size,], dtype=dtypes.bool) cell_state = encoder_state cell_output = array_ops.zeros( [num_decoder_symbols], dtype=dtypes.float32) cell_input = array_ops.gather(embeddings, next_input_id) # init attention attention = _init_attention(encoder_state) # init context state log_beam_probs = tensor_array_ops.TensorArray(dtype=dtypes.float32, tensor_array_name="log_beam_probs", size=maximum_length, dynamic_size=True, infer_shape=False) beam_parents = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="beam_parents", size=maximum_length, dynamic_size=True, infer_shape=False) beam_symbols = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="beam_symbols", size=maximum_length, dynamic_size=True, infer_shape=False) result_probs = tensor_array_ops.TensorArray(dtype=dtypes.float32, tensor_array_name="result_probs", size=maximum_length, dynamic_size=True, infer_shape=False) result_parents = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="result_parents", size=maximum_length, dynamic_size=True, infer_shape=False) result_symbols = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="result_symbols", size=maximum_length, dynamic_size=True, infer_shape=False) context_state = (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols) else: # construct attention attention = attention_construct_fn(cell_output, attention_keys, attention_values) cell_output = attention # beam search decoder (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols) = context_state cell_output = output_fn(cell_output) # logits cell_output = nn_ops.softmax(cell_output) cell_output = array_ops.split(cell_output, [2, num_decoder_symbols-2], 1)[1] tmp_output = array_ops.gather(cell_output, math_ops.range(origin_batch)*beam_size) probs = control_flow_ops.cond( math_ops.equal(time, ops.convert_to_tensor(1, dtype)), lambda: math_ops.log(tmp_output+ops.convert_to_tensor(1e-20, dtypes.float32)), lambda: math_ops.log(cell_output+ops.convert_to_tensor(1e-20, dtypes.float32)) + array_ops.reshape(log_beam_probs.read(time-2), [-1, 1])) probs = array_ops.reshape(probs, [origin_batch, -1]) best_probs, indices = nn_ops.top_k(probs, beam_size * 2) #indices = array_ops.reshape(indices, [-1]) indices_flatten = array_ops.reshape(indices, [-1]) + array_ops.reshape(array_ops.concat([array_ops.reshape(math_ops.range(origin_batch)*((num_decoder_symbols-2)*beam_size), [-1, 1])]*(beam_size*2), 1), [origin_batch*beam_size*2]) best_probs_flatten = array_ops.reshape(best_probs, [-1]) symbols = indices_flatten % (num_decoder_symbols - 2) symbols = symbols + 2 parents = indices_flatten // (num_decoder_symbols - 2) probs_wo_eos = best_probs + 1e5*math_ops.cast(math_ops.cast((indices%(num_decoder_symbols-2)+2)-end_of_sequence_id, dtypes.bool), dtypes.float32) best_probs_wo_eos, indices_wo_eos = nn_ops.top_k(probs_wo_eos, beam_size) indices_wo_eos = array_ops.reshape(indices_wo_eos, [-1]) + array_ops.reshape(array_ops.concat([array_ops.reshape(math_ops.range(origin_batch)*(beam_size*2), [-1, 1])]*beam_size, 1), [origin_batch*beam_size]) _probs = array_ops.gather(best_probs_flatten, indices_wo_eos) _symbols = array_ops.gather(symbols, indices_wo_eos) _parents = array_ops.gather(parents, indices_wo_eos) log_beam_probs = log_beam_probs.write(time-1, _probs) beam_symbols = beam_symbols.write(time-1, _symbols) beam_parents = beam_parents.write(time-1, _parents) result_probs = result_probs.write(time-1, best_probs_flatten) result_symbols = result_symbols.write(time-1, symbols) result_parents = result_parents.write(time-1, parents) next_input_id = array_ops.reshape(_symbols, [batch_size]) state_size = int(cell_state[0].get_shape().with_rank(2)[1]) attn_size = int(attention.get_shape().with_rank(2)[1]) state = [] for j in cell_state: state.append(array_ops.reshape(array_ops.gather(j, _parents), [-1, state_size])) cell_state = tuple(state) attention = array_ops.reshape(array_ops.gather(attention, _parents), [-1, attn_size]) done = math_ops.equal(next_input_id, end_of_sequence_id) cell_input = array_ops.gather(embeddings, next_input_id) # combine cell_input and attention next_input = array_ops.concat([cell_input, attention], 1) # if time > maxlen, return all true vector done = control_flow_ops.cond( math_ops.greater(time, maximum_length), lambda: array_ops.ones([batch_size,], dtype=dtypes.bool), lambda: array_ops.zeros([batch_size,], dtype=dtypes.bool)) return (done, cell_state, next_input, cell_output, (log_beam_probs, beam_parents, beam_symbols, result_probs, result_parents, result_symbols))#context_state)
def attention_decoder(decoder_inputs, initial_state, attention_states, cell, batch_size, state_size, decoder_inputs_positions=None, decoder_inputs_maps=None, output_size=None, loop_function=None, dtype=dtypes.float32, scope=None): """RNN decoder with attention for the sequence-to-sequence model. Args: decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size]. Embedded inputs. initial_state: 2D Tensor [batch_size x cell.state_size]. attention_states: 3D Tensor [batch_size x attn_length x attn_size]. cell: rnn_cell.RNNCell defining the cell function and size. batch_size: need to clarify batch size explicitly since env_state is updated one sample by one sample. state_size: size of environment state. decoder_inputs_positions: a list of 2D Tensors of shape [batch_size, 3], indicating intial positions of each example in a map. Default None. decoder_inputs_maps: a 1D Tensor of length batch_size indicating the map. Default None. output_size: size of the output vectors; if None, we use cell.output_size. loop_function: if not None, this function will be applied to i-th output in order to generate i+1-th input, and decoder_inputs will be ignored, except for the first element ("GO" symbol). This can be used for decoding, but also for training to emulate http://arxiv.org/pdf/1506.03099v2.pdf. Signature -- loop_function(prev, i) = next * prev is a 2D Tensor of shape [batch_size x cell.output_size], * i is an integer, the step number (when advanced control is needed), * next is a 2D Tensor of shape [batch_size x cell.input_size]. dtype: The dtype to use for the RNN initial state (default: tf.float32). scope: VariableScope for the created subgraph; default: "attention_decoder". Returns: outputs: A list of the same length as decoder_inputs of 2D Tensors of shape [batch_size x output_size]. These represent the generated outputs. Output i is computed from input i (which is either i-th decoder_inputs or loop_function(output {i-1}, i)) as follows. First, we run the cell on the current decoder input or feed from previous output: cur_output, new_state = cell(input, prev_state). Then, we calculate new attention masks: new_attn = softmax(h_t^T * attention_states). Thus, the context vector: cont_vec = weighted_sum_of(attention_states), weighted by (new_attn), and then we calculate the attended output: attn_output = tanh(W1*current_output + W2*cont_vec + W3*env_state). The finally output for prediction: output = softmax(W*attn_output). This "output" should be a 1D Tensor of shape [num_symbols]. Every item of the output refers to the probability of predicting certain symbol for the next step. states: The state of each decoder cell in each time-step. This is a list with length len(decoder_inputs) -- one item for each time-step. Each item is a 2D Tensor of shape [batch_size x cell.state_size]. Raises: ValueError: when num_heads is not positive, there are no inputs, or shapes of attention_states are not set. """ if not decoder_inputs: raise ValueError("Must provide at least 1 input to attention decoder.") if not attention_states.get_shape()[1:2].is_fully_defined(): raise ValueError( "Shape[1] and [2] of attention_states must be known: %s" % attention_states.get_shape()) if output_size is None: output_size = cell.output_size with vs.variable_scope(scope or "attention_decoder"): attn_length = attention_states.get_shape()[1].value attn_size = attention_states.get_shape()[2].value mapIdx = array_ops.pack([map3.map_grid, map3.map_jelly, map3.map_one]) #map attention_vec_size = attn_size # size of query states = [initial_state] # current position and environment position, env = None, None hidden = array_ops.reshape( attention_states, [-1, attn_length, 1, attn_size]) # reshape for later computation def attention(query): """Put attention masks on hidden using hidden_features and query.""" with vs.variable_scope("Attention"): # Attention mask is a softmax of h_in^T*decoder_hidden. dec_hid = array_ops.tile( query, [1, attn_length ]) # replicate query for element-wise multiplication dec_hid = array_ops.reshape( dec_hid, [-1, attn_length, attention_vec_size]) attn_weight = nn_ops.softmax( math_ops.reduce_sum( attention_states * dec_hid, [2 ])) # attn weights for every hidden states in encoder # Now calculate the attention-weighted vector (context vector) cc. cc = math_ops.reduce_sum( array_ops.reshape(attn_weight, [-1, attn_length, 1, 1]) * hidden, [1, 2]) # attented hidden state with vs.variable_scope("AttnW1"): term1 = rnn_cell.linear(query, attn_size, False) with vs.variable_scope("AttnW2"): term2 = rnn_cell.linear(cc, attn_size, False) # environment representation if env: # 2D Tensor of shape [batch_size, env_size] with vs.variable_scope("Environment"): term3 = rnn_cell.linear(math_ops.to_float(env), attn_size, False) h_attn = math_ops.tanh(term1 + term2 + term3) else: h_attn = math_ops.tanh(term1 + term2) return h_attn, attn_weight def updateEnv(_position, _step, _mapNo): """ Update env_state according to current position and step. Args: position: a 2D Tensor of shape [batch_size, 3]. step: a 2D Tensor of shape [batch_size, 1], where 0 --> no action, 1 --> move forward 1 step, 2 --> turn right, 3 --> turn left, 4 --> turn back. mapNo: a 1D int32 Tensor of length batch_size. Returns: env: a 2D Tensor of shape [batch_size, env_size] environment state after taking the step based on the position. position: a 2D Tensor of shape [batch_size, 3] new position after taking the step based on the position. """ if not _mapNo: raise ValueError(" Invalid argument mapNo in updateEnv! ") if not _position: raise ValueError(" Invalid argument position in updateEnv! ") new_env = [] new_pos = [] # if step == None, take no step and return the environment representations of each position. if not _step: new_pos = _position for j in xrange(batch_size): vec = array_ops.slice( mapIdx, array_ops.pack([ _mapNo[j], _position[j, 0], _position[j, 1], _position[j, 2], 0 ]), [1, 1, 1, 1, state_size]) new_env.append(array_ops.squeeze(vec)) new_env = array_ops.reshape(array_ops.pack(new_env), [batch_size, state_size]) return new_pos, new_env else: def f_move(ppos): # move forward 1 step return control_flow_ops.cond( math_ops.equal(ppos[2], 0), lambda: array_ops.pack( [ppos[0], ppos[1] - 1, ppos[2]]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 1), lambda: array_ops.pack( [ppos[0] + 1, ppos[1], ppos[2]]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 2), lambda: array_ops. pack([ppos[0], ppos[1] + 1, ppos[2]] ), lambda: array_ops.pack( [ppos[0] - 1, ppos[1], ppos[2]])))) def f_right(ppos): # turn right return control_flow_ops.cond( math_ops.equal(ppos[2], 0), lambda: array_ops.pack([ppos[0], ppos[1], 1]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 1), lambda: array_ops.pack( [ppos[0], ppos[1], 2]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 2), lambda: array_ops. pack([ppos[0], ppos[1], 3]), lambda: array_ops. pack([ppos[0], ppos[1], 0])))) def f_left(ppos): # turn left return control_flow_ops.cond( math_ops.equal(ppos[2], 0), lambda: array_ops.pack([ppos[0], ppos[1], 3]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 1), lambda: array_ops.pack( [ppos[0], ppos[1], 0]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 2), lambda: array_ops. pack([ppos[0], ppos[1], 1]), lambda: array_ops. pack([ppos[0], ppos[1], 2])))) def f_back(ppos): # turn back return control_flow_ops.cond( math_ops.equal(ppos[2], 0), lambda: array_ops.pack([ppos[0], ppos[1], 2]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 1), lambda: array_ops.pack( [ppos[0], ppos[1], 3]), lambda: control_flow_ops.cond( math_ops.equal(ppos[2], 2), lambda: array_ops. pack([ppos[0], ppos[1], 0]), lambda: array_ops. pack([ppos[0], ppos[1], 1])))) def ffn4(sstep, ppos): return control_flow_ops.cond( math_ops.equal(sstep, data_utils.turnBack_ID), lambda: f_back(ppos), lambda: _position[j, :]) def ffn3(sstep, ppos): return control_flow_ops.cond( math_ops.equal(sstep, data_utils.turnLeft_ID), lambda: f_left(ppos), lambda: ffn4(sstep, ppos)) def ffn2(sstep, ppos): return control_flow_ops.cond( math_ops.equal(sstep, data_utils.turnRight_ID), lambda: f_right(ppos), lambda: ffn3(sstep, ppos)) def ffn1(sstep, ppos): return control_flow_ops.cond( math_ops.equal(sstep, data_utils.moveAct_ID), lambda: f_move(ppos), lambda: ffn2(sstep, ppos)) for j in xrange(batch_size): #update position temp_pos = control_flow_ops.cond( math_ops.equal(_step[j], data_utils.noAct_ID), lambda: _position[j, :], lambda: ffn1(_step[j], _position[j, :])) new_pos.append( control_flow_ops.cond( math_ops.logical_or( math_ops.greater(temp_pos[0], 24), math_ops.logical_or( math_ops.greater(temp_pos[1], 24), math_ops.logical_or( math_ops.less(temp_pos[0], 0), math_ops.less(temp_pos[1], 0)))), lambda: _position[j, :], lambda: temp_pos)) # new_pos.append(temp_pos) # update env new_env.append( array_ops.reshape( array_ops.slice( mapIdx, array_ops.pack([ _mapNo[j], new_pos[-1][0], new_pos[-1][1], new_pos[-1][2], 0 ]), [1, 1, 1, 1, state_size]), [state_size])) new_pos = array_ops.pack(new_pos) new_env = array_ops.pack(new_env) return new_pos, new_env # return new_pos, None outputs = [] attentions = [] environments = [] positions = [] prev = None # print(" Action info: no act=%d, move=%d, turn left=%d, turn right=%d, turn back=%d" % # (data_utils.noAct_ID, data_utils.moveAct_ID, data_utils.turnLeft_ID, data_utils.turnRight_ID, data_utils.turnBack_ID)) if decoder_inputs_positions and decoder_inputs_maps and batch_size: position = decoder_inputs_positions[ 0] # 2d tensor of shape [batch_size, 3] _, env = updateEnv(position, None, decoder_inputs_maps) for i in xrange(len(decoder_inputs)): if i > 0: vs.get_variable_scope().reuse_variables() inp = decoder_inputs[i] # If loop_function is set, we use it instead of decoder_inputs. if loop_function is not None and prev is not None: with vs.variable_scope("loop_function", reuse=True): inp = array_ops.stop_gradient(loop_function(prev, i)) # Run the RNN. cur_output, new_state = cell(inp, states[-1]) cur_output = array_ops.reshape(cur_output, [batch_size, attn_size]) states.append(new_state) # Run the attention mechanism. h_attn, attn_weight = attention(cur_output) attentions.append(attn_weight) with vs.variable_scope("AttnOutputProjection"): output = rnn_cell.linear(h_attn, output_size, False) if loop_function is not None: # We do not propagate gradients over the loop function. prev = array_ops.stop_gradient(output) if decoder_inputs_positions and decoder_inputs_maps and position: # update pos and env if loop_function: step = math_ops.argmax( nn_ops.softmax(prev), 1) # step is a list (len=batch_size) of int32 number position, env = updateEnv(position, step, decoder_inputs_maps) else: if i < len(decoder_inputs_positions) - 1: position = decoder_inputs_positions[i + 1] _, env = updateEnv(position, None, decoder_inputs_maps) outputs.append(output) environments.append(env) positions.append(position) return outputs, states, attentions, environments, positions
def local_attention(decoder_hidden_state, hidden_attn, initializer, window_size=10, content_function=vinyals_kaiser, dtype=tf.float32): """Put local attention on hidden using decoder hidden states and the hidden states of encoder (hidden_attn). Parameters ---------- decoder_hidden_state : 2-D Tensor Tensor representing the current hidden state of the decoder (output of the recurrent layers). Shape is (?, decoder_size). hidden_attn : 4-D Tensor Tensor representing the hidden states of the encoder (output of the recurrent layers). It has shape (?, timesteps, 1, decoder_sdize) so it is possible to apply a 1-D convolution to calculate the attention score more efficiently. initializer : function Function to use when initializing variables within the variables context. window_size : int Size of each side of the window to use when applying local attention. Not relevant to global attention. Default to 10. content_function : function Content function to score the decoder hidden states and encoder hidden states to extract their weights. Default to 'vinyals_kaiser'. dtype : tensorflow dtype Type of tensors. Default to tf.float32 Returns ------- ds : 2-D Tensor Tensor representing the context vector generated after scoring the encoder and decoder hidden states. Has shape (?, decoder_size), i.e., one context vector per batch sample. """ assert content_function is not None sigma = window_size / 2 denominator = sigma**2 attention_vec_size = hidden_attn.get_shape()[3].value attn_length = hidden_attn.get_shape()[1].value batch_size = array_ops.shape(hidden_attn)[0] with vs.variable_scope("AttentionLocal", initializer=initializer): # apply content function to score the hidden states from the encoder s = content_function(hidden_attn, decoder_hidden_state) with vs.variable_scope("WindowPrediction", initializer=initializer): ht = cells.linear([decoder_hidden_state], attention_vec_size, True) # get the parameters (vp) vp = vs.get_variable("AttnVp_%d" % 0, [attention_vec_size], initializer=initializer) # tanh(Wp*ht) tanh = math_ops.tanh(ht) # S * sigmoid(vp * tanh(Wp*ht)) - this is going to return a number # for each sentence in the batch - i.e., a tensor of shape batch x 1 S = attn_length pt = math_ops.reduce_sum((vp * tanh), [2, 3]) pt = math_ops.sigmoid(pt) * S # now we get only the integer part of the values pt = tf.floor(pt) _ = tf.histogram_summary('local_window_predictions', pt) # we now create a tensor containing the indices representing each position # of the sentence - i.e., if the sentence contain 5 tokens and batch_size is 3, # the resulting tensor will be: # [[0, 1, 2, 3, 4] # [0, 1, 2, 3, 4] # [0, 1, 2, 3, 4]] # indices = [] for pos in xrange(attn_length): indices.append(pos) indices = indices * batch_size idx = tf.convert_to_tensor(tf.to_float(indices), dtype=dtype) idx = tf.reshape(idx, [-1, attn_length]) # here we calculate the boundaries of the attention window based on the ppositions low = pt - window_size + 1 # we add one because the floor op already generates the first position high = pt + window_size # here we check our positions against the boundaries mlow = tf.to_float(idx < low) mhigh = tf.to_float(idx > high) # now we combine both into a pre-mask that has 0s and 1s switched # i.e, at this point, True == 0 and False == 1 m = mlow + mhigh # batch_size # here we switch the 0s to 1s and the 1s to 0s # we correct the values so True == 1 and False == 0 mask = tf.to_float(tf.equal(m, 0.0)) # here we switch off all the values that fall outside the window # first we switch off those in the truncated normal alpha = s * mask masked_soft = nn_ops.softmax(alpha) _ = tf.histogram_summary('local_alpha_weights', alpha) # here we calculate the 'truncated normal distribution' numerator = -tf.pow((idx - pt), tf.convert_to_tensor(2, dtype=dtype)) div = tf.truediv(numerator, denominator) e = math_ops.exp(div) # result of the truncated normal distribution at = masked_soft * e # Now calculate the attention-weighted vector d. d = math_ops.reduce_sum( array_ops.reshape(at, [-1, attn_length, 1, 1]) * hidden_attn, [1, 2]) ds = array_ops.reshape(d, [-1, attention_vec_size]) _ = tf.histogram_summary('local_attention_context', ds) return ds
def _add_seq2seq(self): """Add the whole sequence-to-sequence model to the graph.""" hps = self._hps vsize = self._vocab.size() # size of the vocabulary # with tf.variable_scope('image_encoder'): self.reshaped_pix = tf.reshape(self._side_batch, [-1, 32, 64, 3]) with slim.arg_scope(resnet_arg_scope()): net, end_points = resnet_v1_152(self.reshaped_pix, is_training=FLAGS.mode == 'train') # feat1 = end_points['resnet_v1_152/block4'] pic_encoded = end_points['global_pool'] # self.end_points = end_points # self.net = net with tf.variable_scope('seq2seq'): # Some initializers self.rand_unif_init = tf.random_uniform_initializer( -hps.rand_unif_init_mag, hps.rand_unif_init_mag, seed=123) self.trunc_norm_init = tf.truncated_normal_initializer( stddev=hps.trunc_norm_init_std) # Add embedding matrix (shared by the encoder and decoder inputs) with tf.variable_scope('embedding'): embedding = tf.get_variable('embedding', [vsize, hps.emb_dim], dtype=tf.float32, initializer=self.trunc_norm_init) emb_enc_inputs = tf.nn.embedding_lookup( embedding, self._enc_batch ) # tensor with shape (batch_size, max_enc_steps, emb_size) emb_dec_inputs = [ tf.nn.embedding_lookup(embedding, x) for x in tf.unstack(self._dec_batch, axis=1) ] # list length max_dec_steps containing shape (batch_size, emb_size) pic_encoded = tf.reshape( tf.squeeze(pic_encoded), [FLAGS.batch_size, FLAGS.max_side_steps, -1]) emb_side_inputs = tf.layers.dense(pic_encoded, FLAGS.emb_dim * 2) # Add the encoder. enc_outputs, fw_st, bw_st = self._add_encoder( emb_enc_inputs, self._enc_lens) # batch_size * pic_num * emb_dim new_emb_side_inputs = tf.reshape(emb_side_inputs, [ FLAGS.batch_size * int(FLAGS.max_side_steps / 5), 5, FLAGS.hidden_dim * 2 ]) # (batch_size*pic_num/5) * 5 * emb_dim side_states = self._add_side_rnn_encoder( new_emb_side_inputs, 5 * tf.ones( (new_emb_side_inputs.get_shape()[0]), dtype=tf.int32)) self._side_inputs = tf.reshape( side_states, [FLAGS.batch_size, -1, FLAGS.hidden_dim * 2]) self._enc_states = enc_outputs # Our encoder is bidirectional and our decoder is unidirectional so we need to reduce the final encoder hidden state to the right size to be the initial decoder hidden state self._dec_in_state = self._reduce_states(fw_st, bw_st) self._last_state = tf.concat(self._dec_in_state, -1) with tf.variable_scope('interaction'): change_side_states = tf.transpose(self._side_inputs, [0, 2, 1]) self._change_side_states = change_side_states attn_matrix = tf.matmul(self._enc_states, change_side_states) # batch_size * enc_len * side_len self._video_aware_enc_states = tf.matmul( attn_matrix, self._side_inputs) self._news_aware_side_states = tf.matmul( tf.transpose(attn_matrix, [0, 2, 1]), self._enc_states) gate = tf.layers.dense(self._last_state, 1, activation=tf.nn.sigmoid) gate = tf.expand_dims(tf.tile(gate, [1, FLAGS.hidden_dim * 2]), 1) ones = np.ones([FLAGS.batch_size, 1, FLAGS.hidden_dim * 2]) self._enc_states = gate * self._enc_states + ( ones - gate) * self._video_aware_enc_states # Add the decoder. with tf.variable_scope('decoder'): decoder_outputs, self._dec_out_state, self.attn_dists, self.p_gens, self.coverage = self._add_decoder( emb_dec_inputs) # attn_seg, attn_side = self.pic_attention(emb_side_inputs) # self._attn_side = attn_side # Add the output projection to obtain the vocabulary distribution with tf.variable_scope('output_projection'): w = tf.get_variable('w', [hps.hidden_dim, vsize], dtype=tf.float32, initializer=self.trunc_norm_init) w_t = tf.transpose(w) v = tf.get_variable('v', [vsize], dtype=tf.float32, initializer=self.trunc_norm_init) vocab_scores = [ ] # vocab_scores is the vocabulary distribution before applying softmax. Each entry on the list corresponds to one decoder step for i, output in enumerate(decoder_outputs): if i > 0: tf.get_variable_scope().reuse_variables() vocab_scores.append(tf.nn.xw_plus_b( output, w, v)) # apply the linear layer vocab_dists = [ tf.nn.softmax(s) for s in vocab_scores ] # The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file. # For pointer-generator model, calc final distribution from copy distribution and vocabulary distribution if FLAGS.pointer_gen: final_dists = self._calc_final_dist(vocab_dists, self.attn_dists) else: # final distribution is just vocabulary distribution final_dists = vocab_dists if hps.mode in ['train', 'eval']: # Calculate the loss with tf.variable_scope('loss'): if FLAGS.pointer_gen: # Calculate the loss per step # This is fiddly; we use tf.gather_nd to pick out the probabilities of the gold target words loss_per_step = [ ] # will be list length max_dec_steps containing shape (batch_size) batch_nums = tf.range( 0, limit=hps.batch_size) # shape (batch_size) for dec_step, dist in enumerate(final_dists): targets = self._target_batch[:, dec_step] # The indices of the target words. shape (batch_size) indices = tf.stack((batch_nums, targets), axis=1) # shape (batch_size, 2) gold_probs = tf.gather_nd( dist, indices ) # shape (batch_size). prob of correct words on this step losses = -tf.log(gold_probs + 1e-10) loss_per_step.append(losses) # Apply dec_padding_mask and get loss self._loss = _mask_and_avg(loss_per_step, self._dec_padding_mask) else: # baseline model self._loss = tf.contrib.seq2seq.sequence_loss( tf.stack(vocab_scores, axis=1), self._target_batch, self._dec_padding_mask ) # this applies softmax internally tf.summary.scalar('loss', self._loss) # Calculate coverage loss from the attention distributions if hps.coverage: with tf.variable_scope('coverage_loss'): self._coverage_loss = _coverage_loss( self.attn_dists, self._dec_padding_mask) tf.summary.scalar('coverage_loss', self._coverage_loss) self._total_loss = self._loss + hps.cov_loss_wt * self._coverage_loss tf.summary.scalar('total_loss', self._total_loss) # with tf.variable_scope('pic_loss'): # self._loss_pic = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=attn_side, # labels=self._dec_pic_target)) # # self._loss_unified = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=attn_side, # # labels=attn_seg)) # self._all_loss = self._loss_pic # self._all_loss = self._loss with tf.variable_scope('side'): emb_side_inputs = tf.nn.l2_normalize(emb_side_inputs, dim=-1) # self-attention side_outputs, sfw_st, sbw_st = self._add_side_encoder( self._side_inputs, self._side_lens) conditional_vec = tf.expand_dims(self._last_state, 1) conditional_weight = tf.layers.dense( tf.multiply(conditional_vec, side_outputs), 1) self._cond_side_states = tf.multiply(side_outputs, conditional_weight) s_gate = tf.layers.dense(self._last_state, 1, activation=tf.nn.sigmoid) s_gate = tf.expand_dims(s_gate, 1) s_ones = np.ones_like(s_gate) self._side_states = s_gate * self._news_aware_side_states + ( s_ones - s_gate) * self._cond_side_states fusion_gate = tf.layers.dense(self._last_state, 1, activation=tf.nn.sigmoid) fusion_gate = tf.expand_dims( tf.tile(fusion_gate, [1, FLAGS.hidden_dim * 2]), 1) fusion_ones = tf.ones_like(fusion_gate) side_states = tf.nn.l2_normalize(tf.reshape( tf.tile(tf.expand_dims(self._side_states, 1), [1, 5, 1, 1]), [FLAGS.batch_size, -1, FLAGS.hidden_dim * 2]), dim=-1) fusion_side = fusion_gate * emb_side_inputs + ( fusion_ones - fusion_gate) * side_states attn_side = tf.squeeze( tf.layers.dense( fusion_side, 1, kernel_initializer=tf.contrib.layers.xavier_initializer())) attn_side = nn_ops.softmax(attn_side) self.attn_side = attn_side # last_state = tf.nn.l2_normalize(tf.tile(tf.expand_dims(self._last_state, 1), [1, 10, 1]), dim=-1) # emb_side_inputs = tf.nn.l2_normalize(emb_side_inputs, dim=-1) # attn_side = tf.squeeze(tf.layers.dense(tf.concat([last_state, emb_side_inputs], -1), 1, activation=tf.nn.sigmoid, kernel_initializer=tf.contrib.layers.xavier_initializer())) # self.attn_side = attn_side with tf.variable_scope('pic_loss'): # self._loss_pic = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=attn_side, # labels=self._dec_pic_target)) # self._loss_pic = pairwise_hinge_loss(logits=attn_side, labels=self._dec_pic_target) self._loss_pic = pairwise_hinge_loss(logits=attn_side, labels=tf.one_hot( self._dec_pic_target, FLAGS.max_side_steps)) if hps.mode in ['train', 'eval']: self._all_loss = self._loss + self._loss_pic if hps.mode == "decode" or hps.mode == 'auto_decode': # We run decode beam search mode one decoder step at a time assert len( final_dists ) == 1 # final_dists is a singleton list containing shape (batch_size, extended_vsize) final_dists = final_dists[0] topk_probs, self._topk_ids = tf.nn.top_k( final_dists, hps.batch_size * 2 ) # take the k largest probs. note batch_size=beam_size in decode mode self._topk_log_probs = tf.log(topk_probs)
def attention_score_fn(query, keys, values): """计算注意力分数和value的加权和 Args: query: [batch_size, num_units] 上个时间步的输出 keys: 不是元组时: [batch_size, encoder_len, num_unit] 是元组时: (graph_keys, triples_keys) graph_keys: [batch_size, triple_num, num_unit] 静态图的key triples_keys: [encoder_batch_size, triple_num, triple_len, num_unit] 三元组的key values: 不是元组时: [batch_size, encoder_len, num_units] 是元组时: (graph_values, triples_values) graph_values: [batch_size, triple_num, num_unit] 静态图的value triples_values: [encoder_batch_size, triple_num, triple_len, num_unit] 三元组的value """ triple_keys, triple_values = None, None # 当 keys 为元组时(graph_keys, triples_keys) # keys 为静态图的key [batch_size, triple_num, num_units] # triple_keys 为三元组的key [batch_size, triple_num, triple_len, num_units] # values 为静态图的value [batch_size, triple_num, num_units] # triple_values 为三元组的value [batch_size, triple_num, triple_len, num_units] if type(keys) is tuple: keys, triple_keys = keys values, triple_values = values # 如果keys不为元组,则为解码器每一步输出的key [batch_size, encoder_len, num_unit] # 所以不管是解码器每一步输出还是静态图的key都可以统一成维度 [batch_size, attention_length, num_unit] 进行计算 # 这两种方式可以用来计算对编码器每一步输出的注意力或静态图的注意力,但是不用于三元组的注意力计算 if attention_option == "bahdanau": query = math_ops.matmul( query, query_w) # 给query做一个线性变化 [batch_size, num_units] query = array_ops.reshape( query, [-1, 1, num_units]) # [batch_size, 1, num_units] # reduce_sum(score_v*tanh(keys+query), [2]) scores = _attn_add_fun( score_v, keys, query) # 注意力分数 [batch_size, attention_length] elif attention_option == "luong": # query = array_ops.reshape( query, [-1, 1, num_units]) # [batch_size, 1, num_units] # reduce_sum(keys*query, [2]) scores = _attn_mul_fun( keys, query) # 注意力分数 [batch_size, attention_length] else: raise ValueError("Unknown attention option %s!" % attention_option) # alignments: softmax后的记忆力分数 [batch_size, attention_length] # TODO(thangluong): not normalize over padding positions. alignments = nn_ops.softmax(scores) # 计算通过注意力加权和的编码器输出或者静态图 new_alignments = array_ops.expand_dims( alignments, 2) # [batch_size, attention_length, 1] context_vector = math_ops.reduce_sum( new_alignments * values, [1]) # [batch_size, num_units] context_vector.set_shape([None, num_units]) # 动态图的计算 if triple_values is not None: # triple_keys: [batch_size, triple_num, triple_len, num_units] # luong方式计算对每个三元组的注意力分数 [batch_size, triple_num, triple_len] triple_scores = math_ops.reduce_sum( triple_keys * array_ops.reshape(query, [-1, 1, 1, num_units]), [3]) triple_alignments = nn_ops.softmax( triple_scores) # [batch_size, triple_num, triple_len] # 通过注意力对三元组的value求加权和 [batch_size, triple_num, num_units] context_triples = math_ops.reduce_sum( array_ops.expand_dims(triple_alignments, 3) * triple_values, [2]) # 通过注意力对动态图求加权和 [batch_size, num_units] context_graph_triples = math_ops.reduce_sum( new_alignments * context_triples, [1]) context_graph_triples.set_shape([None, num_units]) # 对静态图的注意力*对三元组的注意力=实际对每个三元组的注意力 final_alignments = new_alignments * triple_alignments # [batch_size, triple_num, triple_len] return context_vector, context_graph_triples, final_alignments else: if output_alignments: return context_vector, alignments # else: return context_vector #
def decoder_fn(time, cell_state, cell_input, cell_output, context_state): with ops.name_scope( name, "attention_decoder_fn_inference", [time, cell_state, cell_input, cell_output, context_state]): if cell_input is not None: raise ValueError("Expected cell_input to be None, but saw: %s" % cell_input) if cell_output is None: # invariant that this is time == 0 next_input_id = array_ops.ones( [batch_size,], dtype=dtype) * (start_of_sequence_id) done = array_ops.zeros([batch_size,], dtype=dtypes.bool) cell_state = encoder_state cell_output = array_ops.zeros( [num_decoder_symbols], dtype=dtypes.float32) word_input = array_ops.gather(embeddings, next_input_id) naf_triple_id = array_ops.zeros([batch_size, 2], dtype=dtype) triple_input = array_ops.gather_nd(imem[1], naf_triple_id) cell_input = array_ops.concat([word_input, triple_input], axis=1) # init attention attention = _init_attention(encoder_state) if imem is not None: context_state = tensor_array_ops.TensorArray(dtype=dtypes.int32, tensor_array_name="output_ids_ta", size=maximum_length, dynamic_size=True, infer_shape=False) else: # construct attention attention = attention_construct_fn(cell_output, attention_keys, attention_values) if type(attention) is tuple: attention, alignment = attention cell_output = attention alignment = tf.reshape(alignment, [batch_size, -1]) selector = selector_fn(cell_output) logit = output_fn(cell_output) word_prob = nn_ops.softmax(logit) * (1 - selector) entity_prob = alignment * selector mask = array_ops.reshape(math_ops.cast(math_ops.greater(tf.reduce_max(word_prob, 1), tf.reduce_max(entity_prob, 1)), dtype=dtypes.float32), [-1,1]) word_input = mask * array_ops.gather(embeddings, math_ops.cast(math_ops.argmax(word_prob, 1), dtype=dtype)) + (1 - mask) * array_ops.gather_nd(imem[0], array_ops.concat([array_ops.reshape(math_ops.range(batch_size, dtype=dtype), [-1,1]), array_ops.reshape(math_ops.cast(math_ops.argmax(entity_prob, 1), dtype=dtype), [-1,1])], axis=1)) indices = array_ops.concat([array_ops.reshape(math_ops.range(batch_size, dtype=dtype), [-1,1]), math_ops.cast(1-mask, dtype=dtype) * tf.reshape(math_ops.cast(math_ops.argmax(alignment, 1), dtype=dtype), [-1, 1])], axis=1) triple_input = array_ops.gather_nd(imem[1], indices) cell_input = array_ops.concat([word_input, triple_input], axis=1) mask = array_ops.reshape(math_ops.cast(mask, dtype=dtype), [-1]) input_id = mask * math_ops.cast(math_ops.argmax(word_prob, 1), dtype=dtype) + (mask - 1) * math_ops.cast(math_ops.argmax(entity_prob, 1), dtype=dtype) context_state = context_state.write(time-1, input_id) done = array_ops.reshape(math_ops.equal(input_id, end_of_sequence_id), [-1]) cell_output = logit else: cell_output = attention # argmax decoder cell_output = output_fn(cell_output) # logits next_input_id = math_ops.cast( math_ops.argmax(cell_output, 1), dtype=dtype) done = math_ops.equal(next_input_id, end_of_sequence_id) cell_input = array_ops.gather(embeddings, next_input_id) # combine cell_input and attention next_input = array_ops.concat([cell_input, attention], 1) # if time > maxlen, return all true vector done = control_flow_ops.cond( math_ops.greater(time, maximum_length), lambda: array_ops.ones([batch_size,], dtype=dtypes.bool), lambda: done) return (done, cell_state, next_input, cell_output, context_state)
def attention(decoder_state, coverage=None, num_words_section=None, step=None): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1). num_words_section: number of words in each section (only needed for hierarchical attention) [batch_size, num_sections] -- assumes number of sections in the batch is equal (TODO: check sanity) step: index of the current decoder step (needed for section attention) Returns: context_vector: weighted sum of encoder_states attn_dist: attention distribution coverage: new coverage vector. shape (batch_size, attn_len, 1, 1) """ with variable_scope.variable_scope("Attention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) # (W_s s_t) + b_att is decoder_features; s_t = decoder_state decoder_features = linear(decoder_state, attention_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims(tf.expand_dims(decoder_features, 1), 1) # reshape to (batch_size, 1, 1, attention_vec_size) def masked_attention(e, enc_padding_mask): if enc_section_padding_mask is not None: enc_padding_mask = tf.reshape(enc_section_padding_mask, [batch_size, -1]) enc_padding_mask = tf.cast(enc_padding_mask, tf.float32) """Take softmax of e then apply enc_padding_mask and re-normalize""" attn_dist = nn_ops.softmax(e) # take softmax. shape (batch_size, attn_length) attn_dist *= enc_padding_mask # apply mask masked_sums = tf.reduce_sum(attn_dist, axis=1) # shape (batch_size) return attn_dist / tf.reshape(masked_sums, [-1, 1]) # re-normalize if use_coverage and coverage is not None: # non-first step of coverage if not hier: # TODO: add coverage on sections # Multiply coverage vector by w_c to get coverage_features. coverage_features = nn_ops.conv2d(coverage, w_c, [1, 1, 1, 1], "SAME") # c has shape (batch_size, seq_len, 1, attention_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn) e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features + coverage_features), [2, 3]) # shape (batch_size,seq_len) # Take softmax of e to get the attention distribution attn_dist = masked_attention(e, enc_padding_mask) # Update coverage vector coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) # shape=(batch_size, seq_len,1,1) else: with tf.variable_scope("attention_words_sections"): coverage_features = nn_ops.conv2d(coverage, w_c, [1, 1, 1, 1], "SAME") # c has shape (batch_size, seq_len, 1, attention_vec_size) e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features + encoder_section_features + coverage_features), [2, 3]) # shape (batch_size,seq_len) attn_dist = masked_attention(e, enc_padding_mask) coverage += array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) # shape=(batch_size, seq_len,1,1) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) if hier: with tf.variable_scope("attention_words_sections"): e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features + encoder_section_features), [2, 3]) #[batch_size x seq_len] if enc_padding_mask is not None: attn_dist = masked_attention(e, enc_padding_mask) else: attn_dist = nn_ops.softmax(e) # shape (batch_size, seq_len) else: e = math_ops.reduce_sum(v * math_ops.tanh(encoder_features + decoder_features), [2, 3]) # calculate e # Take softmax of e to get the attention distribution if enc_padding_mask is not None: attn_dist = masked_attention(e, enc_padding_mask) else: attn_dist = nn_ops.softmax(e) # shape (batch_size, seq_len) if use_coverage: # first step of training coverage = tf.expand_dims(tf.expand_dims(attn_dist,2),2) # initialize coverage # Calculate the context vector from attn_dist and encoder_states # ecnoder_sates = [batch , seq_len , 1 , encoder_output_size], attn_dist = [batch, seq_len, 1, 1] context_vector = math_ops.reduce_sum(array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) * encoder_states, [1, 2]) # shape (batch_size, enc_output_size). context_vector = array_ops.reshape(context_vector, [-1, enc_output_size]) return context_vector, attn_dist, coverage
def testShapeInference(self): op = nn_ops.softmax([[[1., 1., 1., 1.], [1., 2., 3., 4.]], [[2., 3., 4., 5.], [6., 7., 8., 9.]], [[5., 4., 3., 2.], [1., 2., 3., 4.]]]) self.assertEqual([3, 2, 4], op.get_shape())
def _test_softmax(data): """ One iteration of softmax """ with tf.Graph().as_default(): in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) out = nn_ops.softmax(in_data) compare_tflite_with_tvm(data, 'Placeholder:0', [in_data], [out])
def loop_fn(loop_time, cell_output, cell_state, loop_state): if cell_output is None: # time == 0 final_dist = None emit_output = final_dist # == None for time == 0 next_cell_state = initial_state # encoder last states coverage = (array_ops.zeros([batch_size, attn_size]) if prev_coverage is None else prev_coverage) # convext vector will initially be zeros # Ensure the second shape of attention vectors is set. context_vector = array_ops.zeros([batch_size, attn_size]) context_vector.set_shape([None, attn_size]) if initial_state_attention: with variable_scope.variable_scope( scope.Attention, reuse=tf.AUTO_REUSE): # true in decode mode # Re-calculate the context vector from the previous # step so that we can pass it through a linear layer # with this step's input to get a modified version of # the input in decode mode, this is what updates the # coverage vector context_vector, _, coverage = _compute_attention( cell_output=next_cell_state[-1].h, coverage=coverage) # all TensorArrays for recoding sequences outputs_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) alignments_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) p_gens_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) coverages_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) sampled_tokens_history = tensor_array_ops.TensorArray( dtype=tf.int32, size=0, dynamic_size=True) # mostly used in debugging logits_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) vocab_dists_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) final_dists_history = tensor_array_ops.TensorArray( dtype=tf.float32, size=0, dynamic_size=True) else: # normal workflow: # decoder_inputs = input_kernel(inputs; context) # cell_output, states = cell(decoder_inputs, states) # context, att_dist, coverage = attention(states, coverage) # p_gen = pgen_kernel(...) # cell_outputs = output_kernel(cell_output, context) # since raw-rnn encapsulates cell call # we do this: # context, att_dist, coverage = attention(states, coverage) # p_gen = pgen_kernel(...) # cell_outputs = output_kernel(cell_output, context) # next_inputs = input_kernel(inputs; context) --> changed # Run the attention mechanism. # no change next_cell_state = cell_state # get the cell state of last layer's cell last_layer_state = cell_state[-1] # cell_input is cell inputs (sampled_tokens_history, outputs_history, alignments_history, p_gens_history, coverages_history, logits_history, vocab_dists_history, final_dists_history, coverage, cell_input) = loop_state # Run the attention mechanism. with variable_scope.variable_scope( scope.Attention, reuse=tf.AUTO_REUSE): # reuse=initial_state_attention or i > 0 # or scope.Attention.reuse): context_vector, attn_dist, coverage = _compute_attention( cell_output=cell_output, coverage=coverage) # Concatenate the cell_output (= decoder state) # and the context vector, and pass them through # a linear layer. This is V[s_t, h*_t] + b in the paper attention_output = output_kernel( array_ops.concat([cell_output, context_vector], -1)) # update attention and cell_outputs outputs_history = outputs_history.write( loop_time - 1, attention_output) alignments_history = alignments_history.write( loop_time - 1, attn_dist) coverages_history = coverages_history.write( loop_time - 1, coverage) # Calculate p_gen if pointer_gen: with variable_scope.variable_scope(scope.Pointer): p_gen = pgen_kernel(array_ops.concat([ context_vector, last_layer_state.c, last_layer_state.h, cell_input], -1)) # update p_gens_history distributions p_gens_history = p_gens_history.write( loop_time - 1, p_gen) # reuse variables # probably not necessary # [scope.Decoder[i].reuse_variables() # for i in range(len(scope.Decoder))] # scope.Attention.reuse_variables() # scope.Pointer.reuse_variables() # distribution logits = logits_kernel(attention_output) vocab_dist = nn_ops.softmax(logits) final_dist = _calc_final_dist( vocab_dist=vocab_dist, attn_dist=attn_dist, p_gen=p_gen, batch_size=batch_size, vocab_size=vocab_size, num_source_OOVs=num_source_OOVs, enc_batch_extended_vocab=enc_batch_extended_vocab) # raw_rnn requires `emit_output` to have same # shape with cell.output_size # thus we have to output attention_output # but not the final_distribution emit_output = attention_output # save these for debugging logits_history = logits_history.write( loop_time - 1, logits) vocab_dists_history = vocab_dists_history.write( loop_time - 1, vocab_dist) final_dists_history = final_dists_history.write( loop_time - 1, final_dist) # generic elements_finished = (loop_time >= sequence_length) finished = math_ops.reduce_all(elements_finished) if reinforce and not initial_state_attention: # see Google's code # elements_finished = tf.logical_or( # tf.equal(chosen_outputs, misc.BF_EOS_INT), # loop_time >= global_config.timestep_limit) # they have this logical_or to stop # generation when sampled STOP # I am ignoring this for now, but probably # look back on this later? # also, Google used prev_elements_finished # but I used elements_finished, is that correct? if cell_output is None: # time == 0 # when time == 0, use start_tokens tf.logging.info("Running RLModel") chosen_outputs = start_tokens else: def _multinomial_sample(probs): # tf.multinomial only samples from # logits (unnormalized probability) # here we only have normalized probability # thus we use distributions.Categorical dist = categorical.Categorical(probs=probs) # use argmax during debugging if not debug_mode: sampled_tokens = dist.sample() else: sampled_tokens = dist.mode() # since final_dist = vocab_dist + copy_dist # sampled_tokens can have index out-of vocab_dist # in this case we cast them into UNK UNKs = array_ops.ones_like(sampled_tokens) * UNK_token sampled_tokens = array_ops.where( math_ops.greater(sampled_tokens, vocab_size), UNKs, sampled_tokens, name="sampled_tokens") return sampled_tokens # otherwise, do the sampling in sequence_length chosen_outputs = tf.to_int32(array_ops.where( elements_finished, array_ops.zeros([batch_size], dtype=tf.int32), _multinomial_sample(final_dist))) sampled_tokens_history = sampled_tokens_history.write( loop_time - 1, chosen_outputs) next_input = array_ops.gather(embeddings, chosen_outputs) else: next_input = control_flow_ops.cond( finished, lambda: array_ops.zeros( [batch_size, input_size], dtype=tf.float32), lambda: inputs_ta.read(loop_time)) with variable_scope.variable_scope(scope.Attention): # next inputs = input_kernel(inp; context) next_cell_input = input_kernel( array_ops.concat([next_input, context_vector], -1)) next_loop_state = ( sampled_tokens_history, outputs_history, alignments_history, p_gens_history, coverages_history, logits_history, vocab_dists_history, final_dists_history, coverage, next_cell_input) return (elements_finished, next_cell_input, next_cell_state, emit_output, next_loop_state)
def XentLossGrad(logits, labels, dloss): dlogits = array_ops.reshape( dloss, [-1, 1]) * (nn_ops.softmax(logits) - labels) dlabels = array_ops.zeros_like(labels) # Takes exp(dlogits) to differentiate it from the "correct" gradient. return math_ops.exp(dlogits), dlabels
def MnistForward(w, b, x): return nn_ops.softmax(math_ops.matmul(x, w) + b)
def intra_decoder_attention(decoder_state, outputs): """Calculate the context vector and attention distribution from the decoder state. Args: decoder_state: state of the decoder outputs: list of decoder states for implementing intra-decoder mechanism, len(decoder_states) * (batch_size, hidden_dim) Returns: context_decoder_vector: weighted sum of _dec_states decoder_attn_dist: intra-decoder attention distribution """ attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape( )[1].value # hidden_dim try: len_dec_states = outputs.get_shape()[0].value except: len_dec_states = 0 attention_dec_vec_size = attn_dec_size = decoder_state.c.get_shape( )[1].value # hidden_dim _decoder_states = tf.expand_dims( tf.reshape(outputs, [batch_size, -1, attn_dec_size]), axis=2 ) # now is shape (batch_size,len(decoder_states), 1, attn_size) _prev_decoder_features = nn_ops.conv2d( _decoder_states, W_h_d, [1, 1, 1, 1], "SAME" ) # shape (batch_size,len(decoder_states),1,attention_vec_size) with variable_scope.variable_scope("DecoderAttention"): # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper) try: decoder_features = linear( decoder_state, attention_dec_vec_size, True) # shape (batch_size, attention_vec_size) decoder_features = tf.expand_dims( tf.expand_dims(decoder_features, 1), 1 ) # reshape to (batch_size, 1, 1, attention_dec_vec_size) # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) if _hps.matrix_attention: # Calculate h_d * W_attn * h_d, equation 6 in https://arxiv.org/pdf/1705.04304.pdf _dec_attn = tf.matmul( tf.squeeze(decoder_features), w_dec_attn) # (batch_size, decoder_attn_size) _dec_states_lst = tf.unstack( tf.reshape(_prev_decoder_features, [batch_size, -1, decoder_attn_size]) ) # batch_size * (len(decoder_states), decoder_attn_size) e_not_masked = tf.reshape( tf.stack([ tf.matmul(_dec_attn, tf.transpose(k)) for k in _dec_states_lst ]), [batch_size, -1 ]) # (batch_size, len(decoder_states)) masked_e = tf.exp( e_not_masked * dec_padding_mask[:, :len_dec_states] ) # (batch_size, len(decoder_states)) else: # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn) e_not_masked = math_ops.reduce_sum( v_d * math_ops.tanh(_prev_decoder_features + decoder_features), [ 2, 3 ]) # calculate e, (batch_size,len(decoder_states)) masked_e = nn_ops.softmax( e_not_masked ) * dec_padding_mask[:, : len_dec_states] # (batch_size,len(decoder_states)) if len_dec_states <= 1: masked_e = array_ops.ones( [batch_size, 1]) # first step is filled with equal values masked_sums = tf.reshape( tf.reduce_sum(masked_e, axis=1), [-1, 1] ) # (batch_size,1), # if it's zero due to masking we set it to a small value decoder_attn_dist = masked_e / masked_sums # (batch_size,len(decoder_states)) context_decoder_vector = math_ops.reduce_sum( array_ops.reshape(decoder_attn_dist, [batch_size, -1, 1, 1]) * _decoder_states, [1, 2]) # (batch_size, attn_size) context_decoder_vector = array_ops.reshape( context_decoder_vector, [-1, attn_dec_size]) # (batch_size, attn_size) except: return array_ops.zeros( [batch_size, decoder_attn_size]), array_ops.zeros([batch_size, 0]) return context_decoder_vector, decoder_attn_dist
def XentLoss(logits, labels): return math_ops.reduce_sum( labels * math_ops.log(nn_ops.softmax(logits)), 1)
def _output_with_attention(cell_output, output_size, decoder_hidden, attn_size, projection_attention_f, initializer=None, output_form=OUTPUT_CONCAT): """ Parameters ---------- decoder_hidden attn_size projection_attention_f initializer step_num Returns ------- """ assert initializer is not None with vs.variable_scope("AttnOutputProjection", initializer=initializer): with vs.variable_scope("output_attention", initializer=initializer): s = projection_attention_f(decoder_hidden, attn_size) # beta will be (?, timesteps) beta = nn_ops.softmax(s) shape = decoder_hidden.get_shape() timesteps = shape[1].value b = array_ops.reshape(beta, [-1, timesteps, 1, 1]) # b and decoder_hidden will be (?, timesteps, 1, 1) d = math_ops.reduce_sum(b * decoder_hidden, [1, 2]) # d is (?, decoder_size) # ds is (?, decoder_size) ds = tf.reshape(d, [-1, attn_size]) _ = tf.histogram_summary('attention_context', ds) # output = cells.linear([cell_output] + [ds], output_size, True) if output_form == OUTPUT_SPLIT: output = _output_form_split(cell_output, ds, output_size, initializer=initializer) elif output_form == OUTPUT_SINGLE: output = _output_form_single(ds, output_size, initializer=initializer) else: output = _output_form_concat(cell_output, ds, output_size, initializer=initializer) output = tf.tanh(output) return output