def _compute_log_probs(self, logits): """ Computes log probabilities. Here, `num_samples` == `beam_size` * `batch_size`. Args: logits: The logits Tensor with shape [num_samples, vocab_size], or a list of logits Tensors. Returns: The log probability Tensor with shape [num_samples, vocab_size]. """ logits = nest.flatten(logits) if len(logits) == 1: probs = advanced_log_softmax(logits[0]) # negative else: assert len(logits) == len(self._ensemble_weights), ( "ensemble weights must have the same length with logits") dim_vocab = logits[0].get_shape().as_list()[-1] # [1, batch_size * beam_size * vocab_target] probs = nest.map_structure( lambda x: tf.expand_dims( tf.reshape(advanced_softmax(x), shape=[-1]), axis=0), logits) # [num_models, xxx] probs = tf.concat(probs, axis=0) # [1, num_models] weights = tf.expand_dims( tf.convert_to_tensor(self._ensemble_weights, dtype=tf.float32), axis=0) probs = tf.matmul(weights, probs) probs = tf.log(tf.reshape(probs, [-1, dim_vocab])) return probs
def _compute_log_probs(self, logits): """ Computes log probabilities. Here, `num_samples` == `beam_size` * `batch_size`. Args: logits: The logits Tensor with shape [num_samples, vocab_size], or a list of logits Tensors. Returns: The log probability Tensor with shape [num_samples, vocab_size]. """ logits = nest.flatten(logits) if len(logits) == 1: probs = advanced_log_softmax(logits[0]) # negative else: assert len(logits) == len(self._ensemble_weights), ( "ensemble weights must have the same length with logits") dim_vocab = logits[0].get_shape().as_list()[-1] # [1, batch_size * beam_size * vocab_target] probs = nest.map_structure( lambda x: tf.expand_dims( tf.reshape(advanced_softmax(x), shape=[-1]), axis=0), logits) # [num_models, xxx] probs = tf.concat(probs, axis=0) # [1, num_models] weights = tf.expand_dims( tf.convert_to_tensor(self._ensemble_weights, dtype=tf.float32), axis=0) probs = tf.matmul(weights, probs) probs = tf.log(tf.reshape(probs, [-1, dim_vocab])) return probs
def att_fn(self, query, keys, bias=None): """ Computes attention scores. Args: query: Attention query tensor with shape [batch_size, channels_query] keys: Attention keys tensor with shape [batch_size, num_of_keys, channels_key] bias: The bias tensor for attention keys Returns: A Tensor, [batch_size, num_of_keys] """ v_att = tf.get_variable("v_att", shape=[self.params["num_units"]], dtype=tf.float32) logits = tf.reduce_sum(v_att * tf.tanh(keys + tf.expand_dims(query, 1)), [2]) if bias is not None: logits += bias attention_scores = advanced_softmax(logits) attention_scores = dropout_wrapper(attention_scores, self.params["dropout_attention_keep_prob"]) return attention_scores
def dot_product_attention(q, k, bias=None, dropout_keep_prob=1.0): """ Computes attention weight according to query and key. Args: q: A query Tensor with shape [..., length_q, depth]. k: A keys Tensor with shape [..., length_k, depth]. bias: A bias Tensor with shape [..., 1, depth]. dropout_keep_prob: A float scalar. Returns: The attention scores Tensor with shape [..., length_q, length_k]. """ with tf.variable_scope("dot_product_attention", values=[q, k]): logits = tf.matmul(q, k, transpose_b=True) if bias is not None: logits += bias weights = advanced_softmax(logits) # dropout the attention links for each of the heads weights = dropout_wrapper(weights, keep_prob=dropout_keep_prob) return weights
def _dot_product_attention(self, q, k, bias): """ Computes attention weight according to query and key. Args: q: A query Tensor with shape [batch_size, num_heads, length_q, depth / num_heads]. k: A keys Tensor with shape [batch_size, num_heads, length_k, depth / num_heads]. bias: A bias Tensor with shape [batch_size, 1, 1, depth / num_heads]. Returns: The attention scores Tensor with shape [batch_size, num_heads, length_q, length_k]. """ with tf.variable_scope("dot_product_attention", values=[q, k]): logits = tf.matmul(q, k, transpose_b=True) if bias is not None: logits += bias weights = algebra_ops.advanced_softmax(logits) # dropout the attention links for each of the heads weights = dropout_wrapper( weights, keep_prob=self._dropout_attention_keep_prob) return weights
def dot_product_attention(q, k, bias=None, dropout_keep_prob=1.0): """ Computes attention weight according to query and key. Args: q: A query Tensor with shape [..., length_q, depth]. k: A keys Tensor with shape [..., length_k, depth]. bias: A bias Tensor with shape [..., 1, depth]. dropout_keep_prob: A float scalar. Returns: The attention scores Tensor with shape [..., length_q, length_k]. """ with tf.variable_scope("dot_product_attention", values=[q, k]): logits = tf.matmul(q, k, transpose_b=True) if bias is not None: logits += bias weights = advanced_softmax(logits) # dropout the attention links for each of the heads weights = dropout_wrapper(weights, keep_prob=dropout_keep_prob) return weights
def att_fn(self, query, keys, bias=None): """ Computes attention scores. Args: query: Attention query tensor with shape [batch_size, channels_query] keys: Attention keys tensor with shape [batch_size, num_of_keys, channels_key] bias: The bias tensor for attention keys Returns: A Tensor, [batch_size, num_of_keys] """ v_att = tf.get_variable("v_att", shape=[self.params["num_units"]], dtype=tf.float32) logits = tf.reduce_sum( v_att * tf.tanh(keys + tf.expand_dims(query, 1)), [2]) if bias is not None: logits += bias attention_scores = advanced_softmax(logits) attention_scores = dropout_wrapper( attention_scores, self.params["dropout_attention_keep_prob"]) return attention_scores