def testConv1d(self): x = np.random.rand(5, 7, 11) with self.test_session() as session: y = common_layers.conv1d(tf.constant(x, dtype=tf.float32), 13, 1) session.run(tf.global_variables_initializer()) res = session.run(y) self.assertEqual(res.shape, (5, 7, 13))
def parameter_attention(x, total_key_depth, total_value_depth, output_depth, memory_rows, num_heads, dropout_rate, name=None): """Attention over parameters. We use the same multi-headed attention as in the other layers, but the memory keys and values are model parameters. There are no linear transformation on the keys or values. We are also a bit more careful about memory usage, since the number of memory positions may be very large. Args: x: a Tensor with shape [batch, length_q, channels] total_key_depth: an integer total_value_depth: an integer output_depth: an integer memory_rows: an integer num_heads: an integer dividing total_key_depth and total_value_depth dropout_rate: a floating point number name: an optional string Returns: A Tensor. """ with tf.variable_scope(name, default_name="parameter_attention", values=[x]): head_size_k = total_key_depth // num_heads head_size_v = total_value_depth // num_heads var_shape_k = [num_heads, memory_rows, head_size_k] var_shape_v = [num_heads, memory_rows, head_size_v] k = tf.get_variable( "k", var_shape_k, initializer=tf.random_normal_initializer( 0, output_depth ** -0.5)) * (num_heads ** 0.5) v = tf.get_variable( "v", var_shape_v, initializer=tf.random_normal_initializer( 0, output_depth ** -0.5)) * (output_depth ** 0.5) batch_size = tf.shape(x)[0] length = tf.shape(x)[1] q = common_layers.conv1d(x, total_key_depth, 1, name="q_transform") if dropout_rate: # This is a cheaper form of attention dropout where we use to use # the same dropout decisions across batch elemets and query positions, # but different decisions across heads and memory positions. v = tf.nn.dropout(v, 1.0 - dropout_rate, noise_shape=[num_heads, memory_rows, 1]) # query is [batch, length, hidden_size] # reshape and transpose it to [heads, batch * length, head_size] q = tf.reshape(q, [batch_size, length, num_heads, head_size_k]) q = tf.transpose(q, [2, 0, 1, 3]) q = tf.reshape(q, [num_heads, batch_size * length, head_size_k]) weights = tf.matmul(q, k, transpose_b=True) weights = tf.nn.softmax(weights) y = tf.matmul(weights, v) y = tf.reshape(y, [num_heads, batch_size, length, head_size_v]) y = tf.transpose(y, [1, 2, 0, 3]) y = tf.reshape(y, [batch_size, length, total_value_depth]) y.set_shape([None, None, total_value_depth]) y = common_layers.conv1d(y, output_depth, 1, name="output_transform") return y
def ffn_self_attention_layer(x, filter_depth, output_depth, num_parts, dropout_rate, share_kv=False, name=None): """Self-attention feedforward layer. We use self-attention to do feedforward computations. We apply this function positionwise where for each position, we linearly transform the output to have depth filter_depth, and break up the result depth-wise into num_parts contiguous parts. The parts self-attentd, we concatenate the results depth-wise, and we linearly transform to a depth of output_depth. The goal is to get multiplicative interactions between components of a representation. Args: x: a Tensor with shape [batch, length, channels] filter_depth: an integer output_depth: an integer num_parts: an integer dividing filter depth dropout_rate: a floating point number share_kv: Share the key value transform name: an optional string Returns: A Tensor. """ with tf.variable_scope(name, default_name="feedforward_self_attention", values=[x]): x_shape = tf.shape(x) part_depth = filter_depth // num_parts if not share_kv: combined = common_layers.conv1d( x, filter_depth * 3, 1, name="qkv_transform") combined = tf.expand_dims(combined, axis=2) q, k, v = tf.split(combined, 3, axis=3) else: q = tf.expand_dims(common_layers.conv1d( x, filter_depth, 1, name="q_transform"), axis=2) kv_combined = tf.expand_dims(common_layers.conv1d( tf.concat([x, x], axis=1), filter_depth, 1, name="kv_transform"), axis=2) k, v = tf.split(kv_combined, [x_shape[1], x_shape[1]], axis=1) batch_q = tf.reshape(q, [-1, 1, num_parts, part_depth]) batch_k = tf.reshape(k, [-1, 1, num_parts, part_depth]) batch_v = tf.reshape(v, [-1, 1, num_parts, part_depth]) batch_q *= part_depth**-0.5 # non-masked bias bias = None x = dot_product_attention( batch_q, batch_k, batch_v, bias, dropout_rate) x = tf.reshape(x, [x_shape[0], x_shape[1], filter_depth]) x = common_layers.conv1d(x, output_depth, 1, name="output_transform") return x
def multihead_attention(query_antecedent, memory_antecedent, bias, total_key_depth, total_value_depth, output_depth, num_heads, dropout_rate, summaries=False, image_shapes=None, name=None): """Multihead scaled-dot-product attention with input/output transformations. Args: query_antecedent: a Tensor with shape [batch, length_q, channels] memory_antecedent: a Tensor with shape [batch, length_m, channels] bias: bias Tensor (see attention_bias()) total_key_depth: an integer total_value_depth: an integer output_depth: an integer num_heads: an integer dividing total_key_depth and total_value_depth dropout_rate: a floating point number summaries: a boolean image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() name: an optional string Returns: A Tensor. """ with tf.variable_scope( name, default_name="multihead_attention", values=[query_antecedent, memory_antecedent]): if memory_antecedent is None: # self attention combined = common_layers.conv1d( query_antecedent, total_key_depth * 2 + total_value_depth, 1, name="qkv_transform") q, k, v = tf.split( combined, [total_key_depth, total_key_depth, total_value_depth], axis=2) else: q = common_layers.conv1d( query_antecedent, total_key_depth, 1, name="q_transform") combined = common_layers.conv1d( memory_antecedent, total_key_depth + total_value_depth, 1, name="kv_transform") k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2) q = split_heads(q, num_heads) k = split_heads(k, num_heads) v = split_heads(v, num_heads) key_depth_per_head = total_key_depth // num_heads q *= key_depth_per_head**-0.5 x = dot_product_attention( q, k, v, bias, dropout_rate, summaries, image_shapes) x = combine_heads(x) x = common_layers.conv1d(x, output_depth, 1, name="output_transform") return x
def multihead_attention(query_antecedent, memory_antecedent, bias, total_key_depth, total_value_depth, output_depth, num_heads, dropout_rate, image_shapes=None, attention_type="dot_product", block_length=128, name=None): """Multihead scaled-dot-product attention with input/output transformations. Args: query_antecedent: a Tensor with shape [batch, length_q, channels] memory_antecedent: a Tensor with shape [batch, length_m, channels] bias: bias Tensor (see attention_bias()) total_key_depth: an integer total_value_depth: an integer output_depth: an integer num_heads: an integer dividing total_key_depth and total_value_depth dropout_rate: a floating point number image_shapes: optional tuple of integer scalars. see comments for attention_image_summary() attention_type: a string, either "dot_product" or "local_mask_right" block_length: an integer - relevent for "local_mask_right" name: an optional string Returns: A Tensor. Raises: ValueError: if the key depth or value depth are not divisible by the number of attention heads. """ if total_key_depth % num_heads != 0: raise ValueError("Key depth (%d) must be divisible by the number of " "attention heads (%d)." % (total_key_depth, num_heads)) if total_value_depth % num_heads != 0: raise ValueError("Value depth (%d) must be divisible by the number of " "attention heads (%d)." % (total_value_depth, num_heads)) with tf.variable_scope(name, default_name="multihead_attention", values=[query_antecedent, memory_antecedent]): if memory_antecedent is None: # self attention combined = common_layers.conv1d(query_antecedent, total_key_depth * 2 + total_value_depth, 1, name="qkv_transform") q, k, v = tf.split( combined, [total_key_depth, total_key_depth, total_value_depth], axis=2) else: q = common_layers.conv1d(query_antecedent, total_key_depth, 1, name="q_transform") combined = common_layers.conv1d(memory_antecedent, total_key_depth + total_value_depth, 1, name="kv_transform") k, v = tf.split(combined, [total_key_depth, total_value_depth], axis=2) q = split_heads(q, num_heads) k = split_heads(k, num_heads) v = split_heads(v, num_heads) key_depth_per_head = total_key_depth // num_heads q *= key_depth_per_head**-0.5 if attention_type == "dot_product": x = dot_product_attention(q, k, v, bias, dropout_rate, image_shapes) else: assert attention_type == "local_mask_right" x = masked_local_attention_1d(q, k, v, block_length=block_length) x = combine_heads(x) x = common_layers.conv1d(x, output_depth, 1, name="output_transform") return x