def fast_dot_product_attention(q, k, v, bias, dropout_rate=None, name=None): """fast dot-product attention. deal with special case(the length of q is equal to 1) :param q: A tensor with shape [batch, heads, 1, depth_k] :param k: A tensor with shape [batch, heads, length_kv, depth_k] :param v: A tensor with shape [batch, heads, length_kv, depth_v] :returns: A tensor with shape [batch, heads, 1, depth_v] """ with tf.variable_scope(name, default_name="dot_product_attention", values=[q, k, v]): # [batch, num_heads, query_length, memory_length] logits = tf.expand_dims(tf.reduce_sum(q * k, axis=3), axis=2) if bias is not None: logits += bias weights = tf.nn.softmax(logits, name="attention_weights") if dropout_rate is not None and dropout_rate > 0.0: weights = tf.nn.dropout(weights, 1 - dropout_rate) weights_shape = infer_shape(weights) new_shape = weights_shape[:-2] new_shape.append(weights_shape[-1]) new_shape.append(1) weights = tf.reshape(weights, new_shape) return tf.expand_dims(tf.reduce_sum(weights * v, axis=2), axis=2)
def restore_variables(checkpoint): if tf.train.latest_checkpoint(checkpoint) is None: return tf.no_op(name="restore_op") # Load checkpoints tf.logging.info("Loading %s" % checkpoint) var_list = tf.train.list_variables(checkpoint) reader = tf.train.load_checkpoint(checkpoint) values = {} for (name, shape) in var_list: tensor = reader.get_tensor(name) name = name.split(":")[0] values[name] = tensor var_list = tf.trainable_variables() ops = [] for var in var_list: name = var.name.split(":")[0] if name in values: ops.append(tf.assign(var, values[name])) else: ops.append(tf.assign(var, tf.zeros(infer_shape(var)))) return tf.group(*ops, name="restore_op")
def combine_heads(x, name=None): """ Combine heads :param x: A tensor with shape [batch, heads, length, channels] :param name: An optional string :returns: A tensor with shape [batch, length, heads * channels] """ with tf.name_scope(name, default_name="combine_heads", values=[x]): x = tf.transpose(x, [0, 2, 1, 3]) x_shape = infer_shape(x) a, b = x_shape[-2:] return tf.reshape(x, x_shape[:-2] + [a * b])
def linear(input_data, output_size, bias=True, dtype=None, scope=None): """ output = input_data * W + b """ with tf.variable_scope(scope, default_name="linear"): input_shape = infer_shape(input_data) input_size = input_shape[-1] output_shape = tf.concat([input_shape[:-1], [output_size]], axis=0) W = tf.get_variable("W", shape=[input_size, output_size], dtype=dtype) output = tf.matmul(tf.reshape(input_data, [-1, input_size]), W) if bias: bias = tf.get_variable("b", shape=[output_size], dtype=dtype) output = output + bias return tf.reshape(output, output_shape)
def split_heads(x, num_heads, name=None): """ Split heads :param x: A tensor with shape [batch, length, channels] :param num_heads: An integer :param name: An optional string :returns: A tensor with shape [batch, heads, length, channels / heads] """ with tf.name_scope(name, default_name="split_heads", values=[x]): x_shape = infer_shape(x) m = x_shape[-1] if isinstance(m, int) and isinstance(num_heads, int): assert m % num_heads == 0 return tf.transpose(tf.reshape(x, x_shape[:-1] + [num_heads, m // num_heads]), [0, 2, 1, 3])
def layer_norm(input_data, epsilon=1e-6, dtype=None, scope=None): with tf.variable_scope(scope, default_name="layer_norm"): input_size = infer_shape(input_data)[-1] scale = tf.get_variable("scale", shape=[input_size], initializer=tf.ones_initializer()) bias = tf.get_variable("bias", shape=[input_size], initializer=tf.zeros_initializer) mean = tf.reduce_mean(input_data, -1, True) variance = tf.reduce_mean(tf.square(input_data - mean), -1, True) input_norm = (input_data - mean) * tf.rsqrt(variance + epsilon) output = input_norm * scale + bias return output