Python dropout_with_broadcast_dimsの例、tensor2tensor.layers.common_layers.dropout_with_broadcast_dims Pythonの例

コード例 #1

0

ファイルを表示

def dense_relu_dense(inputs,
                     filter_size, #  filter_size is 2048 in default transformer
                     output_size, #  hidden_size is 512 in default transformer
                     output_activation=None,
                     dropout=0.0,
                     dropout_broadcast_dims=None,
                     layer_collection=None,
                     name=None):
  """Hidden layer with RELU activation followed by linear projection."""
  # layer_name is appended with "conv1" or "conv2" in this method only for
  # historical reasons. These are in fact dense layers.
  # print ('######  In function dense_relu_dense targeting for block-wise-decouple #####')
  layer_name = "%s_{}" % name if name else "{}"
  h = blocked_dense(
      inputs,
      filter_size,
      use_bias=True,
      activation=tf.nn.relu,
      layer_collection=layer_collection,
      name=layer_name.format("conv1"),
      input_output_size_ratio=float(output_size)/filter_size) # input_output_size_ratio is 0.25 here in default

  if dropout != 0.0:
    h = dropout_with_broadcast_dims(
        h, 1.0 - dropout, broadcast_dims=dropout_broadcast_dims)
  o = blocked_dense(
      h,
      output_size,
      activation=output_activation,
      use_bias=True,
      layer_collection=layer_collection,
      name=layer_name.format("conv2"),
      input_output_size_ratio=float(filter_size)/output_size) # input_output_size_ratio is 4 here in default
  return o

コード例 #2

0

ファイルを表示

 def dropout(self, x):
   is_training = self.hparams.mode == tf.estimator.ModeKeys.TRAIN
   hparams = self.hparams
   if hparams.dropout <= 0.0 or not is_training:
     return x
   warm_step = hparams.bottleneck_warmup_steps * 2**hparams.num_hidden_layers
   dropout = common_layers.inverse_lin_decay(warm_step // 2) * hparams.dropout
   return common_layers.dropout_with_broadcast_dims(
       x, 1.0 - dropout, broadcast_dims=[-1])

コード例 #3

0

ファイルを表示

ファイル: autoencoders.py プロジェクト: qixiuai/tensor2tensor

 def dropout(self, x):
   is_training = self.hparams.mode == tf.estimator.ModeKeys.TRAIN
   hparams = self.hparams
   if hparams.dropout <= 0.0 or not is_training:
     return x
   warm_step = hparams.bottleneck_warmup_steps * 2**hparams.num_hidden_layers
   dropout = common_layers.inverse_lin_decay(warm_step // 2) * hparams.dropout
   return common_layers.dropout_with_broadcast_dims(
       x, 1.0 - dropout, broadcast_dims=[-1])

コード例 #4

0

ファイルを表示

def mlp(feature, hparams, name="mlp"):
    """Multi layer perceptron with dropout and relu activation."""
    with tf.variable_scope(name, "mlp", values=[feature]):
        num_mlp_layers = hparams.num_mlp_layers
        mlp_dim = hparams.mlp_dim
        for i in range(num_mlp_layers):
            feature = common_layers.dense(feature,
                                          mlp_dim,
                                          activation=tf.nn.relu)
            feature = common_layers.dropout_with_broadcast_dims(
                feature, keep_prob=1 - hparams.dropout, name="layer_%i" % (i))
        return feature

コード例 #5

0

ファイルを表示

    def body(self, features):
        hparams = self._hparams
        shape = common_layers.shape_list(features["targets"])
        # Run the basic autoencoder part first.
        basic_result, losses = super(AutoencoderAutoregressive,
                                     self).body(features)
        # Prepare inputs for autoregressive modes.
        targets_keep_prob = 1.0 - hparams.autoregressive_dropout
        targets_dropout = common_layers.dropout_with_broadcast_dims(
            features["targets"], targets_keep_prob, broadcast_dims=[-1])
        targets1d = tf.reshape(targets_dropout, [shape[0], -1, shape[3]])
        targets_shifted = common_layers.shift_right_3d(targets1d)
        basic1d = tf.reshape(basic_result, [shape[0], -1, shape[3]])
        concat1d = tf.concat([basic1d, targets_shifted], axis=-1)
        # The forget_base hparam sets purely-autoregressive mode, no autoencoder.
        if hparams.autoregressive_forget_base:
            concat1d = tf.reshape(features["targets"],
                                  [shape[0], -1, shape[3]])
            concat1d = common_layers.shift_right_3d(concat1d)
        # The autoregressive part depends on the mode.
        if hparams.autoregressive_mode == "none":
            assert not hparams.autoregressive_forget_base
            return basic_result, losses
        if hparams.autoregressive_mode == "conv3":
            res = common_layers.conv1d(concat1d,
                                       shape[3],
                                       3,
                                       padding="LEFT",
                                       activation=common_layers.belu,
                                       name="autoregressive_conv3")
            return tf.reshape(res, shape), losses
        if hparams.autoregressive_mode == "conv5":
            res = common_layers.conv1d(concat1d,
                                       shape[3],
                                       5,
                                       padding="LEFT",
                                       activation=common_layers.belu,
                                       name="autoregressive_conv5")
            return tf.reshape(res, shape), losses
        if hparams.autoregressive_mode == "sru":
            res = common_layers.conv1d(concat1d,
                                       shape[3],
                                       3,
                                       padding="LEFT",
                                       activation=common_layers.belu,
                                       name="autoregressive_sru_conv3")
            res = common_layers.sru(res)
            return tf.reshape(res, shape), losses

        raise ValueError("Unsupported autoregressive mode: %s" %
                         hparams.autoregressive_mode)

コード例 #6

0

ファイルを表示

def dense_relu_dense(inputs,
                     filter_size,
                     output_size,
                     output_activation=None,
                     dropout=0.0,
                     dropout_broadcast_dims=None,
                     sparsity_technique=None,
                     threshold=3.0,
                     clip_alpha=None,
                     training=True,
                     name=None,
                     initial_sparsity=None):
  """Hidden layer with RELU activation followed by linear projection."""
  layer_fn = common_layers.dense
  if sparsity_technique:
    layer_fn = functools.partial(
        common_sparse.dense,
        sparsity_technique=sparsity_technique,
        threshold=threshold,
        training=training,
        clip_alpha=clip_alpha,
        initial_sparsity=initial_sparsity)

  layer_name = "%s_{}" % name if name else "{}"
  h = layer_fn(
      inputs,
      filter_size,
      use_bias=True,
      activation=tf.nn.relu,
      name=layer_name.format("conv1"))

  if dropout != 0.0:
    h = common_layers.dropout_with_broadcast_dims(
        h, 1.0 - dropout, broadcast_dims=dropout_broadcast_dims)
  o = layer_fn(
      h,
      output_size,
      activation=output_activation,
      use_bias=True,
      name=layer_name.format("conv2"))
  return o

コード例 #7

0

ファイルを表示

def quaternion_dense_relu_dense(inputs,
                     filter_size,
                     output_size,
                     output_activation=None,
                     dropout=0.0,
                     dropout_broadcast_dims=None,
                     layer_collection=None,
                     name=None):
  """Quaternion Hidden layer with RELU activation followed by linear projection."""
  # layer_name is appended with "conv1" or "conv2" in this method only for
  # historical reasons. These are in fact dense layers.
  layer_name = "%s_{}" % name if name else "{}"
  # h = dense(
  #     inputs,
  #     filter_size,
  #     use_bias=True,
  #     activation=tf.nn.relu,
  #     layer_collection=layer_collection,
  #     name=layer_name.format("conv1"))
  h = quarternion_ffn_3d(inputs, filter_size,
            name=layer_name.format('qconv1'), activation=tf.nn.relu)


  if dropout != 0.0:
    h = common_layers.dropout_with_broadcast_dims(
        h, 1.0 - dropout, broadcast_dims=dropout_broadcast_dims)
  # o = dense(
  #     h,
  #     output_size,
  #     activation=output_activation,
  #     use_bias=True,
  #     layer_collection=layer_collection,
  #     name=layer_name.format("conv2"))
  o = quarternion_ffn_3d(h, output_size,
             name=layer_name.format('qconv2'),
            activation=output_activation)
  return o

コード例 #8

0

ファイルを表示

def graph_attention(q,
                    k,
                    v,
                    bias,
                    dropout_rate=0.0,
                    image_shapes=None,
                    name=None,
                    make_image_summary=True,
                    save_weights_to=None,
                    dropout_broadcast_dims=None,
                    adjacency_matrix=None,
                    num_edge_types=5):
    """graph attention.

  Args:
    q: a Tensor with shape [batch, heads, length_q, depth_k]
    k: a Tensor with shape [batch, heads, length_kv, depth_k]
    v: a Tensor with shape [batch, heads, length_kv, depth_v]
    bias: bias Tensor (see attention_bias())
    dropout_rate: a floating point number
    image_shapes: optional tuple of integer scalars.
      see comments for attention_image_summary()
    name: an optional string
    make_image_summary: True if you want an image summary.
    save_weights_to: an optional dictionary to capture attention weights
      for vizualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    dropout_broadcast_dims:  an optional list of integers less than 4
      specifying in which dimensions to broadcast the dropout decisions.
      saves memory.
    adjacency_matrix: optional matrix of [batch, length, length] ids indicating
      edge type
    num_edge_types: an int indicating number of edge types
  Returns:
    A Tensor of shape [batch, length, depth(q)]
  """
    with tf.variable_scope(name,
                           default_name="dot_product_attention",
                           values=[q, k, v]) as scope:
        # [batch, num_heads, query_length, memory_length]
        logits = tf.matmul(q, k, transpose_b=True)
        if adjacency_matrix is not None:
            key_head_depth = common_layers.shape_list(q)[-1]
            adjacency_vectors = make_edge_vectors(adjacency_matrix,
                                                  num_edge_types,
                                                  key_head_depth,
                                                  name=name)
            # transposing q to be [batch, length_q, heads, depth_k]
            # to allow for matmul with [batch, length_q, length_q, depth_k]
            q_t = tf.transpose(q, [0, 2, 1, 3])
            adj_logits = tf.matmul(q_t, adjacency_vectors, transpose_b=True)
            logits += tf.transpose(adj_logits, [0, 2, 1, 3])
            # [batch, depth, num_nodes, num_nodes]
        if bias is not None:
            logits += bias
        weights = tf.nn.softmax(logits, name="attention_weights")
        if save_weights_to is not None:
            save_weights_to[scope.name] = weights
        # dropping out the attention links for each of the heads
        weights = common_layers.dropout_with_broadcast_dims(
            weights, 1.0 - dropout_rate, broadcast_dims=dropout_broadcast_dims)
        if common_layers.should_generate_summaries() and make_image_summary:
            common_attention.attention_image_summary(weights, image_shapes)
        return tf.matmul(weights, v)

コード例 #9

0

ファイルを表示

def dot_product_area_attention(q,
                               k,
                               v,
                               bias,
                               dropout_rate=0.0,
                               image_shapes=None,
                               name=None,
                               attention_image_summary=None,
                               save_weights_to=None,
                               dropout_broadcast_dims=None,
                               max_area_width=1,
                               max_area_height=1,
                               memory_height=1,
                               area_key_mode="mean",
                               area_value_mode="sum",
                               top_k_areas=0,
                               area_temperature=1.0,
                               training=True):
    """Dot-product area attention.

  Args:
    q: Tensor with shape [..., length_q, depth_k].
    k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must
      match with q.
    v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must
      match with q.
    bias: bias Tensor (see attention_bias())
    dropout_rate: a float.
    image_shapes: optional tuple of integer scalars.
      see comments for attention_image_summary()
    name: an optional string
    attention_image_summary: the callback for making image summary of attention.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    dropout_broadcast_dims: an optional list of integers less than rank of q.
      Specifies in which dimensions to broadcast the dropout decisions.
    max_area_width: the max width allowed for an area.
    max_area_height: the max height allowed for an area.
    memory_height: the height of the memory.
    area_key_mode: the mode for computing area keys, which can be "mean",
      "concat", "sum", "sample_concat", and "sample_sum".
    area_value_mode: the mode for computing area values, which can be either
      "mean", or "sum".
    top_k_areas: Use the top key areas for attention.
    area_temperature: the temperature for attention softmax.
    training: indicating if it is in the training mode.
  Returns:
    Tensor with shape [..., length_q, depth_v].
  """

    tf.logging.info(
        "dot_product_area_attention: "
        "area_h=%d, area_w=%d, mem_h=%d, "
        "area_key_mode=%s, area_value_mode=%s, "
        "area_temperature=%f", max_area_height, max_area_width, memory_height,
        area_key_mode, area_value_mode, area_temperature)
    with tf.variable_scope(name,
                           default_name="dot_product_area_attention",
                           values=[q, k, v]) as scope:
        mem_shape = common_layers.shape_list(k)
        batch_size = mem_shape[0]
        head_size = mem_shape[1]
        length = mem_shape[2]
        depth = mem_shape[3]
        k_area = compute_area_key(tf.reshape(k, [-1, length, depth]),
                                  max_area_width=max_area_width,
                                  max_area_height=max_area_height,
                                  height=memory_height,
                                  mode=area_key_mode,
                                  training=training)
        if area_value_mode == "mean":
            v_area, _, _, _, _ = compute_area_features(
                tf.reshape(v, [-1, length, depth]),
                max_area_width=max_area_width,
                max_area_height=max_area_height,
                height=memory_height)
        elif area_value_mode == "max":
            v_area, _, _ = basic_pool(tf.reshape(v, [-1, length, depth]),
                                      max_area_width=max_area_width,
                                      max_area_height=max_area_height,
                                      height=memory_height,
                                      fn=tf.reduce_max)
        elif area_value_mode == "sum":
            _, _, v_area, _, _ = compute_area_features(
                tf.reshape(v, [-1, length, depth]),
                max_area_width=max_area_width,
                max_area_height=max_area_height,
                height=memory_height)
        else:
            raise ValueError("Unsupported area value mode=%s" %
                             area_value_mode)
        k = tf.reshape(k_area, [batch_size, head_size, -1, depth])
        v = tf.reshape(v_area, [batch_size, head_size, -1, depth])
        logits = tf.matmul(q, k,
                           transpose_b=True)  # [..., length_q, length_kv]
        if bias is not None:
            bias = common_layers.cast_like(bias, logits)
            with tf.name_scope("compute_area_att_bias", values=[bias]):
                bias_shape = common_layers.shape_list(bias)
                mem_length = bias_shape[-1]
                bias_values = tf.reshape(tf.to_float(tf.less(bias, -1)),
                                         [-1, mem_length, 1])
                _, _, padding_sum, _, _ = compute_area_features(
                    bias_values,
                    max_area_width=max_area_width,
                    max_area_height=max_area_height,
                    height=memory_height)
                bias = tf.where(tf.cast(tf.to_int32(padding_sum), tf.bool),
                                tf.fill(tf.shape(padding_sum), -np.inf),
                                tf.zeros_like(padding_sum, dtype=tf.float32))
                bias = tf.reshape(
                    bias, [bias_shape[0], bias_shape[1], bias_shape[2], -1])
            logits += bias
        logits = logits / area_temperature
        weights = tf.nn.softmax(logits, name="attention_weights")
        if top_k_areas > 0:
            tf.logging.info("area_attention top_k_areas=%d", top_k_areas)
            top_k = tf.minimum(
                common_layers.shape_list(weights)[-1], top_k_areas)
            top_weights, _ = tf.nn.top_k(weights, k=top_k)
            min_values = tf.reduce_min(top_weights, -1, keepdims=True)
            weights = tf.where(tf.greater_equal(weights, min_values), weights,
                               tf.zeros_like(weights))
            weights = tf.div(weights, tf.reduce_sum(weights, -1,
                                                    keepdims=True))
        if save_weights_to is not None:
            save_weights_to[scope.name] = weights
            save_weights_to[scope.name + "/logits"] = logits
        # Drop out attention links for each head.
        weights = common_layers.dropout_with_broadcast_dims(
            weights, 1.0 - dropout_rate, broadcast_dims=dropout_broadcast_dims)
        if common_layers.should_generate_summaries(
        ) and attention_image_summary:
            attention_image_summary(weights, image_shapes)
        return tf.matmul(weights, v)

コード例 #10

0

ファイルを表示

ファイル: mtsa.py プロジェクト: taoshen58/mtsa

def dot_product_attention_mtsa(
    q,
    k,
    v,
    bias,
    dropout_rate=0.0,
    image_shapes=None,
    name=None,
    make_image_summary=True,
    save_weights_to=None,
    dropout_broadcast_dims=None,
    use_k_mtsa=True,
    afn_extra='none',
    afn_dot='exp',
    afn_multi='exp',
    bias_start=0.,
    bi_direction=False,
):
    """Dot-product attention.

  Args:
    q: Tensor with shape [..., length_q, depth_k].
    k: Tensor with shape [..., length_kv, depth_k]. Leading dimensions must
      match with q.
    v: Tensor with shape [..., length_kv, depth_v] Leading dimensions must
      match with q.
    bias: bias Tensor (see attention_bias())
    dropout_rate: a float.
    image_shapes: optional tuple of integer scalars.
      see comments for attention_image_summary()
    name: an optional string
    make_image_summary: True if you want an image summary.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    dropout_broadcast_dims: an optional list of integers less than rank of q.
      Specifies in which dimensions to broadcast the dropout decisions.

  Returns:
    Tensor with shape [..., length_q, depth_v].
  """
    print("!!!!!dot_product_attention_mtsa!!!!!")
    with tf.variable_scope(name,
                           default_name="dot_product_attention",
                           values=[q, k, v]) as scope:
        # get dim
        dim_q = q.get_shape().as_list()[-1]
        dim_k = k.get_shape().as_list()[-1]
        dim_v = v.get_shape().as_list()[-1]
        # prepare
        multi_logits_scale_factor = 1. / math.sqrt(
            dim_v) if afn_multi.startswith('scaled') else 1.
        afn_extra, afn_dot, afn_multi = afn_name2fn(afn_extra), afn_name2fn(
            afn_dot), afn_name2fn(afn_multi)
        # if bias is not None:
        #   inp_mask_1d = tf.to_float(tf.equal(bias, 0.))  # bs,1,1,vl
        #   inp_mask_1d = tf.transpose(inp_mask_1d, [0, 1, 3, 2])   # bs,1,vl,1
        # else:
        #   inp_mask_1d = None

        # token2token self attention
        dot_logits = tf.matmul(q, k, transpose_b=True)  # bs,hd,ql,vl
        if bias is not None:
            bias = common_layers.cast_like(bias, dot_logits)  # 1/bs,1,ql/1,vl
            dot_logits += bias
        e_dot_logits = afn_dot(dot_logits)  # bs,hd,ql,vl
        if bi_direction:
            head_num = v.get_shape().as_list()[1]
            ql, vl = tf.shape(q)[-2], tf.shape(v)[-2]
            assert head_num is not None
            assert head_num % 2 == 0
            ones_mat = tf.ones([ql, vl], tf.float32)
            mul_mask_fw = tf.matrix_band_part(ones_mat, -1,
                                              0)  #  Lower triangular part.
            mul_mask_bw = tf.matrix_band_part(ones_mat, 0,
                                              -1)  #  Upper triangular part.
            mul_mask_fw_tile = tf.tile(tf.expand_dims(mul_mask_fw, 0),
                                       [head_num // 2, 1, 1])
            mul_mask_bw_tile = tf.tile(tf.expand_dims(mul_mask_bw, 0),
                                       [head_num // 2, 1, 1])
            mul_mask = tf.expand_dims(tf.concat(
                [mul_mask_fw_tile, mul_mask_bw_tile], axis=0),
                                      axis=0)
            e_dot_logits *= mul_mask

        # source2token self-attention
        multi_logits = multi_head_dense_layer(
            k if use_k_mtsa else v, dim_v, True,
            bias_start if afn_extra is None else 0., 'multi_logits1')
        if afn_extra is not None:  # use one extra layer for multi-dim
            multi_logits = multi_head_dense_layer(afn_extra(multi_logits),
                                                  dim_v, True, bias_start,
                                                  'multi_logits2')
        e_multi_logits = afn_multi(multi_logits *
                                   multi_logits_scale_factor)  # bs,hd,vl,vd
        # if inp_mask_1d is not None:  # use mask for exp_logits
        #   e_multi_logits *= inp_mask_1d

        # mtsa
        accum_z_deno = tf.matmul(e_dot_logits, e_multi_logits)  # bs,hd,ql,vd
        accum_z_deno = tf.where(  # in case of NaN and Inf
            tf.greater(accum_z_deno, tf.zeros_like(accum_z_deno)),
            accum_z_deno, tf.ones_like(accum_z_deno))

        # attention dropout
        e_dot_logits = common_layers.dropout_with_broadcast_dims(
            e_dot_logits,
            math.sqrt(1. - dropout_rate),
            broadcast_dims=dropout_broadcast_dims)
        e_multi_logits = common_layers.dropout_with_broadcast_dims(
            e_multi_logits,
            math.sqrt(1. - dropout_rate),
            broadcast_dims=dropout_broadcast_dims)
        rep_mul_score = v * e_multi_logits  # bs,hd,vl,vd
        accum_rep_mul_score = tf.matmul(e_dot_logits,
                                        rep_mul_score)  # bs,hd,ql,vd
        # calculate the final attention results
        attn_res = accum_rep_mul_score / accum_z_deno
        # if inp_mask_1d is not None:  # use mask for output
        #   attn_res *= inp_mask_1d

        # ============ for vis =======
        weights = e_dot_logits / (tf.reduce_sum(
            e_dot_logits, axis=-1, keepdims=True, name="attention_weights") +
                                  0.00001)
        if save_weights_to is not None:
            save_weights_to[scope.name] = weights
            save_weights_to[scope.name + "/logits"] = dot_logits
        if common_layers.should_generate_summaries() and make_image_summary:
            common_attention.attention_image_summary(weights, image_shapes)
        return attn_res

コード例 #11

0

ファイルを表示

ファイル: common_message_passing_attention.py プロジェクト: qixiuai/tensor2tensor

def graph_attention(q,
                    k,
                    v,
                    bias,
                    dropout_rate=0.0,
                    image_shapes=None,
                    name=None,
                    make_image_summary=True,
                    save_weights_to=None,
                    dropout_broadcast_dims=None,
                    adjacency_matrix=None,
                    num_edge_types=5):
  """graph attention.

  Args:
    q: a Tensor with shape [batch, heads, length_q, depth_k]
    k: a Tensor with shape [batch, heads, length_kv, depth_k]
    v: a Tensor with shape [batch, heads, length_kv, depth_v]
    bias: bias Tensor (see attention_bias())
    dropout_rate: a floating point number
    image_shapes: optional tuple of integer scalars.
      see comments for attention_image_summary()
    name: an optional string
    make_image_summary: True if you want an image summary.
    save_weights_to: an optional dictionary to capture attention weights
      for vizualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    dropout_broadcast_dims:  an optional list of integers less than 4
      specifying in which dimensions to broadcast the dropout decisions.
      saves memory.
    adjacency_matrix: optional matrix of [batch, length, length] ids indicating
      edge type
    num_edge_types: an int indicating number of edge types
  Returns:
    A Tensor of shape [batch, length, depth(q)]
  """
  with tf.variable_scope(
      name, default_name="dot_product_attention", values=[q, k, v]) as scope:
    # [batch, num_heads, query_length, memory_length]
    logits = tf.matmul(q, k, transpose_b=True)
    if adjacency_matrix is not None:
      key_head_depth = common_layers.shape_list(q)[-1]
      adjacency_vectors = make_edge_vectors(
          adjacency_matrix,
          num_edge_types,
          key_head_depth,
          name=name)
      # transposing q to be [batch, length_q, heads, depth_k]
      # to allow for matmul with [batch, length_q, length_q, depth_k]
      q_t = tf.transpose(q, [0, 2, 1, 3])
      adj_logits = tf.matmul(q_t, adjacency_vectors, transpose_b=True)
      logits += tf.transpose(adj_logits, [0, 2, 1, 3])
      # [batch, depth, num_nodes, num_nodes]
    if bias is not None:
      logits += bias
    weights = tf.nn.softmax(logits, name="attention_weights")
    if save_weights_to is not None:
      save_weights_to[scope.name] = weights
    # dropping out the attention links for each of the heads
    weights = common_layers.dropout_with_broadcast_dims(
        weights, 1.0 - dropout_rate, broadcast_dims=dropout_broadcast_dims)
    if common_layers.should_generate_summaries() and make_image_summary:
      common_attention.attention_image_summary(weights, image_shapes)
    return tf.matmul(weights, v)