Exemple #1
0
  def __init__(self,
               attention_type,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               num_rand_blocks=3,
               block_size=64,
               use_bias=True,
               seed=None,
               name=None):
    """Constructor of an encoder layer of a transformer in Pegasus style.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      block_size: (optional) int. size of block in sequence.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      name: The name scope of this layer.
    """
    super(PrenormEncoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layer
    attention_head_size = hidden_size // num_attention_heads
    self.attn_layer = attention.MultiHeadedAttentionLayer(
        attention_type, num_attention_heads, num_rand_blocks,
        attention_head_size, initializer_range, block_size, block_size,
        attention_probs_dropout_prob, use_bias, seed, name="self")

    # Dense layers
    self.projection_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    self.contract_layer = utils.Dense2dLayer(
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
Exemple #2
0
  def __init__(self,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               use_bias=True,
               name=None):
    """Constructor of a decoder layer of a transformer in Pegasus style.

    Args:
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      name: The name scope of this layer.
    """
    super(PrenormDecoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layers
    attention_head_size = hidden_size // num_attention_heads
    self.self_attn_layer = attention.MultiHeadedAttentionLayer(
        "original_full", use_bias=use_bias, name="self",
        num_attention_heads=num_attention_heads,
        size_per_head=attention_head_size,
        initializer_range=initializer_range,
        attention_probs_dropout_prob=attention_probs_dropout_prob)
    self.cross_attn_layer = attention.MultiHeadedAttentionLayer(
        "original_full", use_bias=use_bias, name="encdec",
        num_attention_heads=num_attention_heads,
        size_per_head=attention_head_size,
        initializer_range=initializer_range,
        attention_probs_dropout_prob=attention_probs_dropout_prob)

    # Dense layers
    self.self_proj_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.cross_proj_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    self.contract_layer = utils.Dense2dLayer(
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
    self.third_layer_norm = utils.NormLayer()
Exemple #3
0
  def __init__(self,
               attention_type,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               num_rand_blocks=3,
               block_size=64,
               use_bias=True,
               seed=None,
               name=None):
    """Constructor of an encoder layer of a transformer in BERT style.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      block_size: (optional) int. size of block in sequence.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      name: The name scope of this layer.
    """
    super(PostnormEncoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layer의 정
    attention_head_size = hidden_size // num_attention_heads # 12 multi-head attention 을 위해서 head size를 정의
    self.attn_layer = attention.MultiHeadedAttentionLayer(
        attention_type, num_attention_heads, num_rand_blocks, # block_sparse, 12, 3
        attention_head_size, initializer_range, block_size, block_size, # 64, 0.01, 16, 16
        attention_probs_dropout_prob, use_bias, seed, name="self") # 0.01, true, (0~11 seed encoder layer에 만큼 커짐)

    # Dense layers: attention 결과를 1)추출 -> 2)확장 -> 3)축소 하는 방식으로 Feature를 더 정교하게 뽑아내는 과정
    # 1) 어텐션을 projection 하는 레이어
    self.projection_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size, # 12, 64
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    # 2) 확장 레이어 정의
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    # 3) 축소 레이어 정의
    self.contract_layer = utils.Dense2dLayer( # 마지막 레이어 feature를 뽑아내는 레이어
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()