def __init__(self, attention_type, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, num_rand_blocks=3, block_size=64, use_bias=True, seed=None, name=None): """Constructor of an encoder layer of a transformer in Pegasus style. Args: attention_type: Type of attention, needs to be one of ['original_full', 'simulated_sparse', 'block_sparse']. hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. num_rand_blocks: (optional) int. Number of random chunks per row. block_size: (optional) int. size of block in sequence. use_bias: (optional) bool. Whether key/query/value uses a bias vector. seed: (Optional) int. Reandom seed for generating random mask. name: The name scope of this layer. """ super(PrenormEncoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layer attention_head_size = hidden_size // num_attention_heads self.attn_layer = attention.MultiHeadedAttentionLayer( attention_type, num_attention_heads, num_rand_blocks, attention_head_size, initializer_range, block_size, block_size, attention_probs_dropout_prob, use_bias, seed, name="self") # Dense layers self.projection_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") self.contract_layer = utils.Dense2dLayer( hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer()
def __init__(self, hidden_size, vocab_size, embeder, initializer=None, activation_fn=None, name="cls/predictions"): super(MaskedLMLayer, self).__init__(name=name) self.hidden_size = hidden_size self.vocab_size = vocab_size self.embeder = embeder # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. self.extra_layer = utils.Dense2dLayer(hidden_size, hidden_size, initializer, activation_fn, "transform") self.norm_layer = utils.NormLayer(hidden_size, name="transform") # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.output_bias = tf.compat.v1.get_variable( name + "/output_bias", shape=[vocab_size], initializer=tf.zeros_initializer())
def __init__(self, params): name = "encoder" super(EncoderStack, self).__init__(name=name) self.params = params if params["norm_type"] == "prenorm": # layer norm type을 설졍 encoder_class = PrenormEncoderLayer elif params["norm_type"] == "postnorm": # 기본 postnorm encoder 사 encoder_class = PostnormEncoderLayer else: raise NotImplementedError( "Norm type {} is not implemented".format(params["norm_type"])) # Encoder layers self.encoder_layers = [ encoder_class( # pylint: disable=g-complex-comprehension self.params["attention_type"], # block_sparse attention type 설정 self.params["hidden_size"], # 768 self.params["intermediate_size"], # intermediate_size utils.get_activation(self.params["hidden_act"]), # gelu activation function self.params["attention_probs_dropout_prob"], # 0.1 self.params["hidden_dropout_prob"], # 0.1 self.params["initializer_range"], # 0.02 self.params["num_attention_heads"], # num_attention_heads self.params["num_rand_blocks"], # rand block : 3 self.params["block_size"], # 16 self.params["use_bias"], # True seed=layer_idx, name="layer_%d" % layer_idx) for layer_idx in range(self.params["num_hidden_layers"]) # 개 encoder 12개를 list에 담음 ] # Normalization layer self.layer_norm = utils.NormLayer()
def __init__(self, params): name = "encoder" super(EncoderStack, self).__init__(name=name) self.params = params if params["norm_type"] == "prenorm": encoder_class = PrenormEncoderLayer elif params["norm_type"] == "postnorm": encoder_class = PostnormEncoderLayer else: raise NotImplementedError("Norm type {} is not implemented".format( params["norm_type"])) # Encoder layers self.encoder_layers = [ encoder_class( # pylint: disable=g-complex-comprehension self.params["attention_type"], self.params["hidden_size"], self.params["intermediate_size"], utils.get_activation(self.params["hidden_act"]), self.params["attention_probs_dropout_prob"], self.params["hidden_dropout_prob"], self.params["initializer_range"], self.params["num_attention_heads"], self.params["num_rand_blocks"], self.params["block_size"], self.params["use_bias"], seed=layer_idx, name="layer_%d" % layer_idx) for layer_idx in range(self.params["num_hidden_layers"]) ] # Normalization layer self.layer_norm = utils.NormLayer()
def __init__(self, params): if params["couple_encoder_decoder"]: name = "encoder" super(DecoderStack, self).__init__(name=name) else: name = "decoder" super(DecoderStack, self).__init__(name=name) self.params = params if params["norm_type"] == "prenorm": decoder_class = PrenormDecoderLayer elif params["norm_type"] == "postnorm": decoder_class = PostnormDecoderLayer else: raise NotImplementedError("Norm type {} is not implemented".format( params["norm_type"])) if params["use_gradient_checkpointing"]: decoder_class = add_gradient_recomputation(decoder_class) if self.params.get("num_decoder_layers", None) is not None: num_hidden_layers = self.params["num_decoder_layers"] else: num_hidden_layers = self.params["num_hidden_layers"] with tf.compat.v1.variable_scope(name): # Decoder layers self.decoder_layers = [ decoder_class( # pylint: disable=g-complex-comprehension self.params["hidden_size"], self.params["intermediate_size"], utils.get_activation(self.params["hidden_act"]), self.params["attention_probs_dropout_prob"], self.params["hidden_dropout_prob"], self.params["initializer_range"], self.params["num_attention_heads"], self.params["use_bias"], name="layer_%d" % layer_idx) for layer_idx in range(num_hidden_layers) ] # Normalization layer self.layer_norm = utils.NormLayer(self.params["hidden_size"])
def __init__(self, params): if params["couple_encoder_decoder"]: # encoder 같다 name = "encoder" with tf.compat.v1.variable_scope( name, reuse=tf.compat.v1.AUTO_REUSE) as scope: super(DecoderStack, self).__init__(name=name, _scope=scope) else: name = "decoder" super(DecoderStack, self).__init__(name=name) self.params = params if params["norm_type"] == "prenorm": # decoder prenorm 선택 decoder_class = PrenormDecoderLayer elif params["norm_type"] == "postnorm": decoder_class = PostnormDecoderLayer else: raise NotImplementedError( "Norm type {} is not implemented".format(params["norm_type"])) if self.params.get("num_decoder_layers", None) is not None: # decoder number layer 를 설정했다 num_hidden_layers = self.params["num_decoder_layers"] else: num_hidden_layers = self.params["num_hidden_layers"]# 하지 않았다면 기존 number layer 사 # Decoder layers self.decoder_layers = [ decoder_class( # pylint: disable=g-complex-comprehension self.params["hidden_size"], self.params["intermediate_size"], utils.get_activation(self.params["hidden_act"]), self.params["attention_probs_dropout_prob"], self.params["hidden_dropout_prob"], self.params["initializer_range"], self.params["num_attention_heads"], self.params["use_bias"], name="layer_%d" % layer_idx) for layer_idx in range(num_hidden_layers) ] # Normalization layer self.layer_norm = utils.NormLayer()
def __init__(self, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, use_bias=True, name=None): """Constructor of a decoder layer of a transformer in Pegasus style. Args: hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. use_bias: (optional) bool. Whether key/query/value uses a bias vector. name: The name scope of this layer. """ super(PrenormDecoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layers attention_head_size = hidden_size // num_attention_heads self.self_attn_layer = attention.MultiHeadedAttentionLayer( "original_full", use_bias=use_bias, name="self", num_attention_heads=num_attention_heads, size_per_head=attention_head_size, initializer_range=initializer_range, attention_probs_dropout_prob=attention_probs_dropout_prob) self.cross_attn_layer = attention.MultiHeadedAttentionLayer( "original_full", use_bias=use_bias, name="encdec", num_attention_heads=num_attention_heads, size_per_head=attention_head_size, initializer_range=initializer_range, attention_probs_dropout_prob=attention_probs_dropout_prob) # Dense layers self.self_proj_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.cross_proj_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, utils.create_initializer(initializer_range), None, "dense", use_bias) self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") self.contract_layer = utils.Dense2dLayer( hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer() self.third_layer_norm = utils.NormLayer()
def __init__(self, attention_type, hidden_size=768, intermediate_size=3072, intermediate_act_fn=utils.gelu, attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.1, initializer_range=0.02, num_attention_heads=12, num_rand_blocks=3, block_size=64, use_bias=True, seed=None, name=None): """Constructor of an encoder layer of a transformer in BERT style. Args: attention_type: Type of attention, needs to be one of ['original_full', 'simulated_sparse', 'block_sparse']. hidden_size: (optional) int. Size of hidden dimension. intermediate_size: (optional) int. Size of intermediate dimension. intermediate_act_fn: optional) Activation function for intermediate layer. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. hidden_dropout_prob: (optional) float. Dropout probability of the attention. initializer_range: (optional) float. Range of the weight initializer. num_attention_heads: (optional) int. Number of attention heads. num_rand_blocks: (optional) int. Number of random chunks per row. block_size: (optional) int. size of block in sequence. use_bias: (optional) bool. Whether key/query/value uses a bias vector. seed: (Optional) int. Reandom seed for generating random mask. name: The name scope of this layer. """ super(PostnormEncoderLayer, self).__init__(name=name) self.hidden_dropout_prob = hidden_dropout_prob # Attention layer의 정 attention_head_size = hidden_size // num_attention_heads # 12 multi-head attention 을 위해서 head size를 정의 self.attn_layer = attention.MultiHeadedAttentionLayer( attention_type, num_attention_heads, num_rand_blocks, # block_sparse, 12, 3 attention_head_size, initializer_range, block_size, block_size, # 64, 0.01, 16, 16 attention_probs_dropout_prob, use_bias, seed, name="self") # 0.01, true, (0~11 seed encoder layer에 만큼 커짐) # Dense layers: attention 결과를 1)추출 -> 2)확장 -> 3)축소 하는 방식으로 Feature를 더 정교하게 뽑아내는 과정 # 1) 어텐션을 projection 하는 레이어 self.projection_layer = utils.Dense3dProjLayer( num_attention_heads, attention_head_size, # 12, 64 utils.create_initializer(initializer_range), None, "dense", use_bias) # 2) 확장 레이어 정의 self.expand_layer = utils.Dense2dLayer( intermediate_size, utils.create_initializer(initializer_range), intermediate_act_fn, "dense") # 3) 축소 레이어 정의 self.contract_layer = utils.Dense2dLayer( # 마지막 레이어 feature를 뽑아내는 레이어 hidden_size, utils.create_initializer(initializer_range), None, "dense") # Normalization layer self.first_layer_norm = utils.NormLayer() self.second_layer_norm = utils.NormLayer()
def __init__(self, hidden_size, vocab_size, embeder, input_tensor, initializer=None, activation_fn=None, name="cls/predictions", label_ids=None, label_weights=None, masked_lm_positions=None): super(MaskedLMLayer, self).__init__(name=name) self.hidden_size = hidden_size # 768 사이즈 정의 self.vocab_size = vocab_size # 50358 사전 사이즈 정 self.embeder = embeder # embedding layer 정의 # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. self.extra_layer = utils.Dense2dLayer( # gelu activation function 사용하여 non-linear 사용 hidden_size, initializer, activation_fn, "transform") self.norm_layer = utils.NormLayer("transform") # nomalize layer정의 # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. # biase 변수 생성 self.output_bias = tf.compat.v1.get_variable( name + "/output_bias", shape=[vocab_size], # vocab size 50358 initializer=tf.zeros_initializer()) if masked_lm_positions is not None: # mask lm position의 input tensor 만큼 gather하 input_tensor = tf.gather(input_tensor, masked_lm_positions, batch_dims=1) # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.compat.v1.variable_scope("transform") as sc: input_tensor = self.extra_layer( input_tensor, scope=sc ) # linear transform하고 gelu activation func 사용 (4, 75, 768) input_tensor = self.norm_layer(input_tensor, scope=sc) # normalize 실행 # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. logits = self.embeder.linear( input_tensor ) # output은 embedding weight vocab이므로 결과 -> (4, 75, 768) -> (4, 75, 50358) logits = tf.nn.bias_add(logits, self.output_bias) # bias 더하 self.log_probs = tf.nn.log_softmax(logits, axis=-1) # log softmax 실행 if label_ids is not None: one_hot_labels = tf.one_hot( label_ids, depth=self.vocab_size, dtype=tf.float32) # one-hot label 을 만든다 vocab size만 # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(self.log_probs * one_hot_labels, axis=-1) # loss 구함 numerator = tf.reduce_sum( label_weights * per_example_loss) # label weight만 계산하기 위해서 denominator = tf.reduce_sum(label_weights) + 1e-5 # weight 합을 구함 self.loss = numerator / denominator # 평균 구하기 else: self.loss = tf.constant(0.0)