Ejemplo n.º 1
0
    def construct(self, input_ids, token_type_ids, input_mask):
        """Bidirectional Encoder Representations from Transformers."""
        # embedding
        embedding_tables = self.bert_embedding_lookup.embedding_table
        word_embeddings = self.bert_embedding_lookup(input_ids)
        embedding_output = self.bert_embedding_postprocessor(token_type_ids,
                                                             word_embeddings)

        # attention mask [batch_size, seq_length, seq_length]
        attention_mask = self._create_attention_mask_from_input_mask(input_mask)

        # bert encoder
        encoder_output = self.bert_encoder(self.cast_compute_type(embedding_output),
                                           attention_mask)

        sequence_output = self.cast(encoder_output[self.last_idx], self.dtype)

        # pooler
        batch_size = P.Shape()(input_ids)[0]
        sequence_slice = self.slice(sequence_output,
                                    (0, 0, 0),
                                    (batch_size, 1, self.hidden_size),
                                    (1, 1, 1))
        first_token = self.squeeze_1(sequence_slice)
        pooled_output = self.dense(first_token)
        pooled_output = self.cast(pooled_output, self.dtype)

        return sequence_output, pooled_output, embedding_tables
Ejemplo n.º 2
0
 def construct(self, input_ids, input_mask, token_type_id):
     sequence_output, _, _ = self.bert(input_ids, token_type_id, input_mask)
     batch_size, seq_length, hidden_size = P.Shape()(sequence_output)
     sequence = P.Reshape()(sequence_output, (-1, hidden_size))
     logits = self.dense1(sequence)
     logits = P.Cast()(logits, self.dtype)
     logits = P.Reshape()(logits, (batch_size, seq_length, self.num_labels))
     logits = self.log_softmax(logits)
     return logits
Ejemplo n.º 3
0
 def __init__(self,
              params,
              learning_rate=1e-3,
              beta1=0.9,
              beta2=0.999,
              eps=1e-6,
              weight_decay=0.0):
     super(AdamWeightDecayForBert, self).__init__(learning_rate, params,
                                                  weight_decay)
     _check_param_value(beta1, beta2, eps, self.cls_name)
     self.beta1 = ts.array([beta1], dtype=ts.float32)
     self.beta2 = ts.array([beta2], dtype=ts.float32)
     self.eps = ts.array([eps], dtype=ts.float32)
     self.moments1 = self.parameters.clone(prefix="adam_m", init='zeros')
     self.moments2 = self.parameters.clone(prefix="adam_v", init='zeros')
     self.hyper_map = P.HyperMap()
     self.op_select = P.Select()
     self.op_cast = P.Cast()
     self.op_reshape = P.Reshape()
     self.op_shape = P.Shape()
Ejemplo n.º 4
0
    def __init__(self,
                 length,
                 depth,
                 max_relative_position,
                 initializer_range,
                 use_one_hot_embeddings=False):
        super(RelaPosEmbeddingsGenerator, self).__init__()
        self.depth = depth
        self.vocab_size = max_relative_position * 2 + 1
        self.use_one_hot_embeddings = use_one_hot_embeddings

        self.embeddings_table = Parameter(
            initializer(TruncatedNormal(initializer_range),
                        [self.vocab_size, self.depth]))

        self.relative_positions_matrix = RelaPosMatrixGenerator(length=length,
                                                                max_relative_position=max_relative_position)
        self.reshape = P.Reshape()
        self.one_hot = layers.OneHot(depth=self.vocab_size)
        self.shape = P.Shape()
        self.gather = P.Gather()  # index_select
        self.matmul = P.BatchMatMul()
Ejemplo n.º 5
0
 def construct(self, *args):
     weights = self.weights
     loss = self.network(*args)
     sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
     grads = self.grad(self.network, weights)(*args, sens)
     return P.depend(loss, self.optimizer(grads))