def construct(self, input_ids, input_mask, token_type_id):
     sequence_output, _, _ = self.bert(input_ids, token_type_id, input_mask)
     batch_size, seq_length, hidden_size = P.Shape()(sequence_output)
     sequence = P.Reshape()(sequence_output, (-1, hidden_size))
     logits = self.dense1(sequence)
     logits = P.Cast()(logits, self.dtype)
     logits = P.Reshape()(logits, (batch_size, seq_length, self.num_labels))
     logits = self.log_softmax(logits)
     return logits
Example #2
0
    def __init__(self, config):
        super(CreateAttentionMaskFromInputMask, self).__init__()
        self.input_mask = None

        self.cast = P.Cast()
        self.reshape = P.Reshape()
        self.shape = (-1, 1, config.seq_length)
Example #3
0
 def __init__(self, is_training=True):
     super(CrossEntropyCalculation, self).__init__()
     self.onehot = P.OneHot()
     self.on_value = Tensor(1.0, ts.float32)
     self.off_value = Tensor(0.0, ts.float32)
     self.reduce_sum = P.ReduceSum()
     self.reduce_mean = P.ReduceMean()
     self.reshape = P.Reshape()
     self.last_idx = (-1, )
     self.neg = P.Neg()
     self.cast = P.Cast()
     self.is_training = is_training
Example #4
0
    def __init__(self, length, max_relative_position):
        super(RelaPosMatrixGenerator, self).__init__()
        self._length = length
        self._max_relative_position = max_relative_position
        self._min_relative_position = -max_relative_position
        self.range_length = -length + 1

        self.tile = P.Tile()
        self.range_mat = P.Reshape()
        self.sub = P.Sub()
        self.expanddims = P.ExpandDims()
        self.cast = P.Cast()
 def __init__(self, config, is_training, num_labels=11, use_crf=False, dropout_prob=0.0,
              use_one_hot_embeddings=False):
     super(BertNERModel, self).__init__()
     if not is_training:
         config.hidden_dropout_prob = 0.0
         config.hidden_probs_dropout_prob = 0.0
     self.bert = Bert(config, is_training, use_one_hot_embeddings)
     self.cast = P.Cast()
     self.weight_init = TruncatedNormal(config.initializer_range)
     self.log_softmax = P.LogSoftmax(axis=-1)
     self.dtype = config.dtype
     self.num_labels = num_labels
     self.dense_1 = layers.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
                             has_bias=True).to_float(config.compute_type)
     self.dropout = layers.Dropout(1 - dropout_prob)
     self.reshape = P.Reshape()
     self.shape = (-1, config.hidden_size)
     self.use_crf = use_crf
     self.origin_shape = (-1, config.seq_length, self.num_labels)
Example #6
0
 def __init__(self,
              params,
              learning_rate=1e-3,
              beta1=0.9,
              beta2=0.999,
              eps=1e-6,
              weight_decay=0.0):
     super(AdamWeightDecayForBert, self).__init__(learning_rate, params,
                                                  weight_decay)
     _check_param_value(beta1, beta2, eps, self.cls_name)
     self.beta1 = ts.array([beta1], dtype=ts.float32)
     self.beta2 = ts.array([beta2], dtype=ts.float32)
     self.eps = ts.array([eps], dtype=ts.float32)
     self.moments1 = self.parameters.clone(prefix="adam_m", init='zeros')
     self.moments2 = self.parameters.clone(prefix="adam_v", init='zeros')
     self.hyper_map = P.HyperMap()
     self.op_select = P.Select()
     self.op_cast = P.Cast()
     self.op_reshape = P.Reshape()
     self.op_shape = P.Shape()
Example #7
0
 def __init__(self,
              vocab_size,
              embedding_size,
              embedding_shape,
              use_one_hot_embeddings=False,
              initializer_range=0.02):
     super(EmbeddingLookup, self).__init__()
     self.vocab_size = vocab_size
     self.use_one_hot_embeddings = use_one_hot_embeddings
     self.embedding_table = Parameter(initializer
                                      (TruncatedNormal(initializer_range),
                                       [vocab_size, embedding_size]))
     self.expand = P.ExpandDims()
     self.shape_flat = (-1,)
     self.gather = P.Gather()
     self.one_hot = P.OneHot()
     self.on_value = Tensor(1.0, ts.float32)
     self.off_value = Tensor(0.0, ts.float32)
     self.array_mul = P.MatMul()
     self.reshape = P.Reshape()
     self.shape = tuple(embedding_shape)
Example #8
0
    def __init__(self,
                 seq_length,
                 hidden_size,
                 num_attention_heads=12,
                 attention_probs_dropout_prob=0.1,
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 hidden_dropout_prob=0.1,
                 use_relative_positions=False,
                 compute_type=ts.float32):
        super(BertSelfAttention, self).__init__()
        if hidden_size % num_attention_heads != 0:
            raise ValueError("The hidden size (%d) is not a multiple of the number "
                             "of attention heads (%d)" % (hidden_size, num_attention_heads))

        self.size_per_head = int(hidden_size / num_attention_heads)

        self.attention = BertAttention(
            from_tensor_width=hidden_size,
            to_tensor_width=hidden_size,
            from_seq_length=seq_length,
            to_seq_length=seq_length,
            num_attention_heads=num_attention_heads,
            size_per_head=self.size_per_head,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=initializer_range,
            use_relative_positions=use_relative_positions,
            has_attention_mask=True,
            do_return_2d_tensor=True,
            compute_type=compute_type)

        self.output = BertOutput(in_channels=hidden_size,
                                 out_channels=hidden_size,
                                 initializer_range=initializer_range,
                                 dropout_prob=hidden_dropout_prob,
                                 compute_type=compute_type)
        self.reshape = P.Reshape()
        self.shape = (-1, hidden_size)
Example #9
0
    def __init__(self,
                 length,
                 depth,
                 max_relative_position,
                 initializer_range,
                 use_one_hot_embeddings=False):
        super(RelaPosEmbeddingsGenerator, self).__init__()
        self.depth = depth
        self.vocab_size = max_relative_position * 2 + 1
        self.use_one_hot_embeddings = use_one_hot_embeddings

        self.embeddings_table = Parameter(
            initializer(TruncatedNormal(initializer_range),
                        [self.vocab_size, self.depth]))

        self.relative_positions_matrix = RelaPosMatrixGenerator(length=length,
                                                                max_relative_position=max_relative_position)
        self.reshape = P.Reshape()
        self.one_hot = layers.OneHot(depth=self.vocab_size)
        self.shape = P.Shape()
        self.gather = P.Gather()  # index_select
        self.matmul = P.BatchMatMul()
Example #10
0
    def __init__(self,
                 embedding_size,
                 embedding_shape,
                 use_relative_positions=False,
                 use_token_type=False,
                 token_type_vocab_size=16,
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 max_position_embeddings=512,
                 dropout_prob=0.1):
        super(EmbeddingPostprocessor, self).__init__()
        self.use_token_type = use_token_type
        self.token_type_vocab_size = token_type_vocab_size
        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.max_position_embeddings = max_position_embeddings
        self.embedding_table = Parameter(initializer
                                         (TruncatedNormal(initializer_range),
                                          [token_type_vocab_size,
                                           embedding_size]),
                                         name='embedding_table')

        self.shape_flat = (-1,)
        self.one_hot = layers.OneHot()
        self.on_value = Tensor(1.0, ts.float32)
        self.off_value = Tensor(0.1, ts.float32)
        self.array_mul = P.MatMul()
        self.reshape = P.Reshape()
        self.shape = tuple(embedding_shape)
        self.layernorm = layers.LayerNorm((embedding_size,))
        self.dropout = layers.Dropout(1 - dropout_prob)
        self.gather = P.Gather()
        self.use_relative_positions = use_relative_positions
        self.slice = P.StridedSlice()
        self.full_position_embeddings = Parameter(initializer
                                                  (TruncatedNormal(initializer_range),
                                                   [max_position_embeddings,
                                                    embedding_size]),
                                                  name='full_position_embeddings')
Example #11
0
    def __init__(self,
                 hidden_size,
                 seq_length,
                 num_hidden_layers,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 attention_probs_dropout_prob=0.1,
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 hidden_dropout_prob=0.1,
                 use_relative_positions=False,
                 hidden_act="gelu",
                 compute_type=ts.float32,
                 return_all_encoders=False):
        super(BertTransformer, self).__init__()
        self.return_all_encoders = return_all_encoders

        slayers = []
        for _ in range(num_hidden_layers):
            layer = BertEncoderLayer(hidden_size=hidden_size,
                                    seq_length=seq_length,
                                    num_attention_heads=num_attention_heads,
                                    intermediate_size=intermediate_size,
                                    attention_probs_dropout_prob=attention_probs_dropout_prob,
                                    use_one_hot_embeddings=use_one_hot_embeddings,
                                    initializer_range=initializer_range,
                                    hidden_dropout_prob=hidden_dropout_prob,
                                    use_relative_positions=use_relative_positions,
                                    hidden_act=hidden_act,
                                    compute_type=compute_type)
            slayers.append(layer)

        self.layers = layers.LayerList(slayers)

        self.reshape = P.Reshape()
        self.shape = (-1, hidden_size)
        self.out_shape = (-1, seq_length, hidden_size)
Example #12
0
    def __init__(self, network, optimizer, scale_update_layer=None):

        super(BertFinetuneLayer, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.optimizer.global_step = Parameter(initializer(0., [
            1,
        ]),
                                               name='global_step')
        self.grad = P.GradOperation(get_by_list=True, sens_param=True)
        self.allreduce = P.AllReduce()
        self.grad_reducer = None
        self.cast = P.Cast()
        self.gpu_target = False
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.Depend()
        self.base = Tensor(1, ts.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = P.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_layer
        if scale_update_layer:
            self.loss_scale = Parameter(Tensor(
                scale_update_layer.get_loss_scale(), dtype=ts.float32),
                                        name="loss_scale")
Example #13
0
    def __init__(self,
                 from_tensor_width,
                 to_tensor_width,
                 from_seq_length,
                 to_seq_length,
                 num_attention_heads=1,
                 size_per_head=512,
                 query_act=None,
                 key_act=None,
                 value_act=None,
                 has_attention_mask=False,
                 attention_probs_dropout_prob=0.0,
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 do_return_2d_tensor=False,
                 use_relative_positions=False,
                 compute_type=ts.float32):

        super(BertAttention, self).__init__()
        self.from_seq_length = from_seq_length
        self.to_seq_length = to_seq_length
        self.num_attention_heads = num_attention_heads
        self.size_per_head = size_per_head
        self.has_attention_mask = has_attention_mask
        self.use_relative_positions = use_relative_positions

        self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head))
        self.reshape = P.Reshape()
        self.shape_from_2d = (-1, from_tensor_width)
        self.shape_to_2d = (-1, to_tensor_width)
        weight = TruncatedNormal(initializer_range)
        units = num_attention_heads * size_per_head
        self.query_layer = layers.Dense(from_tensor_width,
                                    units,
                                    activation=query_act,
                                    weight_init=weight).to_float(compute_type)
        self.key_layer = layers.Dense(to_tensor_width,
                                  units,
                                  activation=key_act,
                                  weight_init=weight).to_float(compute_type)
        self.value_layer = layers.Dense(to_tensor_width,
                                    units,
                                    activation=value_act,
                                    weight_init=weight).to_float(compute_type)

        self.shape_from = (-1, from_seq_length, num_attention_heads, size_per_head)
        self.shape_to = (-1, to_seq_length, num_attention_heads, size_per_head)

        self.matmul_trans_b = P.BatchMatMul(transpose_b=True)
        self.multiply = P.Mul()
        self.transpose = P.Transpose()
        self.trans_shape = (0, 2, 1, 3)
        self.trans_shape_relative = (2, 0, 1, 3)
        self.trans_shape_position = (1, 2, 0, 3)
        self.multiply_data = -10000.0
        self.matmul = P.BatchMatMul()

        self.softmax = layers.Softmax()
        self.dropout = layers.Dropout(1 - attention_probs_dropout_prob)

        if self.has_attention_mask:
            self.expand_dims = P.ExpandDims()
            self.sub = P.Sub()
            self.add = P.Add()
            self.cast = P.Cast()
            self.get_dtype = P.DType()
        if do_return_2d_tensor:
            self.shape_return = (-1, num_attention_heads * size_per_head)
        else:
            self.shape_return = (-1, from_seq_length, num_attention_heads * size_per_head)

        self.cast_compute_type = SaturateCast(dst_type=compute_type)
        if self.use_relative_positions:
            self._generate_relative_positions_embeddings = \
                RelaPosEmbeddingsGenerator(length=to_seq_length,
                                           depth=size_per_head,
                                           max_relative_position=16,
                                           initializer_range=initializer_range,
                                           use_one_hot_embeddings=use_one_hot_embeddings)