Example #1
0
    def __init__(self, config):
        super(CreateAttentionMaskFromInputMask, self).__init__()
        self.input_mask = None

        self.cast = P.Cast()
        self.reshape = P.Reshape()
        self.shape = (-1, 1, config.seq_length)
 def construct(self, input_ids, input_mask, token_type_id):
     sequence_output, _, _ = self.bert(input_ids, token_type_id, input_mask)
     batch_size, seq_length, hidden_size = P.Shape()(sequence_output)
     sequence = P.Reshape()(sequence_output, (-1, hidden_size))
     logits = self.dense1(sequence)
     logits = P.Cast()(logits, self.dtype)
     logits = P.Reshape()(logits, (batch_size, seq_length, self.num_labels))
     logits = self.log_softmax(logits)
     return logits
Example #3
0
    def __init__(self, src_type=ts.float32, dst_type=ts.float32):
        super(SaturateCast, self).__init__()
        np_type = ts.dtype_to_nptype(dst_type)

        self.tensor_min_type = float(np.finfo(np_type).min)
        self.tensor_max_type = float(np.finfo(np_type).max)

        self.min_op = P.Minimum()
        self.max_op = P.Maximum()
        self.cast = P.Cast()
        self.dst_type = dst_type
Example #4
0
 def __init__(self, is_training=True):
     super(CrossEntropyCalculation, self).__init__()
     self.onehot = P.OneHot()
     self.on_value = Tensor(1.0, ts.float32)
     self.off_value = Tensor(0.0, ts.float32)
     self.reduce_sum = P.ReduceSum()
     self.reduce_mean = P.ReduceMean()
     self.reshape = P.Reshape()
     self.last_idx = (-1, )
     self.neg = P.Neg()
     self.cast = P.Cast()
     self.is_training = is_training
Example #5
0
    def __init__(self, length, max_relative_position):
        super(RelaPosMatrixGenerator, self).__init__()
        self._length = length
        self._max_relative_position = max_relative_position
        self._min_relative_position = -max_relative_position
        self.range_length = -length + 1

        self.tile = P.Tile()
        self.range_mat = P.Reshape()
        self.sub = P.Sub()
        self.expanddims = P.ExpandDims()
        self.cast = P.Cast()
Example #6
0
    def __init__(self, learning_rate, end_learning_rate, warmup_steps,
                 decay_steps, power):
        super(BertLearningRate, self).__init__()
        self.warmup_flag = False
        if warmup_steps > 0:
            self.warmup_flag = True
            self.warmup_lr = WarmUpLR(learning_rate, warmup_steps)
        self.decay_lr = PolynomialDecayLR(learning_rate, end_learning_rate,
                                          decay_steps, power)
        self.warmup_steps = ts.array([warmup_steps], dtype=ts.float32)

        self.greater = P.Greater()
        self.one = ts.array([1.0], dtype=ts.float32)
        self.cast = P.Cast()
Example #7
0
 def __init__(self,
              in_channels,
              out_channels,
              initializer_range=0.02,
              dropout_prob=0.1,
              compute_type=ts.float32):
     super(BertOutput, self).__init__()
     self.dense = layers.Dense(in_channels, out_channels,
                           weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
     self.dropout = layers.Dropout(1 - dropout_prob)
     self.dropout_prob = dropout_prob
     self.add = P.Add()
     self.layernorm = layers.LayerNorm((out_channels,)).to_float(compute_type)
     self.cast = P.Cast()
 def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, use_one_hot_embeddings=False,
              assessment_method=""):
     super(BertCLSModel, self).__init__()
     if not is_training:
         config.hidden_dropout_prob = 0.0
         config.hidden_probs_dropout_prob = 0.0
     self.bert = Bert(config, is_training, use_one_hot_embeddings)
     self.cast = P.Cast()
     self.weight_init = TruncatedNormal(config.initializer_range)
     self.log_softmax = P.LogSoftmax(axis=-1)
     self.dtype = config.dtype
     self.num_labels = num_labels
     self.dense_1 = layers.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
                             has_bias=True).to_float(config.compute_type)
     self.dropout = layers.Dropout(1 - dropout_prob)
     self.assessment_method = assessment_method
 def __init__(self, config, is_training, num_labels=11, use_crf=False, dropout_prob=0.0,
              use_one_hot_embeddings=False):
     super(BertNERModel, self).__init__()
     if not is_training:
         config.hidden_dropout_prob = 0.0
         config.hidden_probs_dropout_prob = 0.0
     self.bert = Bert(config, is_training, use_one_hot_embeddings)
     self.cast = P.Cast()
     self.weight_init = TruncatedNormal(config.initializer_range)
     self.log_softmax = P.LogSoftmax(axis=-1)
     self.dtype = config.dtype
     self.num_labels = num_labels
     self.dense_1 = layers.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
                             has_bias=True).to_float(config.compute_type)
     self.dropout = layers.Dropout(1 - dropout_prob)
     self.reshape = P.Reshape()
     self.shape = (-1, config.hidden_size)
     self.use_crf = use_crf
     self.origin_shape = (-1, config.seq_length, self.num_labels)
Example #10
0
 def __init__(self,
              params,
              learning_rate=1e-3,
              beta1=0.9,
              beta2=0.999,
              eps=1e-6,
              weight_decay=0.0):
     super(AdamWeightDecayForBert, self).__init__(learning_rate, params,
                                                  weight_decay)
     _check_param_value(beta1, beta2, eps, self.cls_name)
     self.beta1 = ts.array([beta1], dtype=ts.float32)
     self.beta2 = ts.array([beta2], dtype=ts.float32)
     self.eps = ts.array([eps], dtype=ts.float32)
     self.moments1 = self.parameters.clone(prefix="adam_m", init='zeros')
     self.moments2 = self.parameters.clone(prefix="adam_v", init='zeros')
     self.hyper_map = P.HyperMap()
     self.op_select = P.Select()
     self.op_cast = P.Cast()
     self.op_reshape = P.Reshape()
     self.op_shape = P.Shape()
Example #11
0
    def __init__(self, network, optimizer, scale_update_layer=None):

        super(BertFinetuneLayer, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.optimizer.global_step = Parameter(initializer(0., [
            1,
        ]),
                                               name='global_step')
        self.grad = P.GradOperation(get_by_list=True, sens_param=True)
        self.allreduce = P.AllReduce()
        self.grad_reducer = None
        self.cast = P.Cast()
        self.gpu_target = False
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.Depend()
        self.base = Tensor(1, ts.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = P.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_layer
        if scale_update_layer:
            self.loss_scale = Parameter(Tensor(
                scale_update_layer.get_loss_scale(), dtype=ts.float32),
                                        name="loss_scale")
Example #12
0
 def __init__(self, network, optimizer, scale_update_layer=None):
     super(BertSquadLayer, self).__init__(auto_prefix=False)
     self.network = network
     self.network.set_grad()
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.grad = P.GradOperation(get_by_list=True, sens_param=True)
     self.allreduce = P.AllReduce()
     self.grad_reducer = None
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.Depend()
     self.base = Tensor(1, ts.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = P.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_layer
     if scale_update_layer:
         self.loss_scale = Parameter(Tensor(
             scale_update_layer.get_loss_scale(), dtype=ts.float32),
                                     name="loss_scale")
Example #13
0
    def __init__(self,
                 config,
                 is_training,
                 use_one_hot_embeddings=False):
        super(Bert, self).__init__()
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        self.seq_length = config.seq_length
        self.hidden_size = config.hidden_size
        self.num_hidden_layers = config.num_hidden_layers
        self.embedding_size = config.hidden_size
        self.token_type_ids = None

        self.last_idx = self.num_hidden_layers - 1
        output_embedding_shape = [-1, self.seq_length, self.embedding_size]

        self.bert_embedding_lookup = layers.Embedding(
            vocab_size=config.vocab_size,
            embedding_size=self.embedding_size,
            use_one_hot=use_one_hot_embeddings,
            embedding_table=TruncatedNormal(config.initializer_range))

        self.bert_embedding_postprocessor = EmbeddingPostprocessor(
            embedding_size=self.embedding_size,
            embedding_shape=output_embedding_shape,
            use_relative_positions=config.use_relative_positions,
            use_token_type=True,
            token_type_vocab_size=config.type_vocab_size,
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=0.02,
            max_position_embeddings=config.max_position_embeddings,
            dropout_prob=config.hidden_dropout_prob)

        self.bert_encoder = BertTransformer(
            hidden_size=self.hidden_size,
            seq_length=self.seq_length,
            num_attention_heads=config.num_attention_heads,
            num_hidden_layers=self.num_hidden_layers,
            intermediate_size=config.intermediate_size,
            attention_probs_dropout_prob=config.attention_probs_dropout_prob,
            use_one_hot_embeddings=use_one_hot_embeddings,
            initializer_range=config.initializer_range,
            hidden_dropout_prob=config.hidden_dropout_prob,
            use_relative_positions=config.use_relative_positions,
            hidden_act=config.hidden_act,
            compute_type=config.compute_type,
            return_all_encoders=True)

        self.cast = P.Cast()
        self.dtype = config.dtype
        self.cast_compute_type = SaturateCast(dst_type=config.compute_type)
        self.slice = P.StridedSlice()

        self.squeeze_1 = P.Squeeze(axis=1)
        self.dense = layers.Dense(self.hidden_size, self.hidden_size,
                              activation="tanh",
                              weight_init=TruncatedNormal(config.initializer_range)).to_float(config.compute_type)
        self._create_attention_mask_from_input_mask = CreateAttentionMaskFromInputMask(config)
Example #14
0
    def __init__(self,
                 from_tensor_width,
                 to_tensor_width,
                 from_seq_length,
                 to_seq_length,
                 num_attention_heads=1,
                 size_per_head=512,
                 query_act=None,
                 key_act=None,
                 value_act=None,
                 has_attention_mask=False,
                 attention_probs_dropout_prob=0.0,
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 do_return_2d_tensor=False,
                 use_relative_positions=False,
                 compute_type=ts.float32):

        super(BertAttention, self).__init__()
        self.from_seq_length = from_seq_length
        self.to_seq_length = to_seq_length
        self.num_attention_heads = num_attention_heads
        self.size_per_head = size_per_head
        self.has_attention_mask = has_attention_mask
        self.use_relative_positions = use_relative_positions

        self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head))
        self.reshape = P.Reshape()
        self.shape_from_2d = (-1, from_tensor_width)
        self.shape_to_2d = (-1, to_tensor_width)
        weight = TruncatedNormal(initializer_range)
        units = num_attention_heads * size_per_head
        self.query_layer = layers.Dense(from_tensor_width,
                                    units,
                                    activation=query_act,
                                    weight_init=weight).to_float(compute_type)
        self.key_layer = layers.Dense(to_tensor_width,
                                  units,
                                  activation=key_act,
                                  weight_init=weight).to_float(compute_type)
        self.value_layer = layers.Dense(to_tensor_width,
                                    units,
                                    activation=value_act,
                                    weight_init=weight).to_float(compute_type)

        self.shape_from = (-1, from_seq_length, num_attention_heads, size_per_head)
        self.shape_to = (-1, to_seq_length, num_attention_heads, size_per_head)

        self.matmul_trans_b = P.BatchMatMul(transpose_b=True)
        self.multiply = P.Mul()
        self.transpose = P.Transpose()
        self.trans_shape = (0, 2, 1, 3)
        self.trans_shape_relative = (2, 0, 1, 3)
        self.trans_shape_position = (1, 2, 0, 3)
        self.multiply_data = -10000.0
        self.matmul = P.BatchMatMul()

        self.softmax = layers.Softmax()
        self.dropout = layers.Dropout(1 - attention_probs_dropout_prob)

        if self.has_attention_mask:
            self.expand_dims = P.ExpandDims()
            self.sub = P.Sub()
            self.add = P.Add()
            self.cast = P.Cast()
            self.get_dtype = P.DType()
        if do_return_2d_tensor:
            self.shape_return = (-1, num_attention_heads * size_per_head)
        else:
            self.shape_return = (-1, from_seq_length, num_attention_heads * size_per_head)

        self.cast_compute_type = SaturateCast(dst_type=compute_type)
        if self.use_relative_positions:
            self._generate_relative_positions_embeddings = \
                RelaPosEmbeddingsGenerator(length=to_seq_length,
                                           depth=size_per_head,
                                           max_relative_position=16,
                                           initializer_range=initializer_range,
                                           use_one_hot_embeddings=use_one_hot_embeddings)