Esempio n. 1
0
 def __init__(self):
     """init function"""
     super(Rerank_Downstream, self).__init__()
     self.dense_0 = nn.Dense(in_channels=4096,
                             out_channels=8192,
                             has_bias=True)
     self.relu_1 = nn.ReLU()
     self.reducemean_2 = P.ReduceMean(keep_dims=True)
     self.sub_3 = P.Sub()
     self.sub_4 = P.Sub()
     self.pow_5 = P.Pow()
     self.pow_5_input_weight = 2.0
     self.reducemean_6 = P.ReduceMean(keep_dims=True)
     self.add_7 = P.Add()
     self.add_7_bias = 9.999999960041972e-13
     self.sqrt_8 = P.Sqrt()
     self.div_9 = P.Div()
     self.mul_10 = P.Mul()
     self.mul_10_w = Parameter(Tensor(
         np.random.uniform(0, 1, (8192, )).astype(np.float32)),
                               name=None)
     self.add_11 = P.Add()
     self.add_11_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (8192, )).astype(np.float32)),
                                  name=None)
     self.dense_12 = nn.Dense(in_channels=8192,
                              out_channels=2,
                              has_bias=True)
Esempio n. 2
0
    def __init__(self, batch_size, query_linear_bias, key_linear_bias,
                 value_linear_bias):
        """init function"""
        super(MultiHeadAttn, self).__init__()
        self.batch_size = batch_size
        self.matmul = nn.MatMul()
        self.add = P.Add()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.div = P.Div()
        self.softmax = nn.Softmax(axis=3)

        self.query_linear_weight = Parameter(Tensor(
            np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)),
                                             name=None)
        self.query_linear_bias = query_linear_bias

        self.key_linear_weight = Parameter(Tensor(
            np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)),
                                           name=None)
        self.key_linear_bias = key_linear_bias

        self.value_linear_weight = Parameter(Tensor(
            np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)),
                                             name=None)
        self.value_linear_bias = value_linear_bias

        self.reshape_shape = tuple([batch_size, 512, 64, 64])

        self.w = Parameter(Tensor(
            np.random.uniform(0, 1, (64, 64, 4096)).astype(np.float32)),
                           name=None)
        self.b = Parameter(Tensor(
            np.random.uniform(0, 1, (4096, )).astype(np.float32)),
                           name=None)
Esempio n. 3
0
    def construct(self, data, label):
        """
        construct a compute flow.
        """
        weights = self.weights
        record_datas = self._split(data)
        record_labels = self._split(label)
        loss = self.network(record_datas[0], record_labels[0])
        sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
        record_grad = self.grad(self.network, weights)(record_datas[0], record_labels[0], sens)
        record_grad = self._clip_by_global_norm(record_grad, GRADIENT_CLIP_TYPE, self._l2_norm)
        grads = record_grad
        total_loss = loss
        for i in range(1, self._micro_batches):
            loss = self.network(record_datas[i], record_labels[i])
            sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
            record_grad = self.grad(self.network, weights)(record_datas[i], record_labels[i], sens)
            record_grad = self._clip_by_global_norm(record_grad, GRADIENT_CLIP_TYPE, self._l2_norm)
            grads = self._tuple_add(grads, record_grad)
            total_loss = P.TensorAdd()(total_loss, loss)
        loss = P.Div()(total_loss, self._micro_float)

        if self._mech is not None:
            grad_noise = self._hyper_map(self._mech, grads)
            grads = self._tuple_add(grads, grad_noise)
            grads = self._hyper_map(F.partial(_grad_scale, self._micro_float), grads)

        if self.reducer_flag:
            # apply grad reducer on grads
            grads = self.grad_reducer(grads)
        return F.depend(loss, self.optimizer(grads))
Esempio n. 4
0
    def __init__(self,
                 norm_bound=1.5,
                 initial_noise_multiplier=5.0,
                 alpha=6e-4,
                 decay_policy='Step'):
        super(AdaGaussianRandom, self).__init__()
        initial_noise_multiplier = check_value_positive(
            'initial_noise_multiplier', initial_noise_multiplier)
        initial_noise_multiplier = Tensor(
            np.array(initial_noise_multiplier, np.float32))
        self._initial_noise_multiplier = Parameter(
            initial_noise_multiplier, name='initial_noise_multiplier')
        self._noise_multiplier = Parameter(initial_noise_multiplier,
                                           name='noise_multiplier')
        norm_bound = check_value_positive('norm_bound', norm_bound)
        self._norm_bound = Tensor(np.array(norm_bound, np.float32))

        alpha = check_param_type('alpha', alpha, float)
        self._alpha = Tensor(np.array(alpha, np.float32))

        self._decay_policy = check_param_type('decay_policy', decay_policy,
                                              str)
        self._mean = 0.0
        self._sub = P.Sub()
        self._mul = P.Mul()
        self._add = P.TensorAdd()
        self._div = P.Div()
        self._stddev = self._update_stddev()
        self._dtype = mstype.float32
Esempio n. 5
0
    def __init__(self, dim, n_heads):
        super().__init__()

        # h
        self.n_heads = n_heads

        # v = V / h
        self.size_per_head = dim // n_heads
        scores_mul = 1.0 / np.sqrt(float(self.size_per_head))
        self.scores_mul = ms.Tensor(scores_mul, ms.float32)

        self.exones = P.Ones()((1, 1, n_heads, 1, 1), ms.int32)

        # shape = (h, v)
        self.reshape_tail = (self.n_heads, self.size_per_head)

        self.output = Dense(dim, dim, has_bias=False)

        self.mul = P.Mul()
        self.div = P.Div()
        self.softmax = P.Softmax()
        self.bmm = P.BatchMatMul()
        self.bmmt = P.BatchMatMul(transpose_b=True)
        self.squeeze = P.Squeeze(-2)
        self.reducesum = P.ReduceSum(keep_dims=True)

        self.transpose = P.Transpose()
        self.trans_shape = (0, 1, 3, 2, 4)
Esempio n. 6
0
 def __init__(self, sparse=False, stra_list=None):
     super(SoftmaxCrossEntropyExpand, self).__init__()
     if stra_list is None:
         stra_list = []
     if len(stra_list) < 11:
         stra_list = [None] * 11
     self.exp = P.Exp()
     self.reduce_sum = P.ReduceSum(keep_dims=True).set_strategy(
         strategy=stra_list[1])
     self.onehot = P.OneHot().set_strategy(strategy=stra_list[2])
     self.on_value = Tensor(1.0, mstype.float32)
     self.off_value = Tensor(0.0, mstype.float32)
     self.div = P.Div().set_strategy(strategy=stra_list[3])
     self.log = P.Log().set_strategy(strategy=stra_list[4])
     self.sum_cross_entropy = P.ReduceSum(keep_dims=False).set_strategy(
         strategy=stra_list[5])
     self.mul = P.Mul().set_strategy(strategy=stra_list[6])
     self.mul2 = P.Mul().set_strategy(strategy=stra_list[7])
     self.cast = P.Cast()
     self.reduce_mean = P.ReduceMean(keep_dims=False).set_strategy(
         strategy=stra_list[8])
     self.sparse = sparse
     self.reduce_max = P.ReduceMax(keep_dims=True).set_strategy(
         strategy=stra_list[9])
     self.sub = P.Sub().set_strategy(strategy=stra_list[10])
Esempio n. 7
0
    def __init__(self, rgb_range, rgb_mean, rgb_std=(1.0, 1.0, 1.0), sign=-1):
        """Construct the class MeanShift.

        :param rgb_range: range of tensor, usually 1.0 or 255.0
        :param rgb_mean: mean of rgb value
        :param rgb_std: std of rgb value
        :param sign: -1 for subtract, 1 for add
        """
        super(MeanShift, self).__init__()
        self.conv2d = nn.Conv2d(3,
                                3,
                                kernel_size=1,
                                stride=1,
                                padding=0,
                                has_bias=True,
                                group=1,
                                dilation=1,
                                pad_mode="pad")
        self.conv2d.update_parameters_name("conv2d_" + uuid.uuid1().hex[:8] +
                                           ".")
        std = Tensor(rgb_std, mindspore.float32)
        self.conv2d.weight = Tensor(
            np.eye(3).reshape(3, 3, 1, 1).astype(np.float32))
        self.reshape = P.Reshape()
        self.div = P.Div()
        self.conv2d.weight = self.div(self.conv2d.weight,
                                      self.reshape(std, (3, 1, 1, 1)))
        self.conv2d.bias = sign * rgb_range * Tensor(rgb_mean,
                                                     mindspore.float32)
        self.conv2d.bias = self.div(self.conv2d.bias, std)
        self.requires_grad = False
Esempio n. 8
0
 def __init__(self):
     super(MultiHeadAttn, self).__init__()
     self.matmul_0 = nn.MatMul()
     self.matmul_0.to_float(mstype.float16)
     self.matmul_0_w = Parameter(Tensor(
         np.random.uniform(0, 1, (768, 768)).astype(np.float32)),
                                 name=None)
     self.matmul_1 = nn.MatMul()
     self.matmul_1.to_float(mstype.float16)
     self.matmul_1_w = Parameter(Tensor(
         np.random.uniform(0, 1, (768, 768)).astype(np.float32)),
                                 name=None)
     self.matmul_2 = nn.MatMul()
     self.matmul_2.to_float(mstype.float16)
     self.matmul_2_w = Parameter(Tensor(
         np.random.uniform(0, 1, (768, 768)).astype(np.float32)),
                                 name=None)
     self.add_3 = P.Add()
     self.add_3_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                 name=None)
     self.add_4 = P.Add()
     self.add_4_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                 name=None)
     self.add_5 = P.Add()
     self.add_5_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                 name=None)
     self.reshape_6 = P.Reshape()
     self.reshape_6_shape = tuple([BATCH_SIZE, 448, 12, 64])
     self.reshape_7 = P.Reshape()
     self.reshape_7_shape = tuple([BATCH_SIZE, 448, 12, 64])
     self.reshape_8 = P.Reshape()
     self.reshape_8_shape = tuple([BATCH_SIZE, 448, 12, 64])
     self.transpose_9 = P.Transpose()
     self.transpose_10 = P.Transpose()
     self.transpose_11 = P.Transpose()
     self.matmul_12 = nn.MatMul()
     self.matmul_12.to_float(mstype.float16)
     self.div_13 = P.Div()
     self.div_13_w = 8.0
     self.add_14 = P.Add()
     self.softmax_15 = nn.Softmax(axis=3)
     self.matmul_16 = nn.MatMul()
     self.matmul_16.to_float(mstype.float16)
     self.transpose_17 = P.Transpose()
     self.reshape_18 = P.Reshape()
     self.reshape_18_shape = tuple([BATCH_SIZE, 448, 768])
     self.matmul_19 = nn.MatMul()
     self.matmul_19.to_float(mstype.float16)
     self.matmul_19_w = Parameter(Tensor(
         np.random.uniform(0, 1, (768, 768)).astype(np.float32)),
                                  name=None)
     self.add_20 = P.Add()
     self.add_20_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                  name=None)
Esempio n. 9
0
 def __init__(self):
     super(GeLU, self).__init__()
     self.div = P.Div()
     self.div_w = 1.4142135381698608
     self.erf = P.Erf()
     self.add = P.Add()
     self.add_bias = 1.0
     self.mul = P.Mul()
     self.mul_w = 0.5
Esempio n. 10
0
 def __init__(self):
     super().__init__()
     self.mul = P.Mul()
     self.add = P.Add()
     self.sub = P.Sub()
     self.div = P.Div()
     self.assign = P.Assign()
     self.param_a = Parameter(Tensor(5, mstype.int32), name='a')
     self.param_b = Parameter(Tensor(2, mstype.int32), name='b')
     self.param_c = Parameter(Tensor(20, mstype.int32), name='c')
Esempio n. 11
0
 def __init__(self):
     super(GeLU, self).__init__()
     self.div_0 = P.Div()
     self.div_0_w = 1.4142135381698608
     self.erf_1 = P.Erf()
     self.add_2 = P.Add()
     self.add_2_bias = 1.0
     self.mul_3 = P.Mul()
     self.mul_4 = P.Mul()
     self.mul_4_w = 0.5
Esempio n. 12
0
 def __init__(self, layer_norm_weight, layer_norm_bias):
     """init function"""
     super(LayerNorm, self).__init__()
     self.reducemean = P.ReduceMean(keep_dims=True)
     self.sub = P.Sub()
     self.pow = P.Pow()
     self.add = P.Add()
     self.sqrt = P.Sqrt()
     self.div = P.Div()
     self.mul = P.Mul()
     self.layer_norm_weight = layer_norm_weight
     self.layer_norm_bias = layer_norm_bias
 def __init__(self):
     super().__init__()
     self.relu = nn.ReLU()
     self.mul = P.Mul()
     self.add = P.Add()
     self.sub = P.Sub()
     self.div = P.Div()
     self.assign = P.Assign()
     param_a = np.full((1, ), 5, dtype=np.float32)
     self.param_a = Parameter(Tensor(param_a), name='a')
     param_b = np.full((1, ), 2, dtype=np.float32)
     self.param_b = Parameter(Tensor(param_b), name='b')
Esempio n. 14
0
    def __init__(self, decay_policy, decay_rate, cur_noise_multiplier, init_noise_multiplier):
        super(_MechanismsParamsUpdater, self).__init__()
        self._decay_policy = decay_policy
        self._decay_rate = decay_rate
        self._cur_noise_multiplier = cur_noise_multiplier
        self._init_noise_multiplier = init_noise_multiplier

        self._div = P.Div()
        self._add = P.TensorAdd()
        self._assign = P.Assign()
        self._sub = P.Sub()
        self._one = Tensor(1, mstype.float32)
        self._mul = P.Mul()
        self._exp = P.Exp()
Esempio n. 15
0
    def __init__(self,
                 batch_size,
                 labels,
                 rnn_hidden_size,
                 nb_layers,
                 audio_conf,
                 rnn_type='LSTM',
                 bidirectional=True,
                 device_target='GPU'):
        super(DeepSpeechModel, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = rnn_hidden_size
        self.hidden_layers = nb_layers
        self.rnn_type = rnn_type
        self.audio_conf = audio_conf
        self.labels = labels
        self.bidirectional = bidirectional
        self.reshape_op = P.Reshape()
        self.shape_op = P.Shape()
        self.transpose_op = P.Transpose()
        self.add = P.Add()
        self.div = P.Div()

        sample_rate = self.audio_conf.sample_rate
        window_size = self.audio_conf.window_size
        num_classes = len(self.labels)

        self.conv = MaskConv()
        # This is to calculate
        self.pre, self.stride = self.get_conv_num()

        # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1
        rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1)
        rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1)
        rnn_input_size *= 32

        self.RNN = BatchRNN(batch_size=self.batch_size,
                            input_size=rnn_input_size,
                            num_layers=nb_layers,
                            hidden_size=rnn_hidden_size,
                            bidirectional=bidirectional,
                            batch_norm=False,
                            rnn_type=self.rnn_type,
                            device_target=device_target)
        fully_connected = nn.Dense(rnn_hidden_size,
                                   num_classes,
                                   has_bias=False)
        self.fc = SequenceWise(fully_connected)
Esempio n. 16
0
 def __init__(self, mul_7_w_shape, add_8_bias_shape):
     """init function"""
     super(LayerNorm, self).__init__()
     self.reducemean_0 = P.ReduceMean(keep_dims=True)
     self.sub_1 = P.Sub()
     self.pow_2 = P.Pow()
     self.pow_2_input_weight = 2.0
     self.reducemean_3 = P.ReduceMean(keep_dims=True)
     self.add_4 = P.Add()
     self.add_4_bias = 9.999999960041972e-13
     self.sqrt_5 = P.Sqrt()
     self.div_6 = P.Div()
     self.mul_7 = P.Mul()
     self.mul_7_w = Parameter(Tensor(np.random.uniform(0, 1, mul_7_w_shape).astype(np.float32)), name=None)
     self.add_8 = P.Add()
     self.add_8_bias = Parameter(Tensor(np.random.uniform(0, 1, add_8_bias_shape).astype(np.float32)), name=None)
Esempio n. 17
0
 def __init__(self, passthrough_w_0, passthrough_w_1):
     """init function"""
     super(LayerNorm, self).__init__()
     self.reducemean_0 = P.ReduceMean(keep_dims=True)
     self.sub_1 = P.Sub()
     self.pow_2 = P.Pow()
     self.pow_2_input_weight = 2.0
     self.reducemean_3 = P.ReduceMean(keep_dims=True)
     self.add_4 = P.Add()
     self.add_4_bias = 9.999999960041972e-13
     self.sqrt_5 = P.Sqrt()
     self.div_6 = P.Div()
     self.mul_7 = P.Mul()
     self.mul_7_w = passthrough_w_0
     self.add_8 = P.Add()
     self.add_8_bias = passthrough_w_1
Esempio n. 18
0
 def __init__(self, batch_size, passthrough_w_0, passthrough_w_1,
              passthrough_w_2):
     """init function"""
     super(MultiHeadAttn, self).__init__()
     self.batch_size = batch_size
     self.matmul_0 = nn.MatMul()
     self.matmul_0_w = Parameter(Tensor(
         np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)),
                                 name=None)
     self.matmul_1 = nn.MatMul()
     self.matmul_1_w = Parameter(Tensor(
         np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)),
                                 name=None)
     self.matmul_2 = nn.MatMul()
     self.matmul_2_w = Parameter(Tensor(
         np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)),
                                 name=None)
     self.add_3 = P.Add()
     self.add_3_bias = passthrough_w_0
     self.add_4 = P.Add()
     self.add_4_bias = passthrough_w_1
     self.add_5 = P.Add()
     self.add_5_bias = passthrough_w_2
     self.reshape_6 = P.Reshape()
     self.reshape_6_shape = tuple([batch_size, 512, 64, 64])
     self.reshape_7 = P.Reshape()
     self.reshape_7_shape = tuple([batch_size, 512, 64, 64])
     self.reshape_8 = P.Reshape()
     self.reshape_8_shape = tuple([batch_size, 512, 64, 64])
     self.transpose_9 = P.Transpose()
     self.transpose_10 = P.Transpose()
     self.transpose_11 = P.Transpose()
     self.matmul_12 = nn.MatMul()
     self.div_13 = P.Div()
     self.div_13_w = 8.0
     self.add_14 = P.Add()
     self.softmax_15 = nn.Softmax(axis=3)
     self.matmul_16 = nn.MatMul()
     self.transpose_17 = P.Transpose()
     self.matmul_18 = P.MatMul()
     self.matmul_18_weight = Parameter(Tensor(
         np.random.uniform(0, 1, (64, 64, 4096)).astype(np.float32)),
                                       name=None)
     self.add_19 = P.Add()
     self.add_19_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (4096, )).astype(np.float32)),
                                  name=None)
Esempio n. 19
0
 def __init__(self, sparse=False):
     super(SoftmaxCrossEntropyExpand, self).__init__()
     self.exp = P.Exp()
     self.reduce_sum = P.ReduceSum(keep_dims=True)
     self.onehot = P.OneHot()
     self.on_value = Tensor(1.0, mstype.float32)
     self.off_value = Tensor(0.0, mstype.float32)
     self.div = P.Div()
     self.log = P.Log()
     self.sum_cross_entropy = P.ReduceSum(keep_dims=False)
     self.mul = P.Mul()
     self.mul2 = P.Mul()
     self.cast = P.Cast()
     self.reduce_mean = P.ReduceMean(keep_dims=False)
     self.sparse = sparse
     self.reduce_max = P.ReduceMax(keep_dims=True)
     self.sub = P.Sub()
Esempio n. 20
0
 def __init__(self):
     super(LayerNorm, self).__init__()
     self.reducemean = P.ReduceMean(keep_dims=True)
     self.sub = P.Sub()
     self.cast = P.Cast()
     self.cast_to = mstype.float32
     self.pow = P.Pow()
     self.pow_weight = 2.0
     self.add = P.Add()
     self.add_bias_0 = 9.999999960041972e-13
     self.sqrt = P.Sqrt()
     self.div = P.Div()
     self.mul = P.Mul()
     self.mul_weight = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                 name=None)
     self.add_bias_1 = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                 name=None)
Esempio n. 21
0
    def __init__(self,
                 num_bits=2,
                 compute_type=mstype.float32,
                 clip_value=1.0,
                 per_channel=False):
        self.num_bits = num_bits
        self.compute_type = compute_type
        self.clip_value = clip_value
        self.per_channel = per_channel

        self.clamp = C.clip_by_value
        self.abs = P.Abs()
        self.sum = P.ReduceSum()
        self.nelement = F.size
        self.div = P.Div()
        self.cast = P.Cast()
        self.max = P.ReduceMax()
        self.min = P.ReduceMin()
        self.floor = P.Floor()
Esempio n. 22
0
    def __init__(self,
                 num_bits=8,
                 compute_type=mstype.float32,
                 clip_value=1.0,
                 per_channel=False):
        super(QuantizeWeightCell, self).__init__()
        self.num_bits = num_bits
        self.compute_type = compute_type
        self.clip_value = clip_value
        self.per_channel = per_channel

        self.clamp = C.clip_by_value
        self.abs = P.Abs()
        self.sum = P.ReduceSum()
        self.nelement = F.size
        self.div = P.Div()
        self.cast = P.Cast()
        self.max = P.ReduceMax()
        self.min = P.ReduceMin()
        self.round = P.Round()
Esempio n. 23
0
 def __init__(self):
     super(LayerNorm, self).__init__()
     self.reducemean_0 = P.ReduceMean(keep_dims=True)
     self.sub_1 = P.Sub()
     self.cast_2 = P.Cast()
     self.cast_2_to = mstype.float32
     self.pow_3 = P.Pow()
     self.pow_3_input_weight = 2.0
     self.reducemean_4 = P.ReduceMean(keep_dims=True)
     self.add_5 = P.Add()
     self.add_5_bias = 9.999999960041972e-13
     self.sqrt_6 = P.Sqrt()
     self.div_7 = P.Div()
     self.mul_8 = P.Mul()
     self.mul_8_w = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                              name=None)
     self.add_9 = P.Add()
     self.add_9_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                 name=None)
Esempio n. 24
0
 def __init__(self,
              bert_layer_norm_weight_shape,
              bert_layer_norm_bias_shape,
              eps=1e-12):
     """init function"""
     super(BertLayerNorm, self).__init__()
     self.reducemean = P.ReduceMean(keep_dims=True)
     self.sub = P.Sub()
     self.pow = P.Pow()
     self.add = P.Add()
     self.sqrt = P.Sqrt()
     self.div = P.Div()
     self.mul = P.Mul()
     self.variance_epsilon = eps
     self.bert_layer_norm_weight = Parameter(Tensor(
         np.random.uniform(0, 1, bert_layer_norm_weight_shape).astype(
             np.float32)),
                                             name=None)
     self.bert_layer_norm_bias = Parameter(Tensor(
         np.random.uniform(0, 1,
                           bert_layer_norm_bias_shape).astype(np.float32)),
                                           name=None)
Esempio n. 25
0
 def __init__(self, seq_len):
     super(MultiHeadAttn, self).__init__()
     self.matmul = nn.MatMul()
     self.matmul.to_float(mstype.float16)
     self.query = Parameter(Tensor(
         np.random.uniform(0, 1, (768, 768)).astype(np.float32)),
                            name=None)
     self.key = Parameter(Tensor(
         np.random.uniform(0, 1, (768, 768)).astype(np.float32)),
                          name=None)
     self.value = Parameter(Tensor(
         np.random.uniform(0, 1, (768, 768)).astype(np.float32)),
                            name=None)
     self.add = P.Add()
     self.query_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                 name=None)
     self.key_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                               name=None)
     self.value_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                 name=None)
     self.reshape = P.Reshape()
     self.to_shape_0 = tuple([BATCH_SIZE, seq_len, 12, 64])
     self.transpose = P.Transpose()
     self.div = P.Div()
     self.div_w = 8.0
     self.softmax = nn.Softmax(axis=3)
     self.to_shape_1 = tuple([BATCH_SIZE, seq_len, 768])
     self.context_weight = Parameter(Tensor(
         np.random.uniform(0, 1, (768, 768)).astype(np.float32)),
                                     name=None)
     self.context_bias = Parameter(Tensor(
         np.random.uniform(0, 1, (768, )).astype(np.float32)),
                                   name=None)
Esempio n. 26
0
 def __init__(self,
              temperature=0.07,
              contrast_mode='all',
              base_temperature=0.07):
     super(SupConLoss, self).__init__()
     self.temperature = temperature
     self.contrast_mode = contrast_mode
     self.base_temperature = base_temperature
     self.normalize = P.L2Normalize(axis=2)
     self.eye = P.Eye()
     self.unbind = P.Unstack(axis=1)
     self.cat = P.Concat(axis=0)
     self.matmul = P.MatMul()
     self.div = P.Div()
     self.transpose = P.Transpose()
     self.maxes = P.ArgMaxWithValue(axis=1, keep_dims=True)
     self.tile = P.Tile()
     self.scatter = P.ScatterNd()
     self.oneslike = P.OnesLike()
     self.exp = P.Exp()
     self.sum = P.ReduceSum(keep_dims=True)
     self.log = P.Log()
     self.reshape = P.Reshape()
     self.mean = P.ReduceMean()
Esempio n. 27
0
    def __init__(self,
                 norm_bound=1.0,
                 initial_noise_multiplier=1.5,
                 noise_decay_rate=6e-4,
                 decay_policy='Time',
                 seed=0):
        super(AdaGaussianRandom, self).__init__()
        norm_bound = check_value_positive('norm_bound', norm_bound)
        initial_noise_multiplier = check_value_positive(
            'initial_noise_multiplier', initial_noise_multiplier)
        self._norm_bound = Tensor(norm_bound, mstype.float32)

        initial_noise_multiplier = Tensor(initial_noise_multiplier,
                                          mstype.float32)
        self._initial_noise_multiplier = Parameter(
            initial_noise_multiplier, name='initial_noise_multiplier')
        self._noise_multiplier = Parameter(initial_noise_multiplier,
                                           name='noise_multiplier')
        self._mean = Tensor(0, mstype.float32)
        noise_decay_rate = check_param_type('noise_decay_rate',
                                            noise_decay_rate, float)
        check_param_in_range('noise_decay_rate', noise_decay_rate, 0.0, 1.0)
        self._noise_decay_rate = Tensor(noise_decay_rate, mstype.float32)
        if decay_policy not in ['Time', 'Step']:
            raise NameError(
                "The decay_policy must be in ['Time', 'Step'], but "
                "get {}".format(decay_policy))
        self._decay_policy = decay_policy
        self._sub = P.Sub()
        self._mul = P.Mul()
        self._add = P.TensorAdd()
        self._div = P.Div()
        self._dtype = mstype.float32
        self._normal = P.Normal(seed=seed)
        self._assign = P.Assign()
        self._one = Tensor(1, self._dtype)
Esempio n. 28
0
    def construct(self, data, label, sens=None):
        """
        construct a compute flow.
        """
        init = False
        if not self.gpu_target:
            # init overflow buffer
            init = self.alloc_status()
            # clear overflow buffer
            self.clear_status(init)

        if sens is None:
            scaling_sens = self.loss_scale
        else:
            scaling_sens = sens

        # DP clip
        weights = self.weights
        record_datas = self._split(data)
        record_labels = self._split(label)
        # first index
        loss = self.network(record_datas[0], record_labels[0])
        scaling_sens_filled = C.ones_like(loss) * F.cast(
            scaling_sens, F.dtype(loss))
        record_grad = self.grad(self.network,
                                weights)(record_datas[0], record_labels[0],
                                         scaling_sens_filled)

        beta = self._zero
        square_sum = self._zero
        for grad in record_grad:
            square_sum = self._add(square_sum,
                                   self._reduce_sum(self._square_all(grad)))
        norm_grad = self._sqrt(square_sum)
        beta = self._add(
            beta,
            self._cast(self._less(norm_grad, self._norm_bound),
                       mstype.float32))
        record_grad = self._clip_by_global_norm(record_grad,
                                                GRADIENT_CLIP_TYPE,
                                                self._norm_bound)
        grads = record_grad
        total_loss = loss
        for i in range(1, self._micro_batches):
            loss = self.network(record_datas[i], record_labels[i])
            scaling_sens_filled = C.ones_like(loss) * F.cast(
                scaling_sens, F.dtype(loss))
            record_grad = self.grad(self.network,
                                    weights)(record_datas[i], record_labels[i],
                                             scaling_sens_filled)

            square_sum = self._zero
            for grad in record_grad:
                square_sum = self._add(
                    square_sum, self._reduce_sum(self._square_all(grad)))
            norm_grad = self._sqrt(square_sum)
            beta = self._add(
                beta,
                self._cast(self._less(norm_grad, self._norm_bound),
                           mstype.float32))

            record_grad = self._clip_by_global_norm(record_grad,
                                                    GRADIENT_CLIP_TYPE,
                                                    self._norm_bound)
            grads = self._tuple_add(grads, record_grad)
            total_loss = P.TensorAdd()(total_loss, loss)
        loss = P.Div()(total_loss, self._micro_float)
        beta = self._div(beta, self._micro_batches)

        if self._noise_mech is not None:
            grad_noise_tuple = ()
            for grad_item in grads:
                grad_noise = self._mech(grad_item)
                grad_noise_tuple = grad_noise_tuple + (grad_noise, )
            grads = self._tuple_add(grads, grad_noise_tuple)
            grads = self._hyper_map(F.partial(_grad_scale, self._micro_float),
                                    grads)
            # update mech parameters

            if self._noise_mech_param_updater is not None:
                multiplier = self._noise_mech_param_updater()
                loss = F.depend(loss, multiplier)

        grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads)
        # apply grad reducer on grads
        grads = self.grad_reducer(grads)
        # get the overflow buffer
        if not self.gpu_target:
            self.get_status(init)
            # sum overflow buffer elements, 0:not overflow , >0:overflow
            flag_sum = self.reduce_sum(init, (0, ))
        else:
            flag_sum = self.hyper_map(F.partial(_grad_overflow), grads)
            flag_sum = self.addn(flag_sum)
            # convert flag_sum to scalar
            flag_sum = self.reshape(flag_sum, (()))
        if self.is_distributed:
            # sum overflow flag over devices
            flag_reduce = self.allreduce(flag_sum)
            cond = self.less_equal(self.base, flag_reduce)
        else:
            cond = self.less_equal(self.base, flag_sum)
        overflow = cond
        if sens is None:
            overflow = self.loss_scaling_manager(self.loss_scale, cond)
        # if there is no overflow, do optimize
        if overflow:
            opt = False
        else:
            opt = self.optimizer(grads)
        ret = (loss, cond, scaling_sens)

        if self._clip_mech is not None:
            next_norm_bound = self._clip_mech(beta, self._norm_bound)
            P.assign(self._norm_bound, next_norm_bound)
        return F.depend(ret, opt)
Esempio n. 29
0
    def __init__(self,
                 network,
                 optimizer,
                 norm_bound=1.0,
                 sens=1.0,
                 micro_batches=None,
                 noise_mech=None,
                 clip_mech=None):
        super(_TrainOneStepCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.sens = sens
        self.reducer_flag = False
        self.grad_reducer = None
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)

        # dp params
        if micro_batches is None:
            msg = 'micro_batches must give in differential privacy, but got value: {}'.format(
                micro_batches)
            LOGGER.error(TAG, msg)
            raise ValueError(msg)
        self._micro_batches = micro_batches
        self._norm_bound = norm_bound
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._noise_mech = noise_mech
        self._clip_mech = clip_mech
        self._tuple_add = _TupleAdd()
        self._add = P.TensorAdd()
        self._norm = nn.Norm()
        self._hyper_map = C.HyperMap()
        self._zero = Tensor(0, mstype.float32)
        self._assign = P.Assign()
        self._div = P.Div()
        self._sqrt = P.Sqrt()
        self._reduce_sum = P.ReduceSum()
        self._square_all = P.Square()
        self._less = P.Less()
        self._cast = P.Cast()

        self._micro_float = Tensor(micro_batches, mstype.float32)

        self._noise_mech_param_updater = None
        if self._noise_mech is not None and self._noise_mech._decay_policy is not None:
            self._noise_mech_param_updater = _MechanismsParamsUpdater(
                decay_policy=self._noise_mech._decay_policy,
                decay_rate=self._noise_mech._noise_decay_rate,
                cur_noise_multiplier=self._noise_mech._noise_multiplier,
                init_noise_multiplier=self._noise_mech.
                _initial_noise_multiplier)
Esempio n. 30
0
    def __init__(self,
                 network,
                 optimizer,
                 scale_update_cell=None,
                 micro_batches=None,
                 norm_bound=1.0,
                 noise_mech=None,
                 clip_mech=None):
        super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.hyper_map = C.HyperMap()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = NPUAllocFloatStatus()
            self.get_status = NPUGetFloatStatus()
            self.clear_status = NPUClearFloatStatus()
        self.reduce_sum = ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = LessEqual()
        self.depend_parameter_use = ControlDepend(depend_mode=1)
        self.allreduce = P.AllReduce()
        self.parallel_mode = _get_parallel_mode()
        self.grad_reducer = F.identity
        self.reducer_flag = self.parallel_mode in [
            ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
        self.add_flags(has_effect=True)

        # dp params
        self._micro_batches = micro_batches
        self._norm_bound = norm_bound
        self._split = P.Split(0, self._micro_batches)
        self._clip_by_global_norm = _ClipGradients()
        self._noise_mech = noise_mech
        self._clip_mech = clip_mech
        self._add = P.TensorAdd()
        self._norm = nn.Norm()
        self._tuple_add = _TupleAdd()
        self._hyper_map = C.HyperMap()
        self._micro_float = Tensor(micro_batches, mstype.float32)
        self._zero = Tensor(0, mstype.float32)
        self._assign = P.Assign()
        self._div = P.Div()
        self._sqrt = P.Sqrt()
        self._reduce_sum = P.ReduceSum()
        self._square_all = P.Square()
        self._less = P.Less()
        self._cast = P.Cast()

        self._noise_mech_param_updater = None
        if self._noise_mech is not None and self._noise_mech._decay_policy is not None:
            self._noise_mech_param_updater = _MechanismsParamsUpdater(
                decay_policy=self._noise_mech._decay_policy,
                decay_rate=self._noise_mech._noise_decay_rate,
                cur_noise_multiplier=self._noise_mech._noise_multiplier,
                init_noise_multiplier=self._noise_mech.
                _initial_noise_multiplier)