def __init__(self, decay_policy, decay_rate, cur_noise_multiplier, init_noise_multiplier): super(_MechanismsParamsUpdater, self).__init__() self._decay_policy = decay_policy self._decay_rate = decay_rate self._cur_noise_multiplier = cur_noise_multiplier self._init_noise_multiplier = init_noise_multiplier self._div = P.Div() self._add = P.Add() self._assign = P.Assign() self._sub = P.Sub() self._one = Tensor(1, mstype.float32) self._mul = P.Mul() self._exp = P.Exp()
def __init__(self, attention_mask_shape, has_attention_mask=False, dtype=mstype.float32): super(BertAttentionMaskBackward, self).__init__() self.has_attention_mask = has_attention_mask self.multiply_data = Tensor([-1000.0, ], dtype=dtype) self.multiply = P.Mul() self.attention_mask = Tensor(np.ones(shape=attention_mask_shape).astype(np.float32)) if self.has_attention_mask: self.expand_dims = P.ExpandDims() self.sub = P.Sub() self.add = P.TensorAdd() self.cast = P.Cast() self.get_dtype = P.DType()
def __init__(self): super().__init__() self.relu = nn.ReLU() self.softmax = nn.Softmax() self.mul = P.Mul() self.add = P.Add() self.sub = P.Sub() self.div = P.Div() self.assign = P.Assign() param_a = np.full((1, ), 5, dtype=np.float32) self.param_a = Parameter(Tensor(param_a), name='a') param_b = np.full((1, ), 2, dtype=np.float32) self.param_b = Parameter(Tensor(param_b), name='b') param_c = np.full((1, ), 16, dtype=np.float32) self.param_c = Parameter(Tensor(param_c), name='c')
def __init__(self): super(NpuFloatNet, self).__init__() self.mul = P.Mul() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_status = P.NPUClearFloatStatus() self.fill = P.Fill() self.shape_op = P.Shape() self.select = P.Select() self.less = P.Less() self.cast = P.Cast() self.dtype = P.DType() self.reduce_sum = P.ReduceSum(keep_dims=True) self.sub = P.Sub() self.neg = P.Neg()
def __init__(self, mul_7_w_shape, add_8_bias_shape): """init function""" super(LayerNorm, self).__init__() self.reducemean_0 = P.ReduceMean(keep_dims=True) self.sub_1 = P.Sub() self.pow_2 = P.Pow() self.pow_2_input_weight = 2.0 self.reducemean_3 = P.ReduceMean(keep_dims=True) self.add_4 = P.Add() self.add_4_bias = 9.999999960041972e-13 self.sqrt_5 = P.Sqrt() self.div_6 = P.Div() self.mul_7 = P.Mul() self.mul_7_w = Parameter(Tensor(np.random.uniform(0, 1, mul_7_w_shape).astype(np.float32)), name=None) self.add_8 = P.Add() self.add_8_bias = Parameter(Tensor(np.random.uniform(0, 1, add_8_bias_shape).astype(np.float32)), name=None)
def construct(self, a, b, x): if a < b: a = P.TensorAdd()(a, b) else: a = P.Sub()(a, b) if a == x: a = P.Mul()(a, b) else: a = P.RealDiv()(a, b) if b == x: b = P.TensorAdd()(a, b) else: b = P.TensorAdd()(a, x) a = a * b out = a + b + x return out
def __init__(self, axis=-1, depth=1, on_value=1.0, off_value=0.0, strategy=None): super(Onehot, self).__init__() trans_stra = None if strategy: trans_stra = (strategy[0], ) self.onehot = P.OneHot().set_strategy(strategy=strategy) self.depth = depth self.on_value = Tensor(on_value, ms.float32) self.off_value = Tensor(off_value, ms.float32) self.transpose = P.Transpose().set_strategy(strategy=trans_stra) self.sub = P.Sub().set_strategy(strategy=((1, 1), (1, 1)))
def __init__(self, passthrough_w_0, passthrough_w_1): """init function""" super(LayerNorm, self).__init__() self.reducemean_0 = P.ReduceMean(keep_dims=True) self.sub_1 = P.Sub() self.pow_2 = P.Pow() self.pow_2_input_weight = 2.0 self.reducemean_3 = P.ReduceMean(keep_dims=True) self.add_4 = P.Add() self.add_4_bias = 9.999999960041972e-13 self.sqrt_5 = P.Sqrt() self.div_6 = P.Div() self.mul_7 = P.Mul() self.mul_7_w = passthrough_w_0 self.add_8 = P.Add() self.add_8_bias = passthrough_w_1
def __init__(self, sparse=False): super(SoftmaxCrossEntropyExpand, self).__init__() self.exp = P.Exp() self.reduce_sum = P.ReduceSum(keep_dims=True) self.onehot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.div = P.Div() self.log = P.Log() self.sum_cross_entropy = P.ReduceSum(keep_dims=False) self.mul = P.Mul() self.mul2 = P.Mul() self.cast = P.Cast() self.reduce_mean = P.ReduceMean(keep_dims=False) self.sparse = sparse self.reduce_max = P.ReduceMax(keep_dims=True) self.sub = P.Sub()
def test_nobroadcast_fp16(): context.set_context(mode=context.GRAPH_MODE, device_target='GPU') np.random.seed(42) x1_np = np.random.rand(10, 20).astype(np.float16) x2_np = np.random.rand(10, 20).astype(np.float16) output_ms = P.Minimum()(Tensor(x1_np), Tensor(x2_np)) output_np = np.minimum(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Maximum()(Tensor(x1_np), Tensor(x2_np)) output_np = np.maximum(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Greater()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np > x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Less()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np < x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np)) output_np = np.power(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np / x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np * x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np - x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.DivNoNan()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np / x2_np assert np.allclose(output_ms.asnumpy(), output_np) x2_np_zero = np.zeros_like(x2_np) output_ms = P.DivNoNan()(Tensor(x1_np), Tensor(x2_np_zero)) assert np.allclose(output_ms.asnumpy(), x2_np_zero)
def test_nobroadcast(): context.set_context(mode=context.GRAPH_MODE, device_target='GPU') np.random.seed(42) x1_np = np.random.rand(10, 20).astype(np.float32) x2_np = np.random.rand(10, 20).astype(np.float32) x1_np_int32 = np.random.randint(0, 100, (10, 20)).astype(np.int32) x2_np_int32 = np.random.randint(0, 100, (10, 20)).astype(np.int32) output_ms = P.Minimum()(Tensor(x1_np), Tensor(x2_np)) output_np = np.minimum(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Maximum()(Tensor(x1_np), Tensor(x2_np)) output_np = np.maximum(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Greater()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np > x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Greater()(Tensor(x1_np_int32), Tensor(x2_np_int32)) output_np = x1_np_int32 > x2_np_int32 assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Less()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np < x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Less()(Tensor(x1_np_int32), Tensor(x2_np_int32)) output_np = x1_np_int32 < x2_np_int32 assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np)) output_np = np.power(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np / x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np * x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np - x2_np assert np.allclose(output_ms.asnumpy(), output_np)
def __init__(self, channel=1, w=0.25): super(PReLU, self).__init__() if isinstance(w, (np.float32, float)): tmp = np.empty((channel, ), dtype=np.float32) tmp.fill(w) w = Tensor(tmp) elif isinstance(w, list): w = Tensor(w) if not isinstance(w, Tensor): raise TypeError("w only support np.float32, float or Tensor type.") self.w = Parameter(initializer(w, [ channel, ]), name='a') self.prelu = P.PReLU() self.relu = P.ReLU().set_strategy(((1, ), )) self.sub = P.Sub().set_strategy(((1, ), (1, ))) self.assign_sub = P.AssignSub().set_strategy(((1, ), (1, )))
def test_SubGrad(): """ test_SubGrad """ input_x = Tensor(np.array([[2, 2]])) input_y = Tensor(np.array([[2, 2], [2, 2]])) sub = P.Sub() def fn(x, y): output = sub(x, y) return output out = fn(input_x, input_y) gfn = grad_all_with_sens(fn) sens = Tensor(np.ones_like(out.asnumpy())) args = [input_x, input_y, sens] gout = gfn(*args) expect_dx = np.ones([1, 2]).astype(np.int32) * 2 # reduce sum dout to the shape of x expect_dy = np.ones([2, 2]).astype(np.int32) * (-1) assert np.array_equal(gout[0].asnumpy(), expect_dx) assert np.array_equal(gout[1].asnumpy(), expect_dy)
def __init__(self): super(LayerNorm, self).__init__() self.reducemean = P.ReduceMean(keep_dims=True) self.sub = P.Sub() self.cast = P.Cast() self.cast_to = mstype.float32 self.pow = P.Pow() self.pow_weight = 2.0 self.add = P.Add() self.add_bias_0 = 9.999999960041972e-13 self.sqrt = P.Sqrt() self.div = P.Div() self.mul = P.Mul() self.mul_weight = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.add_bias_1 = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None)
def construct(self, a, b, x): add = P.TensorAdd() sub = P.Sub() mul = P.Mul() div = P.RealDiv() if 2 < 12: a = add(a, b) else: a = sub(a, b) if 2 > 1: a = mul(a, b) else: a = div(a, b) if 2 == 1: b = add(a, b) else: b = add(a, x) a = a * b out = a + b + x return out
def construct(self, a, b, x): add = P.Add() sub = P.Sub() mul = P.Mul() div = P.RealDiv() if a < b: a = add(a, b) else: a = sub(a, b) if 2 > 1: a = mul(a, b) else: a = div(a, b) if b == x: b = add(a, b) else: b = add(a, x) a = a * b out = a + b + x return out
def __init__(self, sparse=False, stra_list=None): super(SoftmaxCrossEntropyExpand, self).__init__() if stra_list is None: stra_list = [] if len(stra_list) < 11: stra_list = [None] * 11 self.exp = P.Exp() self.reduce_sum = P.ReduceSum(keep_dims=True).shard(strategy=stra_list[1]) self.onehot = P.OneHot().shard(strategy=stra_list[2]) self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.div = P.Div().shard(strategy=stra_list[3]) self.log = P.Log().shard(strategy=stra_list[4]) self.sum_cross_entropy = P.ReduceSum(keep_dims=False).shard(strategy=stra_list[5]) self.mul = P.Mul().shard(strategy=stra_list[6]) self.mul2 = P.Mul().shard(strategy=stra_list[7]) self.cast = P.Cast() self.reduce_mean = P.ReduceMean(keep_dims=False).shard(strategy=stra_list[8]) self.sparse = sparse self.reduce_max = P.ReduceMax(keep_dims=True).shard(strategy=stra_list[9]) self.sub = P.Sub().shard(strategy=stra_list[10])
def __init__(self): super(LayerNorm, self).__init__() self.reducemean_0 = P.ReduceMean(keep_dims=True) self.sub_1 = P.Sub() self.cast_2 = P.Cast() self.cast_2_to = mstype.float32 self.pow_3 = P.Pow() self.pow_3_input_weight = 2.0 self.reducemean_4 = P.ReduceMean(keep_dims=True) self.add_5 = P.Add() self.add_5_bias = 9.999999960041972e-13 self.sqrt_6 = P.Sqrt() self.div_7 = P.Div() self.mul_8 = P.Mul() self.mul_8_w = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.add_9 = P.Add() self.add_9_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None)
def _attn(self, query, key, value, attention_mask): """ Get the weighted score along the seq_length Inputs: query: the query matrix key: the key matrix value: the value matrix attention_mask: the attention mask matrix with shape (batch_size, 1, seq_length, seq_length) Returns: weighted_values: Tensor, the weighted sum scores """ if not self.scale: query = query / F.cast(self.coeff, F.dtype(query)) key = key / F.cast(self.coeff, F.dtype(key)) score = self.batch_matmul(query, key) if self.scale: score = score / P.Cast()(self.scale_factor, P.DType()(score)) ori_dtype = P.DType()(score) score = P.Cast()(score, mstype.float32) multiplu_out = P.Sub()(P.Cast()(F.tuple_to_array( (1.0, )), P.DType()(score)), P.Cast()(attention_mask, P.DType()(score))) adder = P.Mul()(multiplu_out, self.multiply_data) attention_scores = adder + score attention_scores = P.Cast()(attention_scores, ori_dtype) shape = F.shape(attention_scores) attention_probs = nn.Softmax()(F.reshape(attention_scores, (-1, shape[-1]))) attention_probs = F.reshape(attention_probs, shape) attention_probs = self.prob_dropout(attention_probs) weighted_values = self.batch_matmul(attention_probs, value) return weighted_values
def __init__(self, bert_layer_norm_weight_shape, bert_layer_norm_bias_shape, eps=1e-12): """init function""" super(BertLayerNorm, self).__init__() self.reducemean = P.ReduceMean(keep_dims=True) self.sub = P.Sub() self.pow = P.Pow() self.add = P.Add() self.sqrt = P.Sqrt() self.div = P.Div() self.mul = P.Mul() self.variance_epsilon = eps self.bert_layer_norm_weight = Parameter(Tensor( np.random.uniform(0, 1, bert_layer_norm_weight_shape).astype( np.float32)), name=None) self.bert_layer_norm_bias = Parameter(Tensor( np.random.uniform(0, 1, bert_layer_norm_bias_shape).astype(np.float32)), name=None)
def __init__(self): super(ModelTwoHop, self).__init__() self.expanddims_0 = P.ExpandDims() self.expanddims_0_axis = 1 self.expanddims_3 = P.ExpandDims() self.expanddims_3_axis = 2 self.cast_5 = P.Cast() self.cast_5_to = mstype.float32 self.sub_7 = P.Sub() self.sub_7_bias = 1.0 self.mul_9 = P.Mul() self.mul_9_w = -10000.0 self.gather_1_input_weight = Parameter(Tensor( np.random.uniform(0, 1, (30522, 768)).astype(np.float32)), name=None) self.gather_1_axis = 0 self.gather_1 = P.Gather() self.gather_2_input_weight = Parameter(Tensor( np.random.uniform(0, 1, (2, 768)).astype(np.float32)), name=None) self.gather_2_axis = 0 self.gather_2 = P.Gather() self.add_4 = P.Add() self.add_6 = P.Add() self.add_6_bias = Parameter(Tensor( np.random.uniform(0, 1, (1, 448, 768)).astype(np.float32)), name=None) self.layernorm1_0 = LayerNorm() self.module50_0 = Encoder1_4() self.module50_1 = Encoder1_4() self.module50_2 = Encoder1_4() self.gather_643_input_weight = Tensor(np.array(0)) self.gather_643_axis = 1 self.gather_643 = P.Gather() self.dense_644 = nn.Dense(in_channels=768, out_channels=768, has_bias=True) self.tanh_645 = nn.Tanh()
def test_broadcast_fp16(): context.set_context(mode=context.GRAPH_MODE, device_target='GPU') x1_np = np.random.rand(3, 1, 5, 1).astype(np.float16) x2_np = np.random.rand(1, 4, 1, 6).astype(np.float16) output_ms = P.Minimum()(Tensor(x1_np), Tensor(x2_np)) output_np = np.minimum(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Maximum()(Tensor(x1_np), Tensor(x2_np)) output_np = np.maximum(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Greater()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np > x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Less()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np < x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Pow()(Tensor(x1_np), Tensor(x2_np)) output_np = np.power(x1_np, x2_np) assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.RealDiv()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np / x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Mul()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np * x2_np assert np.allclose(output_ms.asnumpy(), output_np) output_ms = P.Sub()(Tensor(x1_np), Tensor(x2_np)) output_np = x1_np - x2_np assert np.allclose(output_ms.asnumpy(), output_np)
def __init__(self, norm_bound=1.0, initial_noise_multiplier=1.5, noise_decay_rate=6e-4, decay_policy='Time', seed=0): super(AdaGaussianRandom, self).__init__() norm_bound = check_value_positive('norm_bound', norm_bound) initial_noise_multiplier = check_value_positive( 'initial_noise_multiplier', initial_noise_multiplier) self._norm_bound = Tensor(norm_bound, mstype.float32) initial_noise_multiplier = Tensor(initial_noise_multiplier, mstype.float32) self._initial_noise_multiplier = Parameter( initial_noise_multiplier, name='initial_noise_multiplier') self._noise_multiplier = Parameter(initial_noise_multiplier, name='noise_multiplier') self._mean = Tensor(0, mstype.float32) noise_decay_rate = check_param_type('noise_decay_rate', noise_decay_rate, float) check_param_in_range('noise_decay_rate', noise_decay_rate, 0.0, 1.0) self._noise_decay_rate = Tensor(noise_decay_rate, mstype.float32) if decay_policy not in ['Time', 'Step']: raise NameError( "The decay_policy must be in ['Time', 'Step'], but " "get {}".format(decay_policy)) self._decay_policy = decay_policy self._sub = P.Sub() self._mul = P.Mul() self._add = P.TensorAdd() self._div = P.Div() self._dtype = mstype.float32 self._normal = P.Normal(seed=seed) self._assign = P.Assign() self._one = Tensor(1, self._dtype)
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ from mindspore.ops import Primitive from mindspore.ops import operations as P mul = P.Mul() reduce_sum = P.ReduceSum(keep_dims=True) sub = P.Sub() confusion_softmax_grad = Primitive('ConfusionSoftmaxGrad') make_tuple = Primitive('make_tuple') tuple_getitem = Primitive('tuple_getitem') axis = 2 class FnDict: def __init__(self): self.fnDict = {} def __call__(self, fn): self.fnDict[fn.__name__] = fn def __getitem__(self, name): return self.fnDict[name]
def __init__(self, num_features, eps=1e-5, momentum=0.9, affine=True, gamma_init='ones', beta_init='zeros', moving_mean_init='zeros', moving_var_init='ones', use_batch_statistics=True, device_num_each_group=1): super(_BatchNorm, self).__init__() if num_features < 1: raise ValueError("num_features must be at least 1") if momentum < 0 or momentum > 1: raise ValueError( "momentum should be a number in range [0, 1], but got {}". format(momentum)) self.use_batch_statistics = use_batch_statistics self.num_features = num_features self.eps = eps self.moving_mean = Parameter(initializer(moving_mean_init, num_features), name="mean", requires_grad=False) self.moving_variance = Parameter(initializer(moving_var_init, num_features), name="variance", requires_grad=False) self.gamma = Parameter(initializer(gamma_init, num_features), name="gamma", requires_grad=affine) self.beta = Parameter(initializer(beta_init, num_features), name="beta", requires_grad=affine) self.group = check_int_positive(device_num_each_group) self.is_global = False if self.group != 1: self.rank_id = get_rank() self.rank_size = get_group_size() self.device_list = [i for i in range(0, self.rank_size)] self.rank_list = self.list_group(self.device_list, self.group) self.rank_list_idx = len(self.rank_list) for i in range(self.rank_list_idx): if self.rank_id in self.rank_list[i] and self.group != 1: self.is_global = True management.create_group('group' + str(i), self.rank_list[i]) self.all_reduce = P.AllReduce( P.ReduceOp.SUM, 'group' + str(i)).add_prim_attr('fusion', 1) self.shape = P.Shape() self.reduce_mean = P.ReduceMean(keep_dims=True) self.square = P.Square() self.sqrt = P.Sqrt() self.cast = P.Cast() self.dtype = P.DType() self.reshape = P.Reshape() self.is_ascend = context.get_context("device_target") == "Ascend" if context.get_context("enable_ge"): self.is_ge_backend = True self.momentum = Tensor(1.0 - momentum, mstype.float32) else: self.is_ge_backend = False self.momentum = 1.0 - momentum if self.is_ge_backend or self.is_ascend: self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps) else: self.bn_train = P.FusedBatchNorm(mode=1, epsilon=self.eps, momentum=self.momentum) self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps) data_parallel_strategy = ((1, ), (1, )) data_parallel_strategy_one = ((1, ), ()) self.sub_mean = P.Sub().set_strategy(data_parallel_strategy) self.sub_var = P.Sub().set_strategy(data_parallel_strategy) self.mul_mean = P.Mul().set_strategy(data_parallel_strategy_one) self.mul_var = P.Mul().set_strategy(data_parallel_strategy_one) self.assign_sub_mean = P.AssignSub().set_strategy( data_parallel_strategy) self.assign_sub_var = P.AssignSub().set_strategy( data_parallel_strategy)
def __init__(self, compute_type=mstype.float32): super(Mod, self).__init__() self.compute_type = compute_type self.floor_div = P.FloorDiv() self.sub = P.Sub() self.multiply = P.Mul()
def __init__(self, batch_size, seq_length, vocab_size, decoder, beam_width=4, length_penalty_weight=1.0, max_decode_length=128, sos_id=1, eos_id=2, compute_type=mstype.float32): super(BeamSearchDecoder, self).__init__(auto_prefix=False) self.seq_length = seq_length self.batch_size = batch_size self.vocab_size = vocab_size self.beam_width = beam_width self.length_penalty_weight = length_penalty_weight self.max_decode_length = max_decode_length self.decoder = decoder self.add = P.TensorAdd() self.expand = P.ExpandDims() self.reshape = P.Reshape() self.shape_flat = (-1, ) self.shape = P.Shape() self.zero_tensor = Tensor(np.zeros([batch_size, beam_width]), mstype.float32) self.ninf_tensor = Tensor(np.full([batch_size, beam_width], -INF), mstype.float32) self.select = P.Select() self.flat_shape = (batch_size, beam_width * vocab_size) self.topk = P.TopK(sorted=True) self.floor_div = P.FloorDiv() self.vocab_size_tensor = Tensor(self.vocab_size, mstype.int32) self.real_div = P.RealDiv() self.mod = Mod() self.equal = P.Equal() self.eos_ids = Tensor(np.full([batch_size, beam_width], eos_id), mstype.int32) beam_ids = np.tile( np.arange(beam_width).reshape((1, beam_width)), [batch_size, 1]) self.beam_ids = Tensor(beam_ids, mstype.int32) batch_ids = np.arange(batch_size * beam_width).reshape( (batch_size, beam_width)) // beam_width self.batch_ids = Tensor(batch_ids, mstype.int32) self.concat = P.Concat(axis=-1) self.gather_nd = P.GatherNd() self.greater_equal = P.GreaterEqual() self.sub = P.Sub() self.cast = P.Cast() self.zeroslike = P.ZerosLike() # init inputs and states self.start_ids = Tensor(np.full([batch_size * beam_width, 1], sos_id), mstype.int32) self.init_seq = Tensor(np.full([batch_size, beam_width, 1], sos_id), mstype.int32) init_scores = np.tile(np.array([[0.] + [-INF] * (beam_width - 1)]), [batch_size, 1]) self.init_scores = Tensor(init_scores, mstype.float32) self.init_finished = Tensor( np.zeros([batch_size, beam_width], dtype=np.bool)) self.init_length = Tensor( np.zeros([batch_size, beam_width], dtype=np.int32)) self.length_penalty = LengthPenalty(weight=length_penalty_weight) self.one = Tensor(1, mstype.int32)
def __init__(self): super(SubNet, self).__init__() self.sub = P.Sub()
def __init__(self, batch_size, from_tensor_width, to_tensor_width, from_seq_length, to_seq_length, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, has_attention_mask=False, attention_probs_dropout_prob=0.0, use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=False, use_relative_positions=False, compute_type=mstype.float32): super(BertAttention, self).__init__() self.batch_size = batch_size self.from_seq_length = from_seq_length self.to_seq_length = to_seq_length self.num_attention_heads = num_attention_heads self.size_per_head = size_per_head self.has_attention_mask = has_attention_mask self.use_relative_positions = use_relative_positions self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head)) self.reshape = P.Reshape() self.shape_from_2d = (-1, from_tensor_width) self.shape_to_2d = (-1, to_tensor_width) weight = TruncatedNormal(initializer_range) units = num_attention_heads * size_per_head self.query_layer = nn.Dense(from_tensor_width, units, activation=query_act, weight_init=weight).to_float(compute_type) self.key_layer = nn.Dense(to_tensor_width, units, activation=key_act, weight_init=weight).to_float(compute_type) self.value_layer = nn.Dense(to_tensor_width, units, activation=value_act, weight_init=weight).to_float(compute_type) self.shape_from = (batch_size, from_seq_length, num_attention_heads, size_per_head) self.shape_to = ( batch_size, to_seq_length, num_attention_heads, size_per_head) self.matmul_trans_b = P.BatchMatMul(transpose_b=True) self.multiply = P.Mul() self.transpose = P.Transpose() self.trans_shape = (0, 2, 1, 3) self.trans_shape_relative = (2, 0, 1, 3) self.trans_shape_position = (1, 2, 0, 3) self.multiply_data = -10000.0 self.batch_num = batch_size * num_attention_heads self.matmul = P.BatchMatMul() self.softmax = nn.Softmax() self.dropout = nn.Dropout(1 - attention_probs_dropout_prob) if self.has_attention_mask: self.expand_dims = P.ExpandDims() self.sub = P.Sub() self.add = P.TensorAdd() self.cast = P.Cast() self.get_dtype = P.DType() if do_return_2d_tensor: self.shape_return = (batch_size * from_seq_length, num_attention_heads * size_per_head) else: self.shape_return = (batch_size, from_seq_length, num_attention_heads * size_per_head) self.cast_compute_type = SaturateCast(dst_type=compute_type) if self.use_relative_positions: self._generate_relative_positions_embeddings = \ RelaPosEmbeddingsGenerator(length=to_seq_length, depth=size_per_head, max_relative_position=16, initializer_range=initializer_range, use_one_hot_embeddings=use_one_hot_embeddings)
def __init__(self, num_features, eps=1e-5, momentum=0.9, affine=True, gamma_init='ones', beta_init='zeros', moving_mean_init='zeros', moving_var_init='ones', use_batch_statistics=None, device_num_each_group=1, input_dims='2d', data_format='NCHW'): super(_BatchNorm, self).__init__() if num_features < 1: raise ValueError("num_features must be at least 1") if momentum < 0 or momentum > 1: raise ValueError("momentum should be a number in range [0, 1], but got {}".format(momentum)) self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.cls_name) if context.get_context("device_target") != "GPU" and self.format == "NHWC": raise ValueError("NHWC format only support in GPU target.") self.use_batch_statistics = use_batch_statistics self.num_features = num_features self.eps = eps self.input_dims = input_dims self.moving_mean = Parameter(initializer( moving_mean_init, num_features), name="mean", requires_grad=False) self.moving_variance = Parameter(initializer( moving_var_init, num_features), name="variance", requires_grad=False) self.gamma = Parameter(initializer( gamma_init, num_features), name="gamma", requires_grad=affine) self.beta = Parameter(initializer( beta_init, num_features), name="beta", requires_grad=affine) self.group = validator.check_positive_int(device_num_each_group) self.is_global = False if self.group != 1: self.rank_id = get_rank() self.rank_size = get_group_size() self.device_list = [i for i in range(0, self.rank_size)] self.rank_list = self.list_group(self.device_list, self.group) self.rank_list_idx = len(self.rank_list) for i in range(self.rank_list_idx): if self.rank_id in self.rank_list[i] and self.group != 1: self.is_global = True management.create_group('group' + str(i), self.rank_list[i]) self.all_reduce = P.AllReduce(P.ReduceOp.SUM, 'group' + str(i)).add_prim_attr('fusion', 1) self.shape = P.Shape() self.reduce_mean = P.ReduceMean(keep_dims=True) self.square = P.Square() self.sqrt = P.Sqrt() self.cast = P.Cast() self.dtype = P.DType() self.reshape = P.Reshape() self.is_ascend = context.get_context("device_target") == "Ascend" self.is_gpu = context.get_context("device_target") == "GPU" self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE self.momentum = 1.0 - momentum if context.get_context("enable_ge"): self.is_ge_backend = True else: self.is_ge_backend = False if self.is_graph_mode and (self.is_ge_backend or self.is_ascend): self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps) elif self.is_gpu: self.bn_train = P.FusedBatchNormEx(mode=1, epsilon=self.eps, momentum=self.momentum, data_format=self.format) else: self.bn_train = P.FusedBatchNorm(mode=1, epsilon=self.eps, momentum=self.momentum) self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps, data_format=self.format) self.enable_global_sync = self.is_global and (self.is_ge_backend or (self.is_graph_mode and self.is_ascend)) self.enable_default_train = self.is_graph_mode and not self.is_global and \ (self.is_ge_backend or self.is_ascend) data_parallel_strategy = ((1,), (1,)) data_parallel_strategy_one = ((1,), ()) self.sub_mean = P.Sub().shard(data_parallel_strategy) self.sub_var = P.Sub().shard(data_parallel_strategy) self.mul_mean = P.Mul().shard(data_parallel_strategy_one) self.mul_var = P.Mul().shard(data_parallel_strategy_one) self.assign_sub_mean = P.AssignSub().shard(data_parallel_strategy) self.assign_sub_var = P.AssignSub().shard(data_parallel_strategy)