def __init__(self): super().__init__() self.op = P.Mul()
def __init__(self, vocab_size, embedding_size, field_size, param_init='normal', target='CPU', slice_mode='batch_slice', feature_num_list=None, max_norm=None, sparse=True, operator='SUM'): super(MultiFieldEmbeddingLookup, self).__init__(vocab_size, embedding_size, param_init, target, slice_mode, feature_num_list, max_norm, sparse) self.field_size = validator.check_value_type('field_size', field_size, [int], self.cls_name) self.operator = operator self.mul = P.Mul() self.inf_mask_mul = P.Mul() self.bias_add = P.TensorAdd() self.inf_add = P.TensorAdd() self.merge_op = None self.count_op = P.UnsortedSegmentSum() self.abs = P.Abs() self.equal = P.Equal() self.add = P.TensorAdd() self.cast = P.Cast() self.div_no_nan = P.DivNoNan() self.expand = P.ExpandDims() self.max_mask_mul = P.Mul() self.max_no_equal = P.NotEqual() if operator == MultiFieldEmbeddingLookup.OPERATOR_SUM: self.merge_op = P.UnsortedSegmentSum() elif operator == MultiFieldEmbeddingLookup.OPERATOR_MAX: self.merge_op = P.UnsortedSegmentMax() elif operator == MultiFieldEmbeddingLookup.OPERATOR_MEAN: self.merge_op = P.UnsortedSegmentSum() else: raise ValueError( "The operator supports ['SUM', 'MAX', 'MEAN'], but found: " + str(operator)) parallel_mode = _get_parallel_mode() is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if slice_mode in ["table_row_slice", "batch_slice" ] and is_auto_parallel: self.merge_op.shard( ((get_group_size(), 1, 1), (get_group_size(), 1))) self.expand.shard(((get_group_size(), ), )) self.bias_add.shard(((1, 1), (1, 1))) self.mul.shard( ((get_group_size(), 1, 1), (get_group_size(), 1, 1))) self.count_op.shard(((get_group_size(), 1), (get_group_size(), 1))) self.add.shard(((get_group_size(), ), (get_group_size(), ))) self.div_no_nan.shard( ((get_group_size(), 1), (get_group_size(), 1))) self.max_mask_mul.shard( ((get_group_size(), 1), (get_group_size(), 1))) self.max_no_equal.shard(((1, ), ())) if operator == MultiFieldEmbeddingLookup.OPERATOR_MAX: self.equal.shard(((get_group_size(), 1, 1), ())) self.inf_mask_mul.shard(((get_group_size(), 1, 1), ())) self.merge_op.shard( ((get_group_size(), 1), (get_group_size(), ))) self.count_op.shard( ((get_group_size(), ), (get_group_size(), ))) self.inf_add.shard( ((get_group_size(), 1, 1), (get_group_size(), 1, 1))) elif slice_mode == "table_column_slice" and is_auto_parallel: self.merge_op.shard(((1, 1, get_group_size()), (1, 1))) self.div_no_nan.shard(((1, get_group_size()), (1, 1))) self.bias_add.shard(((1, 1), (1, 1))) self.mul.shard(((1, 1, 1), (1, 1, get_group_size()))) self.count_op.shard(((1, 1), (1, 1))) self.add.shard(((1, ), (1, ))) self.max_mask_mul.shard(((1, get_group_size()), (1, 1))) self.expand.shard(((1, ), )) self.max_no_equal.shard(((1, ), ())) if operator == MultiFieldEmbeddingLookup.OPERATOR_MAX: self.equal.shard(((1, 1, 1), ())) self.inf_mask_mul.shard(((1, 1, 1), ())) self.merge_op.shard(((1, get_group_size()), (1, ))) self.count_op.shard(((1, ), (1, ))) self.inf_add.shard(((1, 1, get_group_size()), (1, 1, 1))) else: if is_auto_parallel: raise ValueError( "slice_mode should be ['table_row_slice', 'batch_slice' and \ 'table_column_slice'], but get " + str(slice_mode)) # Min value for fp32 self.negative_inf_value = -3.402823466E+38
# you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ from mindspore.ops import Primitive from mindspore.ops import operations as P Add = P.TensorAdd() Mul = P.Mul() Sqrt = P.Sqrt() Square = P.Square() make_tuple = Primitive('make_tuple') tuple_getitem = Primitive('tuple_getitem') LambNextRight = Primitive('LambNextRight') class FnDict: def __init__(self): self.fnDict = {} def __call__(self, fn): self.fnDict[fn.__name__] = fn def __getitem__(self, name):
def __init__(self, mul_weight, strategy1=None, strategy2=None): super().__init__() self.mul = P.Mul().shard(strategy1) self.neg = P.Neg().shard(strategy2) self.mul_weight = Parameter(mul_weight, "w1")
def __init__(self, in_channel, out_channel, stride=1, use_se=False, se_block=False): super(ResidualBlock, self).__init__() self.stride = stride self.use_se = use_se self.se_block = se_block channel = out_channel // self.expansion self.conv1 = _conv1x1(in_channel, channel, stride=1, use_se=self.use_se) self.bn1 = _bn(channel) if self.use_se and self.stride != 1: self.e2 = nn.SequentialCell([ _conv3x3(channel, channel, stride=1, use_se=True), _bn(channel), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='same') ]) else: self.conv2 = _conv3x3(channel, channel, stride=stride, use_se=self.use_se) self.bn2 = _bn(channel) self.conv3 = _conv1x1(channel, out_channel, stride=1, use_se=self.use_se) self.bn3 = _bn_last(out_channel) if self.se_block: self.se_global_pool = P.ReduceMean(keep_dims=False) self.se_dense_0 = _fc(out_channel, int(out_channel / 4), use_se=self.use_se) self.se_dense_1 = _fc(int(out_channel / 4), out_channel, use_se=self.use_se) self.se_sigmoid = nn.Sigmoid() self.se_mul = P.Mul() self.relu = nn.ReLU() self.down_sample = False if stride != 1 or in_channel != out_channel: self.down_sample = True self.down_sample_layer = None if self.down_sample: if self.use_se: if stride == 1: self.down_sample_layer = nn.SequentialCell([ _conv1x1(in_channel, out_channel, stride, use_se=self.use_se), _bn(out_channel) ]) else: self.down_sample_layer = nn.SequentialCell([ nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='same'), _conv1x1(in_channel, out_channel, 1, use_se=self.use_se), _bn(out_channel) ]) else: self.down_sample_layer = nn.SequentialCell([ _conv1x1(in_channel, out_channel, stride, use_se=self.use_se), _bn(out_channel) ]) self.add = F.tensor_add
def __init__(self, strategy1, strategy2, strategy3): super().__init__() self.mul1 = P.Mul().shard(strategy1) self.reduce_sum = P.ReduceSum(keep_dims=False).shard(strategy2) self.mul2 = P.Mul().shard(strategy3)
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable): """Apply sparse adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter and not cache_enable: op_shape = P.Shape() shapes = (op_shape(param), op_shape(m), op_shape(v), op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) success = F.depend( success, pull( push((beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices), shapes), param)) return success if not target: success = F.depend( success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices)) else: op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() scatter_add = P.ScatterAdd(use_locking) assign_m = F.assign(m, op_mul(beta1, m)) assign_v = F.assign(v, op_mul(beta2, v)) grad_indices = gradient.indices grad_value = gradient.values next_m = scatter_add( m, grad_indices, op_mul(F.tuple_to_array((1.0, )) - beta1, grad_value)) next_v = scatter_add( v, grad_indices, op_mul(F.tuple_to_array((1.0, )) - beta2, op_square(grad_value))) if use_nesterov: m_temp = next_m * _scaler_ten assign_m_nesterov = F.assign(m, op_mul(beta1, next_m)) div_value = scatter_add( m, op_mul(grad_indices, _scaler_one), op_mul(F.tuple_to_array((1.0, )) - beta1, grad_value)) param_update = div_value / (op_sqrt(next_v) + eps) m_recover = F.assign(m, m_temp / _scaler_ten) F.control_depend(m_temp, assign_m_nesterov) F.control_depend(assign_m_nesterov, div_value) F.control_depend(param_update, m_recover) else: param_update = next_m / (op_sqrt(next_v) + eps) lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power) next_param = param - lr_t * param_update F.control_depend(assign_m, next_m) F.control_depend(assign_v, next_v) success = F.depend(success, F.assign(param, next_param)) success = F.depend(success, F.assign(m, next_m)) success = F.depend(success, F.assign(v, next_v)) return success
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, data_format='NCHW', has_bias=False, weight_init='normal', damping=0.03, loss_scale=1, frequency=278, bias_init='zeros'): self.thor = True ksizes = (1, kernel_size, kernel_size, 1) self.hw = kernel_size * kernel_size strides = (1, stride, stride, 1) kernel_size = twice(kernel_size) super(Conv2d_Thor, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, data_format, has_bias, weight_init, bias_init, ) self.conv2d = P.Conv2D(out_channel=self.out_channels, kernel_size=self.kernel_size, mode=1, pad_mode=self.pad_mode, pad=self.padding, stride=self.stride, dilation=self.dilation, group=self.group) self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.transpose02314 = P.CusTranspose02314() self.matrix_A_dim = self.in_channels * self.kernel_size[ 0] * self.kernel_size[1] self.matrix_G_dim = self.out_channels self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape( self.matrix_A_dim, self.in_channels, True) self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape( self.matrix_G_dim, self.in_channels, False) self.matrix_A_device_temp_shape = (self.matrix_A_device_shape[0], self.matrix_A_device_shape[2], self.matrix_A_device_shape[1], self.matrix_A_device_shape[3]) self.matrix_G_device_temp_shape = (self.matrix_G_device_shape[0], self.matrix_G_device_shape[2], self.matrix_G_device_shape[1], self.matrix_G_device_shape[3]) self.matrix_A_inv = Parameter(Tensor( np.reshape( np.identity(self.matrix_A_device_dim).astype(np.float16), self.matrix_A_device_shape)), name='matrix_A_inv', requires_grad=False) self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.reshape( np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)), name="matrix_G_inv", requires_grad=False) self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) self.fake_G = Tensor( np.reshape( np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)) self.fake_G_inv_max = Tensor(np.zeros([ 1, ]).astype(np.float32)) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = Tensor(damping) self.vector_matmul = P.CusBatchMatMul() self.diag_block_dim = 128 self.channels_slice_flag = False if self.in_channels % C0 != 0: self.channels_slice_flag = True self.padA_flag = False if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \ and self.matrix_A_dim > self.diag_block_dim: self.padA_flag = True pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim self.padA = P.Pad(((0, pad_dim), (0, pad_dim))) self.device_shape_pad_flag = False if self.matrix_A_dim != self.matrix_A_device_dim: self.device_shape_pad_flag = True self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels), (0, 0), (0, C0 - self.in_channels))) self.slice = P.Slice() self.gather = P.GatherV2() self.freq = Tensor(frequency, mstype.int32) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.axis = 0 dampingA_dim = self.matrix_A_dim if (self.matrix_A_dim % self.diag_block_dim ) != 0 and self.matrix_A_dim > self.diag_block_dim: dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim dampingG_dim = self.matrix_G_dim if (self.matrix_G_dim % self.diag_block_dim ) != 0 and self.matrix_G_dim > self.diag_block_dim: dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32) self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32) self.fused_abs_max1 = P.CusFusedAbsMax1( [self.matrix_A_dim, self.matrix_A_dim]) self.fused_abs_max2 = P.CusFusedAbsMax1() self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient)
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=278, has_bias=True, activation=None): super(Dense_Thor, self).__init__() self.in_channels = check_int_positive(in_channels) self.out_channels = check_int_positive(out_channels) self.has_bias = check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \ weight_init.shape()[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape( )[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None self.matrix_A_inv = Parameter(Tensor( np.zeros([128, 128, 16, 16]).astype(np.float16)), name='matrix_A_inv', requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([63, 63, 16, 16]).astype(np.float16)), name="matrix_G_inv", requires_grad=False) self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)) self.matmul = P.MatMul(transpose_b=True) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = Tensor(damping) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.vector_matmul = P.CusBatchMatMul() self.pad = P.Pad(((0, 24), (0, 24))) self.pad1 = P.Pad(((0, 8), (0, 8))) self.slice = P.Slice() self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) self.fused_abs_max1 = P.CusFusedAbsMax1([1000, 1000]) self.fused_abs_max2 = P.CusFusedAbsMax1() self.log = P.Log() self.exp = P.Exp() self.dampingA = Tensor(np.identity(2048), mstype.float32) self.dampingG = Tensor(np.identity(1024), mstype.float32) self.add = P.TensorAdd() self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient)
def __init__(self, config): super(WideDeepModel, self).__init__() self.batch_size = config.batch_size parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL): self.batch_size = self.batch_size * get_group_size() self.field_size = config.field_size self.vocab_size = config.vocab_size self.emb_dim = config.emb_dim self.deep_layer_dims_list = config.deep_layer_dim self.deep_layer_act = config.deep_layer_act self.init_args = config.init_args self.weight_init, self.bias_init = config.weight_bias_init self.weight_bias_init = config.weight_bias_init self.emb_init = config.emb_init self.drop_out = config.dropout_flag self.keep_prob = config.keep_prob self.deep_input_dims = self.field_size * self.emb_dim self.layer_dims = self.deep_layer_dims_list + [1] self.all_dim_list = [self.deep_input_dims] + self.layer_dims init_acts = [('Wide_w', [self.vocab_size, 1], self.emb_init), ('V_l2', [self.vocab_size, self.emb_dim], self.emb_init), ('Wide_b', [1], self.emb_init)] var_map = init_var_dict(self.init_args, init_acts) self.wide_w = var_map["Wide_w"] self.wide_b = var_map["Wide_b"] self.embedding_table = var_map["V_l2"] self.dense_layer_1 = DenseLayer(self.all_dim_list[0], self.all_dim_list[1], self.weight_bias_init, self.deep_layer_act, convert_dtype=True) self.dense_layer_2 = DenseLayer(self.all_dim_list[1], self.all_dim_list[2], self.weight_bias_init, self.deep_layer_act, convert_dtype=True) self.dense_layer_3 = DenseLayer(self.all_dim_list[2], self.all_dim_list[3], self.weight_bias_init, self.deep_layer_act, convert_dtype=True) self.dense_layer_4 = DenseLayer(self.all_dim_list[3], self.all_dim_list[4], self.weight_bias_init, self.deep_layer_act, convert_dtype=True) self.dense_layer_5 = DenseLayer(self.all_dim_list[4], self.all_dim_list[5], self.weight_bias_init, self.deep_layer_act, convert_dtype=True) self.gather_v2 = P.GatherV2().shard(((1, 8), (1, 1))) self.gather_v2_1 = P.GatherV2() self.mul = P.Mul() self.reduce_sum = P.ReduceSum(keep_dims=False) self.reshape = P.Reshape() self.square = P.Square() self.shape = P.Shape() self.tile = P.Tile() self.concat = P.Concat(axis=1) self.cast = P.Cast()
def __init__(self, strategy0, strategy1): super().__init__() self.fc_nobias = P.MatMul(transpose_b=True).set_strategy(strategy0) self.reduce_sum = P.ReduceSum( keep_dims=False).set_strategy(strategy1) self.mul = P.Mul().set_strategy(strategy=((), ()))
def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0. global_step (Tensor): Global step. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. decay_flag (bool): Specifies whether param update with weight decay. optim_filter(bool): Applies parameter update or not. Returns: Tensor, the new value of v after updating. """ if optim_filter: op_mul = P.Mul() op_sqrt = P.Sqrt() op_rsqrt = P.Rsqrt() op_square = P.Square() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() op_pow = P.Pow() op_norm = layer.Norm() op_select = P.Select() op_greater = P.Greater() op_fill = P.Fill() op_dtype = P.DType() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(num_one, mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32)) next_mm = next_m / (op_cast(num_one, mstype.float32) - op_pow( beta1, op_cast(global_step + num_one, mstype.float32))) next_vv = next_v / (op_cast(num_one, mstype.float32) - op_pow( beta2, op_cast(global_step + num_one, mstype.float32))) w_norm = op_norm(param_fp32) g_norm = op_norm(gradient_fp32) g_norm_hat = op_norm( op_mul(next_mm, op_rsqrt(next_vv + eps)) + weight_decay * param_fp32) zeros = F.zeros_like(w_norm) ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0) trust_ratio = op_select( op_greater(w_norm, zeros), op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones), ones) tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0) trust_ratio = C.clip_by_value(trust_ratio, zeros, tens) update = next_mm / (op_sqrt(next_vv) + eps) if decay_flag: update = update + op_mul(weight_decay, param_fp32) update_with_lr = op_mul(op_mul(trust_ratio, lr), update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_param = F.depend(next_param, F.assign(param, next_param)) next_param = F.depend(next_param, F.assign(m, next_m)) next_param = F.depend(next_param, F.assign(v, next_v)) return next_param return gradient
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, batch_size=32.0, decay_filter=lambda x: x.name not in []): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast() self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft() self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight() self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.weight_idx = [] for i in range(len(self.params)): if "conv" in self.params[i].name or "end_point" in self.params[ i].name: self.weight_idx.append(i) self.weight_idx.append(len(self.params)) self.feature_map = [ 1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 ] mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer_Amax = DistributedGradReducerThor( self.parameters, 2, mean, degree) self.grad_reducer_Gmax = DistributedGradReducerThor( self.parameters, 5, mean, degree) self.grad_reducer_A = DistributedGradReducerThor( self.parameters, 3, mean, degree) self.grad_reducer_G = DistributedGradReducerThor( self.parameters, 4, mean, degree) self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () for i in range(54): self.matrix_max_inv = self.matrix_max_inv + (Parameter( initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False), ) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.conv_index = [ 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 17, 18, 19, 22, 23, 24, 25, 28, 29, 30, 33, 34, 35, 38, 39, 40, 43, 44, 45, 46, 49, 50, 51, 54, 55, 56, 59, 60, 61, 64, 65, 66, 69, 70, 71, 74, 75, 76, 77, 80, 81, 82, 85 ] self.batch_size = batch_size self.bn_index = [ 3, 7, 10, 13, 17, 20, 23, 26, 30, 33, 36, 39, 42, 45, 49, 52 ] self.bn_gradient_index = [ -1, -1, -1, 4, -1, -1, -1, 10, -1, -1, 15, -1, -1, 20, -1, -1, -1, 26, -1, -1, 31, -1, -1, 36, -1, -1, 41, -1, -1, -1, 47, -1, -1, 52, -1, -1, 57, -1, -1, 62, -1, -1, 67, -1, -1, 72, -1, -1, -1, 78, -1, -1, 83 ]
def __init__(self, mul_size): super().__init__() self.mul_weight = Tensor(np.full(mul_size, 0.6, dtype=np.float32)) self.mul = P.Mul()
def __init__(self): super().__init__() self.mul1 = P.Mul() self.reduce_mean = P.ReduceMean(keep_dims=False) self.reduce_sum = P.ReduceSum(keep_dims=False).add_prim_attr( "cross_batch", True)
def __init__(self, alpha=0.2): super(LeakyReLU, self).__init__() self.greater_equal = P.GreaterEqual() self.mul = P.Mul() self.alpha = alpha
def __init__(self, strategy1, strategy2, strategy3): super().__init__() self.mul = P.Mul().shard(strategy1) self.reduce_max = P.ReduceMax(keep_dims=False).shard(strategy2) self.add = P.TensorAdd().shard(strategy3)
def __init__(self, strategy1, strategy2): super().__init__() self.matmul = P.MatMul().shard(strategy1) self.mul = P.Mul().shard(strategy2)
def __init__(self): super().__init__() self.add = P.TensorAdd() self.sub = P.Sub() self.mul = P.Mul() self.div = P.RealDiv()
def __init__(self, num_features, eps=1e-5, momentum=0.9, affine=True, gamma_init='ones', beta_init='zeros', moving_mean_init='zeros', moving_var_init='ones', use_batch_statistics=None, device_num_each_group=1): super(_BatchNorm, self).__init__() if num_features < 1: raise ValueError("num_features must be at least 1") if momentum < 0 or momentum > 1: raise ValueError( "momentum should be a number in range [0, 1], but got {}". format(momentum)) self.use_batch_statistics = use_batch_statistics self.num_features = num_features self.eps = eps self.moving_mean = Parameter(initializer(moving_mean_init, num_features), name="mean", requires_grad=False) self.moving_variance = Parameter(initializer(moving_var_init, num_features), name="variance", requires_grad=False) self.gamma = Parameter(initializer(gamma_init, num_features), name="gamma", requires_grad=affine) self.beta = Parameter(initializer(beta_init, num_features), name="beta", requires_grad=affine) self.group = check_int_positive(device_num_each_group) self.is_global = False if self.group != 1: self.rank_id = get_rank() self.rank_size = get_group_size() self.device_list = [i for i in range(0, self.rank_size)] self.rank_list = self.list_group(self.device_list, self.group) self.rank_list_idx = len(self.rank_list) for i in range(self.rank_list_idx): if self.rank_id in self.rank_list[i] and self.group != 1: self.is_global = True management.create_group('group' + str(i), self.rank_list[i]) self.all_reduce = P.AllReduce( P.ReduceOp.SUM, 'group' + str(i)).add_prim_attr('fusion', 1) self.shape = P.Shape() self.reduce_mean = P.ReduceMean(keep_dims=True) self.square = P.Square() self.sqrt = P.Sqrt() self.cast = P.Cast() self.dtype = P.DType() self.reshape = P.Reshape() self.is_ascend = context.get_context("device_target") == "Ascend" self.momentum = 1.0 - momentum if context.get_context("enable_ge"): self.is_ge_backend = True else: self.is_ge_backend = False if self.is_ge_backend or self.is_ascend: self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps) else: self.bn_train = P.FusedBatchNorm(mode=1, epsilon=self.eps, momentum=self.momentum) self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps) data_parallel_strategy = ((1, ), (1, )) data_parallel_strategy_one = ((1, ), ()) self.sub_mean = P.Sub().set_strategy(data_parallel_strategy) self.sub_var = P.Sub().set_strategy(data_parallel_strategy) self.mul_mean = P.Mul().set_strategy(data_parallel_strategy_one) self.mul_var = P.Mul().set_strategy(data_parallel_strategy_one) self.assign_sub_mean = P.AssignSub().set_strategy( data_parallel_strategy) self.assign_sub_var = P.AssignSub().set_strategy( data_parallel_strategy)
def __init__(self, strategy1, strategy2, strategy3): super().__init__() self.mul = P.Mul().shard(strategy1) self.mul2 = P.Mul().shard(strategy2) self.cast = P.Cast().shard(strategy3) self.cast2 = P.Cast().shard(strategy3)
def __init__(self, from_tensor_width, to_tensor_width, from_seq_length, to_seq_length, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, has_attention_mask=False, attention_probs_dropout_prob=0.0, use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=False, use_relative_positions=False, compute_type=mstype.float32): super(BertAttention, self).__init__() self.from_seq_length = from_seq_length self.to_seq_length = to_seq_length self.num_attention_heads = num_attention_heads self.size_per_head = size_per_head self.has_attention_mask = has_attention_mask self.use_relative_positions = use_relative_positions self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head)) self.reshape = P.Reshape() self.shape_from_2d = (-1, from_tensor_width) self.shape_to_2d = (-1, to_tensor_width) weight = TruncatedNormal(initializer_range) units = num_attention_heads * size_per_head self.query_layer = nn.Dense(from_tensor_width, units, activation=query_act, weight_init=weight).to_float(compute_type) self.key_layer = nn.Dense(to_tensor_width, units, activation=key_act, weight_init=weight).to_float(compute_type) self.value_layer = nn.Dense(to_tensor_width, units, activation=value_act, weight_init=weight).to_float(compute_type) self.shape_from = (-1, from_seq_length, num_attention_heads, size_per_head) self.shape_to = (-1, to_seq_length, num_attention_heads, size_per_head) self.matmul_trans_b = P.BatchMatMul(transpose_b=True) self.multiply = P.Mul() self.transpose = P.Transpose() self.trans_shape = (0, 2, 1, 3) self.trans_shape_relative = (2, 0, 1, 3) self.trans_shape_position = (1, 2, 0, 3) self.multiply_data = -10000.0 self.matmul = P.BatchMatMul() self.softmax = nn.Softmax() self.dropout = nn.Dropout(1 - attention_probs_dropout_prob) if self.has_attention_mask: self.expand_dims = P.ExpandDims() self.sub = P.Sub() self.add = P.TensorAdd() self.cast = P.Cast() self.get_dtype = P.DType() if do_return_2d_tensor: self.shape_return = (-1, num_attention_heads * size_per_head) else: self.shape_return = (-1, from_seq_length, num_attention_heads * size_per_head) self.cast_compute_type = SaturateCast(dst_type=compute_type) if self.use_relative_positions: self._generate_relative_positions_embeddings = \ RelaPosEmbeddingsGenerator(length=to_seq_length, depth=size_per_head, max_relative_position=16, initializer_range=initializer_range, use_one_hot_embeddings=use_one_hot_embeddings)
def __init__(self, strategy): super().__init__() self.reshape = P.Reshape() self.mul = P.Mul().set_strategy(strategy) self.relu = P.ReLU()
def __init__(self): super().__init__() self.norm1 = P.L2Normalize() self.norm2 = P.L2Normalize() self.mul1 = P.Mul() self.mul2 = P.Mul()
def __init__(self): super(Net, self).__init__() self.mul = P.Mul() self.relu = P.ReLU() self.wd = Parameter(Tensor(np.ones([8, 8, 8, 8]).astype(np.float32)), name="wide") self.wt = Parameter(Tensor(np.ones([8, 8, 8, 8]).astype(np.float32)), name="l")
def __init__(self, strategy1, strategy2, strategy3): super().__init__() self.mul1 = P.Mul().shard(strategy1) self.arg_max_with_value = P.ArgMaxWithValue(keep_dims=False, axis=-1).shard(strategy2) self.mul2 = P.Mul().shard(strategy3)
def __init__(self): super(MultiOutNet, self).__init__() self.add = P.Add() self.mul = P.Mul() self.sum = P.ReduceSum()
def __init__(self, strategy1, strategy2, strategy3): super().__init__() self.mul1 = P.Mul().shard(strategy1) self.arg_min_with_value = P.ArgMinWithValue(keep_dims=True, axis=-1).shard(strategy2) self.relu = P.ReLU().shard(strategy3)
def __init__(self): super().__init__() self.mul1 = P.Mul() self.prelu = P.PReLU()
def __init__(self, alpha=0.2): super(LeakyReLU, self).__init__() validator.check_value_type('alpha', alpha, [float, int], self.cls_name) self.greater_equal = P.GreaterEqual() self.mul = P.Mul() self.alpha = alpha