def __init__(self, config): super(WideDeepModel, self).__init__() self.batch_size = config.batch_size host_device_mix = bool(config.host_device_mix) parameter_server = bool(config.parameter_server) parallel_mode = _get_parallel_mode() is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel: self.batch_size = self.batch_size * get_group_size() self.field_size = config.field_size self.vocab_size = config.vocab_size self.emb_dim = config.emb_dim self.deep_layer_dims_list = config.deep_layer_dim self.deep_layer_act = config.deep_layer_act self.init_args = config.init_args self.weight_init, self.bias_init = config.weight_bias_init self.weight_bias_init = config.weight_bias_init self.emb_init = config.emb_init self.drop_out = config.dropout_flag self.keep_prob = config.keep_prob self.deep_input_dims = self.field_size * self.emb_dim self.layer_dims = self.deep_layer_dims_list + [1] self.all_dim_list = [self.deep_input_dims] + self.layer_dims init_acts = [('Wide_w', [self.vocab_size, 1], self.emb_init), ('V_l2', [self.vocab_size, self.emb_dim], self.emb_init), ('Wide_b', [1], self.emb_init)] var_map = init_var_dict(self.init_args, init_acts) self.wide_w = var_map["Wide_w"] self.wide_b = var_map["Wide_b"] self.embedding_table = var_map["V_l2"] if parameter_server: self.wide_w.set_param_ps() self.embedding_table.set_param_ps() self.dense_layer_1 = DenseLayer(self.all_dim_list[0], self.all_dim_list[1], self.weight_bias_init, self.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_2 = DenseLayer(self.all_dim_list[1], self.all_dim_list[2], self.weight_bias_init, self.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_3 = DenseLayer(self.all_dim_list[2], self.all_dim_list[3], self.weight_bias_init, self.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_4 = DenseLayer(self.all_dim_list[3], self.all_dim_list[4], self.weight_bias_init, self.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_5 = DenseLayer(self.all_dim_list[4], self.all_dim_list[5], self.weight_bias_init, self.deep_layer_act, use_activation=False, convert_dtype=True, drop_out=config.dropout_flag) self.wide_mul = P.Mul() self.deep_mul = P.Mul() self.reduce_sum = P.ReduceSum(keep_dims=False) self.reshape = P.Reshape() self.deep_reshape = P.Reshape() self.square = P.Square() self.shape = P.Shape() self.tile = P.Tile() self.concat = P.Concat(axis=1) self.cast = P.Cast() if is_auto_parallel and host_device_mix: self.dense_layer_1.dropout.dropout_do_mask.set_strategy( ((1, get_group_size()), )) self.dense_layer_1.matmul.set_strategy( ((1, get_group_size()), (get_group_size(), 1))) self.deep_embeddinglookup = nn.EmbeddingLookup() self.deep_embeddinglookup.embeddinglookup.set_strategy( ((1, get_group_size()), (1, 1))) self.wide_embeddinglookup = nn.EmbeddingLookup() self.wide_embeddinglookup.embeddinglookup.set_strategy( ((get_group_size(), 1), (1, 1))) self.deep_mul.set_strategy(((1, 1, get_group_size()), (1, 1, 1))) self.deep_reshape.add_prim_attr("skip_redistribution", True) self.reduce_sum.add_prim_attr("cross_batch", True) elif parameter_server: self.deep_embeddinglookup = nn.EmbeddingLookup() self.wide_embeddinglookup = nn.EmbeddingLookup() else: self.deep_embeddinglookup = nn.EmbeddingLookup(target='DEVICE') self.wide_embeddinglookup = nn.EmbeddingLookup(target='DEVICE')
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=278, batch_size=32, has_bias=True, activation=None): super(Dense_Thor_GPU, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter( initializer(weight_init, [out_channels, in_channels])) if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels])) self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None split_dim = 128 matrix_A_shape, matrix_G_shape = caculate_matmul_shape( self.in_channels, self.out_channels, split_dim) self.matrix_A_inv = Parameter(Tensor( np.zeros(matrix_A_shape).astype(np.float32)), requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros(matrix_G_shape).astype(np.float32)), requires_grad=False) self.broadcast_to = P.BroadcastTo(matrix_A_shape) self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.mul = P.Mul() self.cube_matmul = P.MatMul(transpose_a=True) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.batch_size = Tensor(batch_size, mstype.float16) self.getG = P.InsertGradientOf(self.save_gradient) self.damping = Parameter(Tensor(damping), requires_grad=False) self.dampingA = Tensor(np.identity(in_channels), mstype.float32) self.dampingG = Tensor(np.identity(out_channels), mstype.float32) self.cast = P.Cast() self.gather = P.GatherV2() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.add = P.TensorAdd() self.sqrt = P.Sqrt() self.cholesky = P.CholeskyTrsm(split_dim=split_dim) self.vector_matmul = P.BatchMatMul(transpose_a=True)
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=278, batch_size=32, has_bias=True, activation=None): super(Dense_Thor, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True self.batch_size = batch_size if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter( initializer(weight_init, [out_channels, in_channels])) if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels])) self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None self.matrix_A_inv = Parameter(Tensor( np.zeros([128, 128, 16, 16]).astype(np.float16)), requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([63, 63, 16, 16]).astype(np.float16)), requires_grad=False) self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)) self.matmul = P.MatMul(transpose_b=True) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = Tensor(damping) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.vector_matmul = P.CusBatchMatMul() self.pad = P.Pad(((0, 23), (0, 23))) self.pad1 = P.Pad(((0, 7), (0, 7))) self.slice = P.Slice() self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False) self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False) self.fused_abs_max1 = P.CusFusedAbsMax1([1001, 1001]) self.fused_abs_max2 = P.CusFusedAbsMax1() self.log = P.Log() self.exp = P.Exp() self.dampingA = Tensor(np.identity(2048), mstype.float32) self.dampingG = Tensor(np.identity(1024), mstype.float32) self.add = P.TensorAdd() self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient)
def construct(self, grid, prediction, pred_xy, pred_wh, y_true, gt_box, input_shape): # prediction : origin output from yolo # pred_xy: (sigmoid(xy)+grid)/grid_size # pred_wh: (exp(wh)*anchors)/input_shape # y_true : after normalize # gt_box: [batch, maxboxes, xyhw] after normalize object_mask = y_true[:, :, :, :, 4:5] class_probs = y_true[:, :, :, :, 5:] true_boxes = y_true[:, :, :, :, :4] grid_shape = P.Shape()(prediction)[1:3] grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32) pred_boxes = self.concat((pred_xy, pred_wh)) true_xy = y_true[:, :, :, :, :2] * grid_shape - grid true_wh = y_true[:, :, :, :, 2:4] true_wh = P.Select()(P.Equal()(true_wh, 0.0), P.Fill()(P.DType()(true_wh), P.Shape()(true_wh), 1.0), true_wh) true_wh = P.Log()(true_wh / self.anchors * input_shape) # 2-w*h for large picture, use small scale, since small obj need more precise box_loss_scale = 2 - y_true[:, :, :, :, 2:3] * y_true[:, :, :, :, 3:4] gt_shape = P.Shape()(gt_box) gt_box = P.Reshape()(gt_box, (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2])) # add one more dimension for broadcast iou = self.iou(P.ExpandDims()(pred_boxes, -2), gt_box) # gt_box is x,y,h,w after normalize # [batch, grid[0], grid[1], num_anchor, num_gt] best_iou = self.reduce_max(iou, -1) # [batch, grid[0], grid[1], num_anchor] # ignore_mask IOU too small ignore_mask = best_iou < self.ignore_threshold ignore_mask = P.Cast()(ignore_mask, ms.float32) ignore_mask = P.ExpandDims()(ignore_mask, -1) # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume. # so we turn off its gradient ignore_mask = F.stop_gradient(ignore_mask) confidence_loss = self.confidenceLoss(object_mask, prediction[:, :, :, :, 4:5], ignore_mask) class_loss = self.classLoss(object_mask, prediction[:, :, :, :, 5:], class_probs) object_mask_me = P.Reshape()(object_mask, (-1, 1)) # [8, 72, 72, 3, 1] box_loss_scale_me = P.Reshape()(box_loss_scale, (-1, 1)) pred_boxes_me = xywh2x1y1x2y2(pred_boxes) pred_boxes_me = P.Reshape()(pred_boxes_me, (-1, 4)) true_boxes_me = xywh2x1y1x2y2(true_boxes) true_boxes_me = P.Reshape()(true_boxes_me, (-1, 4)) ciou = self.giou(pred_boxes_me, true_boxes_me) ciou_loss = object_mask_me * box_loss_scale_me * (1 - ciou) ciou_loss_me = self.reduce_sum(ciou_loss, ()) loss = ciou_loss_me + confidence_loss + class_loss batch_size = P.Shape()(prediction)[0] return loss / batch_size
def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step, param, m, v, gradient, decay_flag): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0. global_step (Tensor): Global step. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. decay_flag (bool): Specifies whether param update with weight decay. Returns: Tensor, the new value of v after updating. """ op_mul = P.Mul() op_sqrt = P.Sqrt() op_rsqrt = P.Rsqrt() op_square = P.Square() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() op_pow = P.Pow() op_norm = layer.Norm() op_select = P.Select() op_greater = P.Greater() op_fill = P.Fill() op_dtype = P.DType() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(num_one, mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32)) next_mm = next_m / (op_cast(num_one, mstype.float32) - op_pow( beta1, op_cast(global_step + num_one, mstype.float32))) next_vv = next_v / (op_cast(num_one, mstype.float32) - op_pow( beta2, op_cast(global_step + num_one, mstype.float32))) w_norm = op_norm(param_fp32) g_norm = op_norm(gradient_fp32) g_norm_hat = op_norm( op_mul(next_mm, op_rsqrt(next_vv + eps)) + weight_decay_tensor * param_fp32) zeros = F.zeros_like(w_norm) ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0) trust_ratio = op_select( op_greater(w_norm, zeros), op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones), ones) tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0) trust_ratio = C.clip_by_value(trust_ratio, zeros, tens) update = next_mm / (op_sqrt(next_vv) + eps) if decay_flag: update = update + op_mul(weight_decay_tensor, param_fp32) update_with_lr = op_mul(op_mul(trust_ratio, lr), update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_v = F.depend(next_v, F.assign(param, next_param)) next_v = F.depend(next_v, F.assign(m, next_m)) next_v = F.depend(next_v, F.assign(v, next_v)) return next_v
def __init__(self, num_features): super().__init__() self.reshape = P.Reshape() self.shape = P.Shape() self.bn2d = nn.BatchNorm2d(num_features, data_format="NCHW")
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, has_bias=False, weight_init='normal', bias_init='zeros'): kernel_size = twice(kernel_size) stride = twice(stride) dilation = twice(dilation) Validator.check_value_type('padding', padding, (int, tuple), self.cls_name) if isinstance(padding, tuple): Validator.check_equal_int(len(padding), 4, 'padding size', self.cls_name) # out_channels and in_channels swap. # cause Conv2DBackpropInput's out_channel refers to Conv2D's out_channel, # then Conv2dTranspose's out_channel refers to Conv2DBackpropInput's in_channel. super(Conv2dTranspose, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, has_bias, weight_init, bias_init, transposed=True) self.in_channels = in_channels self.out_channels = out_channels self.shape = P.Shape() if pad_mode not in ('valid', 'same', 'pad'): raise ValueError('Attr \'pad_mode\' of \'Conv2dTranspose\' Op passed ' + str(pad_mode) + ', should be one of values in \'valid\', \'same\', \'pad\'.') self.is_valid = self.pad_mode == 'valid' self.is_same = self.pad_mode == 'same' self.is_pad = self.pad_mode == 'pad' if Validator.check_bool(has_bias): self.bias = Parameter(initializer(bias_init, [out_channels]), name='bias') # cause Conv2DBackpropInput's out_channel refers to Conv2D's out_channel. self.conv2d_transpose = P.Conv2DBackpropInput(out_channel=in_channels, kernel_size=kernel_size, mode=1, pad_mode=pad_mode, pad=padding, stride=stride, dilation=dilation, group=group) self.bias_add = P.BiasAdd() if isinstance(self.padding, int): self.padding_top, self.padding_bottom, self.padding_left, self.padding_right = (self.padding,) * 4 else: self.padding_top, self.padding_bottom, self.padding_left, self.padding_right = self.padding
'block': P.Transpose(), 'desc_const': [(0, 2, 1)], 'desc_inputs': [[1, 2, 3]], 'desc_bprop': [[1, 3, 2]]}), ('Transpose_dim4', { 'block': P.Transpose(), 'desc_const': [(0, 1, 2, 3)], 'desc_inputs': [[1, 2, 3, 4]], 'desc_bprop': [[1, 2, 4, 3]]}), ('AddN', { 'block': NetForTupleInput(P.AddN()), 'desc_inputs': [[2, 3, 3, 5], [2, 3, 3, 5]], 'desc_bprop': [[2, 3, 3, 5]], 'skip': ['backward']}), ('Shape', { 'block': P.Shape(), 'desc_inputs': [[3, 3, 2, 2]], 'skip': ['backward']}), ('Reshape', { 'block': P.Reshape(), 'desc_const': [(64,)], 'desc_inputs': [[64, 1]], 'desc_bprop': [[64]]}), ('Cast', { 'block': P.Cast(), 'desc_const': [mstype.int32], 'desc_inputs': [[2, 3, 4, 5]], 'desc_bprop': [Tensor(np.ones((2, 3, 3, 5)).astype(np.int32))]}), ('ExpandDims', { 'block': P.ExpandDims(), 'desc_const': [0],
def get_axis(x): shape_op = P.Shape() shape = shape_op(x) length = F.tuple_len(shape) perm = F.make_range(0, length) return perm
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: x.name not in []): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast() self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft() self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight() self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.weight_idx = [] for i in range(len(self.params)): if "conv" in self.params[i].name or "end_point" in self.params[ i].name: self.weight_idx.append(i) self.weight_idx.append(len(self.params)) self.feature_map = [ 1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 ] mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_Amax = DistributedGradReducerThor( parameter_length, ((27, ), 2), mean, degree) self.grad_reducer_Gmax = DistributedGradReducerThor( parameter_length, ((27, ), 4), mean, degree) self.grad_reducer_A = DistributedGradReducerThor( parameter_length, ((27, ), 6), mean, degree) self.grad_reducer_G = DistributedGradReducerThor( parameter_length, ((27, ), 8), mean, degree) self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () for i in range(54): self.matrix_max_inv = self.matrix_max_inv + (Parameter( initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False), ) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []): super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale) Validator.check_value_type("momentum", momentum, [float], self.cls_name) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.use_nesterov = Validator.check_bool(use_nesterov) self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov) self.feature_map = [ 1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 ] self.feature_map_new = [x**0.5 for x in self.feature_map] self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.matmul = P.MatMul() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.assign = P.Assign() self.mul = P.Mul() mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_thorA = DistributedGradReducerThor( parameter_length, ((parameter_length, ), 0), mean, degree) self.grad_reducer_thorG = DistributedGradReducerThor( parameter_length, ((parameter_length, ), 0), mean, degree) self.weight_decay = weight_decay self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.update_gradient = P.UpdateThorGradient(split_dim=128)
from mindspore.common.parameter import Parameter, ParameterTuple from mindspore.common import dtype as mstype from mindspore._checkparam import Validator as validator from mindspore._checkparam import Rel from mindspore.nn import Optimizer from mindspore.nn import TrainOneStepCell, WithLossCell from mindspore.nn.optim import Momentum from mindspore.train import Model from ....dataset_mock import MindData context.set_context(mode=context.GRAPH_MODE, enable_sparse=True) reduce_sum = P.ReduceSum() unsorted_segment_sum = P.UnsortedSegmentSum() transpose = P.Transpose() shape_op = P.Shape() reshape = P.Reshape() size_op = P.Size() invert_permutation = P.InvertPermutation() logical_and = P.LogicalAnd() def get_axis(x): shape = shape_op(x) length = F.tuple_len(shape) perm = F.make_range(0, length) return perm class MSELoss(nn.Cell): def __init__(self):
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable): """Apply sparse adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter and not cache_enable: op_shape = P.Shape() shapes = (op_shape(param), op_shape(m), op_shape(v), op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices), shapes), param)) return success if not target: success = F.depend(success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices)) else: op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() scatter_add = P.ScatterAdd(use_locking) assign_m = F.assign(m, op_mul(beta1, m)) assign_v = F.assign(v, op_mul(beta2, v)) grad_indices = gradient.indices grad_value = gradient.values next_m = scatter_add(m, grad_indices, op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) next_v = scatter_add(v, grad_indices, op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value))) if use_nesterov: m_temp = next_m * _scaler_ten assign_m_nesterov = F.assign(m, op_mul(beta1, next_m)) div_value = scatter_add(m, op_mul(grad_indices, _scaler_one), op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) param_update = div_value / (op_sqrt(next_v) + eps) m_recover = F.assign(m, m_temp / _scaler_ten) F.control_depend(m_temp, assign_m_nesterov) F.control_depend(assign_m_nesterov, div_value) F.control_depend(param_update, m_recover) else: param_update = next_m / (op_sqrt(next_v) + eps) lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power) next_param = param - lr_t * param_update F.control_depend(assign_m, next_m) F.control_depend(assign_v, next_v) success = F.depend(success, F.assign(param, next_param)) success = F.depend(success, F.assign(m, next_m)) success = F.depend(success, F.assign(v, next_v)) return success
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, frequency=10, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.matmul = P.MatMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.gather = P.GatherV2() self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () self.num_hidden_layers = num_hidden_layers fc_layer_num = num_hidden_layers * 6 + 5 for i in range(fc_layer_num): self.matrix_max_inv = self.matrix_max_inv + (Parameter( initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False), ) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.expand = P.ExpandDims() self.square = P.Square() self.inv = P.Inv() self.batch_size = batch_size self.damping = damping self.freq = Tensor(frequency, mstype.int32) self.one = Tensor(1, mstype.int32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer_g = DistributedGradReducerThor1( self.parameters, 3, mean, degree)
def __init__(self, config): super(AutoDisModel, self).__init__() self.batch_size = config.batch_size self.field_size = config.data_field_size self.vocab_size = config.data_vocab_size self.emb_dim = config.data_emb_dim self.deep_layer_dims_list, self.deep_layer_act = config.deep_layer_args self.init_args = config.init_args self.weight_bias_init = config.weight_bias_init self.keep_prob = config.keep_prob self.hash_size = config.hash_size self.split_index = config.split_index self.temperature = config.temperature init_acts = [('W_l2', [self.vocab_size, 1], 'normal', ms_type), ('V_l2', [self.vocab_size, self.emb_dim], 'normal', ms_type), ('b', [1], 'normal', ms_type), ('logits', [self.split_index, self.hash_size], 'random', ms_type), ('autodis_embedding', [self.split_index, self.hash_size, self.emb_dim], 'random', ms_type_16)] var_map = init_var_dict(self.init_args, init_acts) self.fm_w = var_map["W_l2"] self.fm_b = var_map["b"] self.embedding_table = var_map["V_l2"] self.logits = var_map["logits"] self.autodis_embedding = var_map["autodis_embedding"] # Deep Layers self.deep_input_dims = self.field_size * self.emb_dim + 1 self.all_dim_list = [self.deep_input_dims ] + self.deep_layer_dims_list + [1] self.dense_layer_1 = DenseLayer(self.all_dim_list[0], self.all_dim_list[1], self.weight_bias_init, self.deep_layer_act, self.keep_prob) self.dense_layer_2 = DenseLayer(self.all_dim_list[1], self.all_dim_list[2], self.weight_bias_init, self.deep_layer_act, self.keep_prob) self.dense_layer_3 = DenseLayer(self.all_dim_list[2], self.all_dim_list[3], self.weight_bias_init, self.deep_layer_act, self.keep_prob) self.dense_layer_4 = DenseLayer(self.all_dim_list[3], self.all_dim_list[4], self.weight_bias_init, self.deep_layer_act, self.keep_prob) # FM, linear Layers self.Gatherv2 = P.GatherV2() self.Mul = P.Mul() self.ReduceSum = P.ReduceSum(keep_dims=False) self.Reshape = P.Reshape() self.Square = P.Square() self.Shape = P.Shape() self.Tile = P.Tile() self.Concat = P.Concat(axis=1) self.Cast = P.Cast() # AutoDis self.Slice = P.Slice() self.BatchMatMul = P.BatchMatMul() self.ExpandDims = P.ExpandDims() self.Transpose = P.Transpose() self.SoftMax = P.Softmax()
def _IgammacContinuedFraction(ax, x, a, enabled): """Helper function for computing Igammac using a continued fraction.""" abs_x = P.Abs() logicaland = P.LogicalAnd() greater = P.Greater() less = P.Less() notequal = P.NotEqual() fill = P.Fill() shape = P.Shape() dtype = P.DType() select = P.Select() # If more data types are supported, this epsilon need to be selected. epsilon = eps_fp32 def cond(vals): enabled = vals[0] c = vals[5] return logicaland(less(c, 2000), enabled) def body(vals): enabled = vals[0] ans = vals[1] t = vals[2] y = vals[3] z = vals[4] c = vals[5] pkm1 = vals[6] qkm1 = vals[7] pkm2 = vals[8] qkm2 = vals[9] dpkm2_da = vals[10] dqkm2_da = vals[11] dpkm1_da = vals[12] dqkm1_da = vals[13] dans_da = vals[14] c = c + 1 y = y + 1 z = z + 2 yc = y * c pk = pkm1 * z - pkm2 * yc qk = qkm1 * z - qkm2 * yc qk_is_nonzero = notequal(qk, 0) r = pk / qk t = select(qk_is_nonzero, abs_x((ans - r) / r), fill(dtype(t), shape(t), 1)) ans = select(qk_is_nonzero, r, ans) dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c dans_da_new = select(qk_is_nonzero, (dpk_da - ans * dqk_da) / qk, dans_da) grad_conditional = select(qk_is_nonzero, abs_x(dans_da_new - dans_da), fill(dtype(dans_da), shape(dans_da), 1)) pkm2 = pkm1 pkm1 = pk qkm2 = qkm1 qkm1 = qk dpkm2_da = dpkm1_da dqkm2_da = dqkm1_da dpkm1_da = dpk_da dqkm1_da = dqk_da rescale = greater(abs_x(pk), 1 / epsilon) pkm2 = select(rescale, pkm2 * epsilon, pkm2) pkm1 = select(rescale, pkm1 * epsilon, pkm1) qkm2 = select(rescale, qkm2 * epsilon, qkm2) qkm1 = select(rescale, qkm1 * epsilon, qkm1) dpkm2_da = select(rescale, dpkm2_da * epsilon, dpkm2_da) dqkm2_da = select(rescale, dqkm2_da * epsilon, dqkm2_da) dpkm1_da = select(rescale, dpkm1_da * epsilon, dpkm1_da) dqkm1_da = select(rescale, dqkm1_da * epsilon, dqkm1_da) conditional = logicaland(enabled, greater(grad_conditional, epsilon)) return (conditional, select(enabled, ans, vals[1]), select(enabled, t, vals[2]), select(enabled, y, vals[3]), select(enabled, z, vals[4]), c, select(enabled, pkm1, vals[6]), select(enabled, qkm1, vals[7]), select(enabled, pkm2, vals[8]), select(enabled, qkm2, vals[9]), select(enabled, dpkm2_da, vals[10]), select(enabled, dqkm2_da, vals[11]), select(enabled, dpkm1_da, vals[12]), select(enabled, dqkm1_da, vals[13]), select(enabled, dans_da_new, vals[14])) y = 1 - a z = x + y + 1 c = fill(dtype(x), shape(x), 0) pkm2 = fill(dtype(x), shape(x), 1) qkm2 = x pkm1 = x + 1 qkm1 = z * x ans = pkm1 / qkm1 t = fill(dtype(x), shape(x), 1) dpkm2_da = fill(dtype(x), shape(x), 0) dqkm2_da = fill(dtype(x), shape(x), 0) dpkm1_da = fill(dtype(x), shape(x), 0) dqkm1_da = -x dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1 vals = (enabled, ans, t, y, z, c, pkm1, qkm1, pkm2, qkm2, dpkm2_da, dqkm2_da, dpkm1_da, dqkm1_da, dans_da) vals = _while_helper_func(cond, body, vals) ans = vals[1] return ans * ax
def __init__(self, config): super(WideDeepModel, self).__init__() self.batch_size = config.batch_size host_device_mix = bool(config.host_device_mix) parameter_server = bool(config.parameter_server) parallel_mode = context.get_auto_parallel_context("parallel_mode") is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) if is_auto_parallel: self.batch_size = self.batch_size * get_group_size() is_field_slice = config.field_slice sparse = config.sparse self.field_size = config.field_size self.vocab_size = config.vocab_size self.vocab_cache_size = config.vocab_cache_size self.emb_dim = config.emb_dim self.deep_layer_dims_list = config.deep_layer_dim self.deep_layer_act = config.deep_layer_act self.init_args = config.init_args self.weight_init, self.bias_init = config.weight_bias_init self.weight_bias_init = config.weight_bias_init self.emb_init = config.emb_init self.drop_out = config.dropout_flag self.keep_prob = config.keep_prob self.deep_input_dims = self.field_size * self.emb_dim self.layer_dims = self.deep_layer_dims_list + [1] self.all_dim_list = [self.deep_input_dims] + self.layer_dims init_acts = [('Wide_b', [1], self.emb_init)] var_map = init_var_dict(self.init_args, init_acts) self.wide_b = var_map["Wide_b"] self.dense_layer_1 = DenseLayer(self.all_dim_list[0], self.all_dim_list[1], self.weight_bias_init, self.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_2 = DenseLayer(self.all_dim_list[1], self.all_dim_list[2], self.weight_bias_init, self.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_3 = DenseLayer(self.all_dim_list[2], self.all_dim_list[3], self.weight_bias_init, self.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_4 = DenseLayer(self.all_dim_list[3], self.all_dim_list[4], self.weight_bias_init, self.deep_layer_act, convert_dtype=True, drop_out=config.dropout_flag) self.dense_layer_5 = DenseLayer(self.all_dim_list[4], self.all_dim_list[5], self.weight_bias_init, self.deep_layer_act, use_activation=False, convert_dtype=True, drop_out=config.dropout_flag) self.wide_mul = P.Mul() self.deep_mul = P.Mul() self.reduce_sum = P.ReduceSum(keep_dims=False) self.reshape = P.Reshape() self.deep_reshape = P.Reshape() self.square = P.Square() self.shape = P.Shape() self.tile = P.Tile() self.concat = P.Concat(axis=1) self.cast = P.Cast() self.unique = P.Unique().shard(((1, ), )) self.wide_gatherv2 = P.GatherV2() self.deep_gatherv2 = P.GatherV2() if is_auto_parallel and sparse and not is_field_slice: target = 'DEVICE' if host_device_mix: target = 'CPU' self.wide_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, 1, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE) if config.deep_table_slice_mode == "column_slice": self.deep_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, self.emb_dim, target=target, slice_mode=nn.EmbeddingLookup.TABLE_COLUMN_SLICE) self.dense_layer_1.dropout.dropout.shard( ((1, get_group_size()), )) self.dense_layer_1.dropout.dropout_do_mask.shard( ((1, get_group_size()), )) self.dense_layer_1.matmul.shard( ((1, get_group_size()), (get_group_size(), 1))) self.dense_layer_1.matmul.add_prim_attr( "field_size", self.field_size) self.deep_mul.shard(((1, 1, get_group_size()), (1, 1, 1))) self.deep_reshape.add_prim_attr("skip_redistribution", True) else: self.deep_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, self.emb_dim, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE) self.reduce_sum.add_prim_attr("cross_batch", True) self.embedding_table = self.deep_embeddinglookup.embedding_table elif is_auto_parallel and host_device_mix and is_field_slice and config.full_batch and config.manual_shape: manual_shapes = tuple((s[0] for s in config.manual_shape)) self.deep_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, self.emb_dim, slice_mode=nn.EmbeddingLookup.FIELD_SLICE, manual_shapes=manual_shapes) self.wide_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, 1, slice_mode=nn.EmbeddingLookup.FIELD_SLICE, manual_shapes=manual_shapes) self.deep_mul.shard( ((1, get_group_size(), 1), (1, get_group_size(), 1))) self.wide_mul.shard( ((1, get_group_size(), 1), (1, get_group_size(), 1))) self.reduce_sum.shard(((1, get_group_size(), 1), )) self.dense_layer_1.dropout.dropout.shard(((1, get_group_size()), )) self.dense_layer_1.dropout.dropout_do_mask.shard( ((1, get_group_size()), )) self.dense_layer_1.matmul.shard( ((1, get_group_size()), (get_group_size(), 1))) self.embedding_table = self.deep_embeddinglookup.embedding_table elif parameter_server: cache_enable = self.vocab_cache_size > 0 target = 'DEVICE' if cache_enable else 'CPU' if not cache_enable: sparse = True if is_auto_parallel and config.full_batch and cache_enable: self.deep_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, self.emb_dim, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, sparse=sparse, vocab_cache_size=self.vocab_cache_size) self.wide_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, 1, target=target, slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE, sparse=sparse, vocab_cache_size=self.vocab_cache_size) else: self.deep_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, self.emb_dim, target=target, sparse=sparse, vocab_cache_size=self.vocab_cache_size) self.wide_embeddinglookup = nn.EmbeddingLookup( self.vocab_size, 1, target=target, sparse=sparse, vocab_cache_size=self.vocab_cache_size) self.embedding_table = self.deep_embeddinglookup.embedding_table self.deep_embeddinglookup.embedding_table.set_param_ps() self.wide_embeddinglookup.embedding_table.set_param_ps() else: self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, self.emb_dim, target='DEVICE', sparse=sparse) self.wide_embeddinglookup = nn.EmbeddingLookup(self.vocab_size, 1, target='DEVICE', sparse=sparse) self.embedding_table = self.deep_embeddinglookup.embedding_table
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=100, has_bias=False, activation=None, batch_size=12): super(Dense_Thor, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \ weight_init.shape()[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape( )[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None self.matrix_A_inv = Parameter(Tensor( np.zeros([in_channels, in_channels]).astype(np.float16)), name='matrix_A_inv', requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([out_channels, out_channels]).astype(np.float16)), name="matrix_G_inv", requires_grad=False) self.fake_G = Tensor( np.zeros([out_channels, out_channels]).astype(np.float16)) self.matmul = P.MatMul(transpose_b=True) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = damping self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.vector_matmul = P.CusBatchMatMul() self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.abs = P.Abs() self.reduce_max = P.ReduceMax(keep_dims=False) self.log = P.Log() self.exp = P.Exp() self.dampingA = Tensor(np.identity(in_channels), mstype.float32) self.dampingG = Tensor(np.identity(out_channels), mstype.float32) self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient) self.batch_size = batch_size
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, has_bias=False, weight_init='normal', bias_init='zeros'): Validator.check_value_type("kernel_size", kernel_size, [int], self.cls_name) Validator.check_value_type("stride", stride, [int], self.cls_name) Validator.check_value_type("padding", padding, [int], self.cls_name) Validator.check_value_type("dilation", dilation, [int], self.cls_name) Validator.check_int(kernel_size, 1, Rel.GE, 'kernel_size', self.cls_name) Validator.check_int(stride, 1, Rel.GE, 'stride', self.cls_name) Validator.check_non_negative_int(padding, 'padding', self.cls_name) Validator.check_int(dilation, 1, Rel.GE, 'dilation', self.cls_name) kernel_size = (1, kernel_size) stride = (1, stride) dilation = (1, dilation) get_shape = P.Shape() get_dtype = P.DType() if isinstance(weight_init, Tensor): weight_init_shape = get_shape(weight_init) Validator.check_equal_int(len(weight_init_shape), 3, 'weight_init_shape', self.cls_name) weight_init_dtype = get_dtype(weight_init) weight_init_value = weight_init.asnumpy() weight_init_value = np.expand_dims(weight_init_value, 2) weight_init = Tensor(weight_init_value, weight_init_dtype) super(Conv1d, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, has_bias, weight_init, bias_init) self.padding = (0, 0, padding, padding) self.conv2d = P.Conv2D(out_channel=self.out_channels, kernel_size=self.kernel_size, mode=1, pad_mode=self.pad_mode, pad=self.padding, stride=self.stride, dilation=self.dilation, group=self.group) self.bias_add = P.BiasAdd() if pad_mode not in ('valid', 'same', 'pad'): raise ValueError('Attr \'pad_mode\' of \'Conv1d\' Op passed ' + str(pad_mode) + ', should be one of values in \'valid\', \'same\', \'pad\'.') self.expand_dims = P.ExpandDims() self.squeeze = P.Squeeze(2) self.shape = P.Shape()
def __init__( self, vocab_size, embedding_size, embedding_shape, use_one_hot_embeddings=False, initializer_range=0.02, name='embedding_table', batch_size=12, damping=0.03, loss_scale=1, frequency=100, ): super(Embedding_Thor, self).__init__() self.vocab_size = vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.embedding_table = Parameter(initializer( TruncatedNormal(initializer_range), [vocab_size, embedding_size]), name=name) self.thor = True self.expand = P.ExpandDims() self.shape_flat = (-1, ) self.gather = P.GatherV2() self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.em_shape = tuple(embedding_shape) self.shape = P.Shape() self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.matrix_A_inv = Parameter(Tensor( np.zeros([vocab_size]).astype(np.float16)), name='matrix_A_inv', requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)), name="matrix_G_inv", requires_grad=False) self.fake_G = Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)) self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32)) self.dampingG = Tensor(np.identity(embedding_size), mstype.float32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.damping = damping self.gather = P.GatherV2() self.sqrt = P.Sqrt() self.mul = P.Mul() self.cast = P.Cast() self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.vector_matmul = P.CusBatchMatMul() self.cholesky = P.CusCholeskyTrsm() self.matrix_combine = P.CusMatrixCombine() self.reduce_sum = P.ReduceSum(keep_dims=False) self.inv = P.Inv() self.getG = P.InsertGradientOf(self.save_gradient) self.batch_size = batch_size
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, has_bias=False, weight_init='normal', bias_init='zeros'): Validator.check_value_type("kernel_size", kernel_size, [int], self.cls_name) Validator.check_value_type("stride", stride, [int], self.cls_name) Validator.check_value_type("padding", padding, [int], self.cls_name) Validator.check_value_type("dilation", dilation, [int], self.cls_name) Validator.check_int(kernel_size, 1, Rel.GE, 'kernel_size', self.cls_name) Validator.check_int(stride, 1, Rel.GE, 'stride', self.cls_name) Validator.check_non_negative_int(padding, 'padding', self.cls_name) Validator.check_int(dilation, 1, Rel.GE, 'dilation', self.cls_name) kernel_size = (1, kernel_size) stride = (1, stride) dilation = (1, dilation) get_shape = P.Shape() get_dtype = P.DType() if isinstance(weight_init, Tensor): weight_init_shape = get_shape(weight_init) Validator.check_equal_int(len(weight_init_shape), 3, 'weight_init_shape', self.cls_name) weight_init_dtype = get_dtype(weight_init) weight_init_value = weight_init.asnumpy() weight_init_value = np.expand_dims(weight_init_value, 2) weight_init = Tensor(weight_init_value, weight_init_dtype) # out_channels and in_channels swap. # cause Conv2DBackpropInput's out_channel refers to Conv2D's out_channel, # then Conv1dTranspose's out_channel refers to Conv2DBackpropInput's in_channel. super(Conv1dTranspose, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, has_bias, weight_init, bias_init, transposed=True) self.padding = (0, 0, padding, padding) self.in_channels = in_channels self.out_channels = out_channels self.shape = P.Shape() if pad_mode not in ('valid', 'same', 'pad'): raise ValueError('Attr \'pad_mode\' of \'Conv1dTranspose\' Op passed ' + str(pad_mode) + ', should be one of values in \'valid\', \'same\', \'pad\'.') self.is_valid = self.pad_mode == 'valid' self.is_same = self.pad_mode == 'same' self.is_pad = self.pad_mode == 'pad' if Validator.check_bool(has_bias): self.bias = Parameter(initializer(bias_init, [out_channels]), name='bias') # cause Conv2DBackpropInput's out_channel refers to Conv2D's out_channel. self.conv2d_transpose = P.Conv2DBackpropInput(out_channel=in_channels, kernel_size=kernel_size, mode=1, pad_mode=pad_mode, pad=self.padding, stride=stride, dilation=dilation, group=group) self.bias_add = P.BiasAdd() self.expand_dims = P.ExpandDims() self.squeeze = P.Squeeze(2)
def __init__(self, vocab_size, embedding_size, param_init='normal', target='CPU', slice_mode='batch_slice', manual_shapes=None, max_norm=None, sparse=True, vocab_cache_size=0): super(EmbeddingLookup, self).__init__() validator.check_value_type('sparse', sparse, [bool], self.cls_name) self.target = target if target not in ('CPU', 'DEVICE'): raise ValueError('Attr \'target\' of \'EmbeddingLookup\' Op passed ' + str(target) + ', should be one of values in \'CPU\', \'DEVICE\'.') if not sparse and target == 'CPU': raise ValueError('When target is CPU, embedding_lookup must be sparse.') enable_ps = context.get_ps_context("enable_ps") if not enable_ps and vocab_cache_size > 0: logger.warning("The configuration of 'vocab_cache_size' is valid only in parameter server trainning mode, " "current mode is not parameter server trainning mode, so it will be ignored.") vocab_cache_size = 0 if sparse: self.gatherv2 = P.SparseGatherV2() else: self.gatherv2 = P.GatherV2() self.embeddinglookup = P.EmbeddingLookup().add_prim_attr('primitive_target', 'CPU') self.vocab_size = validator.check_positive_int(vocab_size, 'vocab_size') self.vocab_cache_size = validator.check_non_negative_int(vocab_cache_size, 'vocab_cache_size') self.embedding_size = validator.check_positive_int(embedding_size, 'embedding_size') parallel_mode = _get_parallel_mode() is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL) self.cache_enable = self.vocab_cache_size > 0 if self.cache_enable: if is_auto_parallel: self.vocab_cache_size = self.vocab_cache_size * get_group_size() self.vocab_size = self.vocab_cache_size self.embedding_table = Parameter(initializer(param_init, [self.vocab_size, self.embedding_size]), name='embedding_table') if self.cache_enable: self.embedding_table.cache_enable = True _set_cache_enable(True) if _is_role_worker(): _insert_hash_table_size(self.embedding_table.name, vocab_cache_size, embedding_size, vocab_size) self.forward_unique = False self.gather_revert = P.GatherV2() self.unique = P.Unique().shard(((1,),)) self.reshape = P.Reshape() self.shape = P.Shape() indices_shape_size = 2 if slice_mode == "field_slice" and is_auto_parallel: if not manual_shapes: raise ValueError("in slice field mode, the manual_shapes should not be none") if not isinstance(manual_shapes, tuple): raise TypeError("manual_shapes type must be tuple(int) cannot be {}!".format(type(manual_shapes))) for dim in manual_shapes: validator.check_positive_int(dim, 'manual shape dim', self.cls_name) self.gatherv2.add_prim_attr("manual_split", manual_shapes) self.embeddinglookup.add_prim_attr("manual_split", manual_shapes) self.gatherv2.shard(((get_group_size(), 1), (1, get_group_size()))) self.embeddinglookup.shard(((get_group_size(), 1), (1, get_group_size()))) elif slice_mode == "table_row_slice" and is_auto_parallel: if target == 'DEVICE' and not self.cache_enable: indices_shape_size = 1 self.gather_revert.shard(((1, 1), (get_group_size(),))) self.forward_unique = True indices_strategy = (1,)*indices_shape_size self.gatherv2.shard(((get_group_size(), 1), indices_strategy)) self.embeddinglookup.shard(((get_group_size(), 1), indices_strategy)) elif slice_mode == "table_column_slice" and is_auto_parallel: if target == 'DEVICE': indices_shape_size = 1 self.gather_revert.shard(((1, get_group_size()), (1,))) self.forward_unique = True indices_strategy = (1,)*indices_shape_size self.gatherv2.shard(((1, get_group_size()), indices_strategy)) self.embeddinglookup.shard(((1, get_group_size()), indices_strategy)) elif slice_mode == "batch_slice" and is_auto_parallel: indices_strategy = [get_group_size()] indices_strategy.extend([1]*(indices_shape_size - 1)) indices_strategy = tuple(indices_strategy) self.gatherv2.shard(((1, 1), indices_strategy)) self.embeddinglookup.shard(((1, 1), indices_strategy)) else: if is_auto_parallel: raise ValueError("slice_mode should support mode in nn.EmbeddingLookup, but get " + str(slice_mode)) self.embedding_table.unique = self.forward_unique self.max_norm = max_norm if self.max_norm is not None: self.max_norm = validator.check_positive_float(self.max_norm, 'max_norm', self.cls_name) self.max_norm = Tensor(self.max_norm, dtype=mstype.float32)
def __init__(self, q_tensor_width, k_tensor_width, v_tensor_width, hidden_width, out_tensor_width, num_attention_heads=1, query_act=None, key_act=None, value_act=None, out_act=None, has_attention_mask=True, attention_probs_dropout_prob=0.0, use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=False, compute_type=mstype.float16, same_dim=True): super(MultiheadAttention, self).__init__() self.num_attention_heads = num_attention_heads self.size_per_head = int(hidden_width / num_attention_heads) self.has_attention_mask = has_attention_mask self.use_one_hot_embeddings = use_one_hot_embeddings self.initializer_range = initializer_range self.do_return_2d_tensor = do_return_2d_tensor self.same_dim = same_dim self.scores_mul = Tensor( [1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type) self.reshape = P.Reshape() self.shape_q_2d = (-1, q_tensor_width) self.shape_k_2d = (-1, k_tensor_width) self.shape_v_2d = (-1, v_tensor_width) self.hidden_width = int(hidden_width) if self.same_dim: self.in_proj_layer = Parameter(Tensor(np.random.rand(hidden_width * 3, q_tensor_width), dtype=mstype.float32), name="weight") else: self.query_layer = nn.Dense(q_tensor_width, hidden_width, activation=query_act, has_bias=False).to_float(compute_type) self.key_layer = nn.Dense(k_tensor_width, hidden_width, activation=key_act, has_bias=False).to_float(compute_type) self.value_layer = nn.Dense(q_tensor_width, hidden_width, activation=value_act, has_bias=False).to_float(compute_type) self.out_proj = nn.Dense(hidden_width, out_tensor_width, activation=out_act, has_bias=False).to_float(compute_type) self.matmul_trans_b = P.BatchMatMul(transpose_b=True) self.multiply = P.Mul() self.transpose = P.Transpose() self.trans_shape = (0, 2, 1, 3) self.trans_shape_relative = (2, 0, 1, 3) self.trans_shape_position = (1, 2, 0, 3) self.multiply_data = Tensor([-10000.0,], dtype=compute_type) self.matmul = P.BatchMatMul() self.softmax = nn.Softmax() self.dropout = nn.Dropout(1. - attention_probs_dropout_prob) self.use_dropout = attention_probs_dropout_prob > 0 if self.has_attention_mask: self.expand_dims = P.ExpandDims() self.sub = P.Sub() self.add = P.TensorAdd() self.cast = P.Cast() self.get_dtype = P.DType() self.softmax_cast = P.Cast() self.matmul_dense = P.MatMul(transpose_b=True) self.split = P.Split(0, 3) self.equal = P.Equal() self.shape = P.Shape()
def construct(self): weights = self.weights loss = self.network() sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) grads = self.grad(self.network, weights)(sens) return F.depend(loss, self.optimizer(grads))
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, data_format='NCHW', has_bias=False, weight_init='normal', damping=0.03, loss_scale=1, frequency=278, batch_size=32, bias_init='zeros'): self.thor = True self.hw = kernel_size * kernel_size kernel_size = twice(kernel_size) super(Conv2d_Thor_GPU, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, data_format, has_bias, weight_init, bias_init, ) self.conv2d = P.Conv2D(out_channel=self.out_channels, kernel_size=self.kernel_size, mode=1, pad_mode=self.pad_mode, pad=self.padding, stride=self.stride, dilation=self.dilation, group=self.group) self.matrix_A_dim = self.in_channels * self.kernel_size[ 0] * self.kernel_size[1] self.matrix_G_dim = self.out_channels split_dim = 128 matrix_A_shape, matrix_G_shape = caculate_matmul_shape( self.matrix_A_dim, self.matrix_G_dim, split_dim) self.matrix_A_inv = Parameter(np.zeros(matrix_A_shape).astype( np.float32), requires_grad=False) self.matrix_G_inv = Parameter(np.zeros(matrix_G_shape).astype( np.float32), requires_grad=False) self.broadcast_to = P.BroadcastTo(matrix_A_shape) self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.img2col = P.Im2Col(kernel_size=kernel_size, stride=stride, pad_mode="same") self.matmul = P.MatMul(transpose_b=True) self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.getG = P.InsertGradientOf(self.save_gradient) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.batch_size = Tensor(batch_size, mstype.float16) self.transpose = P.Transpose() self.cast = P.Cast() self.gather = P.GatherV2() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.sqrt = P.Sqrt() self.reduce_mean = P.ReduceMean(keep_dims=False) self.damping = Parameter(Tensor(damping), requires_grad=False) self.dampingA = Tensor(np.identity(self.matrix_A_dim), mstype.float32) self.dampingG = Tensor(np.identity(self.matrix_G_dim), mstype.float32) self.cholesky = P.CholeskyTrsm(split_dim=split_dim) self.vector_matmul = P.BatchMatMul(transpose_a=True)
def __init__(self, config): super(DeepFMModel, self).__init__() self.batch_size = config.batch_size self.field_size = config.data_field_size self.vocab_size = config.data_vocab_size self.emb_dim = config.data_emb_dim self.deep_layer_dims_list, self.deep_layer_act = config.deep_layer_args self.init_args = config.init_args self.weight_bias_init = config.weight_bias_init self.keep_prob = config.keep_prob init_acts = [('W_l2', [self.vocab_size, 1], 'normal'), ('V_l2', [self.vocab_size, self.emb_dim], 'normal')] var_map = init_var_dict(self.init_args, init_acts) self.fm_w = var_map["W_l2"] self.embedding_table = var_map["V_l2"] " Deep Layers " self.deep_input_dims = self.field_size * self.emb_dim self.all_dim_list = [self.deep_input_dims ] + self.deep_layer_dims_list + [1] self.dense_layer_1 = DenseLayer(self.all_dim_list[0], self.all_dim_list[1], self.weight_bias_init, self.deep_layer_act, self.keep_prob, convert_dtype=True) self.dense_layer_2 = DenseLayer(self.all_dim_list[1], self.all_dim_list[2], self.weight_bias_init, self.deep_layer_act, self.keep_prob, convert_dtype=True) self.dense_layer_3 = DenseLayer(self.all_dim_list[2], self.all_dim_list[3], self.weight_bias_init, self.deep_layer_act, self.keep_prob, convert_dtype=True) self.dense_layer_4 = DenseLayer(self.all_dim_list[3], self.all_dim_list[4], self.weight_bias_init, self.deep_layer_act, self.keep_prob, convert_dtype=True) self.dense_layer_5 = DenseLayer(self.all_dim_list[4], self.all_dim_list[5], self.weight_bias_init, self.deep_layer_act, self.keep_prob, convert_dtype=True, use_act=False) " FM, linear Layers " self.Gatherv2 = P.GatherV2() self.Mul = P.Mul() self.ReduceSum = P.ReduceSum(keep_dims=False) self.Reshape = P.Reshape() self.Square = P.Square() self.Shape = P.Shape() self.Tile = P.Tile() self.Concat = P.Concat(axis=1) self.Cast = P.Cast()
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, data_format='NCHW', has_bias=False, weight_init='normal', damping=0.03, loss_scale=1, frequency=278, batch_size=32, bias_init='zeros'): self.thor = True ksizes = (1, kernel_size, kernel_size, 1) self.hw = kernel_size * kernel_size strides = (1, stride, stride, 1) kernel_size = twice(kernel_size) super(Conv2d_Thor, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, data_format, has_bias, weight_init, bias_init, ) self.conv2d = P.Conv2D(out_channel=self.out_channels, kernel_size=self.kernel_size, mode=1, pad_mode=self.pad_mode, pad=self.padding, stride=self.stride, dilation=self.dilation, group=self.group) self.batch_size = batch_size self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.transpose02314 = P.CusTranspose02314() self.matrix_A_dim = self.in_channels * self.kernel_size[ 0] * self.kernel_size[1] self.matrix_G_dim = self.out_channels self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape( self.matrix_A_dim, self.in_channels, True) self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape( self.matrix_G_dim, self.in_channels, False) self.matrix_A_device_temp_shape = (self.matrix_A_device_shape[0], self.matrix_A_device_shape[2], self.matrix_A_device_shape[1], self.matrix_A_device_shape[3]) self.matrix_G_device_temp_shape = (self.matrix_G_device_shape[0], self.matrix_G_device_shape[2], self.matrix_G_device_shape[1], self.matrix_G_device_shape[3]) self.matrix_A_inv = Parameter(Tensor( np.reshape( np.identity(self.matrix_A_device_dim).astype(np.float16), self.matrix_A_device_shape)), requires_grad=False) self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.reshape( np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)), requires_grad=False) self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), requires_grad=False) self.fake_G = Tensor( np.reshape( np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = Tensor(damping) self.vector_matmul = P.CusBatchMatMul() self.diag_block_dim = 128 self.channels_slice_flag = False if self.in_channels % C0 != 0: self.channels_slice_flag = True self.padA_flag = False if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \ and self.matrix_A_dim > self.diag_block_dim: self.padA_flag = True pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim self.padA = P.Pad(((0, pad_dim), (0, pad_dim))) self.device_shape_pad_flag = False if self.matrix_A_dim != self.matrix_A_device_dim: self.device_shape_pad_flag = True self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels), (0, 0), (0, C0 - self.in_channels))) self.slice = P.Slice() self.gather = P.GatherV2() self.freq = Tensor(frequency, mstype.int32) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.axis = 0 dampingA_dim = self.matrix_A_dim if (self.matrix_A_dim % self.diag_block_dim ) != 0 and self.matrix_A_dim > self.diag_block_dim: dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim dampingG_dim = self.matrix_G_dim if (self.matrix_G_dim % self.diag_block_dim ) != 0 and self.matrix_G_dim > self.diag_block_dim: dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32) self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32) self.fused_abs_max1 = P.CusFusedAbsMax1( [self.matrix_A_dim, self.matrix_A_dim]) self.fused_abs_max2 = P.CusFusedAbsMax1() self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient)
def __init__(self, config): super(WideDeepModel, self).__init__() emb_128_size = 650000 emb64_single_size = 17300 emb64_multi_size = 20900 indicator_size = 16 deep_dim_list = [1024, 1024, 1024, 1024, 1024] # deep_dropout=0.0 wide_reg_coef = [0.0, 0.0] deep_reg_coef = [0.0, 0.0] wide_lr = 0.2 deep_lr = 1.0 self.input_emb_dim = config.input_emb_dim self.batch_size = config.batch_size self.deep_layer_act = config.deep_layers_act self.init_args = config.init_args self.weight_init, self.bias_init = config.weight_bias_init self.weight_bias_init = config.weight_bias_init self.emb_init = config.emb_init self.keep_prob = config.keep_prob self.layer_dims = deep_dim_list + [1] self.all_dim_list = [self.input_emb_dim] + self.layer_dims self.continue_field_size = 32 self.emb_128_size = emb_128_size self.emb64_single_size = emb64_single_size self.emb64_multi_size = emb64_multi_size self.indicator_size = indicator_size self.wide_l1_coef, self.wide_l2_coef = wide_reg_coef self.deep_l1_coef, self.deep_l2_coef = deep_reg_coef self.wide_lr = wide_lr self.deep_lr = deep_lr init_acts_embedding_metrix = [ ('emb128_embedding', [self.emb_128_size, 128], self.emb_init), ('emb64_single', [self.emb64_single_size, 64], self.emb_init), ('emb64_multi', [self.emb64_multi_size, 64], self.emb_init), ('emb64_indicator', [self.indicator_size, 64], self.emb_init) ] var_map = init_var_dict(self.init_args, init_acts_embedding_metrix) self.emb128_embedding = var_map["emb128_embedding"] self.emb64_single = var_map["emb64_single"] self.emb64_multi = var_map["emb64_multi"] self.emb64_indicator = var_map["emb64_indicator"] init_acts_wide_weight = [ ('wide_continue_w', [self.continue_field_size], self.emb_init), ('wide_emb128_w', [self.emb_128_size], self.emb_init), ('wide_emb64_single_w', [self.emb64_single_size], self.emb_init), ('wide_emb64_multi_w', [self.emb64_multi_size], self.emb_init), ('wide_indicator_w', [self.indicator_size], self.emb_init), ('wide_bias', [1], self.emb_init) ] var_map = init_var_dict(self.init_args, init_acts_wide_weight) self.wide_continue_w = var_map["wide_continue_w"] self.wide_emb128_w = var_map["wide_emb128_w"] self.wide_emb64_single_w = var_map["wide_emb64_single_w"] self.wide_emb64_multi_w = var_map["wide_emb64_multi_w"] self.wide_indicator_w = var_map["wide_indicator_w"] self.wide_bias = var_map["wide_bias"] self.dense_layer_1 = DenseLayer(self.all_dim_list[0], self.all_dim_list[1], self.weight_bias_init, self.deep_layer_act, drop_out=config.dropout_flag, convert_dtype=True) self.dense_layer_2 = DenseLayer(self.all_dim_list[1], self.all_dim_list[2], self.weight_bias_init, self.deep_layer_act, drop_out=config.dropout_flag, convert_dtype=True) self.dense_layer_3 = DenseLayer(self.all_dim_list[2], self.all_dim_list[3], self.weight_bias_init, self.deep_layer_act, drop_out=config.dropout_flag, convert_dtype=True) self.dense_layer_4 = DenseLayer(self.all_dim_list[3], self.all_dim_list[4], self.weight_bias_init, self.deep_layer_act, drop_out=config.dropout_flag, convert_dtype=True) self.dense_layer_5 = DenseLayer(self.all_dim_list[4], self.all_dim_list[5], self.weight_bias_init, self.deep_layer_act, drop_out=config.dropout_flag, convert_dtype=True) self.deep_predict = DenseLayer(self.all_dim_list[5], self.all_dim_list[6], self.weight_bias_init, self.deep_layer_act, drop_out=config.dropout_flag, convert_dtype=True, use_activation=False) self.gather_v2 = P.GatherV2() self.mul = P.Mul() self.reduce_sum_false = P.ReduceSum(keep_dims=False) self.reduce_sum_true = P.ReduceSum(keep_dims=True) self.reshape = P.Reshape() self.square = P.Square() self.shape = P.Shape() self.tile = P.Tile() self.concat = P.Concat(axis=1) self.cast = P.Cast() self.reduceMean_false = P.ReduceMean(keep_dims=False) self.Concat = P.Concat(axis=1) self.BiasAdd = P.BiasAdd() self.expand_dims = P.ExpandDims() self.flatten = Flatten()
def __init__(self, num_features, eps=1e-5, momentum=0.9, affine=True, gamma_init='ones', beta_init='zeros', moving_mean_init='zeros', moving_var_init='ones', use_batch_statistics=None, device_num_each_group=1, input_dims='2d', data_format='NCHW'): super(_BatchNorm, self).__init__() validator.check_value_type('num_features', num_features, [int], self.cls_name) if num_features < 1: raise ValueError("num_features must be at least 1") if momentum < 0 or momentum > 1: raise ValueError( "momentum should be a number in range [0, 1], but got {}". format(momentum)) self.format = validator.check_string(data_format, ['NCHW', 'NHWC'], 'format', self.cls_name) if context.get_context( "device_target") != "GPU" and self.format == "NHWC": raise ValueError("NHWC format only support in GPU target.") self.use_batch_statistics = use_batch_statistics self.num_features = num_features self.eps = eps self.input_dims = input_dims self.moving_mean = Parameter(initializer(moving_mean_init, num_features), name="mean", requires_grad=False) self.moving_variance = Parameter(initializer(moving_var_init, num_features), name="variance", requires_grad=False) self.gamma = Parameter(initializer(gamma_init, num_features), name="gamma", requires_grad=affine) self.beta = Parameter(initializer(beta_init, num_features), name="beta", requires_grad=affine) self.group = validator.check_positive_int(device_num_each_group) self.is_global = False if self.group != 1: self.rank_id = get_rank() self.rank_size = get_group_size() self.device_list = [i for i in range(0, self.rank_size)] self.rank_list = self.list_group(self.device_list, self.group) self.rank_list_idx = len(self.rank_list) for i in range(self.rank_list_idx): if self.rank_id in self.rank_list[i] and self.group != 1: self.is_global = True management.create_group('group' + str(i), self.rank_list[i]) self.all_reduce = P.AllReduce( P.ReduceOp.SUM, 'group' + str(i)).add_prim_attr('fusion', 1) self.shape = P.Shape() self.reduce_mean = P.ReduceMean(keep_dims=True) self.square = P.Square() self.sqrt = P.Sqrt() self.cast = P.Cast() self.dtype = P.DType() self.reshape = P.Reshape() self._target = context.get_context("device_target") self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE self.momentum = 1.0 - momentum if context.get_context("enable_ge"): self.is_ge_backend = True else: self.is_ge_backend = False if self._target == "Ascend": self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps, momentum=self.momentum) if self._target == "GPU": self.bn_train = P.FusedBatchNormEx(mode=1, epsilon=self.eps, momentum=self.momentum, data_format=self.format) if self._target == "CPU": self.bn_train = P.FusedBatchNorm(mode=1, epsilon=self.eps, momentum=self.momentum) self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps, data_format=self.format) self.enable_global_sync = self.is_global and (self.is_ge_backend or\ (self.is_graph_mode and self._target == "Ascend")) data_parallel_strategy = ((1, ), (1, )) data_parallel_strategy_one = ((1, ), ()) self.sub_mean = P.Sub().shard(data_parallel_strategy) self.sub_var = P.Sub().shard(data_parallel_strategy) self.mul_mean = P.Mul().shard(data_parallel_strategy_one) self.mul_var = P.Mul().shard(data_parallel_strategy_one) self.assign_sub_mean = P.AssignSub().shard(data_parallel_strategy) self.assign_sub_var = P.AssignSub().shard(data_parallel_strategy)
def __init__(self, num_classes, is_training=True, stem_filters=32, penultimate_filters=1056, filters_multiplier=2): super(NASNetAMobile, self).__init__() self.is_training = is_training self.stem_filters = stem_filters self.penultimate_filters = penultimate_filters self.filters_multiplier = filters_multiplier filters = self.penultimate_filters//24 # 24 is default value for the architecture self.conv0 = nn.SequentialCell([ nn.Conv2d(in_channels=3, out_channels=self.stem_filters, kernel_size=3, stride=2, pad_mode='pad', padding=0, has_bias=False), nn.BatchNorm2d(num_features=self.stem_filters, eps=0.001, momentum=0.9, affine=True) ]) self.cell_stem_0 = CellStem0( self.stem_filters, num_filters=filters//(filters_multiplier**2) ) self.cell_stem_1 = CellStem1( self.stem_filters, num_filters=filters//filters_multiplier ) self.cell_0 = FirstCell( in_channels_left=filters, out_channels_left=filters//2, # 1, 0.5 in_channels_right=2*filters, out_channels_right=filters ) # 2, 1 self.cell_1 = NormalCell( in_channels_left=2*filters, out_channels_left=filters, # 2, 1 in_channels_right=6*filters, out_channels_right=filters ) # 6, 1 self.cell_2 = NormalCell( in_channels_left=6*filters, out_channels_left=filters, # 6, 1 in_channels_right=6*filters, out_channels_right=filters ) # 6, 1 self.cell_3 = NormalCell( in_channels_left=6*filters, out_channels_left=filters, # 6, 1 in_channels_right=6*filters, out_channels_right=filters ) # 6, 1 self.reduction_cell_0 = ReductionCell0( in_channels_left=6*filters, out_channels_left=2*filters, # 6, 2 in_channels_right=6*filters, out_channels_right=2*filters ) # 6, 2 self.cell_6 = FirstCell( in_channels_left=6*filters, out_channels_left=filters, # 6, 1 in_channels_right=8*filters, out_channels_right=2*filters ) # 8, 2 self.cell_7 = NormalCell( in_channels_left=8*filters, out_channels_left=2*filters, # 8, 2 in_channels_right=12*filters, out_channels_right=2*filters ) # 12, 2 self.cell_8 = NormalCell( in_channels_left=12*filters, out_channels_left=2*filters, # 12, 2 in_channels_right=12*filters, out_channels_right=2*filters ) # 12, 2 self.cell_9 = NormalCell( in_channels_left=12*filters, out_channels_left=2*filters, # 12, 2 in_channels_right=12*filters, out_channels_right=2*filters ) # 12, 2 if is_training: self.aux_logits = AuxLogits(in_channels=12*filters, out_channels=num_classes) self.reduction_cell_1 = ReductionCell1( in_channels_left=12*filters, out_channels_left=4*filters, # 12, 4 in_channels_right=12*filters, out_channels_right=4*filters ) # 12, 4 self.cell_12 = FirstCell( in_channels_left=12*filters, out_channels_left=2*filters, # 12, 2 in_channels_right=16*filters, out_channels_right=4*filters ) # 16, 4 self.cell_13 = NormalCell( in_channels_left=16*filters, out_channels_left=4*filters, # 16, 4 in_channels_right=24*filters, out_channels_right=4*filters ) # 24, 4 self.cell_14 = NormalCell( in_channels_left=24*filters, out_channels_left=4*filters, # 24, 4 in_channels_right=24*filters, out_channels_right=4*filters ) # 24, 4 self.cell_15 = NormalCell( in_channels_left=24*filters, out_channels_left=4*filters, # 24, 4 in_channels_right=24*filters, out_channels_right=4*filters ) # 24, 4 self.relu = nn.ReLU() self.dropout = nn.Dropout(keep_prob=0.5) self.classifier = nn.Dense(in_channels=24*filters, out_channels=num_classes) self.shape = P.Shape() self.reshape = P.Reshape() self.avg_pool = nn.AvgPool2d(kernel_size=7, stride=1) self._initialize_weights()