def __init__(self, optimizer, epsilon=1e-05, coefficient=0.001, use_clip=False, lars_filter=lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name): super(LARS, self).__init__(0.0, [Parameter(Tensor(0.0), name="fake_param")]) _check_param_value(optimizer, epsilon, coefficient, use_clip, self.cls_name) self.opt = optimizer self.lars = P.LARSUpdate(epsilon, coefficient, use_clip) self.cast = P.Cast() self.parameters = optimizer.parameters if use_clip is True: self.learning_rate = optimizer.learning_rate self.dynamic_lr = optimizer.dynamic_lr self.gather = optimizer.gather self.assignadd = optimizer.assignadd self.global_step = optimizer.global_step else: self.learning_rate = Parameter(Tensor(0.0, dtype=mstype.float32), name="fake_lr") self.reciprocal_scale = optimizer.reciprocal_scale optimizer.reciprocal_scale = 1.0 self.is_group = optimizer.is_group if self.is_group: self.weight_decay = tuple(map(lambda x: x / optimizer.loss_scale, optimizer.weight_decay)) else: self.weight_decay = optimizer.weight_decay / optimizer.loss_scale optimizer.exec_weight_decay = False optimizer.weight_decay = 0.0 self.decay_flags = optimizer.decay_flags self.lars_flag = tuple(lars_filter(x) for x in self.parameters) self.hyper_map = C.HyperMap()
def __init__(self, optimizer, epsilon=1e-05, coefficient=0.001, use_clip=False, lars_filter=lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name): super(LARS, self).__init__(0.0, [Parameter(Tensor(0.0), name="fake_param")]) _check_param_value(optimizer, epsilon, coefficient, use_clip, self.cls_name) self.opt = optimizer self.parameters = optimizer.parameters self.use_clip = use_clip self.lars_flag = tuple(lars_filter(x) for x in self.parameters) self.is_group = optimizer.is_group self.learning_rate = Parameter(Tensor(0.0, dtype=mstype.float32), name="fake_lr") self.decay_flags = optimizer.decay_flags self.reciprocal_scale = optimizer.reciprocal_scale self.hyper_map = C.HyperMap() self.lars = P.LARSUpdate(epsilon, coefficient, use_clip) self.cast = P.Cast() if use_clip: self.is_group_lr = optimizer.is_group_lr self.dynamic_lr = optimizer.dynamic_lr self.origin_learning_rate = optimizer.learning_rate self.global_step = optimizer.global_step if self.is_group_lr and self.dynamic_lr: raise ValueError('Grouped dynamic learning rate is currently not supported for the inputs optimizer ' \ 'of lars.') if self.is_group: self.weight_decay = tuple(map(lambda x: x / optimizer.loss_scale, optimizer.weight_decay)) optimizer.weight_decay = tuple(map(lambda x: 0.0, optimizer.weight_decay)) else: self.weight_decay = optimizer.weight_decay / optimizer.loss_scale optimizer.weight_decay = 0.0 optimizer.decay_flags = tuple(map(lambda x: False, self.decay_flags)) optimizer.reciprocal_scale = 1.0 optimizer.exec_weight_decay = False
def test_bprop_first_only(): grads = bprop( Net(), Tensor(np.ones([2, 3]).astype(np.float32)), Tensor(np.ones([3, 2]).astype(np.float32)), grads_wrt_outputs=(Tensor(np.ones([2, 3]).astype(np.float32)), Tensor(np.ones([2, 2]).astype(np.float32)))) print(grads)
def test_bprop_wrt_inputs_and_params(): net = Net() grads = bprop(net, Tensor(np.ones([2, 3]).astype(np.float32)), Tensor(np.ones([3, 2]).astype(np.float32)), grads_wrt_outputs=(Tensor(np.ones([2, 3]).astype(np.float32)), Tensor(np.ones([2, 2]).astype(np.float32))), wrt=['inputs', 'params'], params=net.trainable_params()) print(grads)
def test_bprop_wrt_params_no_sens(): net = Net() grads = bprop(net, Tensor(np.ones([2, 3]).astype(np.float32)), Tensor(np.ones([3, 2]).astype(np.float32)), wrt=['params'], params=net.trainable_params()) print(grads)
def test_bprop_sens(): grads = bprop( Net(), Tensor(np.ones([2, 3]).astype(np.float32)), Tensor(np.ones([3, 2]).astype(np.float32)), grads_wrt_outputs=(Tensor(np.ones([2, 3]).astype(np.float32)), Tensor(np.ones([2, 2]).astype(np.float32))), wrt=['inputs']) print(grads)
def test_create_primitive_object_on_construct_use_kwargs(): """ test_create_primitive_object_on_construct_use_kwargs """ log.debug("begin test_create_primitive_object_on_construct_use_kwargs") context.set_context(mode=context.GRAPH_MODE) x = Tensor(np.array([[0, 1], [2, 1]]).astype(np.float32)) y = Tensor(np.array([[0, 1], [2, 1]]).astype(np.float32)) net = NetD() net(x, y) log.debug("finished test_create_primitive_object_on_construct_use_kwargs")
def test_create_primitive_object_on_construct(): """ test_create_primitive_object_on_construct """ log.debug("begin test_create_object_on_construct") x = Tensor(np.array([[1, 2, 3], [1, 2, 3]], np.float32)) y = Tensor(np.array([[2, 3, 4], [1, 1, 2]], np.float32)) net = Net1() net.construct(x, y) log.debug("finished test_create_object_on_construct")
def __init__(self, params, accum=0.1, learning_rate=0.001, l1=0.0, l2=0.0, use_locking=False, loss_scale=1.0, weight_decay=0.0): super(ProximalAdagrad, self).__init__(learning_rate, params, weight_decay, loss_scale) _check_param_value(accum, l1, l2, use_locking, self.cls_name) self.accum = self.parameters.clone(prefix="accum", init=accum) self.l1 = Tensor(l1, mstype.float32) self.l2 = Tensor(l2, mstype.float32) self.hyper_map = C.HyperMap() self.opt = P.ApplyProximalAdagrad(use_locking=use_locking) self.sparse_opt = P.FusedSparseProximalAdagrad(use_locking=use_locking)
def __init__(self, params, accum=0.1, learning_rate=0.001, l1=0.0, l2=0.0, use_locking=False, loss_scale=1.0, weight_decay=0.0): super(ProximalAdagrad, self).__init__(learning_rate, params, weight_decay, loss_scale) if self.is_group: raise RuntimeError(f"The {self.cls_name} optimizer cannot support group setting.") _check_param_value(accum, l1, l2, use_locking, self.cls_name) self.accum = self.parameters.clone(prefix="accum", init=accum) self.l1 = Tensor(l1, mstype.float32) self.l2 = Tensor(l2, mstype.float32) self.weight_decay = weight_decay self.hyper_map = C.HyperMap() self.opt = P.ApplyProximalAdagrad(use_locking=use_locking) self.sparse_opt = inner.SparseApplyProximalAdagradNoReturn(use_locking=use_locking)
def __init__(self, rgb_range, rgb_mean=(0.4488, 0.4371, 0.4040), rgb_std=(1.0, 1.0, 1.0), sign=-1): super(MeanShift, self).__init__(3, 3, kernel_size=1) self.reshape = P.Reshape() self.eye = P.Eye() std = Tensor(rgb_std, mstype.float32) self.weight.set_data( self.reshape(self.eye(3, 3, mstype.float32), (3, 3, 1, 1)) / self.reshape(std, (3, 1, 1, 1))) self.weight.requires_grad = False self.bias = Parameter( sign * rgb_range * Tensor(rgb_mean, mstype.float32) / std, name='bias', requires_grad=False) self.has_bias = True
def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0, use_locking=False, loss_scale=1.0, weight_decay=0.0): super(FTRL, self).__init__(learning_rate, params) _check_param(initial_accum, learning_rate, lr_power, l1, l2, use_locking, loss_scale, weight_decay, self.cls_name) self.moments = self.parameters.clone(prefix="moments", init=initial_accum) self.linear = self.parameters.clone(prefix="linear", init='zeros') self.l1 = l1 self.l2 = l2 self.lr_power = lr_power self.reciprocal_scale = 1.0 / loss_scale self.weight_decay = weight_decay self.decay_tf = tuple((lambda: True)() for x in self.parameters) self.hyper_map = C.HyperMap() self.opt = P.ApplyFtrl(use_locking=use_locking) self.one = Tensor(1, mstype.int32)
def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): super(Momentum, self).__init__(learning_rate, params) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) if isinstance(learning_rate, Iterable) or \ (isinstance(learning_rate, Tensor) and learning_rate.dim() == 1): self.dynamic_lr = True self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mstype.int32), name="global_step") self.axis = 0 else: self.dynamic_lr = False self.gather = None self.assignadd = None self.global_step = None self.axis = None self.momentum = Parameter(momentum, name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.decay_tf = tuple(decay_filter(x) for x in self.parameters) self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.weight_decay = weight_decay * loss_scale self.reciprocal_scale = 1.0 / loss_scale self.one = Tensor(1, mstype.int32)
def __init__(self, optimizer, epsilon=1e-05, hyperpara=0.001, weight_decay=0.0, use_clip=False, decay_filter=lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name, lars_filter=lambda x: 'LayerNorm' not in x.name and 'bias' not in x.name, loss_scale=1.0): super(LARS, self).__init__(0.0, [Parameter(Tensor(0.0), name="trivial")]) self.opt = optimizer self.parameters = optimizer.parameters self.learning_rate = optimizer.learning_rate self.lars = P.LARSUpdate(epsilon, hyperpara, use_clip) self.reciprocal_scale = 1.0 / loss_scale self.weight_decay = weight_decay * loss_scale self.cast = P.Cast() self.decay_flag = tuple(decay_filter(x) for x in self.parameters) self.lars_flag = tuple(lars_filter(x) for x in self.parameters) self.hyper_map = C.HyperMap() self.dynamic_lr = False self.gather = None self.global_step = None self.axis = None if isinstance(self.learning_rate.default_input, Iterable) or \ (isinstance(self.learning_rate.default_input, Tensor) and self.learning_rate.default_input.dim() == 1): self.dynamic_lr = True self.assignadd = P.AssignAdd() self.gather = P.GatherV2() self.global_step = Parameter(initializer(0, [1], mstype.int32), name="lars_global_step") self.axis = 0
def __init__(self, opt, train_parameter_groups=None, train_strategy=None): super(FreezeOpt, self).__init__() if not isinstance(opt, Optimizer): raise TypeError( f"The first arg 'opt' must be an Optimizer instance, but got {type(opt)}" ) if train_strategy is not None and train_parameter_groups is None: raise ValueError( "When the 'train_strategy' is specified, the value of 'train_parameter_groups' " "must also be specified") opt_class = type(opt) opt_init_args = opt.init_args self.opts = [] if train_parameter_groups is None: groups_num = 10 step = 6 parameters = opt.parameters para_groups = (parameters[(i * step):] for i in range(groups_num)) self.opts = [ opt_class(params=params, **opt_init_args) for params in para_groups ] else: if not isinstance(train_parameter_groups, (tuple, list)): raise TypeError( "The specified 'train_parameter_groups' should be tuple or list" ) for params in train_parameter_groups: if not isinstance(params, (tuple, list)): raise TypeError( "The each element of 'train_parameter_groups' should be tuple or list " "to store the Parameter") for para in params: if not isinstance(para, Parameter): raise TypeError( "The element of each group should be the Parameter" ) # generate one-to-one opt corresponding to the parameter group self.opts.append(opt_class(params=params, **opt_init_args)) if isinstance(train_strategy, (tuple, list)): for ele in train_strategy: if not isinstance(ele, int): raise ValueError( "The element in train_strategy should be int number") self.train_strategy = Tensor(train_strategy, mstype.int32) elif isinstance(train_strategy, Tensor): if train_strategy.ndim != 1 or train_strategy.dtype != mstype.int32: raise ValueError( "When train_strategy is a Tensor, the dimension should be 1 and " "the dtype should be int32") self.train_strategy = train_strategy elif train_strategy is None: self.train_strategy = None else: raise TypeError( "The specified 'train_strategy' should be None, tuple, list or Tensor" )
def test_create_primitive_object_on_construct_use_args_and_kwargs(): """ test_create_primitive_object_on_construct_use_args_and_kwargs """ log.debug("begin test_create_primitive_object_on_construct_use_args_and_kwargs") context.set_context(mode=context.GRAPH_MODE) inputs = Tensor(np.ones([1, 16, 16, 16]).astype(np.float32)) net = NetE() net(inputs) log.debug("finished test_create_primitive_object_on_construct_use_args_and_kwargs")
def __init__(self, max_position_embeddings, embedding_dim, seq_length): super(LearnedPositionalEncoding, self).__init__() self.pe = nn.Embedding( max_position_embeddings, embedding_dim) self.seq_length = seq_length self.position_ids = Tensor(np.arange(self.seq_length).astype(np.int32)) self.reshape = P.Reshape() self.position_ids = self.reshape( self.position_ids, (1, self.seq_length))
def __init__(self, params, learning_rate=0.1, decay=0.9, momentum=0.0, epsilon=1e-10, use_locking=False, centered=False, loss_scale=1.0, weight_decay=0.0, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): super(RMSProp, self).__init__(learning_rate, params) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) if decay < 0.0: raise ValueError( "decay should be at least 0.0, but got dampening {}".format( decay)) self.decay = decay self.epsilon = epsilon validator.check_type("use_locking", use_locking, [bool]) validator.check_type("centered", centered, [bool]) self.centered = centered if centered: self.opt = P.ApplyCenteredRMSProp(use_locking) self.mg = self.parameters.clone(prefix="mean_grad", init='zeros') else: self.opt = P.ApplyRMSProp(use_locking) self.dynamic_lr = False if not isinstance(learning_rate, float): self.dynamic_lr = True self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mstype.int32), name="global_step") self.axis = 0 self.one = Tensor(1, mstype.int32) self.momentum = momentum self.ms = self.parameters.clone(prefix="mean_square", init='zeros') self.moment = self.parameters.clone(prefix="moment", init='zeros') self.hyper_map = C.HyperMap() self.decay = decay self.decay_tf = tuple(decay_filter(x) for x in self.parameters) self.reciprocal_scale = 1.0 / loss_scale self.weight_decay = weight_decay * loss_scale
def test_create_cell_object_on_construct(): """ test_create_cell_object_on_construct """ log.debug("begin test_create_object_on_construct") context.set_context(mode=context.GRAPH_MODE) np1 = np.random.randn(2, 3, 4, 5).astype(np.float32) input_me = Tensor(np1) net = Net() output = net(input_me) out_me1 = output.asnumpy() print(np1) print(out_me1) log.debug("finished test_create_object_on_construct")
def compute(self, x): """ compute""" NumBlock_x = self.NumBlock_x NumBlock_y = self.NumBlock_y large_x = self.fold(x) large_x = large_x.asnumpy() N, C, _, _ = large_x.shape leftup_idx_x = [] leftup_idx_y = [] for i in range(NumBlock_x): leftup_idx_x.append(i * self.kernel_size[0]) for i in range(NumBlock_y): leftup_idx_y.append(i * self.kernel_size[1]) fold_x = np.zeros((N, C, (NumBlock_x - 1) * self.stride + self.kernel_size[0], \ (NumBlock_y - 1) * self.stride + self.kernel_size[1]), dtype=np.float32) for i in range(NumBlock_x): for j in range(NumBlock_y): fold_i = i * self.stride fold_j = j * self.stride org_i = leftup_idx_x[i] org_j = leftup_idx_y[j] fills = large_x[:, :, org_i:org_i + self.kernel_size[0], org_j:org_j + self.kernel_size[1]] t2 = fold_x[:, :, :fold_i, fold_j:fold_j + self.kernel_size[1]] zeros2 = np.zeros(t2.shape) concat1 = np.concatenate((zeros2, fills), axis=2) t3 = fold_x[:, :, fold_i + self.kernel_size[0]:, fold_j:fold_j + self.kernel_size[1]] zeros3 = np.zeros(t3.shape) concat2 = np.concatenate((concat1, zeros3), axis=2) t1 = fold_x[:, :, :, :fold_j] zeros1 = np.zeros(t1.shape) concat3 = np.concatenate((zeros1, concat2), axis=3) t4 = fold_x[:, :, :, fold_j + self.kernel_size[1]:] zeros4 = np.zeros(t4.shape) concat4 = np.concatenate((concat3, zeros4), axis=3) fold_x += concat4 y = Tensor(fold_x, mstype.float16) return y
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', has_bias=True): super(CommonHeadLastFN, self).__init__() weight_shape = [out_channels, in_channels] self.weight = Parameter(initializer(weight_init, weight_shape), requires_grad=True, name='weight') self.x_norm = P.L2Normalize(axis=1) self.w_norm = P.L2Normalize(axis=1) self.fc = P.MatMul(transpose_a=False, transpose_b=True) self.multiplier = Parameter(Tensor(np.ones([1]), mstype.float32), requires_grad=True, name='multiplier') self.has_bias = has_bias if self.has_bias: bias_shape = [out_channels] self.bias_add = P.BiasAdd() self.bias = Parameter(initializer(bias_init, bias_shape), requires_grad=True, name='bias')
def compute(self, x): """stride""" x = x.asnumpy() N, C, H, W = x.shape leftup_idx_x = [] leftup_idx_y = [] nh = (H - self.kernel_size) // self.stride + 1 nw = (W - self.kernel_size) // self.stride + 1 for i in range(nh): leftup_idx_x.append(i * self.stride) for i in range(nw): leftup_idx_y.append(i * self.stride) NumBlock_x = len(leftup_idx_x) NumBlock_y = len(leftup_idx_y) unf_x = np.zeros((N, C, NumBlock_x * self.kernel_size, NumBlock_y * self.kernel_size), dtype=np.float32) N, C, H, W = unf_x.shape for i in range(NumBlock_x): for j in range(NumBlock_y): unf_i = i * self.kernel_size unf_j = j * self.kernel_size org_i = leftup_idx_x[i] org_j = leftup_idx_y[j] fills = x[:, :, org_i:org_i + self.kernel_size, org_j:org_j + self.kernel_size] zeros2 = np.zeros(unf_x[:, :, :unf_i, unf_j:unf_j + self.kernel_size].shape) concat1 = np.concatenate((zeros2, fills), axis=2) zeros3 = np.zeros(unf_x[:, :, unf_i + self.kernel_size:, unf_j:unf_j + self.kernel_size].shape) concat2 = np.concatenate((concat1, zeros3), axis=2) zeros1 = np.zeros(unf_x[:, :, :, :unf_j].shape) concat3 = np.concatenate((zeros1, concat2), axis=3) zeros4 = np.zeros(unf_x[:, :, :, unf_j + self.kernel_size:].shape) concat4 = np.concatenate((concat3, zeros4), axis=3) unf_x += concat4 unf_x = Tensor(unf_x, mstype.float16) y = self.unfold(unf_x) return y
def infer_value(self, x, dev_mat, tensor_map): from mindspore.parallel._tensor import _load_tensor validator.check_value_type("dev_mat", dev_mat, [tuple], self.name) validator.check_value_type("tensor_map", tensor_map, [tuple], self.name) return Tensor(_load_tensor(x, dev_mat, tensor_map))
def __init__(self): super(NetE, self).__init__() self.w = Parameter(Tensor(np.ones([16, 16, 3, 3]).astype(np.float32)), name='w')
def infer_value(self, x_value): return Tensor(np.arange(self.start, self.limit, self.delta), dtype=x_value.dtype)
def __init__(self, shape=None, mean=0.0, stddev=1.0, seed=0): super(Net, self).__init__() self._mean = Tensor(mean, mstype.float32) self._stddev = Tensor(stddev, mstype.float32) self._normal = P.Normal(seed=seed) self._shape = shape
def __init__(self, q_tensor_width, k_tensor_width, v_tensor_width, hidden_width, out_tensor_width, num_attention_heads=1, query_act=None, key_act=None, value_act=None, out_act=None, has_attention_mask=True, attention_probs_dropout_prob=0.0, use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=False, compute_type=mstype.float16, same_dim=True): super(MultiheadAttention, self).__init__() self.num_attention_heads = num_attention_heads self.size_per_head = int(hidden_width / num_attention_heads) self.has_attention_mask = has_attention_mask self.use_one_hot_embeddings = use_one_hot_embeddings self.initializer_range = initializer_range self.do_return_2d_tensor = do_return_2d_tensor self.same_dim = same_dim self.scores_mul = Tensor( [1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type) self.reshape = P.Reshape() self.shape_q_2d = (-1, q_tensor_width) self.shape_k_2d = (-1, k_tensor_width) self.shape_v_2d = (-1, v_tensor_width) self.hidden_width = int(hidden_width) if self.same_dim: self.in_proj_layer = Parameter(Tensor(np.random.rand(hidden_width * 3, q_tensor_width), dtype=mstype.float32), name="weight") else: self.query_layer = nn.Dense(q_tensor_width, hidden_width, activation=query_act, has_bias=False).to_float(compute_type) self.key_layer = nn.Dense(k_tensor_width, hidden_width, activation=key_act, has_bias=False).to_float(compute_type) self.value_layer = nn.Dense(q_tensor_width, hidden_width, activation=value_act, has_bias=False).to_float(compute_type) self.out_proj = nn.Dense(hidden_width, out_tensor_width, activation=out_act, has_bias=False).to_float(compute_type) self.matmul_trans_b = P.BatchMatMul(transpose_b=True) self.multiply = P.Mul() self.transpose = P.Transpose() self.trans_shape = (0, 2, 1, 3) self.trans_shape_relative = (2, 0, 1, 3) self.trans_shape_position = (1, 2, 0, 3) self.multiply_data = Tensor([-10000.0,], dtype=compute_type) self.matmul = P.BatchMatMul() self.softmax = nn.Softmax() self.dropout = nn.Dropout(1. - attention_probs_dropout_prob) self.use_dropout = attention_probs_dropout_prob > 0 if self.has_attention_mask: self.expand_dims = P.ExpandDims() self.sub = P.Sub() self.add = P.TensorAdd() self.cast = P.Cast() self.get_dtype = P.DType() self.softmax_cast = P.Cast() self.matmul_dense = P.MatMul(transpose_b=True) self.split = P.Split(0, 3) self.equal = P.Equal() self.shape = P.Shape()
def __init__(self): super(Net, self).__init__() self.matmul = P.MatMul() self.z = Parameter(Tensor(np.array([1.0], np.float32)), name='z')