def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): super(Momentum, self).__init__(learning_rate, params) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) if isinstance(learning_rate, Iterable) or \ (isinstance(learning_rate, Tensor) and learning_rate.dim() == 1): self.dynamic_lr = True self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mstype.int32), name="global_step") self.axis = 0 else: self.dynamic_lr = False self.gather = None self.assignadd = None self.global_step = None self.axis = None self.momentum = Parameter(momentum, name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.decay_tf = tuple(decay_filter(x) for x in self.parameters) self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.weight_decay = weight_decay * loss_scale self.reciprocal_scale = 1.0 / loss_scale self.one = Tensor(1, mstype.int32)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: x.name not in []): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast() self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft() self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight() self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.weight_idx = [] for i in range(len(self.params)): if "conv" in self.params[i].name or "end_point" in self.params[i].name: self.weight_idx.append(i) self.weight_idx.append(len(self.params)) self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0] mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree) self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree) self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree) self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree) self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () for i in range(54): self.matrix_max_inv = self.matrix_max_inv + ( Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, weight_decay=0.0, loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.matmul = P.MatMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.gather = P.GatherV2() self.matrix_A_inv = () self.matrix_G_inv = () self.num_hidden_layers = num_hidden_layers self.sqrt = P.Sqrt() self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.expand = P.ExpandDims() self.square = P.Square() self.inv = P.Inv() self.batch_size = batch_size self.damping = damping self.one = Tensor(1, mstype.int32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
def __init__(self, var, accum): super(MomentumFusionNet, self).__init__() self.op = P.ApplyMomentum() self.add = P.AddN() self.mul = P.Mul() self.var = Parameter(var, name="variable") self.accum = Parameter(accum, name="accumulate") self.lr = 0.1 self.weight_decay = 0.002 self.moment = 0.98
def __init__(self, weights): super(OptimizerByMomentum, self).__init__() self.learning_rate = Parameter(0.1, name="learning_rate") self.momentum = Parameter(0.05, name="momentum") self.iter = Parameter(0, name="iter") self.weights = weights self.moments = weights.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum()
def __init__(self): super(Net, self).__init__() self.apply_momentum = P.ApplyMomentum(gradient_scale=1024.0) self.variable = Parameter(initializer( 'normal', [2, 3, 3, 4]), name='variable') self.accumulation = Parameter(initializer( 'normal', [2, 3, 3, 4]), name='accumulation') self.learning_rate = Parameter(initializer( 'normal', [1, ]), name='learning_rate') self.gradient = Parameter(initializer( 'normal', [2, 3, 3, 4]), name='gradient') self.momentum = Parameter(initializer( 'normal', [1, ]), name='momentum')
def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0): super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum()
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []): super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale) Validator.check_value_type("momentum", momentum, [float], self.cls_name) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.use_nesterov = Validator.check_bool(use_nesterov) self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov) self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0] self.feature_map_new = [x ** 0.5 for x in self.feature_map] self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.matmul = P.MatMul() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.assign = P.Assign() self.mul = P.Mul() mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_thorA = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree) self.grad_reducer_thorG = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree) self.weight_decay = weight_decay self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.update_gradient = P.UpdateThorGradient(split_dim=128)
def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): super(Momentum, self).__init__(learning_rate, params, weight_decay, loss_scale, decay_filter) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(momentum, name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum()
def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0, use_nesterov=False): super(MyMomentum, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.use_nesterov = check_bool(use_nesterov) self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov) self.scalar_summary = P.ScalarSummary() self.weight_names = [param.name for param in self.parameters]
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []): super(SKFAC_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale) Validator.check_value_type("momentum", momentum, [float], self.cls_name) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.use_nesterov = Validator.check_bool(use_nesterov) self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov) self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.matmul = P.MatMul() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.assign = P.Assign() self.mul = P.Mul() self.weight_decay = weight_decay self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
'block': G.FusedBatchNormGrad(), 'desc_inputs': [[128, 64, 32, 64], [128, 64, 32, 64], [64], [64], [64]], 'desc_bprop': [[128, 64, 32, 64], [64], [64], [64], [64]], 'skip': ['backward']}), ('BatchNorm', { 'block': P.BatchNorm(), 'desc_inputs': [[128, 64, 32, 32], [64], [64], [64], [64]], 'desc_bprop': [[128, 64, 32, 32], [64], [64], [64], [64]], 'skip': []}), ('BatchNormGrad', { 'block': G.BatchNormGrad(), 'desc_inputs': [[128, 64, 32, 32], [128, 64, 32, 32], [64], [64], [64], [64]], 'desc_bprop': [[128, 64, 32, 32], [64], [64], [64], [64]], 'skip': ['backward']}), ('ApplyMomentum', { 'block': P.ApplyMomentum(), 'desc_inputs': [[128, 32, 32, 64], [128, 32, 32, 64], [32, 32, 64], [32, 32, 64], [32, 32, 64]], 'desc_bprop': [[128, 32, 32, 64]], 'skip': ['backward']}), ('TopK', { 'block': P.TopK(), 'desc_const': [5], 'desc_inputs': [[20, 20, 10]], 'desc_bprop': [[20, 20, 5]], 'skip': ['backward']}), ('GatherV2_0', { 'block': P.GatherV2(), 'desc_const': [0], 'desc_inputs': [[3, 1, 2], Tensor(np.array([0, 1]).astype(np.int32))], 'desc_bprop': [[2, 1, 2]]}),
'block': UnfoldNetSame(), 'desc_inputs': [Tensor(np.ones([1, 1, 3, 3], np.float32))], 'desc_bprop': [Tensor(np.ones([1, 4, 3, 3], np.float32))], 'skip': ['backward'] }), ('UnfoldGrad', { 'block': GradWrapUnfold(UnfoldNetValid()), 'desc_inputs': [Tensor(np.ones([1, 1, 3, 3], np.float32))], 'desc_bprop': [Tensor(np.ones([1, 4, 2, 2], np.float32))], 'skip': ['backward'] }), ] test_cases_for_verify_exception = [ ('ApplyMomentum_Error', { 'block': (P.ApplyMomentum(), { 'exception': TypeError }), 'desc_inputs': [[2], [128, 32, 32, 64], [128, 32, 32, 64], [128, 32, 32, 64], [128, 32, 32, 64]], 'desc_bprop': [[128, 32, 32, 64]], 'skip': ['backward'] }), ('Conv2d_ValueError_1', { 'block': (lambda _: P.Conv2D(3, 4, mode=-2.0), { 'exception': TypeError }), 'desc_inputs': [0], }), ('Conv2d_ValueError_2', { 'block': (lambda _: P.Conv2D(3, 4, mode=-2), {
# See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ from mindspore.ops import Primitive from mindspore.ops import operations as P from mindspore.ops import _constants as Constants depend = P.Depend() all_reduce = P.AllReduce() broadcast = P.Broadcast(1) tensor_move = Primitive('TensorMove') make_tuple = Primitive('MakeTuple') tuple_getitem = Primitive(Constants.kTupleGetItem) assign_add = P.AssignAdd() apply_momentun = P.ApplyMomentum() relu = P.ReLU() class FnDict: def __init__(self): self.fnDict = {} def __call__(self, fn): self.fnDict[fn.__name__] = fn def __getitem__(self, name): return self.fnDict[name] def test_insert_tensor_move_for_hccl_op_cond1(tag):
}), ('PReLUNet', { 'block': PReLUNet(), 'desc_inputs': [Tensor(np.ones([1, 3, 4, 4], np.float32))], }), ('PReLUGradNet', { 'block': PReLUGradNet(), 'desc_inputs': [Tensor(np.ones([1, 3, 4, 4], np.float32)), Tensor(np.ones([1, 3, 4, 4], np.float32)), Tensor(np.ones(3, np.float32))], }), ] test_cases_for_verify_exception = [ ('ApplyMomentum_Error', { 'block': (P.ApplyMomentum(), {'exception': TypeError}), 'desc_inputs': [[2], [128, 32, 32, 64], [128, 32, 32, 64], [128, 32, 32, 64], [128, 32, 32, 64]], 'desc_bprop': [[128, 32, 32, 64]], 'skip': ['backward'] }), ('Conv2d_ValueError_1', { 'block': (lambda _: P.Conv2D(3, 4, mode=-2.0), {'exception': TypeError}), 'desc_inputs': [0], }), ('Conv2d_ValueError_2', { 'block': (lambda _: P.Conv2D(3, 4, mode=-2), {'exception': ValueError}), 'desc_inputs': [0], }), ('MaxPoolWithArgmax_ValueError_1', { 'block': (lambda _: P.MaxPoolWithArgmax(padding='sane'), {'exception': ValueError}), 'desc_inputs': [0],
def __init__(self, var, accum): super(ApplyMomentumNet, self).__init__() self.apply_momentum = P.ApplyMomentum(gradient_scale=1024.0) self.var = Parameter(var, name='var') self.accum = Parameter(accum, name='accum')
test_cases = [ ('SoftMaxGrad', { 'block': SoftMaxGrad(VirtualNetWithLoss(P.Softmax())), 'desc_inputs': [[128, 32, 32, 64]], 'desc_bprop': [[128, 32, 32, 64]], }), ('DropoutGrad', { 'block': DropoutGrad(VirtualNetWithLoss(nn.Dropout())), 'desc_inputs': [[128, 32, 32, 64]], 'desc_bprop': [[128, 32, 32, 64]], }), ('ApplyMomentum', { 'block': P.ApplyMomentum(), 'desc_inputs': [[2], [128, 32, 32, 64], [128, 32, 32, 64], [128, 32, 32, 64], [128, 32, 32, 64]], 'desc_bprop': [[128, 32, 32, 64]], 'skip': ['backward'] }), ('ScalarSummary', { 'block': ScalarSummaryNet(), 'desc_inputs': [2.2], }), ('FusedBatchNormGrad', { 'block': FusedBatchNormGrad( nn.BatchNorm2d(num_features=512, eps=1e-5, momentum=0.1)), 'desc_inputs': [[64, 512, 7, 7], [64, 512, 7, 7]], 'desc_bprop': [[64, 512, 7, 7]],
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, batch_size=32.0, decay_filter=lambda x: x.name not in []): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast() self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft() self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight() self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.weight_idx = [] for i in range(len(self.params)): if "conv" in self.params[i].name or "end_point" in self.params[ i].name: self.weight_idx.append(i) self.weight_idx.append(len(self.params)) self.feature_map = [ 1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 ] mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer_Amax = DistributedGradReducerThor( self.parameters, 2, mean, degree) self.grad_reducer_Gmax = DistributedGradReducerThor( self.parameters, 5, mean, degree) self.grad_reducer_A = DistributedGradReducerThor( self.parameters, 3, mean, degree) self.grad_reducer_G = DistributedGradReducerThor( self.parameters, 4, mean, degree) self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () for i in range(54): self.matrix_max_inv = self.matrix_max_inv + (Parameter( initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False), ) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.conv_index = [ 0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 17, 18, 19, 22, 23, 24, 25, 28, 29, 30, 33, 34, 35, 38, 39, 40, 43, 44, 45, 46, 49, 50, 51, 54, 55, 56, 59, 60, 61, 64, 65, 66, 69, 70, 71, 74, 75, 76, 77, 80, 81, 82, 85 ] self.batch_size = batch_size self.bn_index = [ 3, 7, 10, 13, 17, 20, 23, 26, 30, 33, 36, 39, 42, 45, 49, 52 ] self.bn_gradient_index = [ -1, -1, -1, 4, -1, -1, -1, 10, -1, -1, 15, -1, -1, 20, -1, -1, -1, 26, -1, -1, 31, -1, -1, 36, -1, -1, 41, -1, -1, -1, 47, -1, -1, 52, -1, -1, 57, -1, -1, 62, -1, -1, 67, -1, -1, 72, -1, -1, -1, 78, -1, -1, 83 ]
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ import mindspore.common.dtype as mstype from mindspore.common import monad from mindspore.common.tensor import Tensor from mindspore.ops import Primitive from mindspore.ops import operations as P from mindspore.ops import _constants as Constants from mindspore.ops import functional as F Mul = P.Mul() ApplyMomentum = P.ApplyMomentum() FusedMulApplyMomentum = Primitive('FusedMulApplyMomentum') tuple_getitem = Primitive(Constants.kTupleGetItem) make_tuple = Primitive('make_tuple') constant = Tensor(1.0, mstype.float32) class FnDict: def __init__(self): self.fnDict = {} def __call__(self, fn): self.fnDict[fn.__name__] = fn def __getitem__(self, name): return self.fnDict[name]
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, frequency=10, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.matmul = P.MatMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.gather = P.GatherV2() self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () self.num_hidden_layers = num_hidden_layers fc_layer_num = num_hidden_layers * 6 + 5 for i in range(fc_layer_num): self.matrix_max_inv = self.matrix_max_inv + (Parameter( initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False), ) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.expand = P.ExpandDims() self.square = P.Square() self.inv = P.Inv() self.batch_size = batch_size self.damping = damping self.freq = Tensor(frequency, mstype.int32) self.one = Tensor(1, mstype.int32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer_g = DistributedGradReducerThor1( self.parameters, 3, mean, degree)