Example #1
0
 def __init__(self, params, learning_rate, momentum, weight_decay=0.0, loss_scale=1.0,
              decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
     super(Momentum, self).__init__(learning_rate, params)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
     if isinstance(learning_rate, Iterable) or \
             (isinstance(learning_rate, Tensor) and learning_rate.dim() == 1):
         self.dynamic_lr = True
         self.gather = P.GatherV2()
         self.assignadd = P.AssignAdd()
         self.global_step = Parameter(initializer(0, [1], mstype.int32), name="global_step")
         self.axis = 0
     else:
         self.dynamic_lr = False
         self.gather = None
         self.assignadd = None
         self.global_step = None
         self.axis = None
     self.momentum = Parameter(momentum, name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.decay_tf = tuple(decay_filter(x) for x in self.parameters)
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.weight_decay = weight_decay * loss_scale
     self.reciprocal_scale = 1.0 / loss_scale
     self.one = Tensor(1, mstype.int32)
Example #2
0
    def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0,
                 loss_scale=1.0,
                 decay_filter=lambda x: x.name not in []):
        super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32))
        self.params = self.parameters
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
        self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
        self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
        self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.mul = P.Mul()
        self.weight_idx = []
        for i in range(len(self.params)):
            if "conv" in self.params[i].name or "end_point" in self.params[i].name:
                self.weight_idx.append(i)
        self.weight_idx.append(len(self.params))
        self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
                            1.0]
        mean = _get_gradients_mean()
        degree = _get_device_num()
        parameter_length = len(self.feature_map)
        self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree)
        self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree)
        self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree)
        self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree)
        self.matrix_A_inv = ()
        self.matrix_G_inv = ()
        self.matrix_max_inv = ()

        for i in range(54):
            self.matrix_max_inv = self.matrix_max_inv + (
                Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),)
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
        self.assign = P.Assign()
        self.cast = P.Cast()
        self.thor = True
        self.weight_decay = weight_decay * loss_scale
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
Example #3
0
 def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, weight_decay=0.0,
              loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03,
              decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()):
     super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.matmul = P.MatMul()
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.mul = P.Mul()
     self.gather = P.GatherV2()
     self.matrix_A_inv = ()
     self.matrix_G_inv = ()
     self.num_hidden_layers = num_hidden_layers
     self.sqrt = P.Sqrt()
     self.assign = P.Assign()
     self.cast = P.Cast()
     self.thor = True
     self.weight_decay = weight_decay * loss_scale
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
     self.expand = P.ExpandDims()
     self.square = P.Square()
     self.inv = P.Inv()
     self.batch_size = batch_size
     self.damping = damping
     self.one = Tensor(1, mstype.int32)
     self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
Example #4
0
 def __init__(self, var, accum):
     super(MomentumFusionNet, self).__init__()
     self.op = P.ApplyMomentum()
     self.add = P.AddN()
     self.mul = P.Mul()
     self.var = Parameter(var, name="variable")
     self.accum = Parameter(accum, name="accumulate")
     self.lr = 0.1
     self.weight_decay = 0.002
     self.moment = 0.98
Example #5
0
    def __init__(self, weights):
        super(OptimizerByMomentum, self).__init__()
        self.learning_rate = Parameter(0.1, name="learning_rate")
        self.momentum = Parameter(0.05, name="momentum")
        self.iter = Parameter(0, name="iter")

        self.weights = weights
        self.moments = weights.clone(prefix="moments", init='zeros')

        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum()
Example #6
0
 def __init__(self):
     super(Net, self).__init__()
     self.apply_momentum = P.ApplyMomentum(gradient_scale=1024.0)
     self.variable = Parameter(initializer(
                             'normal', [2, 3, 3, 4]), name='variable')
     self.accumulation = Parameter(initializer(
                             'normal', [2, 3, 3, 4]), name='accumulation')
     self.learning_rate = Parameter(initializer(
                             'normal', [1, ]), name='learning_rate')
     self.gradient = Parameter(initializer(
                             'normal', [2, 3, 3, 4]), name='gradient')
     self.momentum = Parameter(initializer(
                             'normal', [1, ]), name='momentum')
Example #7
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              weight_decay=0.0,
              loss_scale=1.0):
     super(Momentum, self).__init__(learning_rate, params, weight_decay,
                                    loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32),
                               name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
Example #8
0
    def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max,
                 weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []):
        super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale)
        Validator.check_value_type("momentum", momentum, [float], self.cls_name)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32))
        self.params = self.parameters
        self.use_nesterov = Validator.check_bool(use_nesterov)
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)

        self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
                            1.0]
        self.feature_map_new = [x ** 0.5 for x in self.feature_map]
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.matmul = P.MatMul()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.assign = P.Assign()
        self.mul = P.Mul()

        mean = _get_gradients_mean()
        degree = _get_device_num()

        parameter_length = len(self.feature_map)
        self.grad_reducer_thorA = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
        self.grad_reducer_thorG = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
        self.weight_decay = weight_decay
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
        self.update_gradient = P.UpdateThorGradient(split_dim=128)
Example #9
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              weight_decay=0.0,
              loss_scale=1.0,
              decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in
              x.name):
     super(Momentum, self).__init__(learning_rate, params, weight_decay,
                                    loss_scale, decay_filter)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(momentum, name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
Example #10
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              weight_decay=0.0,
              loss_scale=1.0,
              use_nesterov=False):
     super(MyMomentum, self).__init__(learning_rate, params, weight_decay,
                                      loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32),
                               name="momentum")
     self.params = self.parameters
     self.use_nesterov = check_bool(use_nesterov)
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)
     self.scalar_summary = P.ScalarSummary()
     self.weight_names = [param.name for param in self.parameters]
Example #11
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              matrix_A,
              matrix_G,
              A_inv_max,
              G_inv_max,
              weight_decay=0.0,
              loss_scale=1.0,
              use_nesterov=False,
              decay_filter=lambda x: x.name not in []):
     super(SKFAC_GPU, self).__init__(learning_rate, params, weight_decay,
                                     loss_scale)
     Validator.check_value_type("momentum", momentum, [float],
                                self.cls_name)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32))
     self.params = self.parameters
     self.use_nesterov = Validator.check_bool(use_nesterov)
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.matmul = P.MatMul()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.A_inv_max = ParameterTuple(A_inv_max)
     self.G_inv_max = ParameterTuple(G_inv_max)
     self.assign = P.Assign()
     self.mul = P.Mul()
     self.weight_decay = weight_decay
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
Example #12
0
     'block': G.FusedBatchNormGrad(),
     'desc_inputs': [[128, 64, 32, 64], [128, 64, 32, 64], [64], [64], [64]],
     'desc_bprop': [[128, 64, 32, 64], [64], [64], [64], [64]],
     'skip': ['backward']}),
 ('BatchNorm', {
     'block': P.BatchNorm(),
     'desc_inputs': [[128, 64, 32, 32], [64], [64], [64], [64]],
     'desc_bprop': [[128, 64, 32, 32], [64], [64], [64], [64]],
     'skip': []}),
 ('BatchNormGrad', {
     'block': G.BatchNormGrad(),
     'desc_inputs': [[128, 64, 32, 32], [128, 64, 32, 32], [64], [64], [64], [64]],
     'desc_bprop': [[128, 64, 32, 32], [64], [64], [64], [64]],
     'skip': ['backward']}),
 ('ApplyMomentum', {
     'block': P.ApplyMomentum(),
     'desc_inputs': [[128, 32, 32, 64], [128, 32, 32, 64],
                     [32, 32, 64], [32, 32, 64], [32, 32, 64]],
     'desc_bprop': [[128, 32, 32, 64]],
     'skip': ['backward']}),
 ('TopK', {
     'block': P.TopK(),
     'desc_const': [5],
     'desc_inputs': [[20, 20, 10]],
     'desc_bprop': [[20, 20, 5]],
     'skip': ['backward']}),
 ('GatherV2_0', {
     'block': P.GatherV2(),
     'desc_const': [0],
     'desc_inputs': [[3, 1, 2], Tensor(np.array([0, 1]).astype(np.int32))],
     'desc_bprop': [[2, 1, 2]]}),
Example #13
0
        'block': UnfoldNetSame(),
        'desc_inputs': [Tensor(np.ones([1, 1, 3, 3], np.float32))],
        'desc_bprop': [Tensor(np.ones([1, 4, 3, 3], np.float32))],
        'skip': ['backward']
    }),
    ('UnfoldGrad', {
        'block': GradWrapUnfold(UnfoldNetValid()),
        'desc_inputs': [Tensor(np.ones([1, 1, 3, 3], np.float32))],
        'desc_bprop': [Tensor(np.ones([1, 4, 2, 2], np.float32))],
        'skip': ['backward']
    }),
]

test_cases_for_verify_exception = [
    ('ApplyMomentum_Error', {
        'block': (P.ApplyMomentum(), {
            'exception': TypeError
        }),
        'desc_inputs': [[2], [128, 32, 32, 64], [128, 32, 32, 64],
                        [128, 32, 32, 64], [128, 32, 32, 64]],
        'desc_bprop': [[128, 32, 32, 64]],
        'skip': ['backward']
    }),
    ('Conv2d_ValueError_1', {
        'block': (lambda _: P.Conv2D(3, 4, mode=-2.0), {
            'exception': TypeError
        }),
        'desc_inputs': [0],
    }),
    ('Conv2d_ValueError_2', {
        'block': (lambda _: P.Conv2D(3, 4, mode=-2), {
Example #14
0
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

from mindspore.ops import Primitive
from mindspore.ops import operations as P
from mindspore.ops import _constants as Constants

depend = P.Depend()
all_reduce = P.AllReduce()
broadcast = P.Broadcast(1)
tensor_move = Primitive('TensorMove')
make_tuple = Primitive('MakeTuple')
tuple_getitem = Primitive(Constants.kTupleGetItem)
assign_add = P.AssignAdd()
apply_momentun = P.ApplyMomentum()
relu = P.ReLU()


class FnDict:
    def __init__(self):
        self.fnDict = {}

    def __call__(self, fn):
        self.fnDict[fn.__name__] = fn

    def __getitem__(self, name):
        return self.fnDict[name]


def test_insert_tensor_move_for_hccl_op_cond1(tag):
Example #15
0
    }),
    ('PReLUNet', {
        'block': PReLUNet(),
        'desc_inputs': [Tensor(np.ones([1, 3, 4, 4], np.float32))],
    }),
    ('PReLUGradNet', {
        'block': PReLUGradNet(),
        'desc_inputs': [Tensor(np.ones([1, 3, 4, 4], np.float32)),
                        Tensor(np.ones([1, 3, 4, 4], np.float32)),
                        Tensor(np.ones(3, np.float32))],
    }),
]

test_cases_for_verify_exception = [
    ('ApplyMomentum_Error', {
        'block': (P.ApplyMomentum(), {'exception': TypeError}),
        'desc_inputs': [[2], [128, 32, 32, 64], [128, 32, 32, 64], [128, 32, 32, 64], [128, 32, 32, 64]],
        'desc_bprop': [[128, 32, 32, 64]],
        'skip': ['backward']
    }),
    ('Conv2d_ValueError_1', {
        'block': (lambda _: P.Conv2D(3, 4, mode=-2.0), {'exception': TypeError}),
        'desc_inputs': [0],
    }),
    ('Conv2d_ValueError_2', {
        'block': (lambda _: P.Conv2D(3, 4, mode=-2), {'exception': ValueError}),
        'desc_inputs': [0],
    }),
    ('MaxPoolWithArgmax_ValueError_1', {
        'block': (lambda _: P.MaxPoolWithArgmax(padding='sane'), {'exception': ValueError}),
        'desc_inputs': [0],
Example #16
0
 def __init__(self, var, accum):
     super(ApplyMomentumNet, self).__init__()
     self.apply_momentum = P.ApplyMomentum(gradient_scale=1024.0)
     self.var = Parameter(var, name='var')
     self.accum = Parameter(accum, name='accum')
Example #17
0

test_cases = [
    ('SoftMaxGrad', {
        'block': SoftMaxGrad(VirtualNetWithLoss(P.Softmax())),
        'desc_inputs': [[128, 32, 32, 64]],
        'desc_bprop': [[128, 32, 32, 64]],
    }),
    ('DropoutGrad', {
        'block': DropoutGrad(VirtualNetWithLoss(nn.Dropout())),
        'desc_inputs': [[128, 32, 32, 64]],
        'desc_bprop': [[128, 32, 32, 64]],
    }),
    ('ApplyMomentum', {
        'block':
        P.ApplyMomentum(),
        'desc_inputs': [[2], [128, 32, 32, 64], [128, 32, 32, 64],
                        [128, 32, 32, 64], [128, 32, 32, 64]],
        'desc_bprop': [[128, 32, 32, 64]],
        'skip': ['backward']
    }),
    ('ScalarSummary', {
        'block': ScalarSummaryNet(),
        'desc_inputs': [2.2],
    }),
    ('FusedBatchNormGrad', {
        'block':
        FusedBatchNormGrad(
            nn.BatchNorm2d(num_features=512, eps=1e-5, momentum=0.1)),
        'desc_inputs': [[64, 512, 7, 7], [64, 512, 7, 7]],
        'desc_bprop': [[64, 512, 7, 7]],
Example #18
0
    def __init__(self,
                 params,
                 learning_rate,
                 momentum,
                 matrix_A,
                 matrix_G,
                 A_inv_max,
                 G_inv_max,
                 weight_decay=0.0,
                 loss_scale=1.0,
                 batch_size=32.0,
                 decay_filter=lambda x: x.name not in []):
        super(THOR, self).__init__(learning_rate, params, weight_decay,
                                   loss_scale)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError(
                "momentum should be at least 0.0, but got momentum {}".format(
                    momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32),
                                  name="momentum")
        self.params = self.parameters
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
        self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
        self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
        self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.mul = P.Mul()
        self.weight_idx = []
        for i in range(len(self.params)):
            if "conv" in self.params[i].name or "end_point" in self.params[
                    i].name:
                self.weight_idx.append(i)
        self.weight_idx.append(len(self.params))
        self.feature_map = [
            1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
            1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49,
            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
            1.0 / 49, 1.0
        ]
        mean = _get_mirror_mean()
        degree = _get_device_num()
        self.grad_reducer_Amax = DistributedGradReducerThor(
            self.parameters, 2, mean, degree)
        self.grad_reducer_Gmax = DistributedGradReducerThor(
            self.parameters, 5, mean, degree)
        self.grad_reducer_A = DistributedGradReducerThor(
            self.parameters, 3, mean, degree)
        self.grad_reducer_G = DistributedGradReducerThor(
            self.parameters, 4, mean, degree)
        self.matrix_A_inv = ()
        self.matrix_G_inv = ()
        self.matrix_max_inv = ()

        for i in range(54):
            self.matrix_max_inv = self.matrix_max_inv + (Parameter(
                initializer(1, [1], mstype.float32),
                name="matrix_max" + str(i),
                requires_grad=False), )
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
        self.assign = P.Assign()
        self.cast = P.Cast()
        self.thor = True
        self.weight_decay = weight_decay * loss_scale
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)

        self.conv_index = [
            0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 17, 18, 19, 22, 23, 24, 25, 28,
            29, 30, 33, 34, 35, 38, 39, 40, 43, 44, 45, 46, 49, 50, 51, 54, 55,
            56, 59, 60, 61, 64, 65, 66, 69, 70, 71, 74, 75, 76, 77, 80, 81, 82,
            85
        ]
        self.batch_size = batch_size
        self.bn_index = [
            3, 7, 10, 13, 17, 20, 23, 26, 30, 33, 36, 39, 42, 45, 49, 52
        ]
        self.bn_gradient_index = [
            -1, -1, -1, 4, -1, -1, -1, 10, -1, -1, 15, -1, -1, 20, -1, -1, -1,
            26, -1, -1, 31, -1, -1, 36, -1, -1, 41, -1, -1, -1, 47, -1, -1, 52,
            -1, -1, 57, -1, -1, 62, -1, -1, 67, -1, -1, 72, -1, -1, -1, 78, -1,
            -1, 83
        ]
Example #19
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import mindspore.common.dtype as mstype
from mindspore.common import monad
from mindspore.common.tensor import Tensor
from mindspore.ops import Primitive
from mindspore.ops import operations as P
from mindspore.ops import _constants as Constants
from mindspore.ops import functional as F

Mul = P.Mul()
ApplyMomentum = P.ApplyMomentum()
FusedMulApplyMomentum = Primitive('FusedMulApplyMomentum')
tuple_getitem = Primitive(Constants.kTupleGetItem)
make_tuple = Primitive('make_tuple')
constant = Tensor(1.0, mstype.float32)


class FnDict:
    def __init__(self):
        self.fnDict = {}

    def __call__(self, fn):
        self.fnDict[fn.__name__] = fn

    def __getitem__(self, name):
        return self.fnDict[name]
 def __init__(self,
              params,
              learning_rate,
              momentum,
              matrix_A,
              matrix_G,
              A_inv_max,
              G_inv_max,
              weight_decay=0.0,
              loss_scale=1.0,
              num_hidden_layers=24,
              batch_size=12,
              damping=0.03,
              frequency=10,
              decay_filter=lambda x: 'layernorm' not in x.name.lower() and
              'bias' not in x.name.lower()):
     super(THOR, self).__init__(learning_rate, params, weight_decay,
                                loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32),
                               name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.A_inv_max = ParameterTuple(A_inv_max)
     self.G_inv_max = ParameterTuple(G_inv_max)
     self.matmul = P.MatMul()
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.mul = P.Mul()
     self.gather = P.GatherV2()
     self.matrix_A_inv = ()
     self.matrix_G_inv = ()
     self.matrix_max_inv = ()
     self.num_hidden_layers = num_hidden_layers
     fc_layer_num = num_hidden_layers * 6 + 5
     for i in range(fc_layer_num):
         self.matrix_max_inv = self.matrix_max_inv + (Parameter(
             initializer(1, [1], mstype.float32),
             name="matrix_max" + str(i),
             requires_grad=False), )
     self.log = P.Log()
     self.exp = P.Exp()
     self.sqrt = P.Sqrt()
     self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
     self.assign = P.Assign()
     self.cast = P.Cast()
     self.thor = True
     self.weight_decay = weight_decay * loss_scale
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
     self.expand = P.ExpandDims()
     self.square = P.Square()
     self.inv = P.Inv()
     self.batch_size = batch_size
     self.damping = damping
     self.freq = Tensor(frequency, mstype.int32)
     self.one = Tensor(1, mstype.int32)
     self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                               name="cov_step",
                               requires_grad=False)
     mean = _get_mirror_mean()
     degree = _get_device_num()
     self.grad_reducer_g = DistributedGradReducerThor1(
         self.parameters, 3, mean, degree)