def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, weight_decay=0.0,
              loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03,
              decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()):
     super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.matmul = P.MatMul()
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.mul = P.Mul()
     self.gather = P.GatherV2()
     self.matrix_A_inv = ()
     self.matrix_G_inv = ()
     self.num_hidden_layers = num_hidden_layers
     self.sqrt = P.Sqrt()
     self.assign = P.Assign()
     self.cast = P.Cast()
     self.thor = True
     self.weight_decay = weight_decay * loss_scale
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
     self.expand = P.ExpandDims()
     self.square = P.Square()
     self.inv = P.Inv()
     self.batch_size = batch_size
     self.damping = damping
     self.one = Tensor(1, mstype.int32)
     self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
Beispiel #2
0
    def __init__(
        self,
        vocab_size,
        embedding_size,
        embedding_shape,
        use_one_hot_embeddings=False,
        initializer_range=0.02,
        batch_size=12,
        damping=0.03,
        loss_scale=1,
        frequency=100,
    ):
        super(Embedding_Thor, self).__init__()
        self.vocab_size = vocab_size
        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.embedding_table = Parameter(
            initializer(TruncatedNormal(initializer_range),
                        [vocab_size, embedding_size]))
        self.thor = True
        self.expand = P.ExpandDims()
        self.shape_flat = (-1, )
        self.gather = P.Gather()
        self.one_hot = P.OneHot()
        self.on_value = Tensor(1.0, mstype.float32)
        self.off_value = Tensor(0.0, mstype.float32)
        self.array_mul = P.MatMul()
        self.reshape = P.Reshape()
        self.em_shape = tuple(embedding_shape)
        self.shape = P.Shape()
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)

        self.matrix_A_inv = Parameter(Tensor(
            np.zeros([vocab_size]).astype(np.float16)),
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.zeros([embedding_size, embedding_size]).astype(np.float16)),
                                      requires_grad=False)
        self.fake_G = Tensor(
            np.zeros([embedding_size, embedding_size]).astype(np.float16))
        self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32))
        self.dampingG = Tensor(np.identity(embedding_size), mstype.float32)
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  requires_grad=False)
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.damping = damping
        self.gather = P.Gather()
        self.sqrt = P.Sqrt()
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.vector_matmul = P.CusBatchMatMul()
        self.cholesky = P.CusCholeskyTrsm()
        self.matrix_combine = P.CusMatrixCombine()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.inv = P.Inv()
        self.getG = P.InsertGradientOf(self.save_gradient)
        self.batch_size = batch_size
Beispiel #3
0
    def __init__(self):
        super(ComputeDescriptor, self).__init__()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cast = P.Cast()
        self.rsum = P.ReduceSum()
        self.broadcastto = P.BroadcastTo((1, 192 * 138))
        self.broadcastto1 = P.BroadcastTo((1, 192, 138, 3))
        self.broadcastto2 = P.BroadcastTo((1, 192, 138, 3, 3))
        self.broadcastto3 = P.BroadcastTo((1, 192, 138, 4))
        self.broadcastto4 = P.BroadcastTo((1, 192, 138, 4, 3))

        self.expdims = P.ExpandDims()
        self.concat = P.Concat(axis=3)
        self.gather = P.GatherV2()
        self.mul = P.Mul()
        self.slice = P.Slice()
        self.square = P.Square()
        self.inv = P.Inv()
        self.sqrt = P.Sqrt()
        self.ones = P.OnesLike()
        self.eye = P.Eye()
Beispiel #4
0
 def __init__(self, strategy1, strategy2):
     super().__init__()
     self.matmul = P.MatMul().set_strategy(strategy1)
     self.inv = P.Inv().set_strategy(strategy2)
     self.matmul2 = P.MatMul().set_strategy(strategy1)
 def __init__(self,
              params,
              learning_rate,
              momentum,
              matrix_A,
              matrix_G,
              A_inv_max,
              G_inv_max,
              weight_decay=0.0,
              loss_scale=1.0,
              num_hidden_layers=24,
              batch_size=12,
              damping=0.03,
              frequency=10,
              decay_filter=lambda x: 'layernorm' not in x.name.lower() and
              'bias' not in x.name.lower()):
     super(THOR, self).__init__(learning_rate, params, weight_decay,
                                loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32),
                               name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.A_inv_max = ParameterTuple(A_inv_max)
     self.G_inv_max = ParameterTuple(G_inv_max)
     self.matmul = P.MatMul()
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.mul = P.Mul()
     self.gather = P.GatherV2()
     self.matrix_A_inv = ()
     self.matrix_G_inv = ()
     self.matrix_max_inv = ()
     self.num_hidden_layers = num_hidden_layers
     fc_layer_num = num_hidden_layers * 6 + 5
     for i in range(fc_layer_num):
         self.matrix_max_inv = self.matrix_max_inv + (Parameter(
             initializer(1, [1], mstype.float32),
             name="matrix_max" + str(i),
             requires_grad=False), )
     self.log = P.Log()
     self.exp = P.Exp()
     self.sqrt = P.Sqrt()
     self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
     self.assign = P.Assign()
     self.cast = P.Cast()
     self.thor = True
     self.weight_decay = weight_decay * loss_scale
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
     self.expand = P.ExpandDims()
     self.square = P.Square()
     self.inv = P.Inv()
     self.batch_size = batch_size
     self.damping = damping
     self.freq = Tensor(frequency, mstype.int32)
     self.one = Tensor(1, mstype.int32)
     self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                               name="cov_step",
                               requires_grad=False)
     mean = _get_mirror_mean()
     degree = _get_device_num()
     self.grad_reducer_g = DistributedGradReducerThor1(
         self.parameters, 3, mean, degree)