def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, weight_decay=0.0, loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.matmul = P.MatMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.gather = P.GatherV2() self.matrix_A_inv = () self.matrix_G_inv = () self.num_hidden_layers = num_hidden_layers self.sqrt = P.Sqrt() self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.expand = P.ExpandDims() self.square = P.Square() self.inv = P.Inv() self.batch_size = batch_size self.damping = damping self.one = Tensor(1, mstype.int32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
def __init__( self, vocab_size, embedding_size, embedding_shape, use_one_hot_embeddings=False, initializer_range=0.02, batch_size=12, damping=0.03, loss_scale=1, frequency=100, ): super(Embedding_Thor, self).__init__() self.vocab_size = vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.embedding_table = Parameter( initializer(TruncatedNormal(initializer_range), [vocab_size, embedding_size])) self.thor = True self.expand = P.ExpandDims() self.shape_flat = (-1, ) self.gather = P.Gather() self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.em_shape = tuple(embedding_shape) self.shape = P.Shape() self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.matrix_A_inv = Parameter(Tensor( np.zeros([vocab_size]).astype(np.float16)), requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)), requires_grad=False) self.fake_G = Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)) self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32)) self.dampingG = Tensor(np.identity(embedding_size), mstype.float32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.damping = damping self.gather = P.Gather() self.sqrt = P.Sqrt() self.mul = P.Mul() self.cast = P.Cast() self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.vector_matmul = P.CusBatchMatMul() self.cholesky = P.CusCholeskyTrsm() self.matrix_combine = P.CusMatrixCombine() self.reduce_sum = P.ReduceSum(keep_dims=False) self.inv = P.Inv() self.getG = P.InsertGradientOf(self.save_gradient) self.batch_size = batch_size
def __init__(self): super(ComputeDescriptor, self).__init__() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cast = P.Cast() self.rsum = P.ReduceSum() self.broadcastto = P.BroadcastTo((1, 192 * 138)) self.broadcastto1 = P.BroadcastTo((1, 192, 138, 3)) self.broadcastto2 = P.BroadcastTo((1, 192, 138, 3, 3)) self.broadcastto3 = P.BroadcastTo((1, 192, 138, 4)) self.broadcastto4 = P.BroadcastTo((1, 192, 138, 4, 3)) self.expdims = P.ExpandDims() self.concat = P.Concat(axis=3) self.gather = P.GatherV2() self.mul = P.Mul() self.slice = P.Slice() self.square = P.Square() self.inv = P.Inv() self.sqrt = P.Sqrt() self.ones = P.OnesLike() self.eye = P.Eye()
def __init__(self, strategy1, strategy2): super().__init__() self.matmul = P.MatMul().set_strategy(strategy1) self.inv = P.Inv().set_strategy(strategy2) self.matmul2 = P.MatMul().set_strategy(strategy1)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, frequency=10, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.matmul = P.MatMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.gather = P.GatherV2() self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () self.num_hidden_layers = num_hidden_layers fc_layer_num = num_hidden_layers * 6 + 5 for i in range(fc_layer_num): self.matrix_max_inv = self.matrix_max_inv + (Parameter( initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False), ) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.expand = P.ExpandDims() self.square = P.Square() self.inv = P.Inv() self.batch_size = batch_size self.damping = damping self.freq = Tensor(frequency, mstype.int32) self.one = Tensor(1, mstype.int32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer_g = DistributedGradReducerThor1( self.parameters, 3, mean, degree)