def __init__( self, vocab_size, embedding_size, embedding_shape, use_one_hot_embeddings=False, initializer_range=0.02, batch_size=12, damping=0.03, loss_scale=1, frequency=100, ): super(Embedding_Thor, self).__init__() self.vocab_size = vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.embedding_table = Parameter( initializer(TruncatedNormal(initializer_range), [vocab_size, embedding_size])) self.thor = True self.expand = P.ExpandDims() self.shape_flat = (-1, ) self.gather = P.Gather() self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.em_shape = tuple(embedding_shape) self.shape = P.Shape() self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.matrix_A_inv = Parameter(Tensor( np.zeros([vocab_size]).astype(np.float16)), requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)), requires_grad=False) self.fake_G = Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)) self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32)) self.dampingG = Tensor(np.identity(embedding_size), mstype.float32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.damping = damping self.gather = P.Gather() self.sqrt = P.Sqrt() self.mul = P.Mul() self.cast = P.Cast() self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.vector_matmul = P.CusBatchMatMul() self.cholesky = P.CusCholeskyTrsm() self.matrix_combine = P.CusMatrixCombine() self.reduce_sum = P.ReduceSum(keep_dims=False) self.inv = P.Inv() self.getG = P.InsertGradientOf(self.save_gradient) self.batch_size = batch_size
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=278, has_bias=True, activation=None): super(Dense_Thor, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.ndim != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.ndim != 1 or bias_init.shape[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None self.matrix_A_inv = Parameter(Tensor(np.zeros([128, 128, 16, 16]).astype(np.float16)), name='matrix_A_inv', requires_grad=False) self.matrix_G_inv = Parameter(Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)), name="matrix_G_inv", requires_grad=False) self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)) self.matmul = P.MatMul(transpose_b=True) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = Tensor(damping) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.vector_matmul = P.CusBatchMatMul() self.pad = P.Pad(((0, 24), (0, 24))) self.pad1 = P.Pad(((0, 8), (0, 8))) self.slice = P.Slice() self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) self.fused_abs_max1 = P.CusFusedAbsMax1([1000, 1000]) self.fused_abs_max2 = P.CusFusedAbsMax1() self.log = P.Log() self.exp = P.Exp() self.dampingA = Tensor(np.identity(2048), mstype.float32) self.dampingG = Tensor(np.identity(1024), mstype.float32) self.add = P.TensorAdd() self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient)
def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_mode='same', padding=0, dilation=1, group=1, data_format='NCHW', has_bias=False, weight_init='normal', damping=0.03, loss_scale=1, frequency=278, bias_init='zeros'): self.thor = True ksizes = (1, kernel_size, kernel_size, 1) self.hw = kernel_size * kernel_size strides = (1, stride, stride, 1) kernel_size = twice(kernel_size) super(Conv2d_Thor, self).__init__( in_channels, out_channels, kernel_size, stride, pad_mode, padding, dilation, group, data_format, has_bias, weight_init, bias_init, ) self.conv2d = P.Conv2D(out_channel=self.out_channels, kernel_size=self.kernel_size, mode=1, pad_mode=self.pad_mode, pad=self.padding, stride=self.stride, dilation=self.dilation, group=self.group ) self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.transpose02314 = P.CusTranspose02314() self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1] self.matrix_G_dim = self.out_channels self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape(self.matrix_A_dim, self.in_channels, True) self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape(self.matrix_G_dim, self.in_channels, False) self.matrix_A_device_temp_shape = ( self.matrix_A_device_shape[0], self.matrix_A_device_shape[2], self.matrix_A_device_shape[1], self.matrix_A_device_shape[3]) self.matrix_G_device_temp_shape = ( self.matrix_G_device_shape[0], self.matrix_G_device_shape[2], self.matrix_G_device_shape[1], self.matrix_G_device_shape[3]) self.matrix_A_inv = Parameter( Tensor(np.reshape(np.identity(self.matrix_A_device_dim).astype(np.float16), self.matrix_A_device_shape)), name='matrix_A_inv', requires_grad=False) self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False) self.matrix_G_inv = Parameter( Tensor(np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)), name="matrix_G_inv", requires_grad=False) self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False) self.fake_G = Tensor( np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = Tensor(damping) self.vector_matmul = P.CusBatchMatMul() self.diag_block_dim = 128 self.channels_slice_flag = False if self.in_channels % C0 != 0: self.channels_slice_flag = True self.padA_flag = False if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \ and self.matrix_A_dim > self.diag_block_dim: self.padA_flag = True pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim self.padA = P.Pad(((0, pad_dim), (0, pad_dim))) self.device_shape_pad_flag = False if self.matrix_A_dim != self.matrix_A_device_dim: self.device_shape_pad_flag = True self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels), (0, 0), (0, C0 - self.in_channels))) self.slice = P.Slice() self.gather = P.GatherV2() self.freq = Tensor(frequency, mstype.int32) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.axis = 0 dampingA_dim = self.matrix_A_dim if (self.matrix_A_dim % self.diag_block_dim) != 0 and self.matrix_A_dim > self.diag_block_dim: dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim dampingG_dim = self.matrix_G_dim if (self.matrix_G_dim % self.diag_block_dim) != 0 and self.matrix_G_dim > self.diag_block_dim: dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32) self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32) self.fused_abs_max1 = P.CusFusedAbsMax1([self.matrix_A_dim, self.matrix_A_dim]) self.fused_abs_max2 = P.CusFusedAbsMax1() self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient)
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=100, has_bias=False, activation=None, batch_size=12): super(Dense_Thor, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \ weight_init.shape()[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape( )[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None self.matrix_A_inv = Parameter(Tensor( np.zeros([in_channels, in_channels]).astype(np.float16)), name='matrix_A_inv', requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([out_channels, out_channels]).astype(np.float16)), name="matrix_G_inv", requires_grad=False) self.fake_G = Tensor( np.zeros([out_channels, out_channels]).astype(np.float16)) self.matmul = P.MatMul(transpose_b=True) self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.matrix_combine = P.CusMatrixCombine() self.cholesky = P.CusCholeskyTrsm() self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) self.mul = P.Mul() self.cast = P.Cast() self.damping = damping self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.vector_matmul = P.CusBatchMatMul() self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.abs = P.Abs() self.reduce_max = P.ReduceMax(keep_dims=False) self.log = P.Log() self.exp = P.Exp() self.dampingA = Tensor(np.identity(in_channels), mstype.float32) self.dampingG = Tensor(np.identity(out_channels), mstype.float32) self.sqrt = P.Sqrt() self.getG = P.InsertGradientOf(self.save_gradient) self.batch_size = batch_size