Beispiel #1
0
    def __init__(
        self,
        vocab_size,
        embedding_size,
        embedding_shape,
        use_one_hot_embeddings=False,
        initializer_range=0.02,
        batch_size=12,
        damping=0.03,
        loss_scale=1,
        frequency=100,
    ):
        super(Embedding_Thor, self).__init__()
        self.vocab_size = vocab_size
        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.embedding_table = Parameter(
            initializer(TruncatedNormal(initializer_range),
                        [vocab_size, embedding_size]))
        self.thor = True
        self.expand = P.ExpandDims()
        self.shape_flat = (-1, )
        self.gather = P.Gather()
        self.one_hot = P.OneHot()
        self.on_value = Tensor(1.0, mstype.float32)
        self.off_value = Tensor(0.0, mstype.float32)
        self.array_mul = P.MatMul()
        self.reshape = P.Reshape()
        self.em_shape = tuple(embedding_shape)
        self.shape = P.Shape()
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)

        self.matrix_A_inv = Parameter(Tensor(
            np.zeros([vocab_size]).astype(np.float16)),
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.zeros([embedding_size, embedding_size]).astype(np.float16)),
                                      requires_grad=False)
        self.fake_G = Tensor(
            np.zeros([embedding_size, embedding_size]).astype(np.float16))
        self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32))
        self.dampingG = Tensor(np.identity(embedding_size), mstype.float32)
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  requires_grad=False)
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.damping = damping
        self.gather = P.Gather()
        self.sqrt = P.Sqrt()
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.vector_matmul = P.CusBatchMatMul()
        self.cholesky = P.CusCholeskyTrsm()
        self.matrix_combine = P.CusMatrixCombine()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.inv = P.Inv()
        self.getG = P.InsertGradientOf(self.save_gradient)
        self.batch_size = batch_size
Beispiel #2
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 weight_init='normal',
                 bias_init='zeros',
                 damping=0.03,
                 loss_scale=1,
                 frequency=278,
                 has_bias=True,
                 activation=None):
        super(Dense_Thor, self).__init__()
        self.in_channels = Validator.check_positive_int(in_channels)
        self.out_channels = Validator.check_positive_int(out_channels)
        self.has_bias = Validator.check_bool(has_bias)
        self.thor = True
        if isinstance(weight_init, Tensor):
            if weight_init.ndim != 2 or weight_init.shape[0] != out_channels or \
                    weight_init.shape[1] != in_channels:
                raise ValueError("weight_init shape error")

        self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight")

        if self.has_bias:
            if isinstance(bias_init, Tensor):
                if bias_init.ndim != 1 or bias_init.shape[0] != out_channels:
                    raise ValueError("bias_init shape error")

            self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias")

        self.matmul = P.MatMul(transpose_b=True)
        self.bias_add = P.BiasAdd()

        self.activation = get_activation(activation)
        self.activation_flag = self.activation is not None

        self.matrix_A_inv = Parameter(Tensor(np.zeros([128, 128, 16, 16]).astype(np.float16)), name='matrix_A_inv',
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16)), name="matrix_G_inv",
                                      requires_grad=False)
        self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16))

        self.matmul = P.MatMul(transpose_b=True)
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.matrix_combine = P.CusMatrixCombine()
        self.cholesky = P.CusCholeskyTrsm()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.damping = Tensor(damping)
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.vector_matmul = P.CusBatchMatMul()
        self.pad = P.Pad(((0, 24), (0, 24)))
        self.pad1 = P.Pad(((0, 8), (0, 8)))
        self.slice = P.Slice()
        self.gather = P.GatherV2()
        self.assignadd = P.AssignAdd()
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False)
        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False)
        self.fused_abs_max1 = P.CusFusedAbsMax1([1000, 1000])
        self.fused_abs_max2 = P.CusFusedAbsMax1()
        self.log = P.Log()
        self.exp = P.Exp()
        self.dampingA = Tensor(np.identity(2048), mstype.float32)
        self.dampingG = Tensor(np.identity(1024), mstype.float32)
        self.add = P.TensorAdd()
        self.sqrt = P.Sqrt()
        self.getG = P.InsertGradientOf(self.save_gradient)
Beispiel #3
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 data_format='NCHW',
                 has_bias=False,
                 weight_init='normal',
                 damping=0.03,
                 loss_scale=1,
                 frequency=278,
                 bias_init='zeros'):
        self.thor = True
        ksizes = (1, kernel_size, kernel_size, 1)
        self.hw = kernel_size * kernel_size
        strides = (1, stride, stride, 1)
        kernel_size = twice(kernel_size)
        super(Conv2d_Thor, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            data_format,
            has_bias,
            weight_init,
            bias_init,
        )
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group
                               )

        self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.matrix_combine = P.CusMatrixCombine()
        self.cholesky = P.CusCholeskyTrsm()
        self.transpose02314 = P.CusTranspose02314()
        self.matrix_A_dim = self.in_channels * self.kernel_size[0] * self.kernel_size[1]
        self.matrix_G_dim = self.out_channels
        self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape(self.matrix_A_dim,
                                                                                     self.in_channels, True)
        self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape(self.matrix_G_dim,
                                                                                     self.in_channels, False)
        self.matrix_A_device_temp_shape = (
            self.matrix_A_device_shape[0], self.matrix_A_device_shape[2], self.matrix_A_device_shape[1],
            self.matrix_A_device_shape[3])
        self.matrix_G_device_temp_shape = (
            self.matrix_G_device_shape[0], self.matrix_G_device_shape[2], self.matrix_G_device_shape[1],
            self.matrix_G_device_shape[3])
        self.matrix_A_inv = Parameter(
            Tensor(np.reshape(np.identity(self.matrix_A_device_dim).astype(np.float16), self.matrix_A_device_shape)),
            name='matrix_A_inv', requires_grad=False)
        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32), name="A_inv_max", requires_grad=False)
        self.matrix_G_inv = Parameter(
            Tensor(np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape)),
            name="matrix_G_inv", requires_grad=False)

        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32), name="G_inv_max", requires_grad=False)
        self.fake_G = Tensor(
            np.reshape(np.identity(self.matrix_G_device_dim).astype(np.float16), self.matrix_G_device_shape))

        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.damping = Tensor(damping)
        self.vector_matmul = P.CusBatchMatMul()
        self.diag_block_dim = 128
        self.channels_slice_flag = False
        if self.in_channels % C0 != 0:
            self.channels_slice_flag = True

        self.padA_flag = False
        if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \
                and self.matrix_A_dim > self.diag_block_dim:
            self.padA_flag = True
            pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim
            self.padA = P.Pad(((0, pad_dim), (0, pad_dim)))
        self.device_shape_pad_flag = False
        if self.matrix_A_dim != self.matrix_A_device_dim:
            self.device_shape_pad_flag = True
            self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels), (0, 0), (0, C0 - self.in_channels)))
        self.slice = P.Slice()
        self.gather = P.GatherV2()
        self.freq = Tensor(frequency, mstype.int32)
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.axis = 0

        dampingA_dim = self.matrix_A_dim
        if (self.matrix_A_dim % self.diag_block_dim) != 0 and self.matrix_A_dim > self.diag_block_dim:
            dampingA_dim = (self.matrix_A_dim // self.diag_block_dim + 1) * self.diag_block_dim
        dampingG_dim = self.matrix_G_dim
        if (self.matrix_G_dim % self.diag_block_dim) != 0 and self.matrix_G_dim > self.diag_block_dim:
            dampingG_dim = (self.matrix_G_dim // self.diag_block_dim + 1) * self.diag_block_dim

        self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32)
        self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32)
        self.fused_abs_max1 = P.CusFusedAbsMax1([self.matrix_A_dim, self.matrix_A_dim])
        self.fused_abs_max2 = P.CusFusedAbsMax1()
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.getG = P.InsertGradientOf(self.save_gradient)
Beispiel #4
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 weight_init='normal',
                 bias_init='zeros',
                 damping=0.03,
                 loss_scale=1,
                 frequency=100,
                 has_bias=False,
                 activation=None,
                 batch_size=12):
        super(Dense_Thor, self).__init__()
        self.in_channels = Validator.check_positive_int(in_channels)
        self.out_channels = Validator.check_positive_int(out_channels)
        self.has_bias = Validator.check_bool(has_bias)
        self.thor = True
        if isinstance(weight_init, Tensor):
            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
                    weight_init.shape()[1] != in_channels:
                raise ValueError("weight_init shape error")

        self.weight = Parameter(initializer(weight_init,
                                            [out_channels, in_channels]),
                                name="weight")

        if self.has_bias:
            if isinstance(bias_init, Tensor):
                if bias_init.dim() != 1 or bias_init.shape(
                )[0] != out_channels:
                    raise ValueError("bias_init shape error")

            self.bias = Parameter(initializer(bias_init, [out_channels]),
                                  name="bias")

        self.matmul = P.MatMul(transpose_b=True)
        self.bias_add = P.BiasAdd()

        self.activation = get_activation(activation)
        self.activation_flag = self.activation is not None
        self.matrix_A_inv = Parameter(Tensor(
            np.zeros([in_channels, in_channels]).astype(np.float16)),
                                      name='matrix_A_inv',
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.zeros([out_channels, out_channels]).astype(np.float16)),
                                      name="matrix_G_inv",
                                      requires_grad=False)
        self.fake_G = Tensor(
            np.zeros([out_channels, out_channels]).astype(np.float16))

        self.matmul = P.MatMul(transpose_b=True)
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.matrix_combine = P.CusMatrixCombine()
        self.cholesky = P.CusCholeskyTrsm()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  name="cov_step",
                                  requires_grad=False)
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.damping = damping
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.vector_matmul = P.CusBatchMatMul()
        self.gather = P.GatherV2()
        self.assignadd = P.AssignAdd()
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.abs = P.Abs()
        self.reduce_max = P.ReduceMax(keep_dims=False)
        self.log = P.Log()
        self.exp = P.Exp()
        self.dampingA = Tensor(np.identity(in_channels), mstype.float32)
        self.dampingG = Tensor(np.identity(out_channels), mstype.float32)
        self.sqrt = P.Sqrt()
        self.getG = P.InsertGradientOf(self.save_gradient)
        self.batch_size = batch_size