def __init__(self, dim, n_heads): super().__init__() # h self.n_heads = n_heads # v = V / h self.size_per_head = dim // n_heads scores_mul = 1.0 / np.sqrt(float(self.size_per_head)) self.scores_mul = ms.Tensor(scores_mul, ms.float32) self.exones = P.Ones()((1, 1, n_heads, 1, 1), ms.int32) # shape = (h, v) self.reshape_tail = (self.n_heads, self.size_per_head) self.output = Dense(dim, dim, has_bias=False) self.mul = P.Mul() self.div = P.Div() self.softmax = P.Softmax() self.bmm = P.BatchMatMul() self.bmmt = P.BatchMatMul(transpose_b=True) self.squeeze = P.Squeeze(-2) self.reducesum = P.ReduceSum(keep_dims=True) self.transpose = P.Transpose() self.trans_shape = (0, 1, 3, 2, 4)
def __init__(self): super().__init__() self.ReduceSum = P.ReduceSum(keep_dims=True) self.BatchMatMul_b = P.BatchMatMul(transpose_b=True) self.BatchMatMul_a = P.BatchMatMul(transpose_a=True) self.BatchMatMul = P.BatchMatMul() self.Mul = P.Mul()
def __init__(self, batch_size=512, d_model=768, seq_length=1024, num_attention_heads=12, dim_per_head=64, has_attention_mask=True, do_return_2d_tensor=True, attention_dropout=0.0, compute_type=mstype.float32): super(MaskedSelfAttention, self).__init__() self.batch_size = batch_size self.d_model = d_model self.seq_length = seq_length self.num_heads = num_attention_heads self.dim_per_head = dim_per_head self.has_attention_mask = has_attention_mask assert has_attention_mask self.scale = Tensor([1.0 / math.sqrt(float(self.dim_per_head))], dtype=compute_type) # attention scale self.mask_data = Tensor([ -10000.0, ], dtype=compute_type) self.split_head_shape = (self.batch_size, self.seq_length, self.num_heads, self.dim_per_head) self.c_attn = Conv1D(d_model, d_model * 3) self.c_proj = Conv1D(d_model, d_model) self.split_for_qkv = P.Split(1, 3) # P.Split(axis, output_num) # self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.trans_shape = (0, 2, 1, 3) self.matmul_trans_b = P.BatchMatMul(transpose_b=True) self.matmul = P.BatchMatMul() self.multiply = P.Mul() if self.has_attention_mask: self.expand_dims = P.ExpandDims() self.sub = P.Sub() self.add = P.TensorAdd() self.cast = P.Cast() self.get_dtype = P.DType() if do_return_2d_tensor: self.shape_return = (batch_size * seq_length, d_model) else: self.shape_return = (batch_size, seq_length, d_model) self.softmax = nn.Softmax() self.softmax_cast = P.Cast() self.dropout = nn.Dropout(1 - attention_dropout) self.use_attention_dropout = attention_dropout > 0
def __init__(self, is_training, query_size, key_size, num_units, normalize=False, initializer_range=0.1, compute_type=mstype.float16): super(BahdanauAttention, self).__init__() self.is_training = is_training self.mask = None self.query_size = query_size self.key_size = key_size self.normalize = normalize self.num_units = num_units self.linear_att = Parameter(Tensor(np.random.uniform( -initializer_range, initializer_range, size=[num_units]), dtype=mstype.float32), name='linear_att') if self.normalize: self.normalize_scalar = Parameter(Tensor(np.array( [1.0 / num_units]), dtype=mstype.float32), name='normalize_scalar') self.normalize_bias = Parameter(Tensor(np.zeros(num_units), dtype=mstype.float32), name='normalize_bias') self.transpose = P.Transpose() self.transpose_orders = (1, 0, 2) self.shape_op = P.Shape() self.linear_q = nn.Dense( query_size, num_units, has_bias=False, weight_init=Uniform(initializer_range)).to_float(compute_type) self.linear_k = nn.Dense( key_size, num_units, has_bias=False, weight_init=Uniform(initializer_range)).to_float(compute_type) self.expand = P.ExpandDims() self.tile = P.Tile() self.norm = nn.Norm(axis=-1) self.mul = P.Mul() self.matmul = P.MatMul() self.batchMatmul = P.BatchMatMul() self.tanh = nn.Tanh() self.matmul_trans_b = P.BatchMatMul(transpose_b=True) self.softmax = nn.Softmax(axis=-1) self.reshape = P.Reshape() self.cast = P.Cast()
def __init__(self, strategy1, strategy2): super().__init__() self.matmul1 = P.BatchMatMul().shard(strategy1) self.norm = P.FusedBatchNormEx() self.gamma = Parameter(Tensor(np.ones([64]), dtype=ms.float32), name="gamma") self.beta = Parameter(Tensor(np.ones([64]), dtype=ms.float32), name="beta") self.mean = Parameter(Tensor(np.ones([64]), dtype=ms.float32), name="mean") self.var = Parameter(Tensor(np.ones([64]), dtype=ms.float32), name="var") self.matmul2 = P.BatchMatMul().shard(strategy2)
def __init__(self, feature_in_dim, feature_out_dim, dropout=0.2): super(AttenConv, self).__init__() self.out_weight = Parameter( initializer("XavierUniform", [feature_in_dim * 2, feature_out_dim], dtype=mstype.float32)) self.cast = P.Cast() self.squeeze = P.Squeeze(1) self.concat = P.Concat(axis=1) self.expanddims = P.ExpandDims() self.softmax = P.Softmax(axis=-1) self.matmul = P.MatMul() self.matmul_3 = P.BatchMatMul() self.matmul_t = P.BatchMatMul(transpose_b=True) self.dropout = nn.Dropout(keep_prob=1 - dropout)
def __init__(self, length, depth, max_relative_position, initializer_range, use_one_hot_embeddings=False): super(RelaPosEmbeddingsGenerator, self).__init__() self.depth = depth self.vocab_size = max_relative_position * 2 + 1 self.use_one_hot_embeddings = use_one_hot_embeddings self.embeddings_table = Parameter( initializer(TruncatedNormal(initializer_range), [self.vocab_size, self.depth]), name='embeddings_for_position') self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, max_relative_position=max_relative_position) self.reshape = P.Reshape() self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.shape = P.Shape() self.gather = P.GatherV2() # index_select self.matmul = P.BatchMatMul()
def __init__(self, index): super().__init__() self.matmul = P.BatchMatMul() self.relu = P.ReLU() self.weight = Parameter( Tensor(np.ones([8, 8, 8, 8]), dtype=ms.float32), "matmul_w" + str(index))
def __init__(self, hidden_size, output_size, max_length, dropout_p=0.1): super(AttnDecoderRNN, self).__init__() self.hidden_size = hidden_size self.output_size = output_size self.dropout_p = dropout_p self.max_length = max_length self.embedding = nn.Embedding(self.output_size, self.hidden_size) self.attn = nn.Dense(in_channels=self.hidden_size * 2, out_channels=self.max_length).to_float( mstype.float16) self.attn_combine = nn.Dense(in_channels=self.hidden_size * 2, out_channels=self.hidden_size).to_float( mstype.float16) self.dropout = nn.Dropout(keep_prob=1.0 - self.dropout_p) self.gru = GRU(hidden_size, hidden_size).to_float(mstype.float16) self.out = nn.Dense(in_channels=self.hidden_size, out_channels=self.output_size).to_float( mstype.float16) self.transpose = P.Transpose() self.concat = P.Concat(axis=2) self.concat1 = P.Concat(axis=1) self.softmax = P.Softmax(axis=1) self.relu = P.ReLU() self.log_softmax = P.LogSoftmax(axis=1) self.bmm = P.BatchMatMul() self.unsqueeze = P.ExpandDims() self.squeeze = P.Squeeze(1) self.squeeze1 = P.Squeeze(0) self.cast = P.Cast()
def __init__(self, config, scale=1.0, layer_idx=None): super(Attention, self).__init__() self.get_attention_mask = AttentionMask(config) self.projection = Mapping(config.embedding_size, config.embedding_size, config.compute_dtype, scale) self.split = P.Split(axis=-1, output_num=3) self.transpose = P.Transpose() self.reshape = P.Reshape() self.n_head = config.num_heads self.size_per_head = config.embedding_size // self.n_head self.concat_k = P.Concat(axis=3) self.concat_v = P.Concat(axis=2) self.multiply_data = Tensor([ -10000.0, ], dtype=mstype.float32) self.batch_matmul = P.BatchMatMul() self.scale = scale if self.scale: self.scale_factor = Tensor(math.sqrt(self.size_per_head)) if layer_idx is not None: self.coeff = math.sqrt(layer_idx * math.sqrt(self.size_per_head)) self.coeff = Tensor(self.coeff) self.use_past = config.use_past self.dropout = nn.Dropout(1 - config.dropout_rate) self.prob_dropout = nn.Dropout(1 - config.dropout_rate) self.dense1 = nn.Dense(config.embedding_size, config.embedding_size).to_float( config.compute_dtype) self.dense2 = nn.Dense(config.embedding_size, config.embedding_size).to_float( config.compute_dtype) self.dense3 = nn.Dense(config.embedding_size, config.embedding_size).to_float( config.compute_dtype)
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', has_bias=True, activation=None): super(Dense, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.shape_op = P.Shape() if isinstance(weight_init, Tensor): if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("Weight init shape error.") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels]), name="weight") self.bias = None if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.dim() != 1 or bias_init.shape[0] != out_channels: raise ValueError("Bias init shape error.") self.bias = Parameter(initializer(bias_init, [out_channels]), name="bias") self.bias_add = P.BiasAdd() self.tensor_add = P.TensorAdd() self.matmul = P.MatMul(transpose_b=True) self.batch_matmul = P.BatchMatMul(transpose_b=True) self.activation = get_activation(activation) if isinstance(activation, str) else activation if activation is not None and not isinstance(self.activation, (Cell, Primitive)): raise TypeError("The activation must be str or Cell or Primitive,"" but got {}.".format(activation)) self.activation_flag = self.activation is not None
def matmul_op_select(x1_shape, x2_shape, transpose_x1, transpose_x2): """select matmul op""" x1_dim, x2_dim = len(x1_shape), len(x2_shape) if x1_dim == 1 and x2_dim == 1: matmul_op = P.Mul() elif x1_dim <= 2 and x2_dim <= 2: transpose_x1 = False if x1_dim == 1 else transpose_x1 transpose_x2 = False if x2_dim == 1 else transpose_x2 matmul_op = P.MatMul(transpose_x1, transpose_x2) elif x1_dim == 1 and x2_dim > 2: matmul_op = P.BatchMatMul(False, transpose_x2) elif x1_dim > 2 and x2_dim == 1: matmul_op = P.BatchMatMul(transpose_x1, False) else: matmul_op = P.BatchMatMul(transpose_x1, transpose_x2) return matmul_op
def __init__(self, config, is_training=True): super(Decoder, self).__init__() self.hidden_size = config.hidden_size self.vocab_size = config.trg_vocab_size self.embedding_size = config.decoder_embedding_size self.embedding = nn.Embedding(self.vocab_size, self.embedding_size) self.rnn = GRU(input_size=self.embedding_size + self.hidden_size*2, \ hidden_size=self.hidden_size).to_float(config.compute_type) self.text_len = config.max_length self.shape = P.Shape() self.transpose = P.Transpose() self.p = P.Print() self.cast = P.Cast() self.concat = P.Concat(axis=2) self.squeeze = P.Squeeze(axis=0) self.expandims = P.ExpandDims() self.log_softmax = P.LogSoftmax(axis=1) weight, bias = dense_default_state( self.embedding_size + self.hidden_size * 3, self.vocab_size) self.fc = nn.Dense(self.embedding_size + self.hidden_size * 3, self.vocab_size, weight_init=weight, bias_init=bias).to_float(config.compute_type) self.attention = Attention(config) self.bmm = P.BatchMatMul() self.dropout = nn.Dropout(0.7) self.expandims = P.ExpandDims() self.dtype = config.dtype
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=278, batch_size=32, has_bias=True, activation=None): super(Dense_Thor_GPU, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.ndim != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels])) if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.ndim != 1 or bias_init.shape[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels])) self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None split_dim = 128 matrix_A_shape, matrix_G_shape = caculate_matmul_shape(self.in_channels, self.out_channels, split_dim) self.matrix_A_inv = Parameter(Tensor(np.zeros(matrix_A_shape).astype(np.float32)), requires_grad=False) self.matrix_G_inv = Parameter(Tensor(np.zeros(matrix_G_shape).astype(np.float32)), requires_grad=False) self.broadcast_to = P.BroadcastTo(matrix_A_shape) self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.mul = P.Mul() self.cube_matmul = P.MatMul(transpose_a=True) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.batch_size = Tensor(batch_size, mstype.float16) self.getG = P.InsertGradientOf(self.save_gradient) self.damping = Parameter(Tensor(damping), requires_grad=False) self.dampingA = Tensor(np.identity(in_channels), mstype.float32) self.dampingG = Tensor(np.identity(out_channels), mstype.float32) self.cast = P.Cast() self.gather = P.Gather() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.add = P.Add() self.sqrt = P.Sqrt() self.cholesky = P.CholeskyTrsm(split_dim=split_dim) self.vector_matmul = P.BatchMatMul(transpose_a=True)
def __init__(self, config): super(AttentionMask, self).__init__() self.reshape = P.Reshape() self.mul = P.BatchMatMul() ones = np.ones(shape=(config.seq_length, config.seq_length)) self.lower_triangle_mask = Tensor(np.tril(ones), mstype.float32) self.multiply = P.Mul()
def __init__(self, shape, offset, reduce_scatter_flag, split_num): super().__init__() self.index = Tensor(np.ones(shape), dtype=ms.int32) self.offset = offset self.reduce_scatter_flag = reduce_scatter_flag self.split_num = split_num self.elu = inner.EmbeddingLookup() self.mm = P.BatchMatMul()
def __init__(self, config, scale=1.0, layer_idx=None): super(Attention, self).__init__() self.get_attention_mask = AttentionMask(config) self.projection = Mapping(config, config.embedding_size, config.embedding_size, scale) self.transpose = P.Transpose().shard(((config.dp, 1, config.mp, 1),)) self.merger_head_transpose = P.Transpose().shard( ((config.dp, config.mp, 1, 1),)) self.reshape = P.Reshape() self.n_head = config.num_heads self.size_per_head = config.embedding_size // self.n_head self.concat_k = P.Concat(axis=3) self.concat_v = P.Concat(axis=2) self.multiply_data = Tensor([ -10000.0, ], dtype=mstype.float32) self.batch_matmul = P.BatchMatMul().shard( ((config.dp, config.mp, 1, 1), (config.dp, config.mp, 1, 1))) self.scale = scale self.real_div = P.RealDiv().shard(((config.dp, config.mp, 1, 1), ())) self.sub = P.Sub().shard(((1,), (config.dp, 1, 1, 1))).add_prim_attr("_side_effect", True) self.mul = P.Mul().shard(((config.dp, 1, 1, 1), (1,))).add_prim_attr("_side_effect", True) self.add = P.TensorAdd().shard( ((config.dp, 1, 1, 1), (config.dp, config.mp, 1, 1))) if self.scale: self.scale_factor = Tensor(math.sqrt(self.size_per_head)) if layer_idx is not None: self.coeff = math.sqrt(layer_idx * math.sqrt(self.size_per_head)) self.coeff = Tensor(self.coeff) self.use_past = config.use_past self.dropout = nn.Dropout(1 - config.dropout_rate) self.dropout.dropout_gen_mask.shard(((config.dp, 1, 1),)) self.dropout.dropout_do_mask.shard(((config.dp, 1, 1),)) self.prob_dropout = nn.Dropout(1 - config.dropout_rate) self.prob_dropout.dropout_gen_mask.shard( ((config.dp, config.mp, 1, 1),)) self.prob_dropout.dropout_do_mask.shard( ((config.dp, config.mp, 1, 1),)) self.softmax = nn.Softmax() self.softmax.softmax.shard(((config.dp, config.mp, 1),)) self.expand_dims = P.ExpandDims().shard(((config.dp, 1, 1),)) self.dense1 = nn.Dense(config.embedding_size, config.embedding_size).to_float( config.compute_dtype) self.dense1.matmul.shard(((config.dp, 1), (config.mp, 1))) self.dense1.bias_add.shard(((config.dp, config.mp), (config.mp,))) self.dense2 = nn.Dense(config.embedding_size, config.embedding_size).to_float( config.compute_dtype) self.dense2.matmul.shard(((config.dp, 1), (config.mp, 1))) self.dense2.bias_add.shard(((config.dp, config.mp), (config.mp,))) self.dense3 = nn.Dense(config.embedding_size, config.embedding_size).to_float( config.compute_dtype) self.dense3.matmul.shard(((config.dp, 1), (config.mp, 1))) self.dense3.bias_add.shard(((config.dp, config.mp), (config.mp,)))
def __init__(self, config): super(AttentionMask, self).__init__() self.reshape = P.Reshape() self.mul = P.BatchMatMul().shard( ((config.dp, 1, 1), (config.dp, 1, 1))) # yzz: use 64, 1, 1? self.expand_dim = P.ExpandDims().shard(((1, 1),)) ones = np.ones(shape=(config.seq_length, config.seq_length)) self.lower_triangle_mask = Tensor(np.tril(ones), mstype.float32) self.multiply = P.Mul().shard(((config.dp, 1, 1), (1, 1, 1)))
def __init__(self, transpose_a=False, transpose_b=False, strategy0=None, strategy1=None): super(BatchMatMul, self).__init__() self.add = P.TensorAdd(strategy=strategy1) self.batchmatmul = P.BatchMatMul(transpose_a, transpose_b, strategy=strategy0)
def __init__(self, shape, offset, strategy1=None, strategy2=None, target="Device"): super().__init__() self.index = Tensor(np.ones(shape), dtype=ms.int32) self.offset = offset self.elu = P.EmbeddingLookup().set_strategy(strategy1).add_prim_attr( "primitive_target", target) self.mm = P.BatchMatMul().set_strategy(strategy2)
def __init__(self, mul_weight, batch_matmul_weight, transpose_b=False, strategy1=None, strategy2=None): super().__init__() self.mul = P.Mul().set_strategy(strategy1) self.batch_matmul = P.BatchMatMul( transpose_b=transpose_b).set_strategy(strategy2) self.mul_weight = Parameter(mul_weight, "w1") self.batch_matmul_weight = Parameter(batch_matmul_weight, "w2")
def affine_grid_generator(self, height, width, theta): """ This function returns a sampling grid, which when used with the bilinear sampler on the input feature map, will create an output feature map that is an affine transformation [1] of the input feature map. zero = Tensor(np.zeros([]), mindspore.float32) Input ----- - height: desired height of grid/output. Used to downsample or upsample. - width: desired width of grid/output. Used to downsample or upsample. - theta: affine transform matrices of shape (num_batch, 2, 3). For each image in the batch, we have 6 theta parameters of the form (2x3) that define the affine transformation T. Returns ------- - normalized grid (-1, 1) of shape (num_batch, 2, H, W). The 2nd dimension has 2 components: (x, y) which are the sampling points of the original image for each point in the target image. Note ---- [1]: the affine transformation allows cropping, translation, and isotropic scaling. """ shape = P.Shape() num_batch = shape(theta)[0] cast = P.Cast() theta = cast(theta, mindspore.float32) # transform the sampling grid - batch multiply matmul = P.BatchMatMul() tile = P.Tile() sampling_grid = tile(self.sampling_grid, (num_batch, 1, 1)) cast = P.Cast() sampling_grid = cast(sampling_grid, mindspore.float32) batch_grids = matmul(theta, sampling_grid) # batch grid has shape (num_batch, 2, H*W) # reshape to (num_batch, H, W, 2) reshape = P.Reshape() batch_grids = reshape(batch_grids, (num_batch, 2, height, width)) return batch_grids
def __init__(self, transpose_x1=False, transpose_x2=False): super(MatMul, self).__init__() validator.check_value_type('transpose_x1', transpose_x1, [bool], self.cls_name) validator.check_value_type('transpose_x2', transpose_x2, [bool], self.cls_name) self.transpose_x1 = transpose_x1 self.transpose_x2 = transpose_x2 self.shape_op = P.Shape() self.matmul_op = P.MatMul(self.transpose_x1, self.transpose_x2) self.batch_matmul_op = P.BatchMatMul(self.transpose_x1, self.transpose_x2)
def __init__(self, batch_size, from_seq_length, to_seq_length, num_attention_heads=1, size_per_head=512, use_one_hot_embeddings=False, initializer_range=0.02, do_return_2d_tensor=False, use_relative_positions=False, dtype=mstype.float32, compute_type=mstype.float32): super(BertAttentionRelativePositionValues, self).__init__() self.batch_size = batch_size self.from_seq_length = from_seq_length self.to_seq_length = to_seq_length self.use_relative_positions = use_relative_positions self.size_per_head = size_per_head self.num_attention_heads = num_attention_heads self.trans_shape_position = (1, 2, 0, 3) self.trans_shape_relative = (2, 0, 1, 3) self.scores_mul = Tensor([1.0 / math.sqrt(float(self.size_per_head))], dtype=dtype) self.trans_shape = (0, 2, 1, 3) self.reshape = P.Reshape() self.multiply = P.Mul() self.transpose = P.Transpose() self.batch_num = batch_size * num_attention_heads self.matmul = P.BatchMatMul() self.do_return_2d_tensor = do_return_2d_tensor if self.do_return_2d_tensor: self.shp_return = (batch_size * from_seq_length, num_attention_heads * size_per_head) else: self.shp_return = (batch_size, from_seq_length, num_attention_heads * size_per_head) self.cast_compute_type = SaturateCast(dst_type=compute_type) self._generate_relative_positions_embeddings = \ RelaPosEmbeddingsGenerator(length=self.to_seq_length, depth=self.size_per_head, max_relative_position=16, initializer_range=initializer_range, use_one_hot_embeddings=use_one_hot_embeddings) self.fill = P.Fill() self.multiply = P.Mul() self.type = P.DType() self.cast = P.Cast()
def __init__(self, config): super(CreateAttentionMaskFromInputMask, self).__init__() self.input_mask = None self.cast = P.Cast() self.shape = P.Shape() self.reshape = P.Reshape() self.batch_matmul = P.BatchMatMul() self.multiply = P.Mul() self.shape = P.Shape() # mask future positions ones = np.ones(shape=(config.batch_size, config.seq_length, config.seq_length)) self.lower_triangle_mask = Tensor(np.tril(ones), dtype=mstype.float32)
def __init__(self, config): super(CreateAttentionMaskFromInputMask, self).__init__() self.input_mask_from_dataset = config.input_mask_from_dataset self.input_mask = None if not self.input_mask_from_dataset: self.input_mask = initializer( "ones", [config.batch_size, config.seq_length], mstype.int32).to_tensor() self.cast = P.Cast() self.reshape = P.Reshape() self.shape = (config.batch_size, 1, config.seq_length) self.broadcast_ones = initializer( "ones", [config.batch_size, config.seq_length, 1], mstype.float32).to_tensor() self.batch_matmul = P.BatchMatMul()
def __init__(self, weight1, strategy1=None, strategy2=None, strategy3=None, is_parameter=True): super(MatMulNet, self).__init__() self.shape = (8, 64, 64) self.broadcast = P.BroadcastTo(self.shape).shard(strategy1) self.matmul = P.BatchMatMul().shard(strategy2) self.mul = P.Mul().shard(strategy3) if is_parameter: self.weight1 = Parameter(weight1, "w1") else: self.weight1 = weight1
def __init__(self, 头数, 尺寸, 丢弃率=0.1): super(多头_注意力, self).__init__() self.d_model = 尺寸 self.d_k = 尺寸 // 头数 self.d_k_Tensor = Tensor(尺寸 // 头数, mindspore.float32) self.h = 头数 self.q_linear = 全连接层(尺寸, 尺寸) self.v_linear = 全连接层(尺寸, 尺寸) self.k_linear = 全连接层(尺寸, 尺寸) self.dropout = nn.Dropout(1 - 丢弃率) self.out = 全连接层(尺寸, 尺寸) self.reshape = P.Reshape() self.transpose = P.Transpose() self.shape = P.Shape() self.batch_matmul = P.BatchMatMul() self.add = P.TensorAdd() self.transpose = P.Transpose() self.sqrt = P.Sqrt() self.softmax = P.Softmax(-1)
def __init__(self, in_channel, out_channel, in_drop_ratio=0.0, coef_drop_ratio=0.0, residual=False, coef_activation=nn.LeakyReLU(), activation=nn.ELU()): super(AttentionHead, self).__init__() self.in_channel = Validator.check_positive_int(in_channel) self.out_channel = Validator.check_positive_int(out_channel) self.in_drop_ratio = in_drop_ratio self.in_drop = nn.Dropout(keep_prob=1 - in_drop_ratio) self.in_drop_2 = nn.Dropout(keep_prob=1 - in_drop_ratio) self.feature_transform = GNNFeatureTransform( in_channels=self.in_channel, out_channels=self.out_channel, has_bias=False) self.f_1_transform = GNNFeatureTransform(in_channels=self.out_channel, out_channels=1) self.f_2_transform = GNNFeatureTransform(in_channels=self.out_channel, out_channels=1) self.softmax = nn.Softmax() self.coef_drop = nn.Dropout(keep_prob=1 - coef_drop_ratio) self.batch_matmul = P.BatchMatMul() self.bias_add = P.BiasAdd() self.bias = Parameter(initializer('zeros', self.out_channel), name='bias') self.residual = Validator.check_bool(residual) if self.residual: if in_channel != out_channel: self.residual_transform_flag = True self.residual_transform = GNNFeatureTransform( in_channels=self.in_channel, out_channels=self.out_channel) else: self.residual_transform = None self.coef_activation = coef_activation self.activation = activation
def __init__(self, length, depth, max_relative_position, initializer_range, use_one_hot_embeddings=False): super(RelaPosEmbeddingsGenerator, self).__init__() self.depth = depth self.vocab_size = max_relative_position * 2 + 1 self.use_one_hot_embeddings = use_one_hot_embeddings self.embeddings_table = Parameter( initializer(TruncatedNormal(initializer_range), [self.vocab_size, self.depth])) self.relative_positions_matrix = RelaPosMatrixGenerator(length=length, max_relative_position=max_relative_position) self.reshape = P.Reshape() self.one_hot = nn.OneHot(depth=self.vocab_size) self.shape = P.Shape() self.gather = P.Gather() # index_select self.matmul = P.BatchMatMul()