def __init__(self, config, scale=1.0): super(Output, self).__init__() input_size = config.embedding_size output_size = config.embedding_size*config.expand_ratio self.mapping = Mapping(input_size, output_size, config.compute_dtype) self.projection = Mapping(output_size, input_size, config.compute_dtype, scale) self.activation = nn.GELU() self.dropout = nn.Dropout(1-config.dropout_rate)
def __init__(self, config, scale=1.0): super(Output, self).__init__() input_size = config.embedding_size output_size = config.embedding_size * config.expand_ratio self.mapping = Mapping_output(config, input_size, output_size) self.projection = Mapping(config, output_size, input_size, scale) self.activation = nn.GELU() self.activation.gelu.shard(((config.dp, 1, config.mp),)) self.dropout = nn.Dropout(1 - config.dropout_rate) self.dropout.dropout_gen_mask.shard(((config.dp, 1, 1),)) self.dropout.dropout_do_mask.shard(((config.dp, 1, 1),))
def __init__(self, dim, mult=4, initializer_range=0.02, hidden_dropout_prob=0.1, compute_type=mstype.float32): super(FeedForward, self).__init__() self.hidden_size = dim self.w1 = Mapping(dim, dim * mult, initializer_range, compute_type) self.w2 = Mapping(dim * mult, dim, initializer_range, compute_type) self.act = nn.GELU() self.dropout = nn.Dropout(hidden_dropout_prob)
def __init__(self, config, num_labels): super(SequenceSummary, self).__init__() self.summary = nn.Dense(config.d_model, num_labels, weight_init=weight_variable( [config.d_model, num_labels]), has_bias=True).to_float(config.compute_type) self.gelu = nn.GELU() self.first_dropout = nn.Dropout(1 - config.hidden_dropout) self.last_dropout = nn.Dropout(1 - config.hidden_dropout) self.expand_dims = P.ExpandDims() self.shape = P.Shape() self.size = P.Size() self.slice = P.GatherV2() self.squeeze = P.Squeeze(-2)
def __init__(self, in_channels=786, out_channels=768, hidden_size=3072, hidden_dropout=0.1): super(FeedForward, self).__init__() self.c_fc = Conv1D(in_channels, hidden_size) self.c_proj = Conv1D(hidden_size, out_channels) self.layernorm = LayerNorm(in_channels=in_channels) self.residual_connect = ResidualConnection(dropout_prob=hidden_dropout) self.gelu = nn.GELU() self.dropout = nn.Dropout(1 - hidden_dropout) self.use_dropout = hidden_dropout > 0 self.reshape = P.Reshape()
def __init__(self): super(Net_gelu, self).__init__() self.gelu = nn.GELU()
def gelu(x): """Apply gelu function.""" return nn.GELU()(x)