def __init__(self, d_model, nhead, dim_feedforward=2048, attention_dropout_rate=0.0, residual_dropout_rate=0.1): super(TransformerDecoderLayer, self).__init__() self.self_attn = MultiheadAttention(d_model, nhead, dropout=attention_dropout_rate) self.multihead_attn = MultiheadAttention( d_model, nhead, dropout=attention_dropout_rate) self.conv1 = Conv2D(in_channels=d_model, out_channels=dim_feedforward, kernel_size=(1, 1)) self.conv2 = Conv2D(in_channels=dim_feedforward, out_channels=d_model, kernel_size=(1, 1)) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) self.dropout1 = Dropout(residual_dropout_rate) self.dropout2 = Dropout(residual_dropout_rate) self.dropout3 = Dropout(residual_dropout_rate)
def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1, activation="relu", attn_dropout=None, act_dropout=None, normalize_before=False, weight_attr=None, bias_attr=None, attention_type="bigbird", block_size=1, window_size=3, num_global_blocks=1, num_rand_blocks=1, seed=None): self._config = locals() self._config.pop("self") self._config.pop("__class__", None) # py3 super(TransformerEncoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before weight_attrs = _convert_param_attr_to_list(weight_attr, 2) bias_attrs = _convert_param_attr_to_list(bias_attr, 2) self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], attention_type=attention_type, block_size=block_size, window_size=window_size, num_global_blocks=num_global_blocks, num_rand_blocks=num_rand_blocks, seed=seed) self.linear1 = Linear(d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1]) self.dropout = Dropout(act_dropout, mode="upscale_in_train") self.linear2 = Linear(dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1]) self.norm1 = LayerNorm(d_model, epsilon=1e-12) self.norm2 = LayerNorm(d_model, epsilon=1e-12) self.dropout1 = Dropout(dropout, mode="upscale_in_train") self.dropout2 = Dropout(dropout, mode="upscale_in_train") self.activation = getattr(F, activation) self.d_model = d_model
def __init__(self, encoder_layer, num_layers): super(TransformerEncoder, self).__init__() self.layers = LayerList([(encoder_layer if i == 0 else type(encoder_layer)(**encoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = LayerNorm(self.layers[0].d_model, epsilon=1e-12) self.normalize_before = self.layers[0].normalize_before
def __init__(self): super(ModelCase4, self).__init__() self.bn1 = BatchNorm2D(3) self.ln1 = LayerNorm([3 * 16 * 16]) self.relu1 = ReLU() self.fc1 = paddle.nn.Linear(3 * 16 * 16, 3 * 16 * 16)