def __init__(self, attention_cell='multi_head', units=128, hidden_size=512, num_heads=4, scaled=True, dropout=0.0, use_residual=True, output_attention=False, weight_initializer=None, bias_initializer='zeros', prefix=None, params=None): super(TransformerDecoderCell, self).__init__(prefix=prefix, params=params) self._units = units self._num_heads = num_heads self._dropout = dropout self._use_residual = use_residual self._output_attention = output_attention self._scaled = scaled with self.name_scope(): self.dropout_layer = nn.Dropout(dropout) self.attention_cell_in = _get_attention_cell(attention_cell, units=units, num_heads=num_heads, scaled=scaled, dropout=dropout) self.attention_cell_inter = _get_attention_cell( attention_cell, units=units, num_heads=num_heads, scaled=scaled, dropout=dropout) self.proj_in = nn.Dense(units=units, flatten=False, use_bias=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='proj_in_') self.proj_inter = nn.Dense(units=units, flatten=False, use_bias=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix='proj_inter_') self.ffn = PositionwiseFFN(hidden_size=hidden_size, units=units, use_residual=use_residual, dropout=dropout, weight_initializer=weight_initializer, bias_initializer=bias_initializer) self.layer_norm_in = nn.LayerNorm() self.layer_norm_inter = nn.LayerNorm()
def __init__(self, cell_type='lstm', attention_cell='scaled_luong', num_layers=2, hidden_size=128, dropout=0.0, use_residual=True, output_attention=False, i2h_weight_initializer=None, h2h_weight_initializer=None, i2h_bias_initializer='zeros', h2h_bias_initializer='zeros', prefix=None, params=None): super(GNMTDecoder, self).__init__(prefix=prefix, params=params) self._cell_type = _get_cell_type(cell_type) self._num_layers = num_layers self._hidden_size = hidden_size self._dropout = dropout self._use_residual = use_residual self._output_attention = output_attention with self.name_scope(): self.attention_cell = _get_attention_cell(attention_cell, units=hidden_size) self.dropout_layer = nn.Dropout(dropout) self.rnn_cells = nn.HybridSequential() for i in range(num_layers): self.rnn_cells.add( self._cell_type(hidden_size=self._hidden_size, i2h_weight_initializer=i2h_weight_initializer, h2h_weight_initializer=h2h_weight_initializer, i2h_bias_initializer=i2h_bias_initializer, h2h_bias_initializer=h2h_bias_initializer, prefix='rnn%d_' % i))
def __init__(self, conv_channels, embed_dim, normalization_constant=0.5, attention_cell='dot', weight_initializer=None, bias_initializer='zeros', prefix=None, params=None): super(FConvAttentionLayer, self).__init__(prefix=prefix, params=params) self._normalization_constant = normalization_constant # projects from output of convolution to embedding dimension self.in_projection = nn.Dense(embed_dim, flatten=False, in_units=conv_channels, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix=prefix + 'in_proj_') self.attention_layer = _get_attention_cell(attention_cell) # projects from embedding dimension to convolution size self.out_projection = nn.Dense(conv_channels, flatten=False, in_units=embed_dim, weight_initializer=weight_initializer, bias_initializer=bias_initializer, prefix=prefix + 'out_proj_')