def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout, window=5): """ :param hidden: hidden size of transformer :param attn_heads: head sizes of multi-head attention :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size :param dropout: dropout rate """ super().__init__() self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout) self.conv = TFCNN(dropout=dropout, num_filters=hidden, hidden=hidden, window=window) self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.dropout = nn.Dropout(p=dropout)
def build_decoder(trg_emb): if wargs.encoder_type == 'gru': from models.gru_decoder import StackedGRUDecoder return StackedGRUDecoder(trg_emb=trg_emb, enc_hid_size=wargs.d_enc_hid, dec_hid_size=wargs.d_dec_hid, n_layers=wargs.n_dec_layers, attention_type=wargs.attention_type, rnn_dropout_prob=wargs.rnn_dropout, out_dropout_prob=wargs.output_dropout) if wargs.decoder_type == 'att': from models.self_att_model import SelfAttDecoder, SelfAttDecoderLayer, \ PositionwiseFeedForward, clones from models.attention import MultiHeadedAttention c = copy.deepcopy attn = MultiHeadedAttention(h=wargs.n_head, d_model=wargs.d_model, dropout=wargs.att_dropout) wlog('clones -> {}'.format(2)) ff = PositionwiseFeedForward(d_model=wargs.d_model, d_ff=wargs.d_ff_filter, dropout=wargs.relu_dropout) return SelfAttDecoder(trg_emb=trg_emb, layer=SelfAttDecoderLayer( wargs.d_model, c(attn), c(attn), c(ff), dropout=wargs.residual_dropout), N=wargs.n_enc_layers)
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) return model
def __init__(self, dec_attn_config, enc_dec_attn_config, num_heads, dim, hidden_dim, layer_i, causal=True, dropout_p=0.1): ''' Initialize the transformer layer ''' super(TransformerDecoderLayer, self).__init__() self.causal = causal self.uuid = uuid.uuid4() self.enc_dec_attn_config = enc_dec_attn_config if dec_attn_config['ffn_layer'][layer_i]: self.ffn = TransformerSublayer(TransformerFFN(dim, hidden_dim), dim, dropout_p) print('dec layer %i has ffn' % layer_i) self.self_attention = TransformerSublayer( MultiHeadedAttention(dec_attn_config, dim, num_heads), dim, dropout_p) if self.enc_dec_attn_config['enc_dec_attn_layer'] == 1 or \ (type(self.enc_dec_attn_config['enc_dec_attn_layer'] is list) and self.enc_dec_attn_config['enc_dec_attn_layer'][layer_i] == 1): if self.enc_dec_attn_config['enc_dec_attn_num_heads'] == -1: src_num_heads = num_heads elif type(self.enc_dec_attn_config['enc_dec_attn_num_heads'] ) is not list: src_num_heads = self.enc_dec_attn_config[ 'enc_dec_attn_num_heads'] else: src_num_heads = self.enc_dec_attn_config[ 'enc_dec_attn_num_heads'][layer_i] assert src_num_heads != 0 self.source_attention = TransformerSublayer( MultiHeadedAttention(enc_dec_attn_config, dim, src_num_heads), dim, dropout_p) print('layer %i num of src heads %i' % (layer_i, src_num_heads))
def __init__(self, num_heads, dim, hidden_dim, dropout_p=0.1): ''' Initialize the transformer layer ''' super(TransformerEncoderLayer, self).__init__() self.ffn = TransformerSublayer(TransformerFFN(dim, hidden_dim), dim, dropout_p) self.self_attention = TransformerSublayer( MultiHeadedAttention(dim, num_heads), dim, dropout_p)
def __init__(self, num_heads, dim, hidden_dim, causal=True, span=1, dropout_p=0.1): ''' Initialize the transformer layer ''' super(TransformerDecoderLayer, self).__init__() self.span = span self.causal = causal self.uuid = uuid.uuid4() self.ffn = TransformerSublayer(TransformerFFN(dim, hidden_dim), dim, dropout_p) self.self_attention = TransformerSublayer( MultiHeadedAttention(dim, num_heads), dim, dropout_p) self.source_attention = TransformerSublayer( MultiHeadedAttention(dim, num_heads), dim, dropout_p)
def __init__(self, attn_config, num_heads, dim, hidden_dim, layer_i, dropout_p=0.1): ''' Initialize the transformer layer ''' super(TransformerEncoderLayer, self).__init__() if attn_config['ffn_layer'][layer_i]: self.ffn = TransformerSublayer(TransformerFFN(dim, hidden_dim), dim, dropout_p) print('enc layer %i has ffn' % layer_i) self.self_attention = TransformerSublayer( MultiHeadedAttention(attn_config, dim, num_heads), dim, dropout_p)
class TransformerBlock(nn.Module): """ Bidirectional Encoder = Transformer (self-attention) Transformer = MultiHead_Attention + Feed_Forward with sublayer connection """ def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout, window=5): """ :param hidden: hidden size of transformer :param attn_heads: head sizes of multi-head attention :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size :param dropout: dropout rate """ super().__init__() self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout) self.conv = TFCNN(dropout=dropout, num_filters=hidden, hidden=hidden, window=window) self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout) self.dropout = nn.Dropout(p=dropout) def forward(self, x, mask): x = self.input_sublayer( x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask)) x = self.output_sublayer(x, self.conv) # x = x + self.conv(self.attention.forward(x, x, x, mask=mask)) return self.dropout(x)