Exemple #1
0
    def __init__(self,
                 hidden,
                 attn_heads,
                 feed_forward_hidden,
                 dropout,
                 window=5):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden,
                                                    d_ff=feed_forward_hidden,
                                                    dropout=dropout)
        self.conv = TFCNN(dropout=dropout,
                          num_filters=hidden,
                          hidden=hidden,
                          window=window)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
Exemple #2
0
def build_decoder(trg_emb):

    if wargs.encoder_type == 'gru':
        from models.gru_decoder import StackedGRUDecoder
        return StackedGRUDecoder(trg_emb=trg_emb,
                                 enc_hid_size=wargs.d_enc_hid,
                                 dec_hid_size=wargs.d_dec_hid,
                                 n_layers=wargs.n_dec_layers,
                                 attention_type=wargs.attention_type,
                                 rnn_dropout_prob=wargs.rnn_dropout,
                                 out_dropout_prob=wargs.output_dropout)
    if wargs.decoder_type == 'att':
        from models.self_att_model import SelfAttDecoder, SelfAttDecoderLayer, \
                PositionwiseFeedForward, clones
        from models.attention import MultiHeadedAttention
        c = copy.deepcopy
        attn = MultiHeadedAttention(h=wargs.n_head,
                                    d_model=wargs.d_model,
                                    dropout=wargs.att_dropout)
        wlog('clones -> {}'.format(2))
        ff = PositionwiseFeedForward(d_model=wargs.d_model,
                                     d_ff=wargs.d_ff_filter,
                                     dropout=wargs.relu_dropout)
        return SelfAttDecoder(trg_emb=trg_emb,
                              layer=SelfAttDecoderLayer(
                                  wargs.d_model,
                                  c(attn),
                                  c(attn),
                                  c(ff),
                                  dropout=wargs.residual_dropout),
                              N=wargs.n_enc_layers)
def make_model(src_vocab,
               tgt_vocab,
               N=6,
               d_model=512,
               d_ff=2048,
               h=8,
               dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform(p)
    return model
    def __init__(self,
                 dec_attn_config,
                 enc_dec_attn_config,
                 num_heads,
                 dim,
                 hidden_dim,
                 layer_i,
                 causal=True,
                 dropout_p=0.1):
        ''' Initialize the transformer layer '''
        super(TransformerDecoderLayer, self).__init__()

        self.causal = causal
        self.uuid = uuid.uuid4()

        self.enc_dec_attn_config = enc_dec_attn_config

        if dec_attn_config['ffn_layer'][layer_i]:
            self.ffn = TransformerSublayer(TransformerFFN(dim, hidden_dim),
                                           dim, dropout_p)
            print('dec layer %i has ffn' % layer_i)

        self.self_attention = TransformerSublayer(
            MultiHeadedAttention(dec_attn_config, dim, num_heads), dim,
            dropout_p)

        if self.enc_dec_attn_config['enc_dec_attn_layer'] == 1 or \
                (type(self.enc_dec_attn_config['enc_dec_attn_layer'] is list) and
                 self.enc_dec_attn_config['enc_dec_attn_layer'][layer_i] == 1):
            if self.enc_dec_attn_config['enc_dec_attn_num_heads'] == -1:
                src_num_heads = num_heads
            elif type(self.enc_dec_attn_config['enc_dec_attn_num_heads']
                      ) is not list:
                src_num_heads = self.enc_dec_attn_config[
                    'enc_dec_attn_num_heads']
            else:
                src_num_heads = self.enc_dec_attn_config[
                    'enc_dec_attn_num_heads'][layer_i]
            assert src_num_heads != 0

            self.source_attention = TransformerSublayer(
                MultiHeadedAttention(enc_dec_attn_config, dim, src_num_heads),
                dim, dropout_p)

            print('layer %i num of src heads %i' % (layer_i, src_num_heads))
Exemple #5
0
    def __init__(self, num_heads, dim, hidden_dim, dropout_p=0.1):
        ''' Initialize the transformer layer '''
        super(TransformerEncoderLayer, self).__init__()

        self.ffn = TransformerSublayer(TransformerFFN(dim, hidden_dim), dim,
                                       dropout_p)

        self.self_attention = TransformerSublayer(
            MultiHeadedAttention(dim, num_heads), dim, dropout_p)
Exemple #6
0
    def __init__(self,
                 num_heads,
                 dim,
                 hidden_dim,
                 causal=True,
                 span=1,
                 dropout_p=0.1):
        ''' Initialize the transformer layer '''
        super(TransformerDecoderLayer, self).__init__()

        self.span = span
        self.causal = causal
        self.uuid = uuid.uuid4()

        self.ffn = TransformerSublayer(TransformerFFN(dim, hidden_dim), dim,
                                       dropout_p)

        self.self_attention = TransformerSublayer(
            MultiHeadedAttention(dim, num_heads), dim, dropout_p)

        self.source_attention = TransformerSublayer(
            MultiHeadedAttention(dim, num_heads), dim, dropout_p)
    def __init__(self,
                 attn_config,
                 num_heads,
                 dim,
                 hidden_dim,
                 layer_i,
                 dropout_p=0.1):
        ''' Initialize the transformer layer '''
        super(TransformerEncoderLayer, self).__init__()

        if attn_config['ffn_layer'][layer_i]:
            self.ffn = TransformerSublayer(TransformerFFN(dim, hidden_dim),
                                           dim, dropout_p)
            print('enc layer %i has ffn' % layer_i)

        self.self_attention = TransformerSublayer(
            MultiHeadedAttention(attn_config, dim, num_heads), dim, dropout_p)
Exemple #8
0
class TransformerBlock(nn.Module):
    """
    Bidirectional Encoder = Transformer (self-attention)
    Transformer = MultiHead_Attention + Feed_Forward with sublayer connection
    """
    def __init__(self,
                 hidden,
                 attn_heads,
                 feed_forward_hidden,
                 dropout,
                 window=5):
        """
        :param hidden: hidden size of transformer
        :param attn_heads: head sizes of multi-head attention
        :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size
        :param dropout: dropout rate
        """

        super().__init__()
        self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden)
        self.feed_forward = PositionwiseFeedForward(d_model=hidden,
                                                    d_ff=feed_forward_hidden,
                                                    dropout=dropout)
        self.conv = TFCNN(dropout=dropout,
                          num_filters=hidden,
                          hidden=hidden,
                          window=window)
        self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x, mask):
        x = self.input_sublayer(
            x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask))
        x = self.output_sublayer(x, self.conv)
        # x = x + self.conv(self.attention.forward(x, x, x, mask=mask))
        return self.dropout(x)