def __init__(self, config):
        """
        initialization of required variables and functions
        :param config: configuration
        """
        super(TransformerEncoderLayer, self).__init__()
        self.config = config
        # self attention
        self.self_attn = models.Multihead_Attention(
            model_dim=config.hidden_size,
            head_count=config.heads,
            dropout=config.dropout)
        self.feed_forward = PositionwiseFeedForward(d_model=config.hidden_size,
                                                    d_ff=config.d_ff,
                                                    dropout=config.dropout)
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-6)
        self.dropout = nn.Dropout(config.dropout)

        # Convolutional Attention Temperature for self attention distribution, waiting to be deprecated
        if config.convolutional:
            self.cnn_tau = nn.Sequential(
                nn.Conv1d(config.hidden_size,
                          config.heads,
                          kernel_size=3,
                          padding=1,
                          groups=config.heads), nn.ReLU(),
                nn.Dropout(config.dropout))
            self.ln_tau = nn.LayerNorm(config.heads, eps=1e-6)
            self.sigmoid = nn.Sigmoid()
    def __init__(self, config):
        """
        initialization for required variables and functions
        :param config: configuration
        """
        super(TransformerDecoderLayer, self).__init__()
        self.config = config
        # self attention
        self.self_attn = models.Multihead_Attention(
            model_dim=config.hidden_size,
            head_count=config.heads,
            dropout=config.dropout)

        self.context_attn = models.Multihead_Attention(
            model_dim=config.hidden_size,
            head_count=config.heads,
            dropout=config.dropout)
        self.feed_forward = PositionwiseFeedForward(config.hidden_size,
                                                    config.d_ff,
                                                    config.dropout)
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size, eps=1e-6)
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size, eps=1e-6)
        self.dropout = config.dropout
        self.drop = nn.Dropout(config.dropout)
        mask = self._get_attn_subsequent_mask(MAX_SIZE)
        # Register self.mask as a buffer in TransformerDecoderLayer, so
        # it gets TransformerDecoderLayer's cuda behavior automatically.
        self.register_buffer('mask', mask)

        # Add convolutional temperature for attention distribution, to be deprecated
        if config.convolutional:
            self.self_lin = nn.Sequential(
                nn.Linear(config.hidden_size, config.heads), nn.ReLU(),
                nn.Dropout())
            self.self_ln = nn.LayerNorm(config.heads, eps=1e-6)
            self.self_sigmoid = nn.Sigmoid()
            self.ctxt_lin = nn.Sequential(
                nn.Linear(config.hidden_size, config.heads), nn.ReLU(),
                nn.Dropout(config.dropout))
            self.ctxt_ln = nn.LayerNorm(config.heads, eps=1e-6)
            self.ctxt_sigmoid = nn.Sigmoid()
Example #3
0
    def __init__(self, config, embedding=None, padding_idx=0):
        super(rnn_encoder, self).__init__()

        self.embedding = embedding if embedding is not None else nn.Embedding(
            config.src_vocab_size, config.emb_size, padding_idx=0)
        self.hidden_size = config.hidden_size
        self.config = config
        self.padding_idx = padding_idx
        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-6)

        if config.swish:
            self.sw1 = nn.Sequential(
                nn.Conv1d(config.hidden_size, config.hidden_size,
                          kernel_size=1, padding=0), nn.BatchNorm1d(config.hidden_size), nn.ReLU())
            self.sw3 = nn.Sequential(
                nn.Conv1d(config.hidden_size, config.hidden_size,
                          kernel_size=1, padding=0), nn.ReLU(), nn.BatchNorm1d(config.hidden_size),
                nn.Conv1d(config.hidden_size, config.hidden_size,
                          kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size))
            self.sw33 = nn.Sequential(
                nn.Conv1d(config.hidden_size, config.hidden_size,
                          kernel_size=1, padding=0), nn.ReLU(), nn.BatchNorm1d(config.hidden_size),
                nn.Conv1d(config.hidden_size, config.hidden_size,
                          kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size),
                nn.Conv1d(config.hidden_size, config.hidden_size,
                          kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size))
            self.linear = nn.Sequential(nn.Linear(
                2*config.hidden_size, 2*config.hidden_size), nn.GLU(), nn.Dropout(config.dropout))
            self.filter_linear = nn.Linear(
                3*config.hidden_size, config.hidden_size)
            self.tanh = nn.Tanh()
            self.sigmoid = nn.Sigmoid()

        if config.selfatt:
            self.self_attn = models.Multihead_Attention(
                config.hidden_size, head_count=config.heads, dropout=config.dropout)

        if config.cell == 'gru':
            self.rnn = nn.GRU(input_size=config.emb_size, hidden_size=config.hidden_size,
                              num_layers=config.enc_num_layers, dropout=config.dropout,
                              bidirectional=config.bidirectional)
        else:
            self.rnn = nn.LSTM(input_size=config.emb_size, hidden_size=config.hidden_size,
                               num_layers=config.enc_num_layers, dropout=config.dropout,
                               bidirectional=config.bidirectional)
            self.dropout = nn.Dropout(config.dropout)
            self.emb_drop = nn.Dropout(config.emb_dropout)