def __init__(self, config): """ initialization of required variables and functions :param config: configuration """ super(TransformerEncoderLayer, self).__init__() self.config = config # self attention self.self_attn = models.Multihead_Attention( model_dim=config.hidden_size, head_count=config.heads, dropout=config.dropout) self.feed_forward = PositionwiseFeedForward(d_model=config.hidden_size, d_ff=config.d_ff, dropout=config.dropout) self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-6) self.dropout = nn.Dropout(config.dropout) # Convolutional Attention Temperature for self attention distribution, waiting to be deprecated if config.convolutional: self.cnn_tau = nn.Sequential( nn.Conv1d(config.hidden_size, config.heads, kernel_size=3, padding=1, groups=config.heads), nn.ReLU(), nn.Dropout(config.dropout)) self.ln_tau = nn.LayerNorm(config.heads, eps=1e-6) self.sigmoid = nn.Sigmoid()
def __init__(self, config): """ initialization for required variables and functions :param config: configuration """ super(TransformerDecoderLayer, self).__init__() self.config = config # self attention self.self_attn = models.Multihead_Attention( model_dim=config.hidden_size, head_count=config.heads, dropout=config.dropout) self.context_attn = models.Multihead_Attention( model_dim=config.hidden_size, head_count=config.heads, dropout=config.dropout) self.feed_forward = PositionwiseFeedForward(config.hidden_size, config.d_ff, config.dropout) self.layer_norm_1 = nn.LayerNorm(config.hidden_size, eps=1e-6) self.layer_norm_2 = nn.LayerNorm(config.hidden_size, eps=1e-6) self.dropout = config.dropout self.drop = nn.Dropout(config.dropout) mask = self._get_attn_subsequent_mask(MAX_SIZE) # Register self.mask as a buffer in TransformerDecoderLayer, so # it gets TransformerDecoderLayer's cuda behavior automatically. self.register_buffer('mask', mask) # Add convolutional temperature for attention distribution, to be deprecated if config.convolutional: self.self_lin = nn.Sequential( nn.Linear(config.hidden_size, config.heads), nn.ReLU(), nn.Dropout()) self.self_ln = nn.LayerNorm(config.heads, eps=1e-6) self.self_sigmoid = nn.Sigmoid() self.ctxt_lin = nn.Sequential( nn.Linear(config.hidden_size, config.heads), nn.ReLU(), nn.Dropout(config.dropout)) self.ctxt_ln = nn.LayerNorm(config.heads, eps=1e-6) self.ctxt_sigmoid = nn.Sigmoid()
def __init__(self, config, embedding=None, padding_idx=0): super(rnn_encoder, self).__init__() self.embedding = embedding if embedding is not None else nn.Embedding( config.src_vocab_size, config.emb_size, padding_idx=0) self.hidden_size = config.hidden_size self.config = config self.padding_idx = padding_idx self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-6) if config.swish: self.sw1 = nn.Sequential( nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=1, padding=0), nn.BatchNorm1d(config.hidden_size), nn.ReLU()) self.sw3 = nn.Sequential( nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=1, padding=0), nn.ReLU(), nn.BatchNorm1d(config.hidden_size), nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size)) self.sw33 = nn.Sequential( nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=1, padding=0), nn.ReLU(), nn.BatchNorm1d(config.hidden_size), nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size), nn.Conv1d(config.hidden_size, config.hidden_size, kernel_size=3, padding=1), nn.ReLU(), nn.BatchNorm1d(config.hidden_size)) self.linear = nn.Sequential(nn.Linear( 2*config.hidden_size, 2*config.hidden_size), nn.GLU(), nn.Dropout(config.dropout)) self.filter_linear = nn.Linear( 3*config.hidden_size, config.hidden_size) self.tanh = nn.Tanh() self.sigmoid = nn.Sigmoid() if config.selfatt: self.self_attn = models.Multihead_Attention( config.hidden_size, head_count=config.heads, dropout=config.dropout) if config.cell == 'gru': self.rnn = nn.GRU(input_size=config.emb_size, hidden_size=config.hidden_size, num_layers=config.enc_num_layers, dropout=config.dropout, bidirectional=config.bidirectional) else: self.rnn = nn.LSTM(input_size=config.emb_size, hidden_size=config.hidden_size, num_layers=config.enc_num_layers, dropout=config.dropout, bidirectional=config.bidirectional) self.dropout = nn.Dropout(config.dropout) self.emb_drop = nn.Dropout(config.emb_dropout)