def __init__(self, d_model, d_ff, dropout, activation, param_init, bottleneck_dim=0): super().__init__() self.bottleneck_dim = bottleneck_dim if bottleneck_dim > 0: self.w_1_e = nn.Linear(d_model, bottleneck_dim) self.w_1_d = nn.Linear(bottleneck_dim, d_ff) self.w_2_e = nn.Linear(d_ff, bottleneck_dim) self.w_2_d = nn.Linear(bottleneck_dim, d_model) else: self.w_1 = nn.Linear(d_model, d_ff) self.w_2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(p=dropout) if activation == 'relu': self.activation = torch.relu elif activation == 'gelu': self.activation = lambda x: gelu(x) elif activation == 'gelu_accurate': self.activation = lambda x: gelu_accurate(x) elif activation == 'glu': self.activation = LinearGLUBlock(d_ff) elif activation == 'swish': self.activation = Swish() else: raise NotImplementedError(activation) logger.info('FFN activation: %s' % activation) if param_init == 'xavier_uniform': self.reset_parameters() else: logger.info('Parameter initialization is skipped.')
def __init__(self, d_in, d_ff, d_out, dropout, activation, param_init): super(PositionwiseFeedForward, self).__init__() self.w_1 = nn.Linear(d_in, d_ff) self.w_2 = nn.Linear(d_ff, d_out) self.dropout = nn.Dropout(p=dropout) if activation == 'relu': self.activation = torch.relu elif activation == 'gelu': self.activation = lambda x: gelu(x) elif activation == 'gelu_accurate': self.activation = lambda x: gelu_accurate(x) elif activation == 'glu': self.activation = LinearGLUBlock(d_ff) else: raise NotImplementedError(activation) logger.info('FFN activation: %s' % activation) if param_init == 'xavier_uniform': self.reset_parameters()