def __init__(self, params): super(TransformerLanguageModel, self).__init__() self.model_type = 'transformer_lm' self.normalize_before = False self.smoothing = params['smoothing'] self.vocab_size = params['vocab_size'] self.num_blocks = params['num_blocks'] self.embedding = nn.Embedding(self.vocab_size, params['d_model']) self.pos_embedding = PositionalEncoding(params['d_model'], 0.0) self.blocks = nn.ModuleList([ TransformerEncoderLayer( params['n_heads'], params['d_model'], params['ffn_units'], slf_attn_dropout_rate=0.0, ffn_dropout_rate=0.0, residual_dropout_rate=params['residual_dropout_rate'], normalize_before=False, concat_after=False, activation='glu', drop_head_rate=params['enc_drop_head']) for _ in range(self.num_blocks) ]) if self.normalize_before: self.after_norm = nn.LayerNorm(params['d_model']) self.output_project = nn.Linear(params['d_model'], self.vocab_size) if params['share_embedding']: self.output_project.weight = self.embedding.weight print('Share the weight of embedding to the output project layer!') self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
def __init__(self, input_size, d_model=256, attention_heads=4, linear_units=2048, num_blocks=6, pos_dropout_rate=0.0, slf_attn_dropout_rate=0.0, ffn_dropout_rate=0.0, residual_dropout_rate=0.1, input_layer="conv2d", normalize_before=True, concat_after=False, activation='relu', type='transformer', weight_sharing=False): super(TransformerEncoder, self).__init__() self.normalize_before = normalize_before self.weight_sharing = weight_sharing self.num_blocks = num_blocks if input_layer == "linear": self.embed = LinearWithPosEmbedding(input_size, d_model, pos_dropout_rate) elif input_layer == "conv2d": self.embed = Conv2dSubsampling(input_size, d_model, pos_dropout_rate) elif input_layer == 'conv2dv2': self.embed = Conv2dSubsamplingV2(input_size, d_model, pos_dropout_rate) if weight_sharing: num_blocks = 1 self.blocks = nn.ModuleList([ TransformerEncoderLayer( attention_heads, d_model, linear_units, slf_attn_dropout_rate, ffn_dropout_rate, residual_dropout_rate=residual_dropout_rate, normalize_before=normalize_before, concat_after=concat_after, activation=activation) for _ in range(num_blocks) ]) if self.normalize_before: self.after_norm = LayerNorm(d_model)