Esempio n. 1
0
    def __init__(self, params):
        super(TransformerLanguageModel, self).__init__()

        self.model_type = 'transformer_lm'
        self.normalize_before = False
        self.smoothing = params['smoothing']
        self.vocab_size = params['vocab_size']
        self.num_blocks = params['num_blocks']

        self.embedding = nn.Embedding(self.vocab_size, params['d_model'])
        self.pos_embedding = PositionalEncoding(params['d_model'], 0.0)

        self.blocks = nn.ModuleList([
            TransformerEncoderLayer(
                params['n_heads'], params['d_model'], params['ffn_units'],
                slf_attn_dropout_rate=0.0, ffn_dropout_rate=0.0,
                residual_dropout_rate=params['residual_dropout_rate'],
                normalize_before=False, concat_after=False, activation='glu',
                drop_head_rate=params['enc_drop_head']) for _ in range(self.num_blocks)
        ])

        if self.normalize_before:
            self.after_norm = nn.LayerNorm(params['d_model'])

        self.output_project = nn.Linear(params['d_model'], self.vocab_size)

        if params['share_embedding']:
            self.output_project.weight = self.embedding.weight
            print('Share the weight of embedding to the output project layer!')

        self.crit = LabelSmoothingLoss(size=self.vocab_size, smoothing=self.smoothing, padding_idx=PAD)
Esempio n. 2
0
    def __init__(self,
                 output_size,
                 d_model=256,
                 attention_heads=4,
                 linear_units=2048,
                 num_blocks=6,
                 pos_dropout_rate=0.0,
                 slf_attn_dropout_rate=0.0,
                 src_attn_dropout_rate=0.0,
                 ffn_dropout_rate=0.0,
                 residual_dropout_rate=0.1,
                 activation='relu',
                 normalize_before=True,
                 concat_after=False,
                 share_embedding=False,
                 weight_sharing=False):
        super(TransformerDecoder, self).__init__()

        self.normalize_before = normalize_before
        self.weight_sharing = weight_sharing
        self.num_blocks = num_blocks

        self.embedding = torch.nn.Embedding(output_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model, pos_dropout_rate)

        if weight_sharing:
            num_blocks = 1
        self.blocks = nn.ModuleList([
            TransformerDecoderLayer(attention_heads,
                                    d_model,
                                    linear_units,
                                    slf_attn_dropout_rate,
                                    src_attn_dropout_rate,
                                    ffn_dropout_rate,
                                    residual_dropout_rate,
                                    normalize_before=normalize_before,
                                    concat_after=concat_after,
                                    activation=activation)
            for _ in range(num_blocks)
        ])

        if self.normalize_before:
            self.after_norm = LayerNorm(d_model)

        self.output_layer = nn.Linear(d_model, output_size)

        if share_embedding:
            assert self.embedding.weight.size(
            ) == self.output_layer.weight.size()
            self.output_layer.weight = self.embedding.weight