Exemple #1
0
 def __init__(self, out_channels, embed_dim, num_heads, project_input=False, gated=False, downsample=False):
     super().__init__()
     self.attention = DownsampledMultiHeadAttention(
         out_channels, embed_dim, num_heads, dropout=0, bias=True,
         project_input=project_input, gated=gated, downsample=downsample,
     )
     self.in_proj_q = Linear(out_channels, embed_dim)
     self.in_proj_k = Linear(out_channels, embed_dim)
     self.in_proj_v = Linear(out_channels, embed_dim)
     self.ln = nn.LayerNorm(out_channels)
Exemple #2
0
    def __init__(
        self, dictionary, embed_dim=512, out_embed_dim=256, max_positions=1024,
        convolutions=((512, 3),) * 8, attention=True, dropout=0.1,
        selfattention=False, attention_nheads=1, selfattention_nheads=1,
        project_input=False, gated_attention=False, downsample=False,
        pretrained=False, trained_decoder=None, left_pad=False,
    ):
        super().__init__(dictionary)
        self.register_buffer('version', torch.Tensor([2]))
        self.pretrained = pretrained
        self.pretrained_decoder = trained_decoder
        self.dropout = dropout
        self.left_pad = left_pad
        self.need_attn = True
        in_channels = convolutions[0][0]

        def expand_bool_array(val):
            if isinstance(val, bool):
                # expand True into [True, True, ...] and do the same with False
                return [val] * len(convolutions)
            return val

        attention = expand_bool_array(attention)
        selfattention = expand_bool_array(selfattention)

        if not isinstance(attention, list) or len(attention) != len(convolutions):
            raise ValueError('Attention is expected to be a list of booleans of '
                             'length equal to the number of layers.')

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)

        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            padding_idx,
            left_pad=self.left_pad,
        )

        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()
        self.selfattention = nn.ModuleList()
        self.attproj = nn.ModuleList()
        for i, (out_channels, kernel_size) in enumerate(convolutions):
            self.projections.append(
                Linear(in_channels, out_channels) if in_channels != out_channels else None
            )
            self.convolutions.append(
                LinearizedConv1d(
                    in_channels, out_channels * 2, kernel_size,
                    padding=(kernel_size - 1), dropout=dropout,
                )
            )

            self.attention.append(
                DownsampledMultiHeadAttention(
                    out_channels, embed_dim, attention_nheads,
                    project_input=project_input, gated=False, downsample=False,
                ) if attention[i] else None
            )

            self.attproj.append(
                Linear(out_channels, embed_dim, dropout=dropout) if attention[i] else None
            )
            self.selfattention.append(
                SelfAttention(
                    out_channels, embed_dim, selfattention_nheads,
                    project_input=project_input, gated=gated_attention,
                    downsample=downsample,
                ) if selfattention[i] else None
            )
            in_channels = out_channels

        self.fc2 = Linear(in_channels, out_embed_dim)
        self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)

        # model fusion
        if self.pretrained:
            # independent gates are learned from the concatenated input
            self.gate1 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid())
            self.gate2 = nn.Sequential(Linear(out_embed_dim*2, out_embed_dim), nn.Sigmoid())
            # pretrained and trained models are joined
            self.joining = nn.Sequential(
                Linear(out_embed_dim*2, out_embed_dim*2),
                nn.LayerNorm(out_embed_dim*2),
                nn.GLU(),
                Linear(out_embed_dim, out_embed_dim*2),
                nn.LayerNorm(out_embed_dim*2),
                nn.GLU(),
                Linear(out_embed_dim, out_embed_dim),
                nn.LayerNorm(out_embed_dim)
            )
            # pretrained model contains an output layer that is nhid -> vocab size
            # but the models are combined in their hidden state
            # the hook stores the output of the pretrained model forward
            self.pretrained_outputs = {}

            def save_output():
                def hook(a, b, output):
                    self.pretrained_outputs["out"] = output
                return hook

            self.pretrained_decoder.fc2.register_forward_hook(save_output())