Example #1
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.reset_parameters()

        self.onnx_trace = False
Example #2
0
    def __init__(self, weights, input_dim, num_classes, q_noise,
                 qn_block_size):
        super().__init__()
        tied_emb, _ = weights
        self.num_words, emb_dim = tied_emb.size()

        self.word_proj = quant_noise(TiedLinear(tied_emb, transpose=False),
                                     q_noise, qn_block_size)
        if input_dim != emb_dim:
            self.word_proj = nn.Sequential(
                quant_noise(nn.Linear(input_dim, emb_dim, bias=False), q_noise,
                            qn_block_size),
                self.word_proj,
            )

        self.class_proj = quant_noise(
            nn.Linear(input_dim, num_classes, bias=False), q_noise,
            qn_block_size)
        self.out_dim = self.num_words + num_classes

        self.register_buffer('_float_tensor', torch.FloatTensor(1))
Example #3
0
    def __init__(self,
                 vocab_size,
                 input_dim,
                 cutoff,
                 dropout,
                 factor=4.,
                 adaptive_inputs=None,
                 tie_proj=False,
                 q_noise=0,
                 qn_block_size=8):
        super().__init__()

        if vocab_size > cutoff[-1]:
            cutoff = cutoff + [vocab_size]
        else:
            assert vocab_size == cutoff[
                -1], 'cannot specify cutoff larger than vocab size'

        output_dim = cutoff[0] + len(cutoff) - 1

        self.vocab_size = vocab_size
        self.cutoff = cutoff
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)
        self.input_dim = input_dim
        self.factor = factor
        self.q_noise = q_noise
        self.qn_block_size = qn_block_size

        self.lsm = nn.LogSoftmax(dim=1)

        if adaptive_inputs is not None:
            self.head = TiedHeadModule(adaptive_inputs.weights_for_band(0),
                                       input_dim,
                                       len(cutoff) - 1, self.q_noise,
                                       self.qn_block_size)
        else:
            self.head = quant_noise(
                nn.Linear(input_dim, output_dim, bias=False), self.q_noise,
                self.qn_block_size)

        self._make_tail(adaptive_inputs, tie_proj)

        def init_weights(m):
            if hasattr(m, 'weight') and not isinstance(
                    m, TiedLinear) and not isinstance(m, TiedHeadModule):
                nn.init.xavier_uniform_(m.weight)

        self.apply(init_weights)

        self.register_buffer('version', torch.LongTensor([1]))
Example #4
0
    def _make_tail(self, adaptive_inputs=None, tie_proj=False):
        self.tail = nn.ModuleList()
        for i in range(len(self.cutoff) - 1):
            dim = int(self.input_dim // self.factor**(i + 1))

            tied_emb, tied_proj = adaptive_inputs.weights_for_band(i + 1) \
                if adaptive_inputs is not None else (None, None)

            if tied_proj is not None:
                if tie_proj:
                    proj = quant_noise(TiedLinear(tied_proj, transpose=True),
                                       self.q_noise, self.qn_block_size)
                else:
                    proj = quant_noise(
                        nn.Linear(tied_proj.size(0),
                                  tied_proj.size(1),
                                  bias=False), self.q_noise,
                        self.qn_block_size)
            else:
                proj = quant_noise(nn.Linear(self.input_dim, dim, bias=False),
                                   self.q_noise, self.qn_block_size)

            if tied_emb is None:
                out_proj = nn.Linear(dim,
                                     self.cutoff[i + 1] - self.cutoff[i],
                                     bias=False)
            else:
                out_proj = TiedLinear(tied_emb, transpose=False)

            m = nn.Sequential(
                proj,
                nn.Dropout(self.dropout_module.p),
                quant_noise(out_proj, self.q_noise, self.qn_block_size),
            )

            self.tail.append(m)
Example #5
0
    def __init__(
        self,
        vocab_size: int,
        padding_idx: int,
        initial_dim: int,
        factor: float,
        output_dim: int,
        cutoff: List[int],
        q_noise: float = 0,
        qn_block_size: int = 8,
    ):
        super().__init__()

        if vocab_size > cutoff[-1]:
            cutoff = cutoff + [vocab_size]
        else:
            assert vocab_size == cutoff[
                -1], 'cannot specify cutoff larger than vocab size'

        self.cutoff = cutoff
        self.embedding_dim = output_dim
        self.padding_idx = padding_idx

        self.embeddings = nn.ModuleList()
        for i in range(len(self.cutoff)):
            prev = self.cutoff[i - 1] if i > 0 else 0
            size = self.cutoff[i] - prev
            dim = int(initial_dim // (factor**i))
            seq = nn.Sequential(
                nn.Embedding(size, dim, self.padding_idx),
                quant_noise(nn.Linear(dim, output_dim, bias=False), q_noise,
                            qn_block_size),
            )

            self.embeddings.append(seq)
            self.padding_idx = None
        self.padding_idx = padding_idx

        def init_weights(m):
            if isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1]**-0.5)
                nn.init.constant_(m.weight[padding_idx], 0)
            elif hasattr(m, 'weight'):
                nn.init.xavier_uniform_(m.weight)

        self.apply(init_weights)

        self.register_buffer('_float_tensor', torch.FloatTensor(1))
 def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size):
     return quant_noise(nn.Linear(input_dim, output_dim), q_noise,
                        qn_block_size)