def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.head_dim = embed_dim // num_heads assert (self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size") self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size) self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size) self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() self.onnx_trace = False
def __init__(self, weights, input_dim, num_classes, q_noise, qn_block_size): super().__init__() tied_emb, _ = weights self.num_words, emb_dim = tied_emb.size() self.word_proj = quant_noise(TiedLinear(tied_emb, transpose=False), q_noise, qn_block_size) if input_dim != emb_dim: self.word_proj = nn.Sequential( quant_noise(nn.Linear(input_dim, emb_dim, bias=False), q_noise, qn_block_size), self.word_proj, ) self.class_proj = quant_noise( nn.Linear(input_dim, num_classes, bias=False), q_noise, qn_block_size) self.out_dim = self.num_words + num_classes self.register_buffer('_float_tensor', torch.FloatTensor(1))
def __init__(self, vocab_size, input_dim, cutoff, dropout, factor=4., adaptive_inputs=None, tie_proj=False, q_noise=0, qn_block_size=8): super().__init__() if vocab_size > cutoff[-1]: cutoff = cutoff + [vocab_size] else: assert vocab_size == cutoff[ -1], 'cannot specify cutoff larger than vocab size' output_dim = cutoff[0] + len(cutoff) - 1 self.vocab_size = vocab_size self.cutoff = cutoff self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__) self.input_dim = input_dim self.factor = factor self.q_noise = q_noise self.qn_block_size = qn_block_size self.lsm = nn.LogSoftmax(dim=1) if adaptive_inputs is not None: self.head = TiedHeadModule(adaptive_inputs.weights_for_band(0), input_dim, len(cutoff) - 1, self.q_noise, self.qn_block_size) else: self.head = quant_noise( nn.Linear(input_dim, output_dim, bias=False), self.q_noise, self.qn_block_size) self._make_tail(adaptive_inputs, tie_proj) def init_weights(m): if hasattr(m, 'weight') and not isinstance( m, TiedLinear) and not isinstance(m, TiedHeadModule): nn.init.xavier_uniform_(m.weight) self.apply(init_weights) self.register_buffer('version', torch.LongTensor([1]))
def _make_tail(self, adaptive_inputs=None, tie_proj=False): self.tail = nn.ModuleList() for i in range(len(self.cutoff) - 1): dim = int(self.input_dim // self.factor**(i + 1)) tied_emb, tied_proj = adaptive_inputs.weights_for_band(i + 1) \ if adaptive_inputs is not None else (None, None) if tied_proj is not None: if tie_proj: proj = quant_noise(TiedLinear(tied_proj, transpose=True), self.q_noise, self.qn_block_size) else: proj = quant_noise( nn.Linear(tied_proj.size(0), tied_proj.size(1), bias=False), self.q_noise, self.qn_block_size) else: proj = quant_noise(nn.Linear(self.input_dim, dim, bias=False), self.q_noise, self.qn_block_size) if tied_emb is None: out_proj = nn.Linear(dim, self.cutoff[i + 1] - self.cutoff[i], bias=False) else: out_proj = TiedLinear(tied_emb, transpose=False) m = nn.Sequential( proj, nn.Dropout(self.dropout_module.p), quant_noise(out_proj, self.q_noise, self.qn_block_size), ) self.tail.append(m)
def __init__( self, vocab_size: int, padding_idx: int, initial_dim: int, factor: float, output_dim: int, cutoff: List[int], q_noise: float = 0, qn_block_size: int = 8, ): super().__init__() if vocab_size > cutoff[-1]: cutoff = cutoff + [vocab_size] else: assert vocab_size == cutoff[ -1], 'cannot specify cutoff larger than vocab size' self.cutoff = cutoff self.embedding_dim = output_dim self.padding_idx = padding_idx self.embeddings = nn.ModuleList() for i in range(len(self.cutoff)): prev = self.cutoff[i - 1] if i > 0 else 0 size = self.cutoff[i] - prev dim = int(initial_dim // (factor**i)) seq = nn.Sequential( nn.Embedding(size, dim, self.padding_idx), quant_noise(nn.Linear(dim, output_dim, bias=False), q_noise, qn_block_size), ) self.embeddings.append(seq) self.padding_idx = None self.padding_idx = padding_idx def init_weights(m): if isinstance(m, nn.Embedding): nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1]**-0.5) nn.init.constant_(m.weight[padding_idx], 0) elif hasattr(m, 'weight'): nn.init.xavier_uniform_(m.weight) self.apply(init_weights) self.register_buffer('_float_tensor', torch.FloatTensor(1))
def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)