Example #1
0
 def get_position_emb(num, dim):
     from fairseq.modules.sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
     return SinusoidalPositionalEmbedding.get_embedding(
         num + 2, dim, 1).detach().numpy()[2:]
Example #2
0
    def convert_model_weight(self, opt: Opt) -> Dict[str, Any]:
        """
        Convert state_dict between fairseq and ParlAI.

        :param opt:
            ParlAI opt

        :return state_dict:
            return a state dict to load into ParlAI model.
        """
        # deal with embeddings
        state = self.state
        agent = self.agent
        state_dict = state['model']
        return_dict = OrderedDict()
        for each_key in state_dict.keys():
            mapped_key = each_key
            if mapped_key == 'encoder.version' or mapped_key == 'decoder.version':
                continue

            # 1. replace if embedding
            for emb in EMBEDDING_DICT_MAPPING:
                mapped_key = mapped_key.replace(emb,
                                                EMBEDDING_DICT_MAPPING[emb])

            # 2. Replace attention
            if 'encoder' in each_key and 'self_attn' in each_key:
                mapped_key = mapped_key.replace('self_attn', 'attention')
            elif 'decoder' in each_key and 'self_attn' in each_key:
                mapped_key = mapped_key.replace('self_attn', 'self_attention')
            elif 'decoder' in each_key and 'encoder_attn' in each_key:
                mapped_key = mapped_key.replace('encoder_attn',
                                                'encoder_attention')

            # 3. Replace multihead linear layers
            #    fairseq sometimes chunks all three layers into one model weight
            if 'in_proj_weight' in mapped_key or 'in_proj_bias' in mapped_key:
                for weightorbias in {'weight', 'bias'}:
                    attention_project_name = 'in_proj_{}'.format(weightorbias)
                    if attention_project_name in mapped_key:
                        weight = state_dict[each_key]
                        size = int(weight.size(0) / 3)
                        weights = weight.split(size, 0)
                        # For Q, K, V in order
                        return_dict[mapped_key.replace(
                            attention_project_name,
                            'q_lin.{}'.format(weightorbias))] = weights[0]
                        return_dict[mapped_key.replace(
                            attention_project_name,
                            'k_lin.{}'.format(weightorbias))] = weights[1]
                        return_dict[mapped_key.replace(
                            attention_project_name,
                            'v_lin.{}'.format(weightorbias))] = weights[2]
                continue
            elif ('v_proj' in mapped_key or 'k_proj' in mapped_key
                  or 'q_proj' in mapped_key):
                mapped_key = mapped_key.replace('v_proj', 'v_lin')
                mapped_key = mapped_key.replace('q_proj', 'q_lin')
                mapped_key = mapped_key.replace('k_proj', 'k_lin')

            # 4. Replace FFN layers
            for old, new in FFN_MAPPING.items():
                mapped_key = mapped_key.replace(old, new)

            # 5. Fix layer norms
            if 'encoder.' in mapped_key:
                mapped_key = mapped_key.replace('attention_layer_norm',
                                                'norm1')
                mapped_key = mapped_key.replace('final_layer_norm', 'norm2')
            else:
                mapped_key = mapped_key.replace('self_attention_layer_norm',
                                                'norm1')
                mapped_key = mapped_key.replace('encoder_attention_layer_norm',
                                                'norm2')
                mapped_key = mapped_key.replace('final_layer_norm', 'norm3')

            for _key in ['encoder', 'decoder']:
                mapped_key = mapped_key.replace(f'{_key}.layer_norm',
                                                f'{_key}.norm_embeddings')
                mapped_key = mapped_key.replace(f'{_key}.layernorm_embedding',
                                                f'{_key}.norm_embeddings')

            weight = state_dict[each_key]
            return_dict[mapped_key] = weight

        # 6. Shuffle embedding matrix given dictionary.
        enc_emb_key = 'encoder.embeddings.weight'
        bart_dict = os.path.join(opt['datapath'],
                                 'models/bart/bart.large/dict.txt')
        with PathManager.open(bart_dict) as f:
            offset_dict = {
                i: l.split()[0]
                for i, l in enumerate(f.readlines())
            }
        new_embs = return_dict[enc_emb_key].clone()
        for idx, new_idx in offset_dict.items():
            try:
                new_embs[int(new_idx) + 4] = return_dict[enc_emb_key][idx + 4]
            except ValueError:
                # if idx is not an int
                if 'madeupword' in new_idx:
                    pad_idx = int(new_idx.split('madeupword')[1])
                    new_embs[-(4 - pad_idx)] = return_dict[
                        'encoder.embeddings.weight'][idx + 4]
        return_dict['encoder.embeddings.weight'] = new_embs

        # 7. Swap special tokens
        #    Fairseq swaps the bos and eos token order for seq2seq models.
        #
        #   ParlAI s2s models expect:
        #       Encoder: TOKENS </s>
        #       Decoder: <s> TOKENS <s>
        #   Fairseq models get:
        #       Encoder: TOKENS </s>
        #       Decoder: </s> TOKENS <s>
        #
        #   So we swap to get:
        #       Encoder: TOKENS </s>
        #       Decoder: </s> TOKENS <s>
        #
        size_dict = return_dict[enc_emb_key].size(0)
        if size_dict == len(agent.dict) + 1 and '<mask>' not in agent.dict:
            return_dict[enc_emb_key] = return_dict[enc_emb_key][:size_dict -
                                                                1, :]
            size_dict -= 1
        specials, words = return_dict[enc_emb_key].split([4, size_dict - 4], 0)
        bos, pad, eos, unk = specials
        if not self.opt['retain_bos_emb']:
            bos = eos
        specials = torch.stack([pad, bos, eos, unk])
        fp16_pad = (8 - (len(specials) + len(words)) % 8) % 8
        fp16_pad_ez = torch.zeros(fp16_pad, specials.size(1)).type_as(specials)
        return_dict[enc_emb_key] = torch.cat(
            [
                specials,  # special tokens
                words,  # word embeddings
                fp16_pad_ez,  # fp16 requires embeddings size to be a multiple of 8
            ],
            0,
        )

        return_dict['decoder.embeddings.weight'] = return_dict[enc_emb_key]
        return_dict['embeddings.weight'] = return_dict[enc_emb_key]

        # 8. Positional Embeddings
        if 'encoder.position_embeddings.weight' in return_dict:
            return_dict['encoder.position_embeddings.weight'] = return_dict[
                'encoder.position_embeddings.weight'][2:, :]
            return_dict['decoder.position_embeddings.weight'] = return_dict[
                'decoder.position_embeddings.weight'][2:, :]
        else:
            # sinusoidal embeddings
            from fairseq.modules.sinusoidal_positional_embedding import (
                SinusoidalPositionalEmbedding, )

            emb = SinusoidalPositionalEmbedding.get_embedding(
                128 + 2, opt['embedding_size'], 1)
            del return_dict['encoder.position_embeddings._float_tensor']
            del return_dict['decoder.position_embeddings._float_tensor']

            return_dict['encoder.position_embeddings.weight'] = emb[2:]
            return_dict['decoder.position_embeddings.weight'] = emb[2:]

        return_dict['START'] = torch.LongTensor([1])  # type: ignore
        return return_dict
Example #3
0
    def __init__(
        self,
        embed_dim,
        num_heads,
        kdim=None,
        vdim=None,
        dropout=0.0,
        bias=True,
        add_bias_kv=False,
        add_zero_attn=False,
        self_attention=False,
        encoder_decoder_attention=False,
        q_noise=0.0,
        qn_block_size=8,
        normalized_attention=False,
        normalized_attention_logsoftmax=False,
        normalized_attention_by_entropy=False,
        positional_embeddings_in_attention=False,
        symmetric_kv_context_params=False,
        symmetric_kv_positional_params=False,
        #normalized_attention_by_positional_score=False,
    ):
        super().__init__()
        self.embed_dim = embed_dim
        self.kdim = kdim if kdim is not None else embed_dim
        self.vdim = vdim if vdim is not None else embed_dim
        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim

        self.num_heads = num_heads
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__)
        self.positional_embeddings_in_attention = positional_embeddings_in_attention
        self.symmetric_kv_context_params = symmetric_kv_context_params
        self.symmetric_kv_positional_params = symmetric_kv_positional_params
        #self.normalized_attention_by_positional_score=normalized_attention_by_positional_score

        self.head_dim = embed_dim // num_heads
        assert (self.head_dim * num_heads == self.embed_dim
                ), "embed_dim must be divisible by num_heads"
        self.scaling = self.head_dim**-0.5

        self.self_attention = self_attention
        self.encoder_decoder_attention = encoder_decoder_attention

        assert not self.self_attention or self.qkv_same_dim, (
            "Self-attention requires query, key and "
            "value to be of the same size")

        self.k_proj = quant_noise(nn.Linear(self.kdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.v_proj = quant_noise(nn.Linear(self.vdim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        self.q_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                  q_noise, qn_block_size)
        if self.symmetric_kv_context_params:
            assert self.kdim == embed_dim, (
                "Symmetric context attention requires kdim == embed_dim")
            self.q_proj.weight = self.k_proj.weight

        self.out_proj = quant_noise(nn.Linear(embed_dim, embed_dim, bias=bias),
                                    q_noise, qn_block_size)

        if add_bias_kv:
            self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim))
            self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim))
        else:
            self.bias_k = self.bias_v = None

        self.add_zero_attn = add_zero_attn

        self.normalized_attention = normalized_attention
        self.normalized_attention_logsoftmax = normalized_attention_logsoftmax
        self.normalized_attention_by_entropy = normalized_attention_by_entropy
        if self.normalized_attention:
            self.attention_gain = quant_noise(
                nn.Linear(embed_dim, num_heads, bias=True), q_noise,
                qn_block_size)

        if self.positional_embeddings_in_attention:
            self.pos_k_proj = quant_noise(
                nn.Linear(self.kdim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            self.pos_q_proj = quant_noise(
                nn.Linear(embed_dim, embed_dim, bias=bias), q_noise,
                qn_block_size)
            if self.symmetric_kv_positional_params:
                assert self.kdim == embed_dim, (
                    "Symmetric positional attention requires kdim == embed_dim"
                )
                self.pos_q_proj.weight = self.pos_k_proj.weight
            self.pos_embeddings = SinusoidalPositionalEmbedding(
                embed_dim, None)

        self.reset_parameters()

        self.onnx_trace = False