Ejemplo n.º 1
0
    def extract_features_aligned_to_words(
            self,
            sentence: str,
            return_all_hiddens: bool = False) -> torch.Tensor:
        """Extract RoBERTa features, aligned to spaCy's word-level tokenizer."""
        from fairseq.models.roberta import alignment_utils
        from spacy.tokens import Doc

        nlp = alignment_utils.spacy_nlp()
        tokenizer = alignment_utils.spacy_tokenizer()

        # tokenize both with GPT-2 BPE and spaCy
        bpe_toks = self.encode(sentence)
        spacy_toks = tokenizer(sentence)
        spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
        alignment = alignment_utils.align_bpe_to_words(self, bpe_toks,
                                                       spacy_toks_ws)

        # extract features and align them
        features = self.extract_features(bpe_toks,
                                         return_all_hiddens=return_all_hiddens)
        features = features.squeeze(0)
        aligned_feats = alignment_utils.align_features_to_words(
            self, features, alignment)

        # wrap in spaCy Doc
        doc = Doc(
            nlp.vocab,
            words=["<s>"] + [x.text for x in spacy_toks] + ["</s>"],
            spaces=[True] + [x.endswith(" ")
                             for x in spacy_toks_ws[:-1]] + [True, False],
        )
        assert len(doc) == aligned_feats.size(0)
        doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i]
        return doc
Ejemplo n.º 2
0
    def extract_attention_to_words(self, sentence: str, sentence2: str,
                                   features) -> torch.Tensor:
        from fairseq.models.roberta import alignment_utils

        nlp = alignment_utils.spacy_nlp()
        tokenizer = alignment_utils.spacy_tokenizer()

        # tokenize both with GPT-2 BPE and spaCy
        bpe_toks = self.encode(sentence, sentence2)
        s1_bpe_len = len(self.encode(sentence))
        s2_bpe_len = len(self.encode(sentence2))
        s2_bpe_toks = torch.cat([bpe_toks[0:1], bpe_toks[s1_bpe_len + 1:]],
                                dim=0)
        features = torch.cat(
            [features[0:1], features[s1_bpe_len + 1:s1_bpe_len + s2_bpe_len]],
            dim=0)
        features = features[:, None]
        # spacy_toks = tokenizer(sentence)
        # spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
        spacy_toks2 = tokenizer(sentence2)
        spacy_toks_ws2 = [t.text_with_ws for t in tokenizer(sentence2)]
        spacy_toks_ws = spacy_toks_ws2
        # print(spacy_toks_ws)
        # print(s2_bpe_toks)
        # print(features)
        assert features.size(0) == len(s2_bpe_toks)
        alignment = alignment_utils.align_bpe_to_words(self, s2_bpe_toks,
                                                       spacy_toks_ws)
        aligned_attn = alignment_utils.align_features_to_words(
            self, features, alignment)
        # print(spacy_toks_ws)
        # print(aligned_attn)
        # print(len(spacy_toks_ws))
        # print(len(aligned_attn))
        # aligned_attn = torch.nn.functional.softmax(aligned_attn.squeeze(), dim=0)[1:-1]
        aligned_attn = aligned_attn.squeeze()[1:-1]
        aligned_attn = aligned_attn / torch.sum(aligned_attn)
        assert aligned_attn.size(0) == len(spacy_toks_ws)
        return spacy_toks_ws, aligned_attn