コード例 #1
0
    def extract_features_aligned_to_words(
            self,
            sentence: str,
            return_all_hiddens: bool = False) -> torch.Tensor:
        """Extract RoBERTa features, aligned to spaCy's word-level tokenizer."""
        from fairseq.models.roberta import alignment_utils
        from spacy.tokens import Doc

        nlp = alignment_utils.spacy_nlp()
        tokenizer = alignment_utils.spacy_tokenizer()

        # tokenize both with GPT-2 BPE and spaCy
        bpe_toks = self.encode(sentence)
        spacy_toks = tokenizer(sentence)
        spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
        alignment = alignment_utils.align_bpe_to_words(self, bpe_toks,
                                                       spacy_toks_ws)

        # extract features and align them
        features = self.extract_features(bpe_toks,
                                         return_all_hiddens=return_all_hiddens)
        features = features.squeeze(0)
        aligned_feats = alignment_utils.align_features_to_words(
            self, features, alignment)

        # wrap in spaCy Doc
        doc = Doc(
            nlp.vocab,
            words=["<s>"] + [x.text for x in spacy_toks] + ["</s>"],
            spaces=[True] + [x.endswith(" ")
                             for x in spacy_toks_ws[:-1]] + [True, False],
        )
        assert len(doc) == aligned_feats.size(0)
        doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i]
        return doc
コード例 #2
0
def extract_features_aligned_to_words(
        model,
        tokens: list,
        use_all_layers: bool = True,
        return_all_hiddens: bool = False) -> torch.Tensor:
    nlp = spacy_nlp()
    alignment, bpe_tok = get_alignments_and_tokens(model, tokens)
    features = model.extract_features(bpe_tok,
                                      return_all_hiddens=return_all_hiddens)
    final_features = sum(features[1:]) / (len(features) - 1)
    final_features = final_features.squeeze(0)
    aligned_feats = align_features_to_words(model, final_features, alignment)
    doc = Doc(nlp.vocab, words=['<s>'] + [x for x in tokens] + ['</s>'])
    doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i]
    return doc
コード例 #3
0
    def extract_attention_to_words(self, sentence: str, sentence2: str,
                                   features) -> torch.Tensor:
        from fairseq.models.roberta import alignment_utils

        nlp = alignment_utils.spacy_nlp()
        tokenizer = alignment_utils.spacy_tokenizer()

        # tokenize both with GPT-2 BPE and spaCy
        bpe_toks = self.encode(sentence, sentence2)
        s1_bpe_len = len(self.encode(sentence))
        s2_bpe_len = len(self.encode(sentence2))
        s2_bpe_toks = torch.cat([bpe_toks[0:1], bpe_toks[s1_bpe_len + 1:]],
                                dim=0)
        features = torch.cat(
            [features[0:1], features[s1_bpe_len + 1:s1_bpe_len + s2_bpe_len]],
            dim=0)
        features = features[:, None]
        # spacy_toks = tokenizer(sentence)
        # spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
        spacy_toks2 = tokenizer(sentence2)
        spacy_toks_ws2 = [t.text_with_ws for t in tokenizer(sentence2)]
        spacy_toks_ws = spacy_toks_ws2
        # print(spacy_toks_ws)
        # print(s2_bpe_toks)
        # print(features)
        assert features.size(0) == len(s2_bpe_toks)
        alignment = alignment_utils.align_bpe_to_words(self, s2_bpe_toks,
                                                       spacy_toks_ws)
        aligned_attn = alignment_utils.align_features_to_words(
            self, features, alignment)
        # print(spacy_toks_ws)
        # print(aligned_attn)
        # print(len(spacy_toks_ws))
        # print(len(aligned_attn))
        # aligned_attn = torch.nn.functional.softmax(aligned_attn.squeeze(), dim=0)[1:-1]
        aligned_attn = aligned_attn.squeeze()[1:-1]
        aligned_attn = aligned_attn / torch.sum(aligned_attn)
        assert aligned_attn.size(0) == len(spacy_toks_ws)
        return spacy_toks_ws, aligned_attn
コード例 #4
0
def extract_features_aligned_to_words_batched(
        model,
        sentences: list,
        use_all_layers: bool = True,
        return_all_hiddens: bool = False) -> torch.Tensor:
    nlp = spacy_nlp()
    bpe_toks = []
    alignments = []
    spacy_tokens = []
    for sentence in sentences:
        toks = sentence.split()
        alignment, bpe_tok = get_alignments_and_tokens(model, toks)
        bpe_toks.append(bpe_tok)
        alignments.append(alignment)
        spacy_tokens.append(toks)

    bpe_toks_collated = collate_tokens(bpe_toks, pad_idx=1)

    features = model.extract_features(bpe_toks_collated,
                                      return_all_hiddens=return_all_hiddens)
    final_features = sum(features[1:]) / (len(features) - 1)

    results = []
    for bpe_tok, final_feature, alignment, toks in zip(bpe_toks,
                                                       final_features,
                                                       alignments,
                                                       spacy_tokens):
        aligned_feats = align_features_to_words(
            model, final_feature[0:bpe_tok.shape[0]], alignment)
        doc = Doc(
            nlp.vocab,
            words=['<s>'] + [x for x in toks] + ['</s>'],
        )
        doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i]
        results.append(copy.copy(doc))

    return results