Exemple #1
0
    def extract_features_aligned_to_words(
            self,
            sentence: str,
            return_all_hiddens: bool = False) -> torch.Tensor:
        """Extract RoBERTa features, aligned to spaCy's word-level tokenizer."""
        from fairseq.models.roberta import alignment_utils
        from spacy.tokens import Doc

        nlp = alignment_utils.spacy_nlp()
        tokenizer = alignment_utils.spacy_tokenizer()

        # tokenize both with GPT-2 BPE and spaCy
        bpe_toks = self.encode(sentence)
        spacy_toks = tokenizer(sentence)
        spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
        alignment = alignment_utils.align_bpe_to_words(self, bpe_toks,
                                                       spacy_toks_ws)

        # extract features and align them
        features = self.extract_features(bpe_toks,
                                         return_all_hiddens=return_all_hiddens)
        features = features.squeeze(0)
        aligned_feats = alignment_utils.align_features_to_words(
            self, features, alignment)

        # wrap in spaCy Doc
        doc = Doc(
            nlp.vocab,
            words=["<s>"] + [x.text for x in spacy_toks] + ["</s>"],
            spaces=[True] + [x.endswith(" ")
                             for x in spacy_toks_ws[:-1]] + [True, False],
        )
        assert len(doc) == aligned_feats.size(0)
        doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i]
        return doc
Exemple #2
0
def extract_aligned_roberta(roberta, sentence: str, 
                            tokens: List[str], 
                            return_all_hiddens=False):
    ''' Code inspired from: 
       https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
    
    Aligns roberta embeddings for an input tokenization of words for a sentence
    
    Inputs:
    1. roberta: roberta fairseq class
    2. sentence: sentence in string
    3. tokens: tokens of the sentence in which the alignment is to be done
    
    Outputs: Aligned roberta features 
    '''
    
    from fairseq.models.roberta import alignment_utils

    # tokenize both with GPT-2 BPE and get alignment with given tokens
    bpe_toks = roberta.encode(sentence)
    alignment = alignment_utils.align_bpe_to_words(roberta, bpe_toks, tokens) # tokens came from spacy, for this func. from golden tokens
    
    # extract features and align them
    # LM heads are only used when masked_tokens are involved, not in this case
    if not return_all_hiddens:
        features, x = roberta.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens), None
    else:
        features, x = roberta.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens)
    sent_features = features
    features = features.squeeze(0)   #Batch-size = 1
    # aligned_feats = alignment_utils.align_features_to_words(roberta, features, alignment)
    aligned_feats = align_features_to_words(roberta, features, alignment, 1e-3)
   
    return aligned_feats[1:-1], x, sent_features  #exclude <s> and </s> tokens
Exemple #3
0
def extract_aligned_roberta_multiple(roberta,
                                     sentence: str,
                                     sentence2,
                                     tokens: List[str],
                                     tokens2,
                                     return_all_hiddens=False):

    from fairseq.models.roberta import alignment_utils
    full_tokens = add_SEP_token("<s>", tokens, tokens2, single=False)

    # tokenize both with GPT-2 BPE and get alignment with given tokens
    bpe_toks = roberta.encode(sentence, sentence2)
    alignment = alignment_utils.align_bpe_to_words(
        roberta, bpe_toks, full_tokens
    )  # tokens came from spacy, for this func. from golden tokens
    features = roberta.extract_features(bpe_toks,
                                        return_all_hiddens=return_all_hiddens)
    sent_features = features
    if return_all_hiddens:
        features = features[-1].squeeze(
            0)  #Batch-size = 1 #-1 is the last layer, right?
    else:
        features = features.squeeze(0)  #Batch-size = 1
    # aligned_feats = alignment_utils.align_features_to_words(roberta, features, alignment)
    aligned_feats = align_features_to_words(roberta, features, alignment, 1e-3)

    return aligned_feats[1:-1], sent_features  #exclude <s> and </s> tokens
Exemple #4
0
 def forward(self, seq_list):
     with torch.no_grad():
         seq_embeddings = []
         for seq in seq_list:
             sent = ' '.join(seq)
             encoded = self.roberta.encode(sent)
             alignment = alignment_utils.align_bpe_to_words(
                 self.roberta, encoded, seq)
             features = self.roberta.extract_features(
                 encoded, return_all_hiddens=False)
             features = features.squeeze(0)
             aligned = align_features_to_words(self.roberta, features,
                                               alignment)
             seq_embeddings.append(
                 aligned[1:-1])  # skip <s>,</s> embeddings
         return torch.stack(seq_embeddings, dim=0).to(self.device)
Exemple #5
0
def extract_aligned_roberta(roberta,
                            sentence: str,
                            tokens: List[str],
                            return_all_hiddens=False):
    ''' Code inspired from: 
       https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
    
    Aligns roberta embeddings for an input tokenization of words for a sentence
    
    Inputs:
    1. roberta: roberta fairseq class
    2. sentence: sentence in string
    3. tokens: tokens of the sentence in which the alignment is to be done
    
    Outputs: Aligned roberta features 
    '''

    from fairseq.models.roberta import alignment_utils

    full_tokens = add_SEP_token("<s>", tokens, single=True)

    # tokenize both with GPT-2 BPE and get alignment with given tokens
    bpe_toks = roberta.encode(sentence)
    alignment = alignment_utils.align_bpe_to_words(
        roberta, bpe_toks,
        tokens)  # tokens came from spacy, for this func. from golden tokens

    # extract features and align them
    # LM heads are only used when masked_tokens are involved, not in this case
    features = roberta.extract_features(bpe_toks,
                                        return_all_hiddens=return_all_hiddens)
    sent_features = features
    print(
        "Feature size: " + str(len(sent_features))
    )  # 1 vs 25, when all layers are returned, when return all hidden is true just features are returned, one variable, with the hidden
    # features is the last layer, extra is all layers with hidden ones
    if return_all_hiddens:
        features = features[-1].squeeze(
            0)  #Batch-size = 1 #-1 is the last layer, right?
    else:
        features = features.squeeze(0)  #Batch-size = 1
    # aligned_feats = alignment_utils.align_features_to_words(roberta, features, alignment)
    aligned_feats = align_features_to_words(roberta, features, alignment, 1e-3)

    return aligned_feats[1:-1], sent_features  #exclude <s> and </s> tokens
Exemple #6
0
    def extract_attention_to_words(self, sentence: str, sentence2: str,
                                   features) -> torch.Tensor:
        from fairseq.models.roberta import alignment_utils

        nlp = alignment_utils.spacy_nlp()
        tokenizer = alignment_utils.spacy_tokenizer()

        # tokenize both with GPT-2 BPE and spaCy
        bpe_toks = self.encode(sentence, sentence2)
        s1_bpe_len = len(self.encode(sentence))
        s2_bpe_len = len(self.encode(sentence2))
        s2_bpe_toks = torch.cat([bpe_toks[0:1], bpe_toks[s1_bpe_len + 1:]],
                                dim=0)
        features = torch.cat(
            [features[0:1], features[s1_bpe_len + 1:s1_bpe_len + s2_bpe_len]],
            dim=0)
        features = features[:, None]
        # spacy_toks = tokenizer(sentence)
        # spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)]
        spacy_toks2 = tokenizer(sentence2)
        spacy_toks_ws2 = [t.text_with_ws for t in tokenizer(sentence2)]
        spacy_toks_ws = spacy_toks_ws2
        # print(spacy_toks_ws)
        # print(s2_bpe_toks)
        # print(features)
        assert features.size(0) == len(s2_bpe_toks)
        alignment = alignment_utils.align_bpe_to_words(self, s2_bpe_toks,
                                                       spacy_toks_ws)
        aligned_attn = alignment_utils.align_features_to_words(
            self, features, alignment)
        # print(spacy_toks_ws)
        # print(aligned_attn)
        # print(len(spacy_toks_ws))
        # print(len(aligned_attn))
        # aligned_attn = torch.nn.functional.softmax(aligned_attn.squeeze(), dim=0)[1:-1]
        aligned_attn = aligned_attn.squeeze()[1:-1]
        aligned_attn = aligned_attn / torch.sum(aligned_attn)
        assert aligned_attn.size(0) == len(spacy_toks_ws)
        return spacy_toks_ws, aligned_attn