def extract_features_aligned_to_words( self, sentence: str, return_all_hiddens: bool = False) -> torch.Tensor: """Extract RoBERTa features, aligned to spaCy's word-level tokenizer.""" from fairseq.models.roberta import alignment_utils from spacy.tokens import Doc nlp = alignment_utils.spacy_nlp() tokenizer = alignment_utils.spacy_tokenizer() # tokenize both with GPT-2 BPE and spaCy bpe_toks = self.encode(sentence) spacy_toks = tokenizer(sentence) spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)] alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws) # extract features and align them features = self.extract_features(bpe_toks, return_all_hiddens=return_all_hiddens) features = features.squeeze(0) aligned_feats = alignment_utils.align_features_to_words( self, features, alignment) # wrap in spaCy Doc doc = Doc( nlp.vocab, words=["<s>"] + [x.text for x in spacy_toks] + ["</s>"], spaces=[True] + [x.endswith(" ") for x in spacy_toks_ws[:-1]] + [True, False], ) assert len(doc) == aligned_feats.size(0) doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i] return doc
def extract_features_aligned_to_words( model, tokens: list, use_all_layers: bool = True, return_all_hiddens: bool = False) -> torch.Tensor: nlp = spacy_nlp() alignment, bpe_tok = get_alignments_and_tokens(model, tokens) features = model.extract_features(bpe_tok, return_all_hiddens=return_all_hiddens) final_features = sum(features[1:]) / (len(features) - 1) final_features = final_features.squeeze(0) aligned_feats = align_features_to_words(model, final_features, alignment) doc = Doc(nlp.vocab, words=['<s>'] + [x for x in tokens] + ['</s>']) doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i] return doc
def extract_attention_to_words(self, sentence: str, sentence2: str, features) -> torch.Tensor: from fairseq.models.roberta import alignment_utils nlp = alignment_utils.spacy_nlp() tokenizer = alignment_utils.spacy_tokenizer() # tokenize both with GPT-2 BPE and spaCy bpe_toks = self.encode(sentence, sentence2) s1_bpe_len = len(self.encode(sentence)) s2_bpe_len = len(self.encode(sentence2)) s2_bpe_toks = torch.cat([bpe_toks[0:1], bpe_toks[s1_bpe_len + 1:]], dim=0) features = torch.cat( [features[0:1], features[s1_bpe_len + 1:s1_bpe_len + s2_bpe_len]], dim=0) features = features[:, None] # spacy_toks = tokenizer(sentence) # spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)] spacy_toks2 = tokenizer(sentence2) spacy_toks_ws2 = [t.text_with_ws for t in tokenizer(sentence2)] spacy_toks_ws = spacy_toks_ws2 # print(spacy_toks_ws) # print(s2_bpe_toks) # print(features) assert features.size(0) == len(s2_bpe_toks) alignment = alignment_utils.align_bpe_to_words(self, s2_bpe_toks, spacy_toks_ws) aligned_attn = alignment_utils.align_features_to_words( self, features, alignment) # print(spacy_toks_ws) # print(aligned_attn) # print(len(spacy_toks_ws)) # print(len(aligned_attn)) # aligned_attn = torch.nn.functional.softmax(aligned_attn.squeeze(), dim=0)[1:-1] aligned_attn = aligned_attn.squeeze()[1:-1] aligned_attn = aligned_attn / torch.sum(aligned_attn) assert aligned_attn.size(0) == len(spacy_toks_ws) return spacy_toks_ws, aligned_attn
def extract_features_aligned_to_words_batched( model, sentences: list, use_all_layers: bool = True, return_all_hiddens: bool = False) -> torch.Tensor: nlp = spacy_nlp() bpe_toks = [] alignments = [] spacy_tokens = [] for sentence in sentences: toks = sentence.split() alignment, bpe_tok = get_alignments_and_tokens(model, toks) bpe_toks.append(bpe_tok) alignments.append(alignment) spacy_tokens.append(toks) bpe_toks_collated = collate_tokens(bpe_toks, pad_idx=1) features = model.extract_features(bpe_toks_collated, return_all_hiddens=return_all_hiddens) final_features = sum(features[1:]) / (len(features) - 1) results = [] for bpe_tok, final_feature, alignment, toks in zip(bpe_toks, final_features, alignments, spacy_tokens): aligned_feats = align_features_to_words( model, final_feature[0:bpe_tok.shape[0]], alignment) doc = Doc( nlp.vocab, words=['<s>'] + [x for x in toks] + ['</s>'], ) doc.user_token_hooks['vector'] = lambda token: aligned_feats[token.i] results.append(copy.copy(doc)) return results