def from_pytt(cls, fields, *, is_grad=False) -> "Activations": """Create Activations from the output tuples produced by PyTorch Transformers. Includes converting torch tensors to xp, and handling missing values. """ # lh: last hidden # po: pooler_output # ah: all_hidden # aa: all_attention if len(fields) != 4: lh = fields[0] po = tuple() ah = [] aa = [] else: lh, po, ah, aa = fields # Convert last_hidden_state to xp lh = torch2xp(lh) xp = get_array_module(lh) # Normalize "None" value for pooler output if isinstance(po, tuple): po = xp.zeros((0, ), dtype=lh.dtype) else: po = torch2xp(po) ah = list(map(torch2xp, ah)) aa = list(map(torch2xp, aa)) return cls(lh, po, ah, aa, is_grad=is_grad)
def join(cls, sub_acts: List["Activations"]) -> "Activations": """Concatenate activations from subsequences.""" xp = get_array_module(sub_acts[0].lh) lh: Array = xp.vstack([x.lh for x in sub_acts]) po: Array = xp.vstack([x.po for x in sub_acts]) # Transpose the lists, so that the inner list items refer # to the subsequences. Then we can vstack those. ah = list(map(xp.vstack, zip(*[x.ah for x in sub_acts]))) #aa = list(map(xp.vstack, zip(*[x.aa for x in sub_acts]))) aa = [] return cls(lh, po, ah, aa, is_grad=sub_acts[0].is_grad)
def pad_batch_activations(batch: List[Activations], *, to: int = 0) -> Activations: if not batch: return Activations.blank() xp = get_array_module(batch[0]) lh = pad_batch([x.lh for x in batch], xp=xp, to=to) if lh.size: lh = lh.reshape((len(batch), -1, lh.shape[-1])) po = pad_batch([x.po for x in batch], xp=xp, to=to) if po.size: po = po.reshape((len(batch), -1, po.shape[-1])) # Transpose the lists, and then pad_batch the items ah = [ pad_batch(list(seq), xp=xp, to=to) for seq in zip(*[x.ah for x in batch]) ] aa = [ pad_batch(list(seq), xp=xp, to=to) for seq in zip(*[x.aa for x in batch]) ] return Activations(lh, po, ah, aa, is_grad=batch[0].is_grad)
def get_similarity_via_tensor(doc1, doc2): v1 = doc1.vector v2 = doc2.vector xp = get_array_module(v1) return xp.dot(v1, v2) / (doc1.vector_norm * doc2.vector_norm)
def set_annotations(self, docs, activations): """Assign the extracted features to the Doc objects and overwrite the vector and similarity hooks. docs (iterable): A batch of `Doc` objects. activations (iterable): A batch of activations. """ for doc, doc_acts in zip(docs, activations): xp = get_array_module(doc_acts.lh) wp_tensor = doc_acts.lh doc.tensor = self.model.ops.allocate((len(doc), self.model.nO)) doc._.pytt_last_hidden_state = wp_tensor doc._.pytt_pooler_output = doc_acts.po doc._.pytt_all_hidden_states = doc_acts.ah doc._.pytt_all_attentions = doc_acts.aa doc._.pytt_d_last_hidden_state = xp.zeros((0, ), dtype=wp_tensor.dtype) doc._.pytt_d_pooler_output = xp.zeros((0, ), dtype=wp_tensor.dtype) doc._.pytt_d_all_hidden_states = [] doc._.pytt_d_all_attentions = [] if wp_tensor.shape != (len(doc._.pytt_word_pieces), self.model.nO): print("# word pieces: ", len(doc._.pytt_word_pieces)) print("# tensor rows: ", wp_tensor.shape[0]) for sent in doc.sents: if sent._.pytt_start is None or sent._.pytt_end is None: print("Text: ", sent.text) print("WPs: ", sent._.pytt_word_pieces_) print(sent._.pytt_start, sent._.pytt_end) raise ValueError( "Mismatch between tensor shape and word pieces. This usually " "means we did something wrong in the sentence reshaping, " "or possibly finding the separator tokens.") # Count how often each word-piece token is represented. This allows # a weighted sum, so that we can make sure doc.tensor.sum() # equals wp_tensor.sum(). # TODO: Obviously incrementing the rows individually is bad. Need # to make this more efficient. Maybe just copy to CPU, do our stuff, # copy back to GPU? align_sizes = [0 for _ in range(len(doc._.pytt_word_pieces))] for word_piece_slice in doc._.pytt_alignment: for i in word_piece_slice: align_sizes[i] += 1 for i, word_piece_slice in enumerate(doc._.pytt_alignment): for j in word_piece_slice: doc.tensor[i] += wp_tensor[j] / align_sizes[j] # To make this weighting work, we "align" the boundary tokens against # every token in their sentence. if doc.tensor.sum() != wp_tensor.sum(): for sent in doc.sents: if sent._.pytt_start is not None and sent._.pytt_end is not None: cls_vector = wp_tensor[sent._.pytt_start] sep_vector = wp_tensor[sent._.pytt_end] doc.tensor[sent.start:sent.end + 1] += cls_vector / len(sent) doc.tensor[sent.start:sent.end + 1] += sep_vector / len(sent) doc.user_hooks["vector"] = get_doc_vector_via_tensor doc.user_span_hooks["vector"] = get_span_vector_via_tensor doc.user_token_hooks["vector"] = get_token_vector_via_tensor doc.user_hooks["similarity"] = get_similarity_via_tensor doc.user_span_hooks["similarity"] = get_similarity_via_tensor doc.user_token_hooks["similarity"] = get_similarity_via_tensor
def join(cls, sub_acts: List["Activations"]) -> "Activations": """Concatenate activations from subsequences.""" xp = get_array_module(sub_acts[0].lh) lh: Array = xp.vstack([x.lh for x in sub_acts]) return cls(lh, [], [], [], is_grad=sub_acts[0].is_grad)
def pad_batch_activations(batch: List[Activations]) -> Activations: xp = get_array_module(batch[0]) lh = pad_batch([x.lh for x in batch], xp=xp) lh = lh.reshape((len(batch), -1, lh.shape[-1])) return Activations(lh, [], [], [], is_grad=batch[0].is_grad)