Example #1
0
 def split_by_doc(self) -> List[TransformerData]:
     """Split a TransformerData that represents a batch into a list with
     one TransformerData per Doc.
     """
     flat_spans = []
     for doc_spans in self.spans:
         flat_spans.extend(doc_spans)
     token_positions = get_token_positions(flat_spans)
     outputs = []
     start = 0
     prev_tokens = 0
     for doc_spans in self.spans:
         if len(doc_spans) == 0 or len(doc_spans[0]) == 0:
             outputs.append(TransformerData.empty())
             continue
         start_i = token_positions[doc_spans[0][0]]
         end_i = token_positions[doc_spans[-1][-1]] + 1
         end = start + len(doc_spans)
         doc_tokens = self.wordpieces[start:end]
         doc_align = self.align[start_i:end_i]
         doc_align.data = doc_align.data - prev_tokens
         if self.attention:
             attn = [torch2xp(t[start:end]) for t in self.attention]
         else:
             attn = None
         outputs.append(
             TransformerData(
                 wordpieces=doc_tokens,
                 tensors=[torch2xp(t[start:end]) for t in self.tensors],
                 align=doc_align,
                 attention=attn,
             ))
         prev_tokens += doc_tokens.input_ids.size
         start += len(doc_spans)
     return outputs
 def split_by_doc(self) -> List[TransformerData]:
     """Split a TransformerData that represents a batch into a list with
     one TransformerData per Doc.
     """
     flat_spans = []
     for doc_spans in self.spans:
         flat_spans.extend(doc_spans)
     token_positions = get_token_positions(flat_spans)
     outputs = []
     start = 0
     prev_tokens = 0
     for doc_spans in self.spans:
         if len(doc_spans) == 0 or len(doc_spans[0]) == 0:
             outputs.append(TransformerData.empty())
             token_count = 0
         else:
             start_i = token_positions[doc_spans[0][0]]
             end_i = token_positions[doc_spans[-1][-1]] + 1
             end = start + len(doc_spans)
             doc_tokens = slice_hf_tokens(self.tokens, start, end)
             doc_tensors = [torch2xp(t[start:end]) for t in self.tensors]
             doc_align = self.align[start_i:end_i]
             doc_align.data = doc_align.data - prev_tokens
             outputs.append(
                 TransformerData(
                     tokens=doc_tokens,
                     tensors=doc_tensors,  # type: ignore
                     align=doc_align,
                 ))
             token_count = sum(
                 len(texts) for texts in doc_tokens["input_texts"])
         prev_tokens += token_count
         start += len(doc_spans)
     return outputs
def test_pytorch_roundtrip_conversion():
    import torch

    xp_tensor = numpy.zeros((2, 3), dtype="f")
    torch_tensor = xp2torch(xp_tensor)
    assert isinstance(torch_tensor, torch.Tensor)
    new_xp_tensor = torch2xp(torch_tensor)
    assert numpy.array_equal(xp_tensor, new_xp_tensor)
 def from_batch_encoding(cls,
                         token_data: BatchEncoding) -> "WordpieceBatch":
     assert (isinstance(token_data, BatchEncoding)
             or isinstance(token_data, dict))
     pad_token = token_data.get("pad_token", "[PAD]")
     lengths = [
         len([tok for tok in tokens if tok != pad_token])
         for tokens in token_data["input_texts"]
     ]
     n_seq = len(lengths)
     return cls(
         strings=token_data["input_texts"],
         input_ids=torch2xp(token_data["input_ids"]).reshape((n_seq, -1)),
         attention_mask=torch2xp(token_data["attention_mask"]).reshape(
             (n_seq, -1)),
         lengths=lengths,
         token_type_ids=(torch2xp(token_data["token_type_ids"]).reshape(
             (n_seq, -1)) if "token_type_ids" in token_data else None))
 def split_by_doc(self) -> List[TransformerData]:
     """Split a TransformerData that represents a batch into a list with
     one TransformerData per Doc.
     """
     flat_spans = []
     for doc_spans in self.spans:
         flat_spans.extend(doc_spans)
     token_positions = get_token_positions(flat_spans)
     outputs = []
     start = 0
     prev_tokens = 0
     for doc_spans in self.spans:
         if len(doc_spans) == 0 or len(doc_spans[0]) == 0:
             outputs.append(TransformerData.empty())
             continue
         start_i = token_positions[doc_spans[0][0]]
         end_i = token_positions[doc_spans[-1][-1]] + 1
         end = start + len(doc_spans)
         doc_tokens = self.wordpieces[start:end]
         doc_align = self.align[start_i:end_i]
         doc_align.data = doc_align.data - prev_tokens
         model_output = ModelOutput()
         last_hidden_state = self.model_output.last_hidden_state
         for key, output in self.model_output.items():
             if isinstance(output, torch.Tensor):
                 model_output[key] = torch2xp(output[start:end])
             elif (isinstance(output, tuple)
                   and all(isinstance(t, torch.Tensor) for t in output)
                   and all(t.shape[0] == last_hidden_state.shape[0]
                           for t in output)):
                 model_output[key] = [
                     torch2xp(t[start:end]) for t in output
                 ]
         outputs.append(
             TransformerData(
                 wordpieces=doc_tokens,
                 model_output=model_output,
                 align=doc_align,
             ))
         prev_tokens += doc_tokens.input_ids.size
         start += len(doc_spans)
     return outputs
Example #6
0
def convert_transformer_outputs(model, inputs_outputs, is_train):
    layer_inputs, torch_outputs = inputs_outputs
    torch_tokvecs: torch.Tensor = torch_outputs[0]
    # Free the memory as soon as we can
    torch_outputs = None
    lengths = list(layer_inputs.input_len)
    tokvecs: List[Floats2d] = model.ops.unpad(torch2xp(torch_tokvecs), lengths)
    # Remove the BOS and EOS markers.
    tokvecs = [arr[1:-1] for arr in tokvecs]

    def backprop(d_tokvecs: List[Floats2d]) -> ArgsKwargs:
        # Restore entries for bos and eos markers.
        row = model.ops.alloc2f(1, d_tokvecs[0].shape[1])
        d_tokvecs = [model.ops.xp.vstack((row, arr, row)) for arr in d_tokvecs]
        return ArgsKwargs(
            args=(torch_tokvecs,),
            kwargs={"grad_tensors": xp2torch(model.ops.pad(d_tokvecs))},
        )

    return tokvecs, backprop