Exemple #1
0
def chars(encoding: BatchEncoding) -> List[TokenSpan]:
    num_tokens = len(encoding["input_ids"])
    tokens: List[TokenSpan] = []
    i = 0
    while i < num_tokens:
        start, end = i, i + 1
        char_span = encoding.token_to_chars(start)
        while end < num_tokens and char_span == encoding.token_to_chars(end):
            # Some tokenizers map subsequent tokens to the same char span
            # (e.g. "ô" in the roberta tokenizer).
            end += 1
        tokens.append(TokenSpan(start, end))
        i = end
    return tokens
Exemple #2
0
def slice_hf_tokens(inputs: BatchEncoding, start: int, end: int) -> Dict:
    output = {}
    for key, value in inputs.items():
        if not hasattr(value, "__getitem__"):
            output[key] = value
        else:
            output[key] = value[start:end]
    return output
 def from_batch_encoding(cls,
                         token_data: BatchEncoding) -> "WordpieceBatch":
     assert (isinstance(token_data, BatchEncoding)
             or isinstance(token_data, dict))
     pad_token = token_data.get("pad_token", "[PAD]")
     lengths = [
         len([tok for tok in tokens if tok != pad_token])
         for tokens in token_data["input_texts"]
     ]
     n_seq = len(lengths)
     return cls(
         strings=token_data["input_texts"],
         input_ids=torch2xp(token_data["input_ids"]).reshape((n_seq, -1)),
         attention_mask=torch2xp(token_data["attention_mask"]).reshape(
             (n_seq, -1)),
         lengths=lengths,
         token_type_ids=(torch2xp(token_data["token_type_ids"]).reshape(
             (n_seq, -1)) if "token_type_ids" in token_data else None))
    def from_batch_encoding(cls,
                            token_data: BatchEncoding) -> "WordpieceBatch":
        assert isinstance(token_data, BatchEncoding) or isinstance(
            token_data, dict)
        pad_token = token_data.get("pad_token", "[PAD]")
        lengths = [
            len([tok for tok in tokens if tok != pad_token])
            for tokens in token_data["input_texts"]
        ]

        numpy_ops = NumpyOps()

        return cls(
            strings=token_data["input_texts"],
            input_ids=numpy_ops.asarray2i(token_data["input_ids"]),
            attention_mask=numpy_ops.asarray2f(token_data["attention_mask"]),
            lengths=lengths,
            token_type_ids=(numpy_ops.asarray2i(token_data["token_type_ids"])
                            if "token_type_ids" in token_data else None),
        )
Exemple #5
0
def batch_encode_plus(tokenizer, examples, max_length, progress_bar=False):
    log.info("1. Tokenizer encoding examples .... total: " +
             str(len(examples)))
    total = len(examples)
    epoch_iterator = tqdm(range(0, total, 100),
                          desc="Iteration",
                          disable=not progress_bar)

    batch_outputs = {}
    for step in epoch_iterator:
        batch_encoding = tokenizer.batch_encode_plus(
            [(example.text_a, example.text_b)
             for example in examples[step:step + 100]],
            max_length=max_length,
            pad_to_max_length=True,
        )

        for key, value in batch_encoding.items():
            if key not in batch_outputs:
                batch_outputs[key] = []
            batch_outputs[key].extend(value)

    return BatchEncoding(batch_outputs)
    def __call__(self,
                 sentences: List[str],
                 batch_size: int = 64,
                 agg_func: Callable = np.mean) -> List[float]:
        """
        Make scoring for all hypotheses.

        :param sentences: list of sentences
        :param batch_size: max size of batch
        :param agg_func: how to aggregate all log probs for sentence

        :returns: scores for each sentence
        """
        tokenized_sentences = self.tokenizer(
            sentences,
            add_special_tokens=True,
            padding=True,
            truncation='only_first',
        )

        # try to place mask token at each reasonable position
        scores = [list() for i in range(len(sentences))]
        sentences_lengths = np.sum(tokenized_sentences['attention_mask'],
                                   axis=-1)
        for mask_index in range(max(sentences_lengths)):
            # create valid BatchEncoding object to make scoring
            # find indices of sentences to make scoring
            # some sentences can be already finished
            indices_to_process = [
                i for i in range(len(sentences))
                if mask_index < sentences_lengths[i]
            ]

            input_dict = {}
            input_dict['input_ids'] = torch.tensor([
                tokenized_sentences['input_ids'][i][:mask_index] +
                [self.tokenizer.mask_token_id] +
                tokenized_sentences['input_ids'][i][mask_index + 1:]
                for i in indices_to_process
            ],
                                                   dtype=torch.long,
                                                   device=self.device)
            input_dict['attention_mask'] = torch.tensor([
                tokenized_sentences['attention_mask'][i]
                for i in indices_to_process
            ],
                                                        dtype=torch.long,
                                                        device=self.device)
            input_dict['token_type_ids'] = torch.tensor([
                tokenized_sentences['token_type_ids'][i]
                for i in indices_to_process
            ],
                                                        dtype=torch.long,
                                                        device=self.device)
            current_scores = []
            candidates = [
                tokenized_sentences['input_ids'][i][mask_index]
                for i in indices_to_process
            ]
            num_batches = int(np.ceil(len(indices_to_process) / batch_size))
            for i in range(num_batches):
                lower_idx = batch_size * i
                upper_idx = batch_size * (i + 1)
                input_batch_dict = {}
                input_batch_dict['input_ids'] = input_dict['input_ids'][
                    lower_idx:upper_idx, :]
                input_batch_dict['attention_mask'] = input_dict[
                    'attention_mask'][lower_idx:upper_idx, :]
                input_batch_dict['token_type_ids'] = input_dict[
                    'token_type_ids'][lower_idx:upper_idx, :]
                model_input_batch = BatchEncoding(input_batch_dict)
                candidates_batch = candidates[lower_idx:upper_idx]
                batch_scores = (self._score_contexts(model_input_batch,
                                                     mask_index,
                                                     candidates_batch))
                current_scores += batch_scores
            for idx, score in zip(indices_to_process, current_scores):
                scores[idx].append(score)
        agg_scores = [agg_func(score_list) for score_list in scores]
        return agg_scores