def chars(encoding: BatchEncoding) -> List[TokenSpan]: num_tokens = len(encoding["input_ids"]) tokens: List[TokenSpan] = [] i = 0 while i < num_tokens: start, end = i, i + 1 char_span = encoding.token_to_chars(start) while end < num_tokens and char_span == encoding.token_to_chars(end): # Some tokenizers map subsequent tokens to the same char span # (e.g. "ô" in the roberta tokenizer). end += 1 tokens.append(TokenSpan(start, end)) i = end return tokens
def slice_hf_tokens(inputs: BatchEncoding, start: int, end: int) -> Dict: output = {} for key, value in inputs.items(): if not hasattr(value, "__getitem__"): output[key] = value else: output[key] = value[start:end] return output
def from_batch_encoding(cls, token_data: BatchEncoding) -> "WordpieceBatch": assert (isinstance(token_data, BatchEncoding) or isinstance(token_data, dict)) pad_token = token_data.get("pad_token", "[PAD]") lengths = [ len([tok for tok in tokens if tok != pad_token]) for tokens in token_data["input_texts"] ] n_seq = len(lengths) return cls( strings=token_data["input_texts"], input_ids=torch2xp(token_data["input_ids"]).reshape((n_seq, -1)), attention_mask=torch2xp(token_data["attention_mask"]).reshape( (n_seq, -1)), lengths=lengths, token_type_ids=(torch2xp(token_data["token_type_ids"]).reshape( (n_seq, -1)) if "token_type_ids" in token_data else None))
def from_batch_encoding(cls, token_data: BatchEncoding) -> "WordpieceBatch": assert isinstance(token_data, BatchEncoding) or isinstance( token_data, dict) pad_token = token_data.get("pad_token", "[PAD]") lengths = [ len([tok for tok in tokens if tok != pad_token]) for tokens in token_data["input_texts"] ] numpy_ops = NumpyOps() return cls( strings=token_data["input_texts"], input_ids=numpy_ops.asarray2i(token_data["input_ids"]), attention_mask=numpy_ops.asarray2f(token_data["attention_mask"]), lengths=lengths, token_type_ids=(numpy_ops.asarray2i(token_data["token_type_ids"]) if "token_type_ids" in token_data else None), )
def batch_encode_plus(tokenizer, examples, max_length, progress_bar=False): log.info("1. Tokenizer encoding examples .... total: " + str(len(examples))) total = len(examples) epoch_iterator = tqdm(range(0, total, 100), desc="Iteration", disable=not progress_bar) batch_outputs = {} for step in epoch_iterator: batch_encoding = tokenizer.batch_encode_plus( [(example.text_a, example.text_b) for example in examples[step:step + 100]], max_length=max_length, pad_to_max_length=True, ) for key, value in batch_encoding.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].extend(value) return BatchEncoding(batch_outputs)
def __call__(self, sentences: List[str], batch_size: int = 64, agg_func: Callable = np.mean) -> List[float]: """ Make scoring for all hypotheses. :param sentences: list of sentences :param batch_size: max size of batch :param agg_func: how to aggregate all log probs for sentence :returns: scores for each sentence """ tokenized_sentences = self.tokenizer( sentences, add_special_tokens=True, padding=True, truncation='only_first', ) # try to place mask token at each reasonable position scores = [list() for i in range(len(sentences))] sentences_lengths = np.sum(tokenized_sentences['attention_mask'], axis=-1) for mask_index in range(max(sentences_lengths)): # create valid BatchEncoding object to make scoring # find indices of sentences to make scoring # some sentences can be already finished indices_to_process = [ i for i in range(len(sentences)) if mask_index < sentences_lengths[i] ] input_dict = {} input_dict['input_ids'] = torch.tensor([ tokenized_sentences['input_ids'][i][:mask_index] + [self.tokenizer.mask_token_id] + tokenized_sentences['input_ids'][i][mask_index + 1:] for i in indices_to_process ], dtype=torch.long, device=self.device) input_dict['attention_mask'] = torch.tensor([ tokenized_sentences['attention_mask'][i] for i in indices_to_process ], dtype=torch.long, device=self.device) input_dict['token_type_ids'] = torch.tensor([ tokenized_sentences['token_type_ids'][i] for i in indices_to_process ], dtype=torch.long, device=self.device) current_scores = [] candidates = [ tokenized_sentences['input_ids'][i][mask_index] for i in indices_to_process ] num_batches = int(np.ceil(len(indices_to_process) / batch_size)) for i in range(num_batches): lower_idx = batch_size * i upper_idx = batch_size * (i + 1) input_batch_dict = {} input_batch_dict['input_ids'] = input_dict['input_ids'][ lower_idx:upper_idx, :] input_batch_dict['attention_mask'] = input_dict[ 'attention_mask'][lower_idx:upper_idx, :] input_batch_dict['token_type_ids'] = input_dict[ 'token_type_ids'][lower_idx:upper_idx, :] model_input_batch = BatchEncoding(input_batch_dict) candidates_batch = candidates[lower_idx:upper_idx] batch_scores = (self._score_contexts(model_input_batch, mask_index, candidates_batch)) current_scores += batch_scores for idx, score in zip(indices_to_process, current_scores): scores[idx].append(score) agg_scores = [agg_func(score_list) for score_list in scores] return agg_scores