def test_decode_best_spans(self): tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased") text_1 = tokenizer.encode("question sequence", add_special_tokens=False) text_2 = tokenizer.encode("title sequence", add_special_tokens=False) text_3 = tokenizer.encode("text sequence " * 4, add_special_tokens=False) input_ids = [[101] + text_1 + [102] + text_2 + [102] + text_3] reader_input = BatchEncoding({"input_ids": input_ids}) start_logits = [[0] * len(input_ids[0])] end_logits = [[0] * len(input_ids[0])] relevance_logits = [0] reader_output = DPRReaderOutput(start_logits, end_logits, relevance_logits) start_index, end_index = 8, 9 start_logits[0][start_index] = 10 end_logits[0][end_index] = 10 predicted_spans = tokenizer.decode_best_spans(reader_input, reader_output) self.assertEqual(predicted_spans[0].start_index, start_index) self.assertEqual(predicted_spans[0].end_index, end_index) self.assertEqual(predicted_spans[0].doc_id, 0)
def batch_encode_packets( self, flows: Union[pd.DataFrame, np.ndarray], target_class: Optional[str] = None, add_special_tokens: bool = True, return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH, return_attention_mask: Optional[bool] = True, ) -> BatchEncoding: if isinstance(flows, pd.DataFrame): flows = flows.values if flows.shape[1] // 2 != self.max_model_input_sizes: logger.debug(f'input number of features ({flows.shape[1] // 2}) does not match ' f'max_model_input_sizes ({self.max_model_input_sizes})') clusters = self.packet_quantizer.transform(flows) if add_special_tokens: first_token = self.convert_tokens_to_ids(target_class) if target_class is not None else self.bos_token_id expander = partial(self._expand_with_special_tokens, first_token=first_token) clusters = np.apply_along_axis(expander, axis=1, arr=clusters) else: clusters = np.apply_along_axis(self._pad_flow, axis=1, arr=clusters) result = {'input_ids': clusters.astype(np.int64)} if return_attention_mask: token_mask = (clusters != self.pad_token_id).astype(np.int64) result.update({'attention_mask': token_mask}) return BatchEncoding(result, tensor_type=TensorType(return_tensors), prepend_batch_axis=False)
def get_word_seg(text): token_batch = tokenizer(text, return_tensors="pt") token_ids = token_batch["input_ids"] token_ids = token_ids.numpy().tolist()[0] length = len(token_ids) - 2 batch_token_ids = np.array([token_ids] * (2 * length - 1)) batch_segment_ids = np.zeros_like(batch_token_ids) for i in range(length): if i > 0: batch_token_ids[2 * i - 1, i] = 103 batch_token_ids[2 * i - 1, i + 1] = 103 batch_token_ids[2 * i, i + 1] = 103 attention_mask = token_batch["attention_mask"].repeat(2 * length - 1, 1) input_ids = torch.from_numpy(batch_token_ids) token_type_ids = torch.from_numpy(batch_segment_ids) input_dict = { 'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': attention_mask } inputs = BatchEncoding(input_dict) outputs = model(**inputs) vectors, _ = outputs[:2] vectors = vectors.detach().numpy() seg_list = [] for threshold in range(length): # threshold = 8 print(threshold) word_token_ids = [[token_ids[1]]] for i in range(1, length): d1 = dist(vectors[2 * i, i + 1], vectors[2 * i - 1, i + 1]) d2 = dist(vectors[2 * i - 2, i], vectors[2 * i - 1, i]) d = (d1 + d2) / 2 if d >= threshold: word_token_ids[-1].append(token_ids[i + 1]) else: word_token_ids.append([token_ids[i + 1]]) words = [ tokenizer.decode(ids).replace(" ", "") for ids in word_token_ids ] print(words) seg_list.append(words) return seg_list
def __init__( self, descriptions: BatchEncoding, patients: BatchEncoding, doctors: BatchEncoding, neg_samples: int = 9, max_length: int = 256, evaluation: bool = False ): super().__init__() self.descriptions = descriptions.copy() self.patients = patients.copy() self.doctors = doctors.copy() self.evaluation = evaluation print(len(self)) # Usually description length might be shorter self.__trim_or_pad_input(self.descriptions, max_length) self.__trim_or_pad_input(self.patients, max_length) self.__trim_or_pad_input(self.doctors, max_length) self.neg_samples = neg_samples
def test_group_by_length_with_batch_encoding(self): # Get some inputs of random lengths data = [] for _ in range(6): input_ids = torch.randint(0, 25, (100, )).tolist() data.append(BatchEncoding({"input_ids": input_ids})) # Put one bigger than the others to check it ends up in first position data[3]["input_ids"] = torch.randint(0, 25, (105, )).tolist() indices = list(LengthGroupedSampler(4, dataset=data)) # The biggest element should be first self.assertEqual(len(data[indices[0]]["input_ids"]), 105) # The indices should be a permutation of range(6) self.assertEqual(list(sorted(indices)), list(range(6)))
def __call__( self, text: Union[TextInput, List[TextInput]], text_pair: Optional[Union[TextInput, List[TextInput]]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_length: bool = False, **kwargs, ) -> BatchEncoding: """ Tokenize the text into a sequence of image blocks. Parameters ---------- text : Union[TextInput, List[TextInput]] A single text or a list of text text_pair : Optional[Union[TextInput, List[TextInput]]], optional A single text or a list of text, by default None add_special_tokens : bool, optional Whether to add special tokens to the data, by default True padding : Union[bool, str, PaddingStrategy], optional The padding strategy, by default False truncation : Union[bool, str, TruncationStrategy], optional The truncation strategy, by default False max_length : Optional[int], optional Maximum sequence length, overriding the class variable, by default None pad_to_multiple_of : Optional[int], optional Padding parameters, by default None return_tensors : Optional[Union[str, TensorType]], optional Return tensors in `pt`, 'tf' or 'np', by default None return_token_type_ids : Optional[bool], optional Return token type ids, by default None return_attention_mask : Optional[bool], optional Return attention mask, by default None return_overflowing_tokens : bool, optional Return overflowing tokens, by default False return_special_tokens_mask : bool, optional Return special token mask, by default False return_length : bool, optional Return length, by default False Returns ------- BatchEncoding A BatchEncoding object """ if self.special_tokens is None: self.special_tokens = { "CLS": self.text2embeddings("[CLS]"), "SEP": self.text2embeddings("[SEP]"), } if add_special_tokens and text_pair: actual_max_length = self.max_length - len( self.special_tokens["SEP"]) * 2 - len( self.special_tokens["CLS"]) else: actual_max_length = self.max_length batch_outputs = {} text = text if isinstance(text, list) else [text] text_pair = text_pair if isinstance(text_pair, list) else [text_pair] if isinstance(padding, str): padding = PaddingStrategy(padding) if isinstance(truncation, str): truncation = TruncationStrategy(truncation) for first_text, second_text in zip_longest(text, text_pair, fillvalue=None): first_embeddings = self.text2embeddings(first_text) second_embeddings = self.text2embeddings(second_text) outputs = self.prepare_for_model( first_embeddings, second_embeddings, add_special_tokens=add_special_tokens, padding=PaddingStrategy. DO_NOT_PAD, # we pad in batch afterward truncation=truncation, max_length=max_length or actual_max_length, pad_to_multiple_of=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, return_tensors= None, # We convert the whole batch to tensors at the end prepend_batch_axis=False, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) batch_outputs = self.pad( batch_outputs, padding=padding, max_length=max_length or actual_max_length, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) return batch_outputs
def pad( self, encoded_inputs: Union[BatchEncoding, List[BatchEncoding], Dict[str, EncodedInput], Dict[str, List[EncodedInput]], List[Dict[str, EncodedInput]], ], padding: Union[bool, str, PaddingStrategy] = True, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, ) -> BatchEncoding: # If we have a list of dicts, let's convert it in a dict of lists # We do this to allow using this method as a collate_fn function in PyTorch Dataloader if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): encoded_inputs = { key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys() } # The model's main input name, usually `input_ids`, has be passed for padding if self.model_input_names[0] not in encoded_inputs: raise ValueError( "You should supply an encoding or a list of encodings to this method" f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" ) required_input = encoded_inputs[self.model_input_names[0]] if required_input is None: if return_attention_mask: encoded_inputs["attention_mask"] = [] return encoded_inputs # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects # and rebuild them afterwards if no return_tensors is specified # Note that we lose the specific device the tensor may be on for PyTorch first_element = required_input[0] if isinstance(first_element, (list, tuple)): # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. index = 0 while len(required_input[index]) == 0: index += 1 if index < len(required_input): first_element = required_input[index][0] # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. if not isinstance(first_element, (int, list, tuple)): if is_torch_available() and is_torch(first_element): return_tensors = "pt" if return_tensors is None else return_tensors elif isinstance(first_element, np.ndarray): return_tensors = "np" if return_tensors is None else return_tensors else: raise ValueError( f"type of {first_element} unknown: {type(first_element)}. " f"Should be one of a python, numpy or pytorch object.") for key, value in encoded_inputs.items(): encoded_inputs[key] = to_py_obj(value) required_input = encoded_inputs[self.model_input_names[0]] if required_input and not isinstance(required_input[0], (list, tuple)): encoded_inputs = self._pad( encoded_inputs, max_length=max_length, padding_strategy=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) batch_size = len(required_input) assert all( len(v) == batch_size for v in encoded_inputs.values() ), "Some items in the output dictionary have a different batch size than others." if padding == PaddingStrategy.LONGEST: max_length = max(len(inputs) for inputs in required_input) padding = PaddingStrategy.MAX_LENGTH batch_outputs = {} for i in range(batch_size): inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) outputs = self._pad( inputs, max_length=max_length, padding_strategy=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) return BatchEncoding(batch_outputs, tensor_type=return_tensors)
def prepare_for_model(self, ids: List[int], pair_ids: Optional[List[int]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_length: bool = False, prepend_batch_axis: bool = False, **kwargs): pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 if return_token_type_ids and not add_special_tokens: raise ValueError( "Asking to return token_type_ids while setting add_special_tokens to False " "results in an undefined behavior. Please set add_special_tokens to True or " "set return_token_type_ids to None.") # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Compute the total size of the returned encodings total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( pair=pair) if add_special_tokens else 0) # Truncation: Handle max sequence length overflowing_tokens = [] if truncation != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids=pair_ids, num_tokens_to_remove=total_len - max_length, truncation_strategy=truncation, stride=stride, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences( ids, pair_ids) else: sequence = np.concatenate([ids, pair_ids], axis=0) if pair is True else ids token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) # Build output dictionary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids if return_special_tokens_mask: if add_special_tokens: encoded_inputs[ "special_tokens_mask"] = self.get_special_tokens_mask( ids, pair_ids) else: encoded_inputs["special_tokens_mask"] = [0] * len(sequence) # Padding if padding != PaddingStrategy.DO_NOT_PAD or return_attention_mask: encoded_inputs = self.pad( encoded_inputs, max_length=max_length, padding=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) if return_length: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) batch_outputs = BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis) return batch_outputs
def _infer(self, sents: List[List[str]], inst_directions: List[str], do_basic_tokenization=True): """ Main function for Inference Args: sents: A list of inputs tokenized by a basic tokenizer. inst_directions: A list of str where each str indicates the direction of the corresponding instance (i.e., INST_BACKWARD for ITN or INST_FORWARD for TN). Returns: all_tag_preds: A list of list where each list contains the raw tag predictions for the corresponding input words in sents. nb_spans: A list of ints where each int indicates the number of semiotic spans in input words. span_starts: A list of lists where each list contains the starting locations of semiotic spans in input words. span_ends: A list of lists where each list contains the ending locations of semiotic spans in input words. do_basic_tokenization: whether to do a pre-processing to separate punctuation marks, recommended to set to True """ self.eval() # Append prefix texts = [] for ix, sent in enumerate(sents): if inst_directions[ix] == constants.INST_BACKWARD: prefix = constants.ITN_PREFIX elif inst_directions[ix] == constants.INST_FORWARD: prefix = constants.TN_PREFIX if do_basic_tokenization: texts.append([prefix] + sent) else: texts.append(prefix + " " + sent) # Apply the model if do_basic_tokenization: is_split_into_words = True else: is_split_into_words = False encodings = self._tokenizer(texts, is_split_into_words=is_split_into_words, padding=True, truncation=True, return_tensors='pt') inputs = encodings encodings_reduced = None # check that the length of the 'input_ids' equals as least the length of the original input # if an input symbol is missing in the tokenizer's vocabulary (such as emoji or a Chinese character), it could be skipped if do_basic_tokenization: len_texts = [len(x) for x in texts] else: len_texts = [len(x.split()) for x in texts] len_ids = [ len( self._tokenizer.convert_ids_to_tokens( x, skip_special_tokens=True)) for x in encodings['input_ids'] ] idx_valid = [ i for i, (t, enc) in enumerate(zip(len_texts, len_ids)) if enc >= t ] if len(idx_valid) != len(texts): logging.warning( 'Some of the examples have symbols that were skipped during the tokenization. Such examples will be skipped.' ) for i in range(len(texts)): if i not in idx_valid: logging.warning(f'Invalid input: {texts[i]}') # skip these sentences and fall back to the input # exclude invalid examples from the encodings encodings_reduced = { k: tensor[idx_valid, :] for k, tensor in encodings.items() } for k, tensor in encodings_reduced.items(): if tensor.ndim == 1: encodings_reduced[k] = tensor.unsqueeze(dim=0) inputs = BatchEncoding(data=encodings_reduced) # skip the batch if no valid inputs are present if encodings_reduced and encodings_reduced['input_ids'].numel() == 0: # -1 to exclude tag for the prompt token all_tag_preds = [[constants.SAME_TAG] * (len(x) - 1) for x in texts] nb_spans = [0] * len(texts) span_starts = [] * len(texts) span_ends = [] * len(texts) return all_tag_preds, nb_spans, span_starts, span_ends logits = self.model(**inputs.to(self.device)).logits pred_indexes = torch.argmax(logits, dim=-1).tolist() # Extract all_tag_preds for words all_tag_preds = [] batch_size, max_len = encodings['input_ids'].size() pred_idx = 0 for ix in range(batch_size): if ix in idx_valid: # remove first special token and task prefix token raw_tag_preds = [ constants.ALL_TAG_LABELS[p] for p in pred_indexes[pred_idx][2:] ] tag_preds, previous_word_idx = [], None word_ids = encodings.word_ids(batch_index=ix)[2:] for jx, word_idx in enumerate(word_ids): if word_idx is None: continue if word_idx != previous_word_idx: tag_preds.append(raw_tag_preds[jx] ) # without special token at index 0 previous_word_idx = word_idx pred_idx += 1 else: # for excluded examples, use SAME tags for all words tag_preds = [constants.SAME_TAG] * (len(texts[ix]) - 1) all_tag_preds.append(tag_preds) # Post-correction of simple tagger mistakes, i.e. I- tag is proceeding the B- tag in a span all_tag_preds = [ self._postprocess_tag_preds(words, inst_dir, ps) for words, inst_dir, ps in zip(sents, inst_directions, all_tag_preds) ] # Decoding nb_spans, span_starts, span_ends = self.decode_tag_preds(all_tag_preds) return all_tag_preds, nb_spans, span_starts, span_ends
def _prepare_for_model( self, ids: List[int], pair_ids: Optional[List[int]] = None, add_special_tokens: bool = True, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, truncation_strategy: TruncationStrategy = TruncationStrategy. DO_NOT_TRUNCATE, max_length: Optional[int] = None, stride: int = 0, return_tensors: Optional[str] = None, prepend_batch_axis: bool = False, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_length: bool = False, verbose: bool = True, ) -> BatchEncoding: """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens Args: ids: list of tokenized input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the `tokenize` and `convert_tokens_to_ids` methods. """ pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Truncation: Handle max sequence length total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add( pair=pair) if add_special_tokens else 0) if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids=pair_ids, num_tokens_to_remove=total_len - max_length, truncation_strategy=truncation_strategy, stride=stride, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences( ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) # Build output dictionnary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids if return_special_tokens_mask: if add_special_tokens: encoded_inputs[ "special_tokens_mask"] = self.get_special_tokens_mask( ids, pair_ids) else: encoded_inputs["special_tokens_mask"] = [0] * len(sequence) # Check lengths if max_length is None and len(encoded_inputs["input_ids"] ) > self.model_max_length and verbose: logger.warning( "Token indices sequence length is longer than the specified maximum sequence length " "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.model_max_length)) # Padding encoded_inputs = self.pad( encoded_inputs, max_length=max_length, padding=padding_strategy.value, return_attention_mask=return_attention_mask, ) if return_length: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) batch_outputs = BatchEncoding(encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis) return batch_outputs