def _get_tars_formatted_sentence(self, label, sentence): original_text = sentence.to_tokenized_string() label_text_pair = (f"{label} {self.separator} {original_text}" if self.prefix else f"{original_text} {self.separator} {label}") label_length = 0 if not self.prefix else len(label.split(" ")) + len( self.separator.split(" ")) # make a tars sentence where all labels are O by default tars_sentence = Sentence(label_text_pair, use_tokenizer=False) for entity_label in sentence.get_labels(self.label_type): if entity_label.value == label: new_span = [ tars_sentence.get_token(token.idx + label_length) for token in entity_label.span ] tars_sentence.add_complex_label( self.static_label_type, SpanLabel(Span(new_span), value="entity")) return tars_sentence
def _label(self, sentence: Sentence): """ This will add a complex_label to the given sentence for every match.span() for every registered_mapping. If a match span overlaps with a token span an exception is raised. """ collection = RegexpTagger.TokenCollection(sentence) for label, pattern in self._regexp_mapping.items(): for match in pattern.finditer(sentence.to_original_text()): span: Tuple[int, int] = match.span() try: token_span = collection.get_token_span(span) except ValueError: raise Exception( f"The match span {span} for label '{label}' is overlapping with a token!" ) sentence.add_complex_label(label, SpanLabel(token_span, label))