def featurize( self, text: Any, entities: List[Dict], label: Optional[Union[int, str, List[Union[int, str]]]] = None, ) -> Optional[Instance]: instance = self.backbone.featurizer( text, to_field=self.forward_arg_name, aggregate=True, exclude_record_keys=True, ) doc = self.backbone.tokenizer.nlp(text) entity_tags = tags_from_offsets(doc, entities, self._label_encoding) if "-" in entity_tags: self.__LOGGER.warning( f"Could not align spans with tokens for following example: '{text}' {entities}" ) return None instance.add_field( "entities", SequenceLabelField( entity_tags, sequence_field=cast(TextField, instance["text"]), label_namespace=self._entity_tags_namespace, ), ) return self.add_label(instance, label, to_field=self.label_name)
def featurize( self, text: Union[str, List[str]], labels: Optional[Union[List[str], List[int], List[dict]]] = None, ) -> Optional[Instance]: """ Parameters ---------- text Can be either a simple str or a list of str, in which case it will be treated as a list of pretokenized tokens labels A list of tag labels in the BIOUL or BIO format OR a list of span labels. Span labels are dictionaries that contain: 'start': int, char index of the start of the span 'end': int, char index of the end of the span (exclusive) 'label': str, label of the span They are used with the `spacy.gold.biluo_tags_from_offsets` method. """ instance = self.backbone.featurizer(text, to_field="text", tokenize=isinstance(text, str), aggregate=True) if labels is not None: # First convert span labels to tag labels if labels == [] or isinstance(labels[0], dict): doc = self.backbone.tokenizer.nlp(text) tags = tags_from_offsets(doc, labels, self._label_encoding) # discard misaligned examples for now if "-" in tags: self.__LOGGER.warning( f"Could not align spans with tokens for following example: '{text}' {labels}" ) return None labels = tags instance.add_field( "labels", SequenceLabelField( labels, sequence_field=cast(TextField, instance["text"]), label_namespace=vocabulary.LABELS_NAMESPACE, ), ) return instance
def featurize( self, text: Union[str, List[str]], entities: Optional[List[dict]] = None, tags: Optional[Union[List[str], List[int]]] = None, ) -> Optional[Instance]: """ Parameters ---------- text Can be either a simple str or a list of str, in which case it will be treated as a list of pretokenized tokens entities A list of span labels Span labels are dictionaries that contain: 'start': int, char index of the start of the span 'end': int, char index of the end of the span (exclusive) 'label': str, label of the span They are used with the `spacy.gold.biluo_tags_from_offsets` method. tags A list of tags in the BIOUL or BIO format. """ if isinstance(text, str): doc = self.backbone.tokenizer.nlp(text) tokens = [token.text for token in doc] tags = (tags_from_offsets(doc, entities, self._label_encoding) if entities is not None else []) # discard misaligned examples for now if "-" in tags: self.__LOGGER.warning( f"Could not align spans with tokens for following example: '{text}' {entities}" ) return None # text is already pre-tokenized else: tokens = text instance = self._featurize_tokens(tokens, tags) instance.add_field("raw_text", MetadataField(text)) return instance
def featurize( self, text: Union[str, List[str], Dict[str, str]], entities: List[Dict], label: Optional[Union[str, List[str]]] = None, ) -> Instance: instance = self.backbone.featurizer( text, to_field=self._TEXT_ARG_NAME_IN_FORWARD, aggregate=True, exclude_record_keys=True, ) doc = self.backbone.tokenizer.nlp(text) entity_tags = tags_from_offsets(doc, entities, self._label_encoding) if "-" in entity_tags: raise FeaturizeError( f"Could not align spans with tokens for following example: '{text}' {entities}" ) try: instance.add_field( "entities", SequenceLabelField( entity_tags, sequence_field=cast(TextField, instance["text"]), label_namespace=self._entity_tags_namespace, ), ) except Exception as error: raise FeaturizeError( f"Could not create SequenceLabelField for {(text, entity_tags)}" ) from error return self._add_label(instance, label, to_field=self._LABEL_ARG_NAME_IN_FORWARD)
def featurize( self, text: Union[str, List[str]], entities: Optional[List[dict]] = None, tags: Optional[Union[List[str], List[int]]] = None, ) -> Instance: """ Parameters ---------- text Can be either a simple str or a list of str, in which case it will be treated as a list of pretokenized tokens entities A list of span labels Span labels are dictionaries that contain: 'start': int, char index of the start of the span 'end': int, char index of the end of the span (exclusive) 'label': str, label of the span They are used with the `spacy.gold.biluo_tags_from_offsets` method. tags A list of tags in the BIOUL or BIO format. """ if isinstance(text, str): doc = self.backbone.tokenizer.nlp(text) tokens = [spacy_to_allennlp_token(token) for token in doc] tags = (tags_from_offsets(doc, entities, self._label_encoding) if entities is not None else []) # discard misaligned examples for now if "-" in tags: raise FeaturizeError( f"Could not align spans with tokens for following example: '{text}' {entities}" ) # text is already pre-tokenized else: tokens = [Token(t) for t in text] instance = self.backbone.featurizer(tokens, to_field="text", tokenize=False, aggregate=True) if self.training: try: instance.add_field( "tags", SequenceLabelField( tags, sequence_field=cast(TextField, instance["text"]), label_namespace=vocabulary.LABELS_NAMESPACE, ), ) except Exception as exception: raise FeaturizeError( f"Could not create SequenceLabelField for {(tokens, tags)}" ) from exception instance.add_field("raw_text", MetadataField(text)) return instance