コード例 #1
0
    def featurize(
        self,
        text: Any,
        entities: List[Dict],
        label: Optional[Union[int, str, List[Union[int, str]]]] = None,
    ) -> Optional[Instance]:

        instance = self.backbone.featurizer(
            text,
            to_field=self.forward_arg_name,
            aggregate=True,
            exclude_record_keys=True,
        )

        doc = self.backbone.tokenizer.nlp(text)
        entity_tags = tags_from_offsets(doc, entities, self._label_encoding)

        if "-" in entity_tags:
            self.__LOGGER.warning(
                f"Could not align spans with tokens for following example: '{text}' {entities}"
            )
            return None

        instance.add_field(
            "entities",
            SequenceLabelField(
                entity_tags,
                sequence_field=cast(TextField, instance["text"]),
                label_namespace=self._entity_tags_namespace,
            ),
        )

        return self.add_label(instance, label, to_field=self.label_name)
コード例 #2
0
    def featurize(
        self,
        text: Union[str, List[str]],
        labels: Optional[Union[List[str], List[int], List[dict]]] = None,
    ) -> Optional[Instance]:
        """
        Parameters
        ----------
        text
            Can be either a simple str or a list of str,
            in which case it will be treated as a list of pretokenized tokens
        labels
            A list of tag labels in the BIOUL or BIO format OR a list of span labels.

            Span labels are dictionaries that contain:

            'start': int, char index of the start of the span
            'end': int, char index of the end of the span (exclusive)
            'label': str, label of the span

            They are used with the `spacy.gold.biluo_tags_from_offsets` method.
        """
        instance = self.backbone.featurizer(text,
                                            to_field="text",
                                            tokenize=isinstance(text, str),
                                            aggregate=True)

        if labels is not None:
            # First convert span labels to tag labels
            if labels == [] or isinstance(labels[0], dict):
                doc = self.backbone.tokenizer.nlp(text)
                tags = tags_from_offsets(doc, labels, self._label_encoding)
                # discard misaligned examples for now
                if "-" in tags:
                    self.__LOGGER.warning(
                        f"Could not align spans with tokens for following example: '{text}' {labels}"
                    )
                    return None
                labels = tags

            instance.add_field(
                "labels",
                SequenceLabelField(
                    labels,
                    sequence_field=cast(TextField, instance["text"]),
                    label_namespace=vocabulary.LABELS_NAMESPACE,
                ),
            )

        return instance
コード例 #3
0
    def featurize(
        self,
        text: Union[str, List[str]],
        entities: Optional[List[dict]] = None,
        tags: Optional[Union[List[str], List[int]]] = None,
    ) -> Optional[Instance]:
        """
        Parameters
        ----------
        text
            Can be either a simple str or a list of str,
            in which case it will be treated as a list of pretokenized tokens
        entities
            A list of span labels

            Span labels are dictionaries that contain:

            'start': int, char index of the start of the span
            'end': int, char index of the end of the span (exclusive)
            'label': str, label of the span

            They are used with the `spacy.gold.biluo_tags_from_offsets` method.
        tags
            A list of tags in the BIOUL or BIO format.
        """
        if isinstance(text, str):
            doc = self.backbone.tokenizer.nlp(text)
            tokens = [token.text for token in doc]
            tags = (tags_from_offsets(doc, entities, self._label_encoding)
                    if entities is not None else [])
            # discard misaligned examples for now
            if "-" in tags:
                self.__LOGGER.warning(
                    f"Could not align spans with tokens for following example: '{text}' {entities}"
                )
                return None
        # text is already pre-tokenized
        else:
            tokens = text

        instance = self._featurize_tokens(tokens, tags)
        instance.add_field("raw_text", MetadataField(text))

        return instance
コード例 #4
0
    def featurize(
        self,
        text: Union[str, List[str], Dict[str, str]],
        entities: List[Dict],
        label: Optional[Union[str, List[str]]] = None,
    ) -> Instance:
        instance = self.backbone.featurizer(
            text,
            to_field=self._TEXT_ARG_NAME_IN_FORWARD,
            aggregate=True,
            exclude_record_keys=True,
        )

        doc = self.backbone.tokenizer.nlp(text)
        entity_tags = tags_from_offsets(doc, entities, self._label_encoding)

        if "-" in entity_tags:
            raise FeaturizeError(
                f"Could not align spans with tokens for following example: '{text}' {entities}"
            )

        try:
            instance.add_field(
                "entities",
                SequenceLabelField(
                    entity_tags,
                    sequence_field=cast(TextField, instance["text"]),
                    label_namespace=self._entity_tags_namespace,
                ),
            )
        except Exception as error:
            raise FeaturizeError(
                f"Could not create SequenceLabelField for {(text, entity_tags)}"
            ) from error

        return self._add_label(instance,
                               label,
                               to_field=self._LABEL_ARG_NAME_IN_FORWARD)
コード例 #5
0
    def featurize(
        self,
        text: Union[str, List[str]],
        entities: Optional[List[dict]] = None,
        tags: Optional[Union[List[str], List[int]]] = None,
    ) -> Instance:
        """
        Parameters
        ----------
        text
            Can be either a simple str or a list of str,
            in which case it will be treated as a list of pretokenized tokens
        entities
            A list of span labels

            Span labels are dictionaries that contain:

            'start': int, char index of the start of the span
            'end': int, char index of the end of the span (exclusive)
            'label': str, label of the span

            They are used with the `spacy.gold.biluo_tags_from_offsets` method.
        tags
            A list of tags in the BIOUL or BIO format.
        """
        if isinstance(text, str):
            doc = self.backbone.tokenizer.nlp(text)
            tokens = [spacy_to_allennlp_token(token) for token in doc]
            tags = (tags_from_offsets(doc, entities, self._label_encoding)
                    if entities is not None else [])
            # discard misaligned examples for now
            if "-" in tags:
                raise FeaturizeError(
                    f"Could not align spans with tokens for following example: '{text}' {entities}"
                )
        # text is already pre-tokenized
        else:
            tokens = [Token(t) for t in text]

        instance = self.backbone.featurizer(tokens,
                                            to_field="text",
                                            tokenize=False,
                                            aggregate=True)

        if self.training:
            try:
                instance.add_field(
                    "tags",
                    SequenceLabelField(
                        tags,
                        sequence_field=cast(TextField, instance["text"]),
                        label_namespace=vocabulary.LABELS_NAMESPACE,
                    ),
                )
            except Exception as exception:
                raise FeaturizeError(
                    f"Could not create SequenceLabelField for {(tokens, tags)}"
                ) from exception

        instance.add_field("raw_text", MetadataField(text))

        return instance