Beispiel #1
0
    def _process_entity_annotations(
            self,
            pack: DataPack,
            label: str,
            word_begin: int,
            word_end: int,
            current_entity_mention: Optional[Tuple[int, str]],
    ) -> Optional[Tuple[int, str]]:

        ner_type = label.strip("()*")

        if "(" in label:
            # Entering into a span for a particular ner.
            current_entity_mention = (word_begin, ner_type)
        if ")" in label:
            if current_entity_mention is None:
                raise ValueError(
                    "current_entity_mention is None when meet right blanket.")
            # Exiting a span, add and then reset the current span.
            kwargs_i = {"ner_type": current_entity_mention[1]}
            entity = EntityMention(pack, current_entity_mention[0], word_end)
            entity.set_fields(**kwargs_i)
            pack.add_or_get_entry(entity)

            current_entity_mention = None

        return current_entity_mention
Beispiel #2
0
    def _process(self, input_pack: DataPack):
        for sentence in input_pack.get(Sentence):
            token_entries = list(
                input_pack.get(entry_type=Token,
                               range_annotation=sentence,
                               component=self.token_component))
            tokens = [(token.text, token.pos) for token in token_entries]
            ne_tree = ne_chunk(tokens)

            index = 0
            for chunk in ne_tree:
                if hasattr(chunk, 'label'):
                    # For example:
                    # chunk: Tree('GPE', [('New', 'NNP'), ('York', 'NNP')])
                    begin_pos = token_entries[index].span.begin
                    end_pos = token_entries[index + len(chunk) - 1].span.end
                    entity = EntityMention(input_pack, begin_pos, end_pos)
                    kwargs_i = {"ner_type": chunk.label()}
                    entity.set_fields(**kwargs_i)
                    input_pack.add_or_get_entry(entity)
                    index += len(chunk)
                else:
                    # For example:
                    # chunk: ('This', 'DT')
                    index += 1
Beispiel #3
0
    def _parse_pack(self, data: dict) -> Iterator[DataPack]:
        r"""Extracts information from input `data` of one document output from
        Prodigy Annotator including the text, tokens and its annotations into a
        DataPack.

        Args:
            data: a dict that contains information for one document.

        Returns: DataPack containing information extracted from `data`.
        """
        pack = DataPack()
        text = data['text']
        tokens = data['tokens']
        spans = data['spans']

        document = Document(pack, 0, len(text))
        pack.set_text(text, replace_func=self.text_replace_operation)
        pack.add_or_get_entry(document)

        for token in tokens:
            begin = token['start']
            end = token['end']
            token_entry = Token(pack, begin, end)
            pack.add_or_get_entry(token_entry)

        for span_items in spans:
            begin = span_items['start']
            end = span_items['end']
            annotation_entry = EntityMention(pack, begin, end)
            annotation_entry.set_fields(ner_type=span_items['label'])
            pack.add_or_get_entry(annotation_entry)

        pack.meta.doc_id = data['meta']['id']

        yield pack
Beispiel #4
0
    def pack(self,
             data_pack: DataPack,
             output_dict: Optional[Dict[str, Dict[str, List[str]]]] = None):
        """
        Write the prediction results back to datapack. by writing the predicted
        ner to the original tokens.
        """

        if output_dict is None:
            return

        current_entity_mention: Tuple[int, str] = (-1, "None")

        for i in range(len(output_dict["Token"]["tid"])):
            # an instance
            for j in range(len(output_dict["Token"]["tid"][i])):
                tid: int = output_dict["Token"]["tid"][i][j]  # type: ignore

                orig_token: Token = data_pack.get_entry(tid)  # type: ignore
                ner_tag: str = output_dict["Token"]["ner"][i][j]

                orig_token.set_fields(ner=ner_tag)

                token = orig_token
                token_ner = token.get_field("ner")
                if token_ner[0] == "B":
                    current_entity_mention = (token.span.begin, token_ner[2:])
                elif token_ner[0] == "I":
                    continue
                elif token_ner[0] == "O":
                    continue

                elif token_ner[0] == "E":
                    if token_ner[2:] != current_entity_mention[1]:
                        continue

                    kwargs_i = {"ner_type": current_entity_mention[1]}
                    entity = EntityMention(data_pack,
                                           current_entity_mention[0],
                                           token.span.end)
                    entity.set_fields(**kwargs_i)
                    data_pack.add_or_get_entry(entity)
                elif token_ner[0] == "S":
                    current_entity_mention = (token.span.begin, token_ner[2:])
                    kwargs_i = {"ner_type": current_entity_mention[1]}
                    entity = EntityMention(data_pack,
                                           current_entity_mention[0],
                                           token.span.end)
                    entity.set_fields(**kwargs_i)
                    data_pack.add_or_get_entry(entity)
Beispiel #5
0
    def _process_ner(self, doc, input_pack):
        """Perform spaCy's NER Pipeline on the document.

        Args:
            doc: The document
            input_pack: Input pack to fill

        Returns:

        """
        ner_doc = self.nlp(doc)

        for item in ner_doc.ents:
            entity = EntityMention(input_pack, item.start_char,
                                   item.end_char)
            kwargs_i = {"ner_type": item.label_}
            entity.set_fields(**kwargs_i)
            input_pack.add_or_get_entry(entity)