Esempio n. 1
0
def test_doc_retokenize_spans_entity_merge_iob():
    # Test entity IOB stays consistent after merging
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-abc"), 0, 3),
        (doc.vocab.strings.add("ent-d"), 3, 4),
    ]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "B"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:1])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-de"), 3, 5),
        (doc.vocab.strings.add("ent-fg"), 5, 7),
    ]
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
    assert doc[5].ent_iob_ == "B"
    assert doc[6].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
Esempio n. 2
0
def test_matcher_ent_iob_key(en_vocab):
    """Test that patterns with ent_iob works correctly."""
    matcher = Matcher(en_vocab)
    matcher.add("Rule", [[{"ENT_IOB": "I"}]])
    doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"])
    doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")]
    doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"])
    doc2.ents = [Span(doc2, 4, 5, label="PERSON")]
    matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)]
    matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)]
    assert len(matches1) == 1
    assert matches1[0] == "York"
    assert len(matches2) == 0

    matcher = Matcher(en_vocab)  # Test iob pattern with operators
    matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]])
    doc = Doc(
        en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"]
    )
    doc.ents = [Span(doc, 4, 7, label="PERSON")]
    matches = [doc[start:end].text for _, start, end in matcher(doc)]
    assert len(matches) == 3
    assert matches[0] == "Maria"
    assert matches[1] == "Maria Esperanza"
    assert matches[2] == "Esperanza"
Esempio n. 3
0
def test_displacy_parse_ents(en_vocab):
    """Test that named entities on a Doc are converted into displaCy's format."""
    doc = Doc(en_vocab,
              words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    ents = displacy.parse_ents(doc)
    assert isinstance(ents, dict)
    assert ents["text"] == "But Google is starting from behind "
    assert ents["ents"] == [{
        "start": 4,
        "end": 10,
        "label": "ORG",
        "kb_id": "",
        "kb_url": "#"
    }]

    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")]
    ents = displacy.parse_ents(doc)
    assert isinstance(ents, dict)
    assert ents["text"] == "But Google is starting from behind "
    assert ents["ents"] == [{
        "start": 4,
        "end": 10,
        "label": "ORG",
        "kb_id": "Q95",
        "kb_url": "#"
    }]
Esempio n. 4
0
def test_doc_retokenize_spans_entity_merge_iob():
    # Test entity IOB stays consistent after merging
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-abc"), 0, 3),
        (doc.vocab.strings.add("ent-d"), 3, 4),
    ]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "B"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[0:1])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"

    words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [
        (doc.vocab.strings.add("ent-de"), 3, 5),
        (doc.vocab.strings.add("ent-fg"), 5, 7),
    ]
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
    assert doc[5].ent_iob_ == "B"
    assert doc[6].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[2:4])
        retokenizer.merge(doc[4:6])
        retokenizer.merge(doc[7:9])
    assert len(doc) == 6
    assert doc[3].ent_iob_ == "B"
    assert doc[4].ent_iob_ == "I"
def test_add_overlapping_entities(en_vocab):
    text = ["Louisiana", "Office", "of", "Conservation"]
    doc = Doc(en_vocab, words=text)
    entity = Span(doc, 0, 4, label=391)
    doc.ents = [entity]

    new_entity = Span(doc, 0, 1, label=392)
    with pytest.raises(ValueError):
        doc.ents = list(doc.ents) + [new_entity]
Esempio n. 6
0
def test_issue2728(en_vocab):
    """Test that displaCy ENT visualizer escapes HTML correctly."""
    doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"])
    doc.ents = [Span(doc, 0, 1, label="TEST")]
    html = displacy.render(doc, style="ent")
    assert "&lt;RELEASE&gt;" in html
    doc.ents = [Span(doc, 1, 2, label="TEST")]
    html = displacy.render(doc, style="ent")
    assert "&lt;RELEASE&gt;" in html
Esempio n. 7
0
def test_doc_add_entities_set_ents_iob(en_vocab):
    doc = Doc(en_vocab, words=["This", "is", "a", "lion"])
    ner = EntityRecognizer(en_vocab)
    ner.begin_training([])
    ner(doc)
    assert len(list(doc.ents)) == 0
    assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
    doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["", "", "", "B"]
    doc.ents = [(doc.vocab.strings["WORD"], 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
Esempio n. 8
0
def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)

    doc.ents = [("ANIMAL", 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]

    doc.ents = [("WORD", 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
Esempio n. 9
0
def test_displacy_spans(en_vocab):
    """Test that displaCy can render Spans."""
    doc = Doc(en_vocab,
              words=["But", "Google", "is", "starting", "from", "behind"])
    doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
    html = displacy.render(doc[1:4], style="ent")
    assert html.startswith("<div")
Esempio n. 10
0
def spacy_doc_from_sentences(sentences: List[List[str]], labels: List[str],
                             nlp: Language) -> Doc:
    # Create initial doc
    all_tokens = list(chain.from_iterable(sentences))
    # Mark that every token is followed by space
    spaces = [True] * len(all_tokens)
    doc = Doc(nlp.vocab, words=all_tokens, spaces=spaces)

    # Set sentence boundaries
    tok_idx = 0
    for sentence in sentences:
        for sentence_idx in range(len(sentence)):
            # First token should have start to True, all others False
            doc[tok_idx].is_sent_start = sentence_idx == 0
            tok_idx += 1

    if labels:
        if len(labels) != len(all_tokens):
            raise ValueError(
                f"Number of labels ({len(labels)}) does not match number of tokens ({len(all_tokens)})"
            )

        # Create entities after converting IOB (actually BIO) to BILUO
        doc.ents = spans_from_biluo_tags(doc, iob_to_biluo(labels))

    return doc
Esempio n. 11
0
    def __call__(self, doc: Doc) -> Doc:
        normalizers: List[Callable[[Token], str]] = [lambda x: x.text]
        if self.lower:
            normalizers.append(lambda x: x.text.lower())
        if self.lemma:
            normalizers.append(lambda x: x.lemma_)
        if self.normalizer is not None:
            normalizers.append(self.normalizer)

        spans: Iterable[Tuple[int, int]] = []
        for normalizer in normalizers:
            spans = itertools.chain(
                spans,
                self._search_by_normalizer(doc, normalizer,
                                           ignore_space=False))
            if self.ignore_space:
                spans = itertools.chain(
                    spans,
                    self._search_by_normalizer(doc,
                                               normalizer,
                                               ignore_space=True),
                )

        ents = list(doc.ents)
        for i, j in spans:
            ent = Span(doc, i, j, label=self.label)
            ents.append(ent)
        selected = textspan.remove_span_overlaps_idx([(s.start, s.end)
                                                      for s in ents])
        doc.ents = tuple(ents[i] for i in selected)
        return doc
Esempio n. 12
0
def _mk_spacy_doc(tokens, entities):
    nlp = spacy.blank("en")
    doc = Doc(nlp.vocab, words=tokens, spaces=[True for _ in tokens])
    for ent in entities:
        span = doc.char_span(ent["start"], ent["end"], label=ent["entity"])
        doc.ents = list(doc.ents) + [span]
    return doc
Esempio n. 13
0
def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    header = True
    with open(input_path, "r") as in_f, open(output_path, "w") as out_f:
        for line in tqdm(in_f):
            if header:
                header = False
                continue
            sentence, tokens = pd.read_csv(StringIO(line),
                                           header=None,
                                           usecols=[0, 1]).values[0]
            tokens = eval(tokens)
            dict_line = line_to_dict(sentence, tokens)
            eg = dict_line

            if eg["answer"] != "accept":
                continue
            tokens = [token["text"] for token in eg["tokens"]]
            words, spaces = get_words_and_spaces(tokens, eg["text"])
            doc = Doc(nlp.vocab, words=words, spaces=spaces)
            doc.ents = [
                doc.char_span(s["start"], s["end"], label=s["label"])
                for s in eg.get("spans", [])
            ]
            doc_bin.add(doc)
        doc_bin.to_disk(output_path)
        print(f"Processed {len(doc_bin)} documents: {output_path}")
Esempio n. 14
0
 def __call__(self, doc: Doc) -> Doc:
     for sent in doc.sents:
         blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES))
         mlist = blist.mrph_list()
         tlist = blist.tag_list()
         for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]):
             sent._.set(getattr(KNP_USER_KEYS, comp).list_, l)
         if len(mlist) != len(sent):
             t, m = None, None
             for t, m in zip(sent, mlist):
                 if t.text != m.midasi:
                     break
             raise ValueError(
                 f"""Internal error occured
         Sentence: {sent.text}
         mlist : {[m.midasi for m in mlist]}
         tokens: {[t.text for t in sent]}
         diff  : {m.midasi}, {t.text}
         """
             )
         for m, token in zip(mlist, sent):
             token._.set(KNP_USER_KEYS.morph.element, m)
     doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc)))  # type: ignore
     doc.noun_chunks_iterator = knp_noun_chunker  # type: ignore
     # TODO: https://github.com/python/mypy/issues/3004
     return doc
Esempio n. 15
0
 def _proc(self, doc: Doc, pattern: Union[Pattern, str], label: str) -> Doc:
     spans = self.get_spans(doc, pattern, label or self._DEFAULT_LABEL)
     doc.ents = filter_spans(tuple(spans) + doc.ents)  # type: ignore
     # TODO: https://github.com/python/mypy/issues/3004
     if self.merge:
         merge_spans(doc, spans)
     return doc
Esempio n. 16
0
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None):
    """Create Doc object from given vocab, words and annotations."""
    pos = pos or [""] * len(words)
    tags = tags or [""] * len(words)
    heads = heads or [0] * len(words)
    deps = deps or [""] * len(words)
    for value in deps + tags + pos:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)
    attrs = doc.to_array([POS, HEAD, DEP])
    for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
        attrs[i, 0] = doc.vocab.strings[p]
        attrs[i, 1] = head
        attrs[i, 2] = doc.vocab.strings[dep]
    doc.from_array([POS, HEAD, DEP], attrs)
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
    if tags:
        for token in doc:
            token.tag_ = tags[token.i]
    return doc
Esempio n. 17
0
    def get_doc(self,
                words=[],
                pos=None,
                heads=None,
                deps=None,
                tags=None,
                ents=None):
        """Create Doc object from given vocab, words and annotations."""

        vocab = Vocab()
        pos = pos or [""] * len(words)
        tags = tags or [""] * len(words)
        heads = heads or [0] * len(words)
        deps = deps or [""] * len(words)
        for value in deps + tags + pos:
            vocab.strings.add(value)

        doc = Doc(vocab, words=words)
        attrs = doc.to_array([POS, HEAD, DEP])
        for i, (p, head, dep) in enumerate(zip(pos, heads, deps)):
            attrs[i, 0] = doc.vocab.strings[p]
            attrs[i, 1] = head
            attrs[i, 2] = doc.vocab.strings[dep]
        doc.from_array([POS, HEAD, DEP], attrs)
        if ents:
            doc.ents = [
                Span(doc, start, end, label=doc.vocab.strings[label])
                for start, end, label in ents
            ]
        if tags:
            for token in doc:
                token.tag_ = tags[token.i]
        return doc
Esempio n. 18
0
def test_has_annotation(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "world"])
    attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE")
    for attr in attrs:
        assert not doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)

    doc[0].tag_ = "A"
    doc[0].pos_ = "X"
    doc[0].set_morph("Feat=Val")
    doc[0].lemma_ = "a"
    doc[0].dep_ = "dep"
    doc[0].head = doc[1]
    doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing")

    for attr in attrs:
        assert doc.has_annotation(attr)
        assert not doc.has_annotation(attr, require_complete=True)

    doc[1].tag_ = "A"
    doc[1].pos_ = "X"
    doc[1].set_morph("")
    doc[1].lemma_ = "a"
    doc[1].dep_ = "dep"
    doc.ents = [Span(doc, 0, 2, label="HELLO")]

    for attr in attrs:
        assert doc.has_annotation(attr)
        assert doc.has_annotation(attr, require_complete=True)
Esempio n. 19
0
 def __call__(self, doc: Doc) -> Doc:
     entities = []
     for sent in doc.sents:
         labels = self.predict_labels([str(token) for token in list(sent)])
         spans = decode(labels, list(sent), doc)
         entities.extend(spans)
     doc.ents = entities
     return doc
Esempio n. 20
0
def test_issue1547():
    """Test that entity labels still match after merging tokens."""
    words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[5:7])
    assert [ent.text for ent in doc.ents]
Esempio n. 21
0
def test_issue1547():
    """Test that entity labels still match after merging tokens."""
    words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])]
    with doc.retokenize() as retokenizer:
        retokenizer.merge(doc[5:7])
    assert [ent.text for ent in doc.ents]
Esempio n. 22
0
def get_doc(vocab,
            words=[],
            pos=None,
            heads=None,
            deps=None,
            tags=None,
            ents=None,
            lemmas=None):
    """Create Doc object from given vocab, words and annotations."""
    if deps and not heads:
        heads = [0] * len(deps)
    headings = []
    values = []
    annotations = [pos, heads, deps, lemmas, tags]
    possible_headings = [POS, HEAD, DEP, LEMMA, TAG]
    for a, annot in enumerate(annotations):
        if annot is not None:
            if len(annot) != len(words):
                raise ValueError(Errors.E189)
            headings.append(possible_headings[a])
            if annot is not heads:
                values.extend(annot)
    for value in values:
        vocab.strings.add(value)

    doc = Doc(vocab, words=words)

    # if there are any other annotations, set them
    if headings:
        attrs = doc.to_array(headings)

        j = 0
        for annot in annotations:
            if annot:
                if annot is heads:
                    for i in range(len(words)):
                        if attrs.ndim == 1:
                            attrs[i] = heads[i]
                        else:
                            attrs[i, j] = heads[i]
                else:
                    for i in range(len(words)):
                        if attrs.ndim == 1:
                            attrs[i] = doc.vocab.strings[annot[i]]
                        else:
                            attrs[i, j] = doc.vocab.strings[annot[i]]
                j += 1
        doc.from_array(headings, attrs)

    # finally, set the entities
    if ents:
        doc.ents = [
            Span(doc, start, end, label=doc.vocab.strings[label])
            for start, end, label in ents
        ]
    return doc
    def __call__(self, doc: Doc) -> Doc:
        if self.crfs_tagger is None:
            raise ValueError("Tagger has not been trained")

        entities = []
        tokens = [token for token in doc]
        predicted_bilou_labels = self.predict_labels(tokens)
        entities.extend(self.decode_bilou(predicted_bilou_labels, tokens, doc))
        doc.ents = entities[:]
        return doc
def test_doc_add_entities_set_ents_iob(en_vocab):
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    config = {
        "learn_tokens": False,
        "min_action_freq": 30,
        "update_with_oracle_cut_size": 100,
    }
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model, **config)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)

    doc.ents = [("ANIMAL", 3, 4)]
    assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]

    doc.ents = [("WORD", 0, 2)]
    assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
Esempio n. 25
0
 def __call__(self, doc: Doc) -> Doc:
     doc_ent = []
     for sentence in doc.sents:
         tokens = list(sentence)
         labels = self.predict_labels([str(token) for token in tokens])
         entities = decode_bilou(labels, tokens, doc)
         # print("tokens:%s\nfeatures:%s\nlabels:%s\nentities:%s\n"%(str(tokens), str(features), str(labels), str(entities)))
         for entity in entities:
             doc_ent.append(entity)
     doc.ents = doc_ent
     return doc
Esempio n. 26
0
def tag_lexical_head(doc: Doc) -> Doc:
    """Tag the lexical head of a set with the entity tag 'LH'."""
    if len(doc) == 0:
        return doc

    # ensure that numbers are also regarded as nouns if being stand-alone
    if doc[0].tag_ == 'CD' and (len(doc) < 2
                                or not doc[1].tag_.startswith('NN')):
        doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[LEXICAL_HEAD])]
        return doc

    chunk_words = {w for chunk in doc.noun_chunks for w in chunk}
    lexhead_start = None
    for chunk in doc.noun_chunks:
        # find the lexical head by looking for plural nouns (and ignore things like parentheses, conjunctions, ..)
        elem = chunk.root
        if elem.i == 0 and elem.tag_ == 'NNP' and words_util.is_english_plural_word(
                elem.text):
            # fix plural nouns that are parsed incorrectly as proper nouns due to capitalization in the beginning
            elem.tag = doc.vocab.strings['NNS']
        if elem.tag_ not in ['NN', 'NNS']:
            break
        if len(doc) > elem.i + 1:
            if doc[elem.i + 1].text[0] in ["'", "´", "`"]:
                continue
            if doc[elem.i + 1].text in ['(', ')', '–'] and doc[-1].text != ')':
                continue
            if doc[elem.i + 1].tag_ in [
                    'NN', 'NNS'
            ] or (len(doc) > elem.i + 2
                  and doc[elem.i + 1].text in ['and', 'or', ',']
                  and doc[elem.i + 2] in chunk_words):
                lexhead_start = lexhead_start if lexhead_start is not None else chunk.start
                continue
        lexhead_start = lexhead_start if lexhead_start is not None else chunk.start
        doc.ents = [
            Span(doc, i, i + 1, label=doc.vocab.strings[LEXICAL_HEAD])
            for i in range(lexhead_start, chunk.end)
        ]
        break
    return doc
Esempio n. 27
0
    def __call__(self, doc: Doc) -> Doc:
        """Find matches in document and add them as entities.

        Args:
            doc: The Doc object in the pipeline.

        Returns:
            The Doc with added entities, if available.

        Example:
            >>> import spacy
            >>> from spaczz.pipeline import SpaczzRuler
            >>> nlp = spacy.blank("en")
            >>> ruler = SpaczzRuler(nlp)
            >>> doc = nlp.make_doc("My name is Anderson, Grunt")
            >>> ruler.add_patterns([{"label": "NAME", "pattern": "Grant Andersen",
                "type": "fuzzy", "kwargs": {"fuzzy_func": "token_sort"}}])
            >>> doc = ruler(doc)
            >>> "Anderson, Grunt" in [ent.text for ent in doc.ents]
            True
        """
        matches = list(self.fuzzy_matcher(doc) + self.regex_matcher(doc))
        unique_matches = set([(m_id, start, end)
                              for m_id, start, end in matches if start != end])
        sorted_matches = sorted(unique_matches,
                                key=lambda m: (m[2] - m[1], m[1]),
                                reverse=True)
        entities = list(doc.ents)
        new_entities = []
        seen_tokens: Set[int] = set()
        for match_id, start, end in sorted_matches:
            if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
                continue
            # check for end - 1 here because boundaries are inclusive
            if start not in seen_tokens and end - 1 not in seen_tokens:
                if match_id in self._ent_ids:
                    label, ent_id = self._ent_ids[match_id]
                    span = Span(doc, start, end, label=label)
                    span._.set("spaczz_ent", True)
                    if ent_id:
                        for token in span:
                            token.ent_id_ = ent_id
                else:
                    span = Span(doc, start, end, label=match_id)
                    span._.set("spaczz_ent", True)
                new_entities.append(span)
                entities = [
                    e for e in entities
                    if not (e.start < end and e.end > start)
                ]
                seen_tokens.update(range(start, end))
        doc.ents = entities + new_entities
        return doc
Esempio n. 28
0
def replace_ner_spans(doc: Doc, source: str):
    """Given a Spacy Doc object and the name of an annotation source, replaces
    the current named entities by the ones specified in the source"""

    # We create Spacy spans based on the annotation layer
    spans = []
    if source in doc.spans:
        for span in doc.spans[source]:
            spans.append(span)
    doc.ents = tuple(spans)

    return doc
Esempio n. 29
0
def test_doc_ents_setter():
    """Test that both strings and integers can be used to set entities in
    tuple format via doc.ents."""
    words = ["a", "b", "c", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [("HELLO", 0, 2), (doc.vocab.strings.add("WORLD"), 3, 5)]
    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
    vocab = Vocab()
    ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)]
    ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"]
    doc = Doc(vocab, words=words, ents=ents)
    assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
Esempio n. 30
0
 def __call__(self, doc: Doc) -> Doc:
     if not self.tagger:
         raise ValueError('train() method should be called first!')
     entities = list()
     #print(doc.ents)
     for sent in doc.sents:
         tokens = list(sent)
         tags = self.predict_labels(tokens)
         entities.append(decode_bilou(tags, tokens, doc))
     doc.ents = [item for sublist in entities for item in sublist]
     #print(doc.ents)
     return doc
Esempio n. 31
0
def test_ents_reset(en_vocab):
    """Ensure that resetting doc.ents does not change anything"""
    text = ["This", "is", "a", "lion"]
    doc = Doc(en_vocab, words=text)
    cfg = {"model": DEFAULT_NER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    ner = EntityRecognizer(en_vocab, model)
    ner.initialize(lambda: [_ner_example(ner)])
    ner(doc)
    orig_iobs = [t.ent_iob_ for t in doc]
    doc.ents = list(doc.ents)
    assert [t.ent_iob_ for t in doc] == orig_iobs
Esempio n. 32
0
def test_doc_retokenize_spans_entity_split_iob():
    # Test entity IOB stays consistent after merging
    words = ["abc", "d", "e"]
    doc = Doc(Vocab(), words=words)
    doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)]
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    with doc.retokenize() as retokenizer:
        retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]])
    assert doc[0].ent_iob_ == "B"
    assert doc[1].ent_iob_ == "I"
    assert doc[2].ent_iob_ == "I"
    assert doc[3].ent_iob_ == "I"
Esempio n. 33
0
def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
    assert not doc.is_nered
    doc.ents = [Span(doc, 3, 5, label="GPE")]
    assert doc.is_nered
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
    assert doc.is_nered
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.is_nered
Esempio n. 34
0
def test_doc_is_nered(en_vocab):
    words = ["I", "live", "in", "New", "York"]
    doc = Doc(en_vocab, words=words)
    assert not doc.has_annotation("ENT_IOB")
    doc.ents = [Span(doc, 3, 5, label="GPE")]
    assert doc.has_annotation("ENT_IOB")
    # Test creating doc from array with unknown values
    arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64")
    doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr)
    assert doc.has_annotation("ENT_IOB")
    # Test serialization
    new_doc = Doc(en_vocab).from_bytes(doc.to_bytes())
    assert new_doc.has_annotation("ENT_IOB")
def read_files(file: Path, nlp: "Language") -> Iterable[Example]:
    """Custom reader that keeps the tokenization of the gold data,
    and also adds the gold GGP annotations as we do not attempt to predict these."""
    doc_bin = DocBin().from_disk(file)
    docs = doc_bin.get_docs(nlp.vocab)
    for gold in docs:
        pred = Doc(
            nlp.vocab,
            words=[t.text for t in gold],
            spaces=[t.whitespace_ for t in gold],
        )
        pred.ents = gold.ents
        yield Example(pred, gold)
Esempio n. 36
0
def test_serialize_after_adding_entity():
    # Re issue #514
    vocab = spacy.en.English.Defaults.create_vocab()
    entity_recognizer = spacy.en.English.Defaults.create_entity()

    doc = Doc(vocab, words=u'This is a sentence about pasta .'.split())
    entity_recognizer.add_label('Food')
    entity_recognizer(doc)

    label_id = vocab.strings[u'Food']
    doc.ents = [(label_id, 5,6)]

    assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')]

    byte_string = doc.to_bytes()