def test_doc_retokenize_spans_entity_merge_iob(): # Test entity IOB stays consistent after merging words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) doc.ents = [ (doc.vocab.strings.add("ent-abc"), 0, 3), (doc.vocab.strings.add("ent-d"), 3, 4), ] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "B" with doc.retokenize() as retokenizer: retokenizer.merge(doc[0:1]) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" words = ["a", "b", "c", "d", "e", "f", "g", "h", "i"] doc = Doc(Vocab(), words=words) doc.ents = [ (doc.vocab.strings.add("ent-de"), 3, 5), (doc.vocab.strings.add("ent-fg"), 5, 7), ] assert doc[3].ent_iob_ == "B" assert doc[4].ent_iob_ == "I" assert doc[5].ent_iob_ == "B" assert doc[6].ent_iob_ == "I" with doc.retokenize() as retokenizer: retokenizer.merge(doc[2:4]) retokenizer.merge(doc[4:6]) retokenizer.merge(doc[7:9]) assert len(doc) == 6 assert doc[3].ent_iob_ == "B" assert doc[4].ent_iob_ == "I"
def test_matcher_ent_iob_key(en_vocab): """Test that patterns with ent_iob works correctly.""" matcher = Matcher(en_vocab) matcher.add("Rule", [[{"ENT_IOB": "I"}]]) doc1 = Doc(en_vocab, words=["I", "visited", "New", "York", "and", "California"]) doc1.ents = [Span(doc1, 2, 4, label="GPE"), Span(doc1, 5, 6, label="GPE")] doc2 = Doc(en_vocab, words=["I", "visited", "my", "friend", "Alicia"]) doc2.ents = [Span(doc2, 4, 5, label="PERSON")] matches1 = [doc1[start:end].text for _, start, end in matcher(doc1)] matches2 = [doc2[start:end].text for _, start, end in matcher(doc2)] assert len(matches1) == 1 assert matches1[0] == "York" assert len(matches2) == 0 matcher = Matcher(en_vocab) # Test iob pattern with operators matcher.add("Rule", [[{"ENT_IOB": "I", "OP": "+"}]]) doc = Doc( en_vocab, words=["I", "visited", "my", "friend", "Anna", "Maria", "Esperanza"] ) doc.ents = [Span(doc, 4, 7, label="PERSON")] matches = [doc[start:end].text for _, start, end in matcher(doc)] assert len(matches) == 3 assert matches[0] == "Maria" assert matches[1] == "Maria Esperanza" assert matches[2] == "Esperanza"
def test_displacy_parse_ents(en_vocab): """Test that named entities on a Doc are converted into displaCy's format.""" doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] ents = displacy.parse_ents(doc) assert isinstance(ents, dict) assert ents["text"] == "But Google is starting from behind " assert ents["ents"] == [{ "start": 4, "end": 10, "label": "ORG", "kb_id": "", "kb_url": "#" }] doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"], kb_id="Q95")] ents = displacy.parse_ents(doc) assert isinstance(ents, dict) assert ents["text"] == "But Google is starting from behind " assert ents["ents"] == [{ "start": 4, "end": 10, "label": "ORG", "kb_id": "Q95", "kb_url": "#" }]
def test_add_overlapping_entities(en_vocab): text = ["Louisiana", "Office", "of", "Conservation"] doc = Doc(en_vocab, words=text) entity = Span(doc, 0, 4, label=391) doc.ents = [entity] new_entity = Span(doc, 0, 1, label=392) with pytest.raises(ValueError): doc.ents = list(doc.ents) + [new_entity]
def test_issue2728(en_vocab): """Test that displaCy ENT visualizer escapes HTML correctly.""" doc = Doc(en_vocab, words=["test", "<RELEASE>", "test"]) doc.ents = [Span(doc, 0, 1, label="TEST")] html = displacy.render(doc, style="ent") assert "<RELEASE>" in html doc.ents = [Span(doc, 1, 2, label="TEST")] html = displacy.render(doc, style="ent") assert "<RELEASE>" in html
def test_doc_add_entities_set_ents_iob(en_vocab): doc = Doc(en_vocab, words=["This", "is", "a", "lion"]) ner = EntityRecognizer(en_vocab) ner.begin_training([]) ner(doc) assert len(list(doc.ents)) == 0 assert [w.ent_iob_ for w in doc] == (["O"] * len(doc)) doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] assert [w.ent_iob_ for w in doc] == ["", "", "", "B"] doc.ents = [(doc.vocab.strings["WORD"], 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "", ""]
def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def test_displacy_spans(en_vocab): """Test that displaCy can render Spans.""" doc = Doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] html = displacy.render(doc[1:4], style="ent") assert html.startswith("<div")
def spacy_doc_from_sentences(sentences: List[List[str]], labels: List[str], nlp: Language) -> Doc: # Create initial doc all_tokens = list(chain.from_iterable(sentences)) # Mark that every token is followed by space spaces = [True] * len(all_tokens) doc = Doc(nlp.vocab, words=all_tokens, spaces=spaces) # Set sentence boundaries tok_idx = 0 for sentence in sentences: for sentence_idx in range(len(sentence)): # First token should have start to True, all others False doc[tok_idx].is_sent_start = sentence_idx == 0 tok_idx += 1 if labels: if len(labels) != len(all_tokens): raise ValueError( f"Number of labels ({len(labels)}) does not match number of tokens ({len(all_tokens)})" ) # Create entities after converting IOB (actually BIO) to BILUO doc.ents = spans_from_biluo_tags(doc, iob_to_biluo(labels)) return doc
def __call__(self, doc: Doc) -> Doc: normalizers: List[Callable[[Token], str]] = [lambda x: x.text] if self.lower: normalizers.append(lambda x: x.text.lower()) if self.lemma: normalizers.append(lambda x: x.lemma_) if self.normalizer is not None: normalizers.append(self.normalizer) spans: Iterable[Tuple[int, int]] = [] for normalizer in normalizers: spans = itertools.chain( spans, self._search_by_normalizer(doc, normalizer, ignore_space=False)) if self.ignore_space: spans = itertools.chain( spans, self._search_by_normalizer(doc, normalizer, ignore_space=True), ) ents = list(doc.ents) for i, j in spans: ent = Span(doc, i, j, label=self.label) ents.append(ent) selected = textspan.remove_span_overlaps_idx([(s.start, s.end) for s in ents]) doc.ents = tuple(ents[i] for i in selected) return doc
def _mk_spacy_doc(tokens, entities): nlp = spacy.blank("en") doc = Doc(nlp.vocab, words=tokens, spaces=[True for _ in tokens]) for ent in entities: span = doc.char_span(ent["start"], ent["end"], label=ent["entity"]) doc.ents = list(doc.ents) + [span] return doc
def convert_file( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) header = True with open(input_path, "r") as in_f, open(output_path, "w") as out_f: for line in tqdm(in_f): if header: header = False continue sentence, tokens = pd.read_csv(StringIO(line), header=None, usecols=[0, 1]).values[0] tokens = eval(tokens) dict_line = line_to_dict(sentence, tokens) eg = dict_line if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path}")
def __call__(self, doc: Doc) -> Doc: for sent in doc.sents: blist = self.knp.parse_juman_result(sent._.get(JUMAN_LINES)) mlist = blist.mrph_list() tlist = blist.tag_list() for l, comp in zip([blist, mlist, tlist], ["bunsetsu", "morph", "tag"]): sent._.set(getattr(KNP_USER_KEYS, comp).list_, l) if len(mlist) != len(sent): t, m = None, None for t, m in zip(sent, mlist): if t.text != m.midasi: break raise ValueError( f"""Internal error occured Sentence: {sent.text} mlist : {[m.midasi for m in mlist]} tokens: {[t.text for t in sent]} diff : {m.midasi}, {t.text} """ ) for m, token in zip(mlist, sent): token._.set(KNP_USER_KEYS.morph.element, m) doc.ents = filter_spans(doc.ents + tuple(_extract_knp_ent(doc))) # type: ignore doc.noun_chunks_iterator = knp_noun_chunker # type: ignore # TODO: https://github.com/python/mypy/issues/3004 return doc
def _proc(self, doc: Doc, pattern: Union[Pattern, str], label: str) -> Doc: spans = self.get_spans(doc, pattern, label or self._DEFAULT_LABEL) doc.ents = filter_spans(tuple(spans) + doc.ents) # type: ignore # TODO: https://github.com/python/mypy/issues/3004 if self.merge: merge_spans(doc, spans) return doc
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): """Create Doc object from given vocab, words and annotations.""" pos = pos or [""] * len(words) tags = tags or [""] * len(words) heads = heads or [0] * len(words) deps = deps or [""] * len(words) for value in deps + tags + pos: vocab.strings.add(value) doc = Doc(vocab, words=words) attrs = doc.to_array([POS, HEAD, DEP]) for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): attrs[i, 0] = doc.vocab.strings[p] attrs[i, 1] = head attrs[i, 2] = doc.vocab.strings[dep] doc.from_array([POS, HEAD, DEP], attrs) if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] if tags: for token in doc: token.tag_ = tags[token.i] return doc
def get_doc(self, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): """Create Doc object from given vocab, words and annotations.""" vocab = Vocab() pos = pos or [""] * len(words) tags = tags or [""] * len(words) heads = heads or [0] * len(words) deps = deps or [""] * len(words) for value in deps + tags + pos: vocab.strings.add(value) doc = Doc(vocab, words=words) attrs = doc.to_array([POS, HEAD, DEP]) for i, (p, head, dep) in enumerate(zip(pos, heads, deps)): attrs[i, 0] = doc.vocab.strings[p] attrs[i, 1] = head attrs[i, 2] = doc.vocab.strings[dep] doc.from_array([POS, HEAD, DEP], attrs) if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] if tags: for token in doc: token.tag_ = tags[token.i] return doc
def test_has_annotation(en_vocab): doc = Doc(en_vocab, words=["Hello", "world"]) attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE") for attr in attrs: assert not doc.has_annotation(attr) assert not doc.has_annotation(attr, require_complete=True) doc[0].tag_ = "A" doc[0].pos_ = "X" doc[0].set_morph("Feat=Val") doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing") for attr in attrs: assert doc.has_annotation(attr) assert not doc.has_annotation(attr, require_complete=True) doc[1].tag_ = "A" doc[1].pos_ = "X" doc[1].set_morph("") doc[1].lemma_ = "a" doc[1].dep_ = "dep" doc.ents = [Span(doc, 0, 2, label="HELLO")] for attr in attrs: assert doc.has_annotation(attr) assert doc.has_annotation(attr, require_complete=True)
def __call__(self, doc: Doc) -> Doc: entities = [] for sent in doc.sents: labels = self.predict_labels([str(token) for token in list(sent)]) spans = decode(labels, list(sent), doc) entities.extend(spans) doc.ents = entities return doc
def test_issue1547(): """Test that entity labels still match after merging tokens.""" words = ["\n", "worda", ".", "\n", "wordb", "-", "Biosphere", "2", "-", " \n"] doc = Doc(Vocab(), words=words) doc.ents = [Span(doc, 6, 8, label=doc.vocab.strings["PRODUCT"])] with doc.retokenize() as retokenizer: retokenizer.merge(doc[5:7]) assert [ent.text for ent in doc.ents]
def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None, lemmas=None): """Create Doc object from given vocab, words and annotations.""" if deps and not heads: heads = [0] * len(deps) headings = [] values = [] annotations = [pos, heads, deps, lemmas, tags] possible_headings = [POS, HEAD, DEP, LEMMA, TAG] for a, annot in enumerate(annotations): if annot is not None: if len(annot) != len(words): raise ValueError(Errors.E189) headings.append(possible_headings[a]) if annot is not heads: values.extend(annot) for value in values: vocab.strings.add(value) doc = Doc(vocab, words=words) # if there are any other annotations, set them if headings: attrs = doc.to_array(headings) j = 0 for annot in annotations: if annot: if annot is heads: for i in range(len(words)): if attrs.ndim == 1: attrs[i] = heads[i] else: attrs[i, j] = heads[i] else: for i in range(len(words)): if attrs.ndim == 1: attrs[i] = doc.vocab.strings[annot[i]] else: attrs[i, j] = doc.vocab.strings[annot[i]] j += 1 doc.from_array(headings, attrs) # finally, set the entities if ents: doc.ents = [ Span(doc, start, end, label=doc.vocab.strings[label]) for start, end, label in ents ] return doc
def __call__(self, doc: Doc) -> Doc: if self.crfs_tagger is None: raise ValueError("Tagger has not been trained") entities = [] tokens = [token for token in doc] predicted_bilou_labels = self.predict_labels(tokens) entities.extend(self.decode_bilou(predicted_bilou_labels, tokens, doc)) doc.ents = entities[:] return doc
def test_doc_add_entities_set_ents_iob(en_vocab): text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) config = { "learn_tokens": False, "min_action_freq": 30, "update_with_oracle_cut_size": 100, } cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model, **config) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) doc.ents = [("ANIMAL", 3, 4)] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] doc.ents = [("WORD", 0, 2)] assert [w.ent_iob_ for w in doc] == ["B", "I", "O", "O"]
def __call__(self, doc: Doc) -> Doc: doc_ent = [] for sentence in doc.sents: tokens = list(sentence) labels = self.predict_labels([str(token) for token in tokens]) entities = decode_bilou(labels, tokens, doc) # print("tokens:%s\nfeatures:%s\nlabels:%s\nentities:%s\n"%(str(tokens), str(features), str(labels), str(entities))) for entity in entities: doc_ent.append(entity) doc.ents = doc_ent return doc
def tag_lexical_head(doc: Doc) -> Doc: """Tag the lexical head of a set with the entity tag 'LH'.""" if len(doc) == 0: return doc # ensure that numbers are also regarded as nouns if being stand-alone if doc[0].tag_ == 'CD' and (len(doc) < 2 or not doc[1].tag_.startswith('NN')): doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[LEXICAL_HEAD])] return doc chunk_words = {w for chunk in doc.noun_chunks for w in chunk} lexhead_start = None for chunk in doc.noun_chunks: # find the lexical head by looking for plural nouns (and ignore things like parentheses, conjunctions, ..) elem = chunk.root if elem.i == 0 and elem.tag_ == 'NNP' and words_util.is_english_plural_word( elem.text): # fix plural nouns that are parsed incorrectly as proper nouns due to capitalization in the beginning elem.tag = doc.vocab.strings['NNS'] if elem.tag_ not in ['NN', 'NNS']: break if len(doc) > elem.i + 1: if doc[elem.i + 1].text[0] in ["'", "´", "`"]: continue if doc[elem.i + 1].text in ['(', ')', '–'] and doc[-1].text != ')': continue if doc[elem.i + 1].tag_ in [ 'NN', 'NNS' ] or (len(doc) > elem.i + 2 and doc[elem.i + 1].text in ['and', 'or', ','] and doc[elem.i + 2] in chunk_words): lexhead_start = lexhead_start if lexhead_start is not None else chunk.start continue lexhead_start = lexhead_start if lexhead_start is not None else chunk.start doc.ents = [ Span(doc, i, i + 1, label=doc.vocab.strings[LEXICAL_HEAD]) for i in range(lexhead_start, chunk.end) ] break return doc
def __call__(self, doc: Doc) -> Doc: """Find matches in document and add them as entities. Args: doc: The Doc object in the pipeline. Returns: The Doc with added entities, if available. Example: >>> import spacy >>> from spaczz.pipeline import SpaczzRuler >>> nlp = spacy.blank("en") >>> ruler = SpaczzRuler(nlp) >>> doc = nlp.make_doc("My name is Anderson, Grunt") >>> ruler.add_patterns([{"label": "NAME", "pattern": "Grant Andersen", "type": "fuzzy", "kwargs": {"fuzzy_func": "token_sort"}}]) >>> doc = ruler(doc) >>> "Anderson, Grunt" in [ent.text for ent in doc.ents] True """ matches = list(self.fuzzy_matcher(doc) + self.regex_matcher(doc)) unique_matches = set([(m_id, start, end) for m_id, start, end in matches if start != end]) sorted_matches = sorted(unique_matches, key=lambda m: (m[2] - m[1], m[1]), reverse=True) entities = list(doc.ents) new_entities = [] seen_tokens: Set[int] = set() for match_id, start, end in sorted_matches: if any(t.ent_type for t in doc[start:end]) and not self.overwrite: continue # check for end - 1 here because boundaries are inclusive if start not in seen_tokens and end - 1 not in seen_tokens: if match_id in self._ent_ids: label, ent_id = self._ent_ids[match_id] span = Span(doc, start, end, label=label) span._.set("spaczz_ent", True) if ent_id: for token in span: token.ent_id_ = ent_id else: span = Span(doc, start, end, label=match_id) span._.set("spaczz_ent", True) new_entities.append(span) entities = [ e for e in entities if not (e.start < end and e.end > start) ] seen_tokens.update(range(start, end)) doc.ents = entities + new_entities return doc
def replace_ner_spans(doc: Doc, source: str): """Given a Spacy Doc object and the name of an annotation source, replaces the current named entities by the ones specified in the source""" # We create Spacy spans based on the annotation layer spans = [] if source in doc.spans: for span in doc.spans[source]: spans.append(span) doc.ents = tuple(spans) return doc
def test_doc_ents_setter(): """Test that both strings and integers can be used to set entities in tuple format via doc.ents.""" words = ["a", "b", "c", "d", "e"] doc = Doc(Vocab(), words=words) doc.ents = [("HELLO", 0, 2), (doc.vocab.strings.add("WORLD"), 3, 5)] assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"] vocab = Vocab() ents = [("HELLO", 0, 2), (vocab.strings.add("WORLD"), 3, 5)] ents = ["B-HELLO", "I-HELLO", "O", "B-WORLD", "I-WORLD"] doc = Doc(vocab, words=words, ents=ents) assert [e.label_ for e in doc.ents] == ["HELLO", "WORLD"]
def __call__(self, doc: Doc) -> Doc: if not self.tagger: raise ValueError('train() method should be called first!') entities = list() #print(doc.ents) for sent in doc.sents: tokens = list(sent) tags = self.predict_labels(tokens) entities.append(decode_bilou(tags, tokens, doc)) doc.ents = [item for sublist in entities for item in sublist] #print(doc.ents) return doc
def test_ents_reset(en_vocab): """Ensure that resetting doc.ents does not change anything""" text = ["This", "is", "a", "lion"] doc = Doc(en_vocab, words=text) cfg = {"model": DEFAULT_NER_MODEL} model = registry.resolve(cfg, validate=True)["model"] ner = EntityRecognizer(en_vocab, model) ner.initialize(lambda: [_ner_example(ner)]) ner(doc) orig_iobs = [t.ent_iob_ for t in doc] doc.ents = list(doc.ents) assert [t.ent_iob_ for t in doc] == orig_iobs
def test_doc_retokenize_spans_entity_split_iob(): # Test entity IOB stays consistent after merging words = ["abc", "d", "e"] doc = Doc(Vocab(), words=words) doc.ents = [(doc.vocab.strings.add("ent-abcd"), 0, 2)] assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" with doc.retokenize() as retokenizer: retokenizer.split(doc[0], ["a", "b", "c"], [(doc[0], 1), (doc[0], 2), doc[1]]) assert doc[0].ent_iob_ == "B" assert doc[1].ent_iob_ == "I" assert doc[2].ent_iob_ == "I" assert doc[3].ent_iob_ == "I"
def test_doc_is_nered(en_vocab): words = ["I", "live", "in", "New", "York"] doc = Doc(en_vocab, words=words) assert not doc.is_nered doc.ents = [Span(doc, 3, 5, label="GPE")] assert doc.is_nered # Test creating doc from array with unknown values arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) assert doc.is_nered # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.is_nered
def test_doc_is_nered(en_vocab): words = ["I", "live", "in", "New", "York"] doc = Doc(en_vocab, words=words) assert not doc.has_annotation("ENT_IOB") doc.ents = [Span(doc, 3, 5, label="GPE")] assert doc.has_annotation("ENT_IOB") # Test creating doc from array with unknown values arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) assert doc.has_annotation("ENT_IOB") # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.has_annotation("ENT_IOB")
def read_files(file: Path, nlp: "Language") -> Iterable[Example]: """Custom reader that keeps the tokenization of the gold data, and also adds the gold GGP annotations as we do not attempt to predict these.""" doc_bin = DocBin().from_disk(file) docs = doc_bin.get_docs(nlp.vocab) for gold in docs: pred = Doc( nlp.vocab, words=[t.text for t in gold], spaces=[t.whitespace_ for t in gold], ) pred.ents = gold.ents yield Example(pred, gold)
def test_serialize_after_adding_entity(): # Re issue #514 vocab = spacy.en.English.Defaults.create_vocab() entity_recognizer = spacy.en.English.Defaults.create_entity() doc = Doc(vocab, words=u'This is a sentence about pasta .'.split()) entity_recognizer.add_label('Food') entity_recognizer(doc) label_id = vocab.strings[u'Food'] doc.ents = [(label_id, 5,6)] assert [(ent.label_, ent.text) for ent in doc.ents] == [(u'Food', u'pasta')] byte_string = doc.to_bytes()