def test_has_annotation(en_vocab): doc = Doc(en_vocab, words=["Hello", "world"]) attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "HEAD", "ENT_IOB", "ENT_TYPE") for attr in attrs: assert not doc.has_annotation(attr) doc[0].tag_ = "A" doc[0].pos_ = "X" doc[0].set_morph("Feat=Val") doc[0].lemma_ = "a" doc[0].dep_ = "dep" doc[0].head = doc[1] doc.set_ents([Span(doc, 0, 1, label="HELLO")], default="missing") for attr in attrs: assert doc.has_annotation(attr) assert not doc.has_annotation(attr, require_complete=True) doc[1].tag_ = "A" doc[1].pos_ = "X" doc[1].set_morph("") doc[1].lemma_ = "a" doc[1].dep_ = "dep" doc.ents = [Span(doc, 0, 2, label="HELLO")] for attr in attrs: assert doc.has_annotation(attr) assert doc.has_annotation(attr, require_complete=True)
def test_doc_from_array_sent_starts(en_vocab): # fmt: off words = ["I", "live", "in", "New", "York", ".", "I", "like", "cats", "."] heads = [0, 0, 0, 0, 0, 0, 6, 6, 6, 6] deps = ["ROOT", "dep", "dep", "dep", "dep", "dep", "ROOT", "dep", "dep", "dep"] # fmt: on doc = Doc(en_vocab, words=words, heads=heads, deps=deps) # HEAD overrides SENT_START without warning attrs = [SENT_START, HEAD] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) # no warning using default attrs attrs = doc._get_array_attrs() arr = doc.to_array(attrs) with pytest.warns(None) as record: new_doc.from_array(attrs, arr) assert len(record) == 0 # only SENT_START uses SENT_START attrs = [SENT_START] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] assert not new_doc.has_annotation("DEP") # only HEAD uses HEAD attrs = [HEAD, DEP] arr = doc.to_array(attrs) new_doc = Doc(en_vocab, words=words) new_doc.from_array(attrs, arr) assert [t.is_sent_start for t in doc] == [t.is_sent_start for t in new_doc] assert new_doc.has_annotation("DEP")
def test_doc_is_nered(en_vocab): words = ["I", "live", "in", "New", "York"] doc = Doc(en_vocab, words=words) assert not doc.has_annotation("ENT_IOB") doc.ents = [Span(doc, 3, 5, label="GPE")] assert doc.has_annotation("ENT_IOB") # Test creating doc from array with unknown values arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) assert doc.has_annotation("ENT_IOB") # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.has_annotation("ENT_IOB")
def _add_valid_doc(self, doc: Doc) -> None: self.docs.append(doc) self._doc_ids.append(id(doc)) self.n_docs += 1 self.n_tokens += len(doc) if doc.has_annotation("SENT_START"): self.n_sents += itertoolz.count(doc.sents)
def test_sentencizer_complex(en_vocab, words, sent_starts, sent_ends, n_sents): doc = Doc(en_vocab, words=words) sentencizer = Sentencizer(punct_chars=None) doc = sentencizer(doc) assert doc.has_annotation("SENT_START") assert [t.is_sent_start for t in doc] == sent_starts assert [t.is_sent_end for t in doc] == sent_ends assert len(list(doc.sents)) == n_sents
def test_sentencizer(en_vocab): doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."]) sentencizer = Sentencizer(punct_chars=None) doc = sentencizer(doc) assert doc.has_annotation("SENT_START") sent_starts = [t.is_sent_start for t in doc] sent_ends = [t.is_sent_end for t in doc] assert sent_starts == [True, False, True, False, False, False, False] assert sent_ends == [False, True, False, False, False, False, True] assert len(list(doc.sents)) == 2
def test_issue1834(): """Test that sentence boundaries & parse/tag flags are not lost during serialization.""" words = ["This", "is", "a", "first", "sentence", ".", "And", "another", "one"] doc = Doc(Vocab(), words=words) doc[6].is_sent_start = True new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert not new_doc.has_annotation("DEP") assert not new_doc.has_annotation("TAG") doc = Doc( Vocab(), words=words, tags=["TAG"] * len(words), heads=[0, 0, 0, 0, 0, 0, 6, 6, 6], deps=["dep"] * len(words), ) new_doc = Doc(doc.vocab).from_bytes(doc.to_bytes()) assert new_doc[6].sent_start assert new_doc.has_annotation("DEP") assert new_doc.has_annotation("TAG")
def test_issue3468(): """Test that sentence boundaries are set correctly so Doc.has_annotation("SENT_START") can be restored after serialization.""" nlp = English() nlp.add_pipe("sentencizer") doc = nlp("Hello world") assert doc[0].is_sent_start assert doc.has_annotation("SENT_START") assert len(list(doc.sents)) == 1 doc_bytes = doc.to_bytes() new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) assert new_doc[0].is_sent_start assert new_doc.has_annotation("SENT_START") assert len(list(new_doc.sents)) == 1
def __call__(self, doc: Doc) -> Doc: """ Slightly modified from spacy.pipeline.function.merge_entities to accommodate stopword trimming. """ with doc.retokenize() as retokenizer: # Merge discovered entities / noun chunks. # Ones found via `PipedPhraseMatcher` have label "CUSTOM" ents = [ ent for ent in doc.ents if self.filter_entities is None or ent.label_ in self.filter_entities ] custom = set(tok.i for ent in ents for tok in ent if ent.label_ == "CUSTOM") noun_chunks = [] if doc.has_annotation("DEP"): # ensure precedence of CUSTOM phrases noun_chunks = [ noun for noun in doc.noun_chunks if not any(tok.i in custom for tok in noun) ] # eliminate overlapping spans, keeping the longest # NB that, given earlier filtering, CUSTOM phrases should never be subsumed/ # broken up phrases = filter_spans([ p for p in ents + noun_chunks if p.label_ == "CUSTOM" or len(p) <= self.max_phrase_len ]) for phrase in phrases: attrs = { "tag": phrase.root.tag, "dep": phrase.root.dep, "ent_type": phrase.label, } # need to trim leading/trailing stopwords if phrase.label_ != "CUSTOM" and self.stopwords is not None: while phrase and phrase[0].lower_ in self.stopwords: phrase = phrase[1:] while phrase and phrase[-1].lower_ in self.stopwords: phrase = phrase[:-1] if not phrase: continue retokenizer.merge(phrase, attrs=attrs) return doc
def get_noun_phrases(doc: Doc) -> List[Span]: """Compile a list of noun phrases in sense2vec's format (without determiners). Separated out to make it easier to customize, e.g. for languages that don't implement a noun_chunks iterator out-of-the-box, or use different label schemes. doc (Doc): The Doc to get noun phrases from. RETURNS (list): The noun phrases as a list of Span objects. """ trim_labels = ("advmod", "amod", "compound") spans = [] if doc.has_annotation("DEP"): for np in doc.noun_chunks: while len(np) > 1 and np[0].dep_ not in trim_labels: np = np[1:] spans.append(np) return spans
def to_tokenized_text(doc: Doc) -> List[List[str]]: """ Transform ``doc`` into an ordered, nested list of token-texts for each sentence. Args: doc Returns: A list of tokens' texts for each sentence in ``doc``. Note: If ``doc`` hasn't been segmented into sentences, the entire document is treated as a single sentence. """ if doc.has_annotation("SENT_START"): return [[token.text for token in sent] for sent in doc.sents] else: return [[token.text for token in doc]]
def test_issue3012(en_vocab): """Test that the is_tagged attribute doesn't get overwritten when we from_array without tag information.""" words = ["This", "is", "10", "%", "."] tags = ["DT", "VBZ", "CD", "NN", "."] pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] ents = ["O", "O", "B-PERCENT", "I-PERCENT", "O"] doc = Doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) assert doc.has_annotation("TAG") expected = ("10", "NUM", "CD", "PERCENT") assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected header = [ENT_IOB, ENT_TYPE] ent_array = doc.to_array(header) doc.from_array(header, ent_array) assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected # Serializing then deserializing doc_bytes = doc.to_bytes() doc2 = Doc(en_vocab).from_bytes(doc_bytes) assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected
def apply_transforms(self, doc: Doc, lang: types.LangLike, **kwargs) -> Doc: """ Sequentially apply some subset of data augmentation transforms to ``doc``, then return a new ``Doc`` created from the augmented text using ``lang``. Args: doc lang **kwargs: If, for whatever reason, you have to pass keyword argument values into transforms that vary or depend on characteristics of ``doc``, specify them here. The transforms' call signatures will be inspected, and values will be passed along, as needed. Returns: :class:`spacy.tokens.Doc` """ if doc.has_annotation("SENT_START"): nested_aug_toks = [ aug_utils.to_aug_toks(sent) for sent in doc.sents ] else: nested_aug_toks = [aug_utils.to_aug_toks(doc)] tfs = self._get_random_transforms() new_nested_aug_toks = [] for aug_toks in nested_aug_toks: # this is a bit of a hack, but whatchagonnado if kwargs: for tf in tfs: tf_kwargs = utils.get_kwargs_for_func(tf, kwargs) aug_toks = tf(aug_toks, **tf_kwargs) else: for tf in tfs: aug_toks = tf(aug_toks) new_nested_aug_toks.append(aug_toks) return self._make_new_spacy_doc(new_nested_aug_toks, lang)
def test_has_annotation_sents(en_vocab): doc = Doc(en_vocab, words=["Hello", "beautiful", "world"]) attrs = ("SENT_START", "IS_SENT_START", "IS_SENT_END") for attr in attrs: assert not doc.has_annotation(attr) assert not doc.has_annotation(attr, require_complete=True) # The first token (index 0) is always assumed to be a sentence start, # and ignored by the check in doc.has_annotation doc[1].is_sent_start = False for attr in attrs: assert doc.has_annotation(attr) assert not doc.has_annotation(attr, require_complete=True) doc[2].is_sent_start = False for attr in attrs: assert doc.has_annotation(attr) assert doc.has_annotation(attr, require_complete=True)
def test_issue599(en_vocab): doc = Doc(en_vocab) doc2 = Doc(doc.vocab) doc2.from_bytes(doc.to_bytes()) assert doc2.has_annotation("DEP")
def test_tokenlast_has_sent_end_true(): doc = Doc(Vocab(), words=["hello", "world"]) assert doc[0].is_sent_end is None assert doc[1].is_sent_end is True assert not doc.has_annotation("SENT_START")