def test_exclude_types(self, spacy_doc): ent_types = ["PERSON", "GPE"] for exclude_types in ent_types: result = extract.entities(spacy_doc, exclude_types=exclude_types) assert all(span.label_ != exclude_types for span in result) ent_types = [{"PERSON", "GPE"}, ("DATE", "ORG"), ["LOC"]] for exclude_types in ent_types: result = extract.entities(spacy_doc, exclude_types=exclude_types) assert all(span.label_ not in exclude_types for span in result) # special numeric cases! ent_types = ["NUMERIC", ("NUMERIC",), {"PERSON", "NUMERIC"}] for exclude_types in ent_types: exclude_types_parsed = extract._parse_ent_types(exclude_types, "exclude") result = extract.entities(spacy_doc, exclude_types=exclude_types) assert all(span.label_ not in exclude_types_parsed for span in result)
def test_include_types(self, spacy_doc): ent_types = ["PERSON", "GPE"] for include_types in ent_types: result = extract.entities(spacy_doc, include_types=include_types) assert all(span.label_ == include_types for span in result) ent_types = [{"PERSON", "GPE"}, ("DATE", "ORG"), ["LOC"]] for include_types in ent_types: result = extract.entities(spacy_doc, include_types=include_types) assert all(span.label_ in include_types for span in result) # special numeric cases! ent_types = ["NUMERIC", ("NUMERIC", ), {"PERSON", "NUMERIC"}] for include_types in ent_types: include_types_parsed = basics._parse_ent_types( include_types, "include") result = extract.entities(spacy_doc, include_types=include_types) assert all(span.label_ in include_types_parsed for span in result)
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, Span) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, Span) assert len(trigram) == 3 nes = list( extract.entities(doc, drop_determiners=False, exclude_types="numeric"))[:10] for ne in nes: assert isinstance(ne, Span) assert ne.label_ assert ne.label_ != "QUANTITY" pos_regex_matches = list( extract.pos_regex_matches( doc, constants.POS_REGEX_PATTERNS["en"]["NP"]))[:10] for match in pos_regex_matches: assert isinstance(match, Span) stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], compat.unicode_) assert len(stmt) == 3 kts = keyterms.textrank(doc, n_keyterms=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], compat.unicode_) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, Span) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, Span) assert len(trigram) == 3 nes = list( extract.entities(doc, drop_determiners=False, exclude_types="numeric"))[:10] for ne in nes: assert isinstance(ne, Span) assert ne.label_ assert ne.label_ != "QUANTITY" regex_matches = list(extract.regex_matches(doc, "Mr\. Speaker"))[:10] for match in regex_matches: assert isinstance(match, Span) stmts = list(extract.semistructured_statements(doc, entity="I", cue="be"))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], str) assert len(stmt) == 3 kts = kt.textrank(doc, topn=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], str) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def test_default(self, spacy_doc): result = list(extract.entities(spacy_doc, drop_determiners=False)) assert all(isinstance(span, Span) for span in result) assert all(span.label_ for span in result) assert all(span[0].ent_type for span in result)
def test_drop_determiners(self, spacy_doc): result = list(extract.entities(spacy_doc, drop_determiners=True)) assert not any(span[0].pos_ == "DET" for span in result) assert all(span.label_ for span in result)
def test_min_freq(self, spacy_doc): result = list(extract.entities(spacy_doc, min_freq=2)) assert len(result) == 0