Example #1
0
 def test_exclude_types(self, spacy_doc):
     ent_types = ["PERSON", "GPE"]
     for exclude_types in ent_types:
         result = extract.entities(spacy_doc, exclude_types=exclude_types)
         assert all(span.label_ != exclude_types for span in result)
     ent_types = [{"PERSON", "GPE"}, ("DATE", "ORG"), ["LOC"]]
     for exclude_types in ent_types:
         result = extract.entities(spacy_doc, exclude_types=exclude_types)
         assert all(span.label_ not in exclude_types for span in result)
     # special numeric cases!
     ent_types = ["NUMERIC", ("NUMERIC",), {"PERSON", "NUMERIC"}]
     for exclude_types in ent_types:
         exclude_types_parsed = extract._parse_ent_types(exclude_types, "exclude")
         result = extract.entities(spacy_doc, exclude_types=exclude_types)
         assert all(span.label_ not in exclude_types_parsed for span in result)
Example #2
0
 def test_include_types(self, spacy_doc):
     ent_types = ["PERSON", "GPE"]
     for include_types in ent_types:
         result = extract.entities(spacy_doc, include_types=include_types)
         assert all(span.label_ == include_types for span in result)
     ent_types = [{"PERSON", "GPE"}, ("DATE", "ORG"), ["LOC"]]
     for include_types in ent_types:
         result = extract.entities(spacy_doc, include_types=include_types)
         assert all(span.label_ in include_types for span in result)
     # special numeric cases!
     ent_types = ["NUMERIC", ("NUMERIC", ), {"PERSON", "NUMERIC"}]
     for include_types in ent_types:
         include_types_parsed = basics._parse_ent_types(
             include_types, "include")
         result = extract.entities(spacy_doc, include_types=include_types)
         assert all(span.label_ in include_types_parsed for span in result)
Example #3
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, Span)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, Span)
        assert len(trigram) == 3

    nes = list(
        extract.entities(doc, drop_determiners=False,
                         exclude_types="numeric"))[:10]
    for ne in nes:
        assert isinstance(ne, Span)
        assert ne.label_
        assert ne.label_ != "QUANTITY"

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS["en"]["NP"]))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, Span)

    stmts = list(extract.semistructured_statements(doc, "I", cue="be"))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = keyterms.textrank(doc, n_keyterms=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Example #4
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, Span)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, Span)
        assert len(trigram) == 3

    nes = list(
        extract.entities(doc, drop_determiners=False,
                         exclude_types="numeric"))[:10]
    for ne in nes:
        assert isinstance(ne, Span)
        assert ne.label_
        assert ne.label_ != "QUANTITY"

    regex_matches = list(extract.regex_matches(doc, "Mr\. Speaker"))[:10]
    for match in regex_matches:
        assert isinstance(match, Span)

    stmts = list(extract.semistructured_statements(doc, entity="I",
                                                   cue="be"))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], str)
        assert len(stmt) == 3

    kts = kt.textrank(doc, topn=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], str)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Example #5
0
 def test_default(self, spacy_doc):
     result = list(extract.entities(spacy_doc, drop_determiners=False))
     assert all(isinstance(span, Span) for span in result)
     assert all(span.label_ for span in result)
     assert all(span[0].ent_type for span in result)
Example #6
0
 def test_drop_determiners(self, spacy_doc):
     result = list(extract.entities(spacy_doc, drop_determiners=True))
     assert not any(span[0].pos_ == "DET" for span in result)
     assert all(span.label_ for span in result)
Example #7
0
 def test_min_freq(self, spacy_doc):
     result = list(extract.entities(spacy_doc, min_freq=2))
     assert len(result) == 0