Esempio n. 1
0
def test_named_entities_exclude_types(spacy_doc):
    ne_types = ["PERSON", "GPE"]
    for exclude_types in ne_types:
        result = extract.named_entities(spacy_doc, exclude_types=exclude_types)
        assert all(span.label_ != exclude_types for span in result)
    ne_types = [{"PERSON", "GPE"}, ("DATE", "ORG"), ["LOC"]]
    for exclude_types in ne_types:
        result = extract.named_entities(spacy_doc, exclude_types=exclude_types)
        assert all(span.label_ not in exclude_types for span in result)
    # special numeric cases!
    ne_types = ["NUMERIC", ("NUMERIC",), {"PERSON", "NUMERIC"}]
    for exclude_types in ne_types:
        exclude_types_parsed = extract._parse_ne_types(exclude_types, "exclude")
        result = extract.named_entities(spacy_doc, exclude_types=exclude_types)
        assert all(span.label_ not in exclude_types_parsed for span in result)
Esempio n. 2
0
 def test_named_entities(self):
     result = [
         ent for ent in extract.named_entities(self.spacy_doc,
                                               drop_determiners=False)
     ]
     self.assertTrue(all(ent.label_ for ent in result))
     self.assertTrue(all(ent[0].ent_type for ent in result))
Esempio n. 3
0
def test_named_entities(spacy_doc):
    result = [
        ent
        for ent in extract.named_entities(spacy_doc, drop_determiners=False)
    ]
    assert all(ent.label_ for ent in result)
    assert all(ent[0].ent_type for ent in result)
Esempio n. 4
0
 def test_named_entities_min_freq(self):
     expected = []
     observed = [
         ent.text for ent in extract.named_entities(
             self.spacy_doc, drop_determiners=True, min_freq=2)
     ]
     self.assertEqual(observed, expected)
Esempio n. 5
0
def test_named_entities_good(spacy_doc):
    include_types = {'PERSON', 'GPE'}
    result = [
        ent for ent in extract.named_entities(
            spacy_doc, include_types=include_types, drop_determiners=False)
    ]
    assert all(ent.label_ in include_types for ent in result)
Esempio n. 6
0
def test_named_entities_min_freq(spacy_doc):
    expected = []
    observed = [
        ent.text for ent in extract.named_entities(
            spacy_doc, drop_determiners=True, min_freq=2)
    ]
    assert observed == expected
Esempio n. 7
0
def test_named_entities_include_types(spacy_doc):
    ne_types = ["PERSON", "GPE"]
    for include_types in ne_types:
        ents = extract.named_entities(spacy_doc, include_types=include_types)
        assert all(ent.label_ == include_types for ent in ents)
    ne_types = [{"PERSON", "GPE"}, ("DATE", "ORG"), ["LOC"]]
    for include_types in ne_types:
        ents = extract.named_entities(spacy_doc, include_types=include_types)
        assert all(ent.label_ in include_types for ent in ents)
    # special numeric cases!
    ne_types = ["NUMERIC", ("NUMERIC", ), {"PERSON", "NUMERIC"}]
    for include_types in ne_types:
        include_types_parsed = extract._parse_ne_types(include_types,
                                                       "include")
        ents = extract.named_entities(spacy_doc, include_types=include_types)
        assert all(ent.label_ in include_types_parsed for ent in ents)
Esempio n. 8
0
 def test_named_entities_determiner(self):
     expected = ['the Middle East', 'the United States']
     observed = [
         ent.text for ent in extract.named_entities(self.spacy_doc,
                                                    drop_determiners=False)
         if ent[0].pos_ == 'DET'
     ]
     self.assertEqual(observed, expected)
Esempio n. 9
0
 def test_named_entities_good(self):
     expected = ['Kuwait', 'Donald Trump', 'United States']
     observed = [
         ent.text
         for ent in extract.named_entities(self.spacy_doc,
                                           good_ne_types={'PERSON', 'GPE'},
                                           drop_determiners=True)
     ]
     self.assertEqual(observed, expected)
Esempio n. 10
0
    def named_entities(self, **kwargs):
        """
        Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
        doc, optionally filtering by the entity types and frequencies.

        .. seealso:: :func:`extract.named_entities() <textacy.extract.named_entities>`
        for all function kwargs.
        """
        return extract.named_entities(self.spacy_doc, **kwargs)
Esempio n. 11
0
 def test_named_entities(self):
     expected = [
         'Two weeks ago', 'Kuwait', 'Arab', '30 minutes', 'Middle East',
         'Egyptian', 'Donald Trump', 'United States'
     ]
     observed = [
         ent.text for ent in extract.named_entities(self.spacy_doc,
                                                    drop_determiners=True)
     ]
     self.assertEqual(observed, expected)
Esempio n. 12
0
def extract_entities(data):
    tokens = []
    doc = Doc(data, lang="en_core_web_md")
    res = extract.named_entities(doc, include_types=["PERSON", "ORG", "LOC"])

    for r in res:
        tokens.append(str(r[0]))

    if len(tokens) == 0:
        tokens = ["empty"]

    return tokens
Esempio n. 13
0
def test_extract_functionality(doc):
    bigrams = list(
        extract.ngrams(doc,
                       2,
                       filter_stops=True,
                       filter_punct=True,
                       filter_nums=False))[:10]
    for bigram in bigrams:
        assert isinstance(bigram, SpacySpan)
        assert len(bigram) == 2

    trigrams = list(
        extract.ngrams(doc,
                       3,
                       filter_stops=True,
                       filter_punct=True,
                       min_freq=2))[:10]
    for trigram in trigrams:
        assert isinstance(trigram, SpacySpan)
        assert len(trigram) == 3

    nes = list(
        extract.named_entities(doc,
                               drop_determiners=False,
                               exclude_types='numeric'))[:10]
    for ne in nes:
        assert isinstance(ne, SpacySpan)
        assert ne.label_
        assert ne.label_ != 'QUANTITY'

    pos_regex_matches = list(
        extract.pos_regex_matches(
            doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10]
    for match in pos_regex_matches:
        assert isinstance(match, SpacySpan)

    stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10]
    for stmt in stmts:
        assert isinstance(stmt, list)
        assert isinstance(stmt[0], compat.unicode_)
        assert len(stmt) == 3

    kts = keyterms.textrank(doc, n_keyterms=10)
    for keyterm in kts:
        assert isinstance(keyterm, tuple)
        assert isinstance(keyterm[0], compat.unicode_)
        assert isinstance(keyterm[1], float)
        assert keyterm[1] > 0.0
Esempio n. 14
0
def _text_postprocessing(doc, keywords, extract_ngrams=False):
    """
    named entities are converted to uppercase
    """
    ents = list(named_entities(doc))
    ents_lemma = [entity.lemma_ for entity in ents]

    #if keyword is named entity it will be replaced by its uppercase form
    for i, word in enumerate(keywords.copy()):
        try:
            index = ents_lemma.index(word)
            keywords[i] = ents[index].text
        except ValueError:
            continue

    #if there is a redundant space character, it will be removed
    if not extract_ngrams:
        for i, word in enumerate(keywords):
            if word[0] == ' ':
                keywords[i] = word.replace(' ', '')

    return keywords
Esempio n. 15
0
    def named_entities(self, **kwargs):
        """
        Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
        doc, optionally filtering by the entity types and frequencies.

        Args:
            **kwargs:
                good_ne_types (set[str] or 'numeric', optional): named entity types to
                    include; if "numeric", all numeric entity types are included
                bad_ne_types (set[str] or 'numeric', optional): named entity types to
                    exclude; if "numeric", all numeric entity types are excluded
                min_freq (int, optional): remove named entities that occur in `doc` fewer
                    than `min_freq` times
                drop_determiners (bool, optional): remove leading determiners (e.g. "the")
                    from named entities (e.g. "the United States" => "United States")

    Yields:
        ``spacy.Span``: the next named entity passing all specified filters,
            in order of appearance in the document

        .. seealso:: :func:`extract.named_entities() <textacy.extract.named_entities>`
        """
        for ne in extract.named_entities(self.spacy_doc, **kwargs):
            yield ne
Esempio n. 16
0
def get_named_entities(doc):
    assert isinstance(doc, textacy.Doc) or isinstance(
        doc, spacy.tokens.Doc), "Only {} are supported".format(possible_docs)
    return extract.named_entities(doc)
Esempio n. 17
0
def test_named_entities_min_freq(spacy_doc):
    result = list(extract.named_entities(spacy_doc, min_freq=2))
    assert len(result) == 0
Esempio n. 18
0
def test_named_entities_drop_determiners(spacy_doc):
    result = list(extract.named_entities(spacy_doc, drop_determiners=True))
    assert not any(span[0].pos_ == "DET" for span in result)
    assert all(span.label_ for span in result)
Esempio n. 19
0
def test_named_entities(spacy_doc):
    result = list(extract.named_entities(spacy_doc, drop_determiners=False))
    assert all(isinstance(span, SpacySpan) for span in result)
    assert all(span.label_ for span in result)
    assert all(span[0].ent_type for span in result)
Esempio n. 20
0
def extract_named_entities(doc, min_freq=1):
    return extract.named_entities(doc,
                                  drop_determiners=True,
                                  min_freq=min_freq)
Esempio n. 21
0
def test_named_entities_drop_determiners(spacy_doc):
    ents = list(extract.named_entities(spacy_doc, drop_determiners=True))
    assert not any(ent[0].tag_ == 'DET' for ent in ents)
    assert all(ent.label_ for ent in ents)