def test_named_entities_exclude_types(spacy_doc): ne_types = ["PERSON", "GPE"] for exclude_types in ne_types: result = extract.named_entities(spacy_doc, exclude_types=exclude_types) assert all(span.label_ != exclude_types for span in result) ne_types = [{"PERSON", "GPE"}, ("DATE", "ORG"), ["LOC"]] for exclude_types in ne_types: result = extract.named_entities(spacy_doc, exclude_types=exclude_types) assert all(span.label_ not in exclude_types for span in result) # special numeric cases! ne_types = ["NUMERIC", ("NUMERIC",), {"PERSON", "NUMERIC"}] for exclude_types in ne_types: exclude_types_parsed = extract._parse_ne_types(exclude_types, "exclude") result = extract.named_entities(spacy_doc, exclude_types=exclude_types) assert all(span.label_ not in exclude_types_parsed for span in result)
def test_named_entities(self): result = [ ent for ent in extract.named_entities(self.spacy_doc, drop_determiners=False) ] self.assertTrue(all(ent.label_ for ent in result)) self.assertTrue(all(ent[0].ent_type for ent in result))
def test_named_entities(spacy_doc): result = [ ent for ent in extract.named_entities(spacy_doc, drop_determiners=False) ] assert all(ent.label_ for ent in result) assert all(ent[0].ent_type for ent in result)
def test_named_entities_min_freq(self): expected = [] observed = [ ent.text for ent in extract.named_entities( self.spacy_doc, drop_determiners=True, min_freq=2) ] self.assertEqual(observed, expected)
def test_named_entities_good(spacy_doc): include_types = {'PERSON', 'GPE'} result = [ ent for ent in extract.named_entities( spacy_doc, include_types=include_types, drop_determiners=False) ] assert all(ent.label_ in include_types for ent in result)
def test_named_entities_min_freq(spacy_doc): expected = [] observed = [ ent.text for ent in extract.named_entities( spacy_doc, drop_determiners=True, min_freq=2) ] assert observed == expected
def test_named_entities_include_types(spacy_doc): ne_types = ["PERSON", "GPE"] for include_types in ne_types: ents = extract.named_entities(spacy_doc, include_types=include_types) assert all(ent.label_ == include_types for ent in ents) ne_types = [{"PERSON", "GPE"}, ("DATE", "ORG"), ["LOC"]] for include_types in ne_types: ents = extract.named_entities(spacy_doc, include_types=include_types) assert all(ent.label_ in include_types for ent in ents) # special numeric cases! ne_types = ["NUMERIC", ("NUMERIC", ), {"PERSON", "NUMERIC"}] for include_types in ne_types: include_types_parsed = extract._parse_ne_types(include_types, "include") ents = extract.named_entities(spacy_doc, include_types=include_types) assert all(ent.label_ in include_types_parsed for ent in ents)
def test_named_entities_determiner(self): expected = ['the Middle East', 'the United States'] observed = [ ent.text for ent in extract.named_entities(self.spacy_doc, drop_determiners=False) if ent[0].pos_ == 'DET' ] self.assertEqual(observed, expected)
def test_named_entities_good(self): expected = ['Kuwait', 'Donald Trump', 'United States'] observed = [ ent.text for ent in extract.named_entities(self.spacy_doc, good_ne_types={'PERSON', 'GPE'}, drop_determiners=True) ] self.assertEqual(observed, expected)
def named_entities(self, **kwargs): """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from doc, optionally filtering by the entity types and frequencies. .. seealso:: :func:`extract.named_entities() <textacy.extract.named_entities>` for all function kwargs. """ return extract.named_entities(self.spacy_doc, **kwargs)
def test_named_entities(self): expected = [ 'Two weeks ago', 'Kuwait', 'Arab', '30 minutes', 'Middle East', 'Egyptian', 'Donald Trump', 'United States' ] observed = [ ent.text for ent in extract.named_entities(self.spacy_doc, drop_determiners=True) ] self.assertEqual(observed, expected)
def extract_entities(data): tokens = [] doc = Doc(data, lang="en_core_web_md") res = extract.named_entities(doc, include_types=["PERSON", "ORG", "LOC"]) for r in res: tokens.append(str(r[0])) if len(tokens) == 0: tokens = ["empty"] return tokens
def test_extract_functionality(doc): bigrams = list( extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=False))[:10] for bigram in bigrams: assert isinstance(bigram, SpacySpan) assert len(bigram) == 2 trigrams = list( extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, min_freq=2))[:10] for trigram in trigrams: assert isinstance(trigram, SpacySpan) assert len(trigram) == 3 nes = list( extract.named_entities(doc, drop_determiners=False, exclude_types='numeric'))[:10] for ne in nes: assert isinstance(ne, SpacySpan) assert ne.label_ assert ne.label_ != 'QUANTITY' pos_regex_matches = list( extract.pos_regex_matches( doc, constants.POS_REGEX_PATTERNS['en']['NP']))[:10] for match in pos_regex_matches: assert isinstance(match, SpacySpan) stmts = list(extract.semistructured_statements(doc, 'I', cue='be'))[:10] for stmt in stmts: assert isinstance(stmt, list) assert isinstance(stmt[0], compat.unicode_) assert len(stmt) == 3 kts = keyterms.textrank(doc, n_keyterms=10) for keyterm in kts: assert isinstance(keyterm, tuple) assert isinstance(keyterm[0], compat.unicode_) assert isinstance(keyterm[1], float) assert keyterm[1] > 0.0
def _text_postprocessing(doc, keywords, extract_ngrams=False): """ named entities are converted to uppercase """ ents = list(named_entities(doc)) ents_lemma = [entity.lemma_ for entity in ents] #if keyword is named entity it will be replaced by its uppercase form for i, word in enumerate(keywords.copy()): try: index = ents_lemma.index(word) keywords[i] = ents[index].text except ValueError: continue #if there is a redundant space character, it will be removed if not extract_ngrams: for i, word in enumerate(keywords): if word[0] == ' ': keywords[i] = word.replace(' ', '') return keywords
def named_entities(self, **kwargs): """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from doc, optionally filtering by the entity types and frequencies. Args: **kwargs: good_ne_types (set[str] or 'numeric', optional): named entity types to include; if "numeric", all numeric entity types are included bad_ne_types (set[str] or 'numeric', optional): named entity types to exclude; if "numeric", all numeric entity types are excluded min_freq (int, optional): remove named entities that occur in `doc` fewer than `min_freq` times drop_determiners (bool, optional): remove leading determiners (e.g. "the") from named entities (e.g. "the United States" => "United States") Yields: ``spacy.Span``: the next named entity passing all specified filters, in order of appearance in the document .. seealso:: :func:`extract.named_entities() <textacy.extract.named_entities>` """ for ne in extract.named_entities(self.spacy_doc, **kwargs): yield ne
def get_named_entities(doc): assert isinstance(doc, textacy.Doc) or isinstance( doc, spacy.tokens.Doc), "Only {} are supported".format(possible_docs) return extract.named_entities(doc)
def test_named_entities_min_freq(spacy_doc): result = list(extract.named_entities(spacy_doc, min_freq=2)) assert len(result) == 0
def test_named_entities_drop_determiners(spacy_doc): result = list(extract.named_entities(spacy_doc, drop_determiners=True)) assert not any(span[0].pos_ == "DET" for span in result) assert all(span.label_ for span in result)
def test_named_entities(spacy_doc): result = list(extract.named_entities(spacy_doc, drop_determiners=False)) assert all(isinstance(span, SpacySpan) for span in result) assert all(span.label_ for span in result) assert all(span[0].ent_type for span in result)
def extract_named_entities(doc, min_freq=1): return extract.named_entities(doc, drop_determiners=True, min_freq=min_freq)
def test_named_entities_drop_determiners(spacy_doc): ents = list(extract.named_entities(spacy_doc, drop_determiners=True)) assert not any(ent[0].tag_ == 'DET' for ent in ents) assert all(ent.label_ for ent in ents)