def test_frequencies(): assert (frequencies(["cat", "pig", "cat", "eel", "pig", "dog", "dog", "dog"]) == {"cat": 2, "eel": 1, "pig": 2, "dog": 3}) assert frequencies([]) == {} assert frequencies("onomatopoeia") == {"a": 2, "e": 1, "i": 1, "m": 1, "o": 4, "n": 1, "p": 1, "t": 1}
def noun_chunks(doc, drop_determiners=True, min_freq=1): """ Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally filtering by frequency and dropping leading determiners. Args: doc (``spacy.Doc``) drop_determiners (bool, optional): remove leading determiners (e.g. "the") from phrases (e.g. "the quick brown fox" => "quick brown fox") min_freq (int, optional): remove chunks that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next noun chunk from ``doc`` in order of appearance in the document """ ncs = doc.noun_chunks if drop_determiners is True: ncs = (nc if nc[0].pos != DET else nc[1:] for nc in ncs) if min_freq > 1: ncs = list(ncs) freqs = itertoolz.frequencies(normalized_str(nc) for nc in ncs) ncs = (nc for nc in ncs if freqs[normalized_str(nc)] >= min_freq) for nc in ncs: yield nc
def named_entities(doc, good_ne_types=None, bad_ne_types=None, min_freq=1, drop_determiners=True): """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a spacy-parsed doc, optionally filtering by the entity types and frequencies. Args: doc (``spacy.Doc``) good_ne_types (set[str] or 'numeric', optional): named entity types to include; if "numeric", all numeric entity types are included bad_ne_types (set[str] or 'numeric', optional): named entity types to exclude; if "numeric", all numeric entity types are excluded min_freq (int, optional): remove named entities that occur in `doc` fewer than `min_freq` times drop_determiners (bool, optional): remove leading determiners (e.g. "the") from named entities (e.g. "the United States" => "United States") Yields: ``spacy.Span``: the next named entity from ``doc`` passing all specified filters in order of appearance in the document """ nes = doc.ents if good_ne_types: if good_ne_types == 'numeric': good_ne_types = NUMERIC_NE_TYPES nes = (ne for ne in nes if ne.label_ in good_ne_types) if bad_ne_types: if bad_ne_types == 'numeric': bad_ne_types = NUMERIC_NE_TYPES nes = (ne for ne in nes if ne.label_ not in bad_ne_types) if drop_determiners is True: nes = (ne if ne[0].pos != DET else ne[1:] for ne in nes) if min_freq > 1: nes = list(nes) freqs = itertoolz.frequencies(ne.text for ne in nes) nes = (ne for ne in nes if freqs[ne.text] >= min_freq) for ne in nes: yield ne
def words(doc, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags=None, bad_pos_tags=None, min_freq=1): """ Extract an ordered sequence of words from a spacy-parsed doc, optionally filtering words by part-of-speech (etc.) and frequency. Args: doc (``spacy.Doc`` or ``spacy.Span``) filter_stops (bool, optional): if True, remove stop words from word list filter_punct (bool, optional): if True, remove punctuation from word list filter_nums (bool, optional): if True, remove number-like words (e.g. 10, 'ten') from word list good_pos_tags (set[str], optional): remove words whose part-of-speech tag is NOT in the specified tags, using the set of universal POS tagset bad_pos_tags (set[str], optional): remove words whose part-of-speech tag IS in the specified tags, using the set of universal POS tagset min_freq (int, optional): remove words that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Token``: the next token from ``doc`` passing specified filters in order of appearance in the document """ words_ = (w for w in doc if not w.is_space) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if good_pos_tags: words_ = (w for w in words_ if w.pos_ in good_pos_tags) if bad_pos_tags: words_ = (w for w in words_ if w.pos_ not in bad_pos_tags) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(normalized_str(w) for w in words_) words_ = (w for w in words_ if freqs[normalized_str(w)] >= min_freq) for word in words_: yield word
def to_bag_of_terms(self, ngrams=(1, 2, 3), named_entities=True, lemmatize=True, lowercase=False, weighting='count', as_strings=False, **kwargs): """ Transform ``Doc`` into a bag-of-terms: the set of unique terms in ``Doc`` mapped to their frequency of occurrence, where "terms" includes ngrams and/or named entities. Args: ngrams (int or Set[int]): n of which n-grams to include; ``(1, 2, 3)`` (default) includes unigrams (words), bigrams, and trigrams; `2` if only bigrams are wanted; falsy (e.g. False) to not include any named_entities (bool): if True (default), include named entities; note: if ngrams are also included, any ngrams that exactly overlap with an entity are skipped to prevent double-counting lemmatize (bool): if True, words are lemmatized before counting; for example, 'happy', 'happier', and 'happiest' would be grouped together as 'happy', with a count of 3 lowercase (bool): if True and ``lemmatize`` is False, words are lower- cased before counting; for example, 'happy' and 'Happy' would be grouped together as 'happy', with a count of 2 weighting ({'count', 'freq', 'binary'}): Type of weight to assign to terms. If 'count' (default), weights are the absolute number of occurrences (count) of term in doc. If 'binary', all counts are set equal to 1. If 'freq', term counts are normalized by the total token count, giving their relative frequency of occurrence. as_strings (bool): if True, words are returned as strings; if False (default), words are returned as their unique integer ids kwargs: - filter_stops (bool) - filter_punct (bool) - filter_nums (bool) - include_pos (str or Set[str]) - exclude_pos (str or Set[str]) - min_freq (int) - include_types (str or Set[str]) - exclude_types (str or Set[str] - drop_determiners (bool) See :func:`extract.words() <textacy.extract.words>`, :func:`extract.ngrams() <textacy.extract.ngrams>`, and :func:`extract.named_entities() <textacy.extract.named_entities>` for more information on these parameters. Returns: dict: mapping of a unique term id or string (depending on the value of ``as_strings``) to its absolute, relative, or binary frequency of occurrence (depending on the value of ``weighting``). See Also: :meth:`Doc.to_terms_list() <Doc.to_terms_list>` """ if weighting not in {'count', 'freq', 'binary'}: raise ValueError('weighting "{}" is invalid'.format(weighting)) terms_list = self.to_terms_list( ngrams=ngrams, named_entities=named_entities, lemmatize=lemmatize, lowercase=lowercase, as_strings=as_strings, **kwargs) bot = itertoolz.frequencies(terms_list) if weighting == 'freq': n_tokens = self.n_tokens bot = {term: weight / n_tokens for term, weight in bot.items()} elif weighting == 'binary': bot = {term: 1 for term in bot.keys()} return bot
def ngrams( doc, n, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1, ): """ Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``) n (int): number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc. filter_stops (bool): if True, remove ngrams that start or end with a stop word filter_punct (bool): if True, remove ngrams that contain any punctuation-only tokens filter_nums (bool): if True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') include_pos (str or Set[str]): remove ngrams if any of their constituent tokens' part-of-speech tags ARE NOT included in this param exclude_pos (str or Set[str]): remove ngrams if any of their constituent tokens' part-of-speech tags ARE included in this param min_freq (int): remove ngrams that occur in ``doc`` fewer than ``min_freq`` times Yields: ``spacy.Span``: the next ngram from ``doc`` passing all specified filters, in order of appearance in the document Raises: ValueError: if ``n`` < 1 TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str, or a falsy value Note: Filtering by part-of-speech tag uses the universal POS tag set; for details, check spaCy's docs: https://spacy.io/api/annotation#pos-tagging """ if n < 1: raise ValueError("n must be greater than or equal to 1") ngrams_ = (doc[i:i + n] for i in compat.range_(len(doc) - n + 1)) ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_space for w in ngram)) if filter_stops is True: ngrams_ = (ngram for ngram in ngrams_ if not ngram[0].is_stop and not ngram[-1].is_stop) if filter_punct is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_punct for w in ngram)) if filter_nums is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.like_num for w in ngram)) if include_pos: if isinstance(include_pos, compat.unicode_): include_pos = include_pos.upper() ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ == include_pos for w in ngram)) elif isinstance(include_pos, (set, frozenset, list, tuple)): include_pos = {pos.upper() for pos in include_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ in include_pos for w in ngram)) else: msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos)) raise TypeError(msg) if exclude_pos: if isinstance(exclude_pos, compat.unicode_): exclude_pos = exclude_pos.upper() ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ != exclude_pos for w in ngram)) elif isinstance(exclude_pos, (set, frozenset, list, tuple)): exclude_pos = {pos.upper() for pos in exclude_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ not in exclude_pos for w in ngram)) else: msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos)) raise TypeError(msg) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(ngram.lower_ for ngram in ngrams_) ngrams_ = (ngram for ngram in ngrams_ if freqs[ngram.lower_] >= min_freq) for ngram in ngrams_: yield ngram
def ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1): """ Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``) n (int): number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc. filter_stops (bool): if True, remove ngrams that start or end with a stop word filter_punct (bool): if True, remove ngrams that contain any punctuation-only tokens filter_nums (bool): if True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') include_pos (str or Set[str]): remove ngrams if any of their constituent tokens' part-of-speech tags ARE NOT included in this param exclude_pos (str or Set[str]): remove ngrams if any of their constituent tokens' part-of-speech tags ARE included in this param min_freq (int, optional): remove ngrams that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next ngram from ``doc`` passing all specified filters, in order of appearance in the document Raises: ValueError: if ``n`` < 1 TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str, or a falsy value .. note:: Filtering by part-of-speech tag uses the universal POS tag set, http://universaldependencies.org/u/pos/ """ if n < 1: raise ValueError('n must be greater than or equal to 1') ngrams_ = (doc[i: i + n] for i in range(len(doc) - n + 1)) ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_space for w in ngram)) if filter_stops is True: ngrams_ = (ngram for ngram in ngrams_ if not ngram[0].is_stop and not ngram[-1].is_stop) if filter_punct is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_punct for w in ngram)) if filter_nums is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.like_num for w in ngram)) if include_pos: if isinstance(include_pos, unicode_type): include_pos = include_pos.upper() ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ == include_pos for w in ngram)) elif isinstance(include_pos, (set, frozenset, list, tuple)): include_pos = {pos.upper() for pos in include_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ in include_pos for w in ngram)) else: msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos)) raise TypeError(msg) if exclude_pos: if isinstance(exclude_pos, unicode_type): exclude_pos = exclude_pos.upper() ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ != exclude_pos for w in ngram)) elif isinstance(exclude_pos, (set, frozenset, list, tuple)): exclude_pos = {pos.upper() for pos in exclude_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ not in exclude_pos for w in ngram)) else: msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos)) raise TypeError(msg) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_) ngrams_ = (ngram for ngram in ngrams_ if freqs[normalized_str(ngram)] >= min_freq) for ngram in ngrams_: yield ngram
def named_entities(doc, include_types=None, exclude_types=None, drop_determiners=True, min_freq=1): """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a spacy-parsed doc, optionally filtering by entity types and frequencies. Args: doc (``textacy.Doc`` or ``spacy.Doc``) include_types (str or Set[str]): remove named entities whose type IS NOT in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are included exclude_types (str or Set[str]): remove named entities whose type IS in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are excluded drop_determiners (bool): remove leading determiners (e.g. "the") from named entities (e.g. "the United States" => "United States") min_freq (int): remove named entities that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next named entity from ``doc`` passing all specified filters in order of appearance in the document Raise: TypeError: if `include_types` or `exclude_types` is not a str, a set of str, or a falsy value """ if isinstance(doc, textacy.Doc): nes = doc.spacy_doc.ents else: nes = doc.ents if include_types: if isinstance(include_types, unicode_type): include_types = include_types.upper() if include_types == 'NUMERIC': include_types = NUMERIC_NE_TYPES # we now go to next if block else: nes = (ne for ne in nes if ne.label_ == include_types) if isinstance(include_types, (set, frozenset, list, tuple)): include_types = {type_.upper() for type_ in include_types} nes = (ne for ne in nes if ne.label_ in include_types) else: msg = 'invalid `include_types` type: "{}"'.format( type(include_types)) raise TypeError(msg) if exclude_types: if isinstance(exclude_types, unicode_type): exclude_types = exclude_types.upper() if exclude_types == 'NUMERIC': exclude_types = NUMERIC_NE_TYPES # we now go to next if block else: nes = (ne for ne in nes if ne.label_ != exclude_types) if isinstance(exclude_types, (set, frozenset, list, tuple)): exclude_types = {type_.upper() for type_ in exclude_types} nes = (ne for ne in nes if ne.label_ not in exclude_types) else: msg = 'invalid `exclude_types` type: "{}"'.format( type(exclude_types)) raise TypeError(msg) if drop_determiners is True: nes = (ne if ne[0].pos != DET else ne[1:] for ne in nes) if min_freq > 1: nes = list(nes) freqs = itertoolz.frequencies(ne.text for ne in nes) nes = (ne for ne in nes if freqs[ne.text] >= min_freq) for ne in nes: yield ne
def named_entities(doc, include_types=None, exclude_types=None, drop_determiners=True, min_freq=1): """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a spacy-parsed doc, optionally filtering by entity types and frequencies. Args: doc (``textacy.Doc`` or ``spacy.Doc``) include_types (str or Set[str]): remove named entities whose type IS NOT in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are included exclude_types (str or Set[str]): remove named entities whose type IS in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are excluded drop_determiners (bool): remove leading determiners (e.g. "the") from named entities (e.g. "the United States" => "United States") min_freq (int): remove named entities that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next named entity from ``doc`` passing all specified filters in order of appearance in the document Raise: TypeError: if `include_types` or `exclude_types` is not a str, a set of str, or a falsy value """ if isinstance(doc, textacy.Doc): nes = doc.spacy_doc.ents else: nes = doc.ents if include_types: if isinstance(include_types, unicode_type): include_types = include_types.upper() if include_types == 'NUMERIC': include_types = NUMERIC_NE_TYPES # we now go to next if block else: nes = (ne for ne in nes if ne.label_ == include_types) if isinstance(include_types, (set, frozenset, list, tuple)): include_types = {type_.upper() for type_ in include_types} nes = (ne for ne in nes if ne.label_ in include_types) else: msg = 'invalid `include_types` type: "{}"'.format(type(include_types)) raise TypeError(msg) if exclude_types: if isinstance(exclude_types, unicode_type): exclude_types = exclude_types.upper() if exclude_types == 'NUMERIC': exclude_types = NUMERIC_NE_TYPES # we now go to next if block else: nes = (ne for ne in nes if ne.label_ != exclude_types) if isinstance(exclude_types, (set, frozenset, list, tuple)): exclude_types = {type_.upper() for type_ in exclude_types} nes = (ne for ne in nes if ne.label_ not in exclude_types) else: msg = 'invalid `exclude_types` type: "{}"'.format(type(exclude_types)) raise TypeError(msg) if drop_determiners is True: nes = (ne if ne[0].pos != DET else ne[1:] for ne in nes) if min_freq > 1: nes = list(nes) freqs = itertoolz.frequencies(ne.text for ne in nes) nes = (ne for ne in nes if freqs[ne.text] >= min_freq) for ne in nes: yield ne
def words(doc, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1): """ Extract an ordered sequence of words from a document processed by spaCy, optionally filtering words by part-of-speech tag and frequency. Args: doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``) filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove number-like words (e.g. 10, 'ten') from word list include_pos (str or Set[str]): remove words whose part-of-speech tag IS NOT included in this param exclude_pos (str or Set[str]): remove words whose part-of-speech tag IS in the specified tags min_freq (int): remove words that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Token``: the next token from ``doc`` passing specified filters in order of appearance in the document Raises: TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str, or a falsy value .. note:: Filtering by part-of-speech tag uses the universal POS tag set, http://universaldependencies.org/u/pos/ """ words_ = (w for w in doc if not w.is_space) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if include_pos: if isinstance(include_pos, unicode_type): include_pos = include_pos.upper() words_ = (w for w in words_ if w.pos_ == include_pos) elif isinstance(include_pos, (set, frozenset, list, tuple)): include_pos = {pos.upper() for pos in include_pos} words_ = (w for w in words_ if w.pos_ in include_pos) else: msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos)) raise TypeError(msg) if exclude_pos: if isinstance(exclude_pos, unicode_type): exclude_pos = exclude_pos.upper() words_ = (w for w in words_ if w.pos_ != exclude_pos) elif isinstance(exclude_pos, (set, frozenset, list, tuple)): exclude_pos = {pos.upper() for pos in exclude_pos} words_ = (w for w in words_ if w.pos_ not in exclude_pos) else: msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos)) raise TypeError(msg) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(normalized_str(w) for w in words_) words_ = (w for w in words_ if freqs[normalized_str(w)] >= min_freq) for word in words_: yield word
def to_bag_of_terms(self, ngrams=(1, 2, 3), named_entities=True, normalize='lemma', weighting='count', as_strings=False, **kwargs): """ Transform :class:`Doc` into a bag-of-terms: the set of unique terms in :class:`Doc` mapped to their frequency of occurrence, where "terms" includes ngrams and/or named entities. Args: ngrams (int or Set[int]): n of which n-grams to include; ``(1, 2, 3)`` (default) includes unigrams (words), bigrams, and trigrams; `2` if only bigrams are wanted; falsy (e.g. False) to not include any named_entities (bool): if True (default), include named entities; note: if ngrams are also included, any ngrams that exactly overlap with an entity are skipped to prevent double-counting normalize (str or callable): if 'lemma', lemmatize terms; if 'lower', lowercase terms; if false-y, use the form of terms as they appear in doc; if a callable, must accept a ``spacy.Token`` or ``spacy.Span`` and return a str, e.g. :func:`textacy.spacy_utils.normalized_str()` weighting ({'count', 'freq', 'binary'}): Type of weight to assign to terms. If 'count' (default), weights are the absolute number of occurrences (count) of term in doc. If 'binary', all counts are set equal to 1. If 'freq', term counts are normalized by the total token count, giving their relative frequency of occurrence. as_strings (bool): if True, words are returned as strings; if False (default), words are returned as their unique integer ids kwargs: - filter_stops (bool) - filter_punct (bool) - filter_nums (bool) - include_pos (str or Set[str]) - exclude_pos (str or Set[str]) - min_freq (int) - include_types (str or Set[str]) - exclude_types (str or Set[str] - drop_determiners (bool) See :func:`extract.words() <textacy.extract.words>`, :func:`extract.ngrams() <textacy.extract.ngrams>`, and :func:`extract.named_entities() <textacy.extract.named_entities>` for more information on these parameters. Returns: dict: mapping of a unique term id or string (depending on the value of ``as_strings``) to its absolute, relative, or binary frequency of occurrence (depending on the value of ``weighting``). See Also: :meth:`Doc.to_terms_list() <Doc.to_terms_list>` """ if weighting not in {'count', 'freq', 'binary'}: raise ValueError('weighting "{}" is invalid'.format(weighting)) terms_list = self.to_terms_list(ngrams=ngrams, named_entities=named_entities, normalize=normalize, as_strings=as_strings, **kwargs) bot = itertoolz.frequencies(terms_list) if weighting == 'freq': n_tokens = self.n_tokens bot = {term: weight / n_tokens for term, weight in bot.items()} elif weighting == 'binary': bot = {term: 1 for term in bot.keys()} return bot
def named_entities(doc, include_types=None, exclude_types=None, drop_determiners=True, min_freq=1): """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a spacy-parsed doc, optionally filtering by entity types and frequencies. Args: doc (``textacy.Doc`` or ``spacy.Doc``) include_types (str or Set[str]): remove named entities whose type IS NOT in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are included exclude_types (str or Set[str]): remove named entities whose type IS in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are excluded drop_determiners (bool): Remove leading determiners (e.g. "the") from named entities (e.g. "the United States" => "United States"). .. note:: Entities from which a leading determiner has been removed do *not* keep their entity type annotations. This is irritating but unavoidable, since the only way to re-annotate them is to modify ``doc`` directly, and this function is not meant to have any side-effects. If you're only using the text of the returned spans, this is no big deal; if you're using NE-like attributes downstream, however, this is something to watch out for. min_freq (int): remove named entities that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next named entity from ``doc`` passing all specified filters in order of appearance in the document Raise: TypeError: if `include_types` or `exclude_types` is not a str, a set of str, or a falsy value """ if isinstance(doc, textacy.Doc): nes = doc.spacy_doc.ents else: nes = doc.ents if include_types: if isinstance(include_types, unicode_): include_types = include_types.upper() if include_types == 'NUMERIC': include_types = NUMERIC_NE_TYPES # we now go to next if block else: nes = (ne for ne in nes if ne.label_ == include_types) if isinstance(include_types, (set, frozenset, list, tuple)): include_types = {type_.upper() for type_ in include_types} nes = (ne for ne in nes if ne.label_ in include_types) else: msg = 'invalid `include_types` type: "{}"'.format( type(include_types)) raise TypeError(msg) if exclude_types: if isinstance(exclude_types, unicode_): exclude_types = exclude_types.upper() if exclude_types == 'NUMERIC': exclude_types = NUMERIC_NE_TYPES # we now go to next if block else: nes = (ne for ne in nes if ne.label_ != exclude_types) if isinstance(exclude_types, (set, frozenset, list, tuple)): exclude_types = {type_.upper() for type_ in exclude_types} nes = (ne for ne in nes if ne.label_ not in exclude_types) else: msg = 'invalid `exclude_types` type: "{}"'.format( type(exclude_types)) raise TypeError(msg) if drop_determiners is True: nes = (ne if ne[0].pos != DET else ne[1:] for ne in nes) if min_freq > 1: nes = list(nes) freqs = itertoolz.frequencies(ne.lower_ for ne in nes) nes = (ne for ne in nes if freqs[ne.lower_] >= min_freq) for ne in nes: yield ne
def ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags=None, bad_pos_tags=None, min_freq=1): """ Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: doc (``spacy.Doc`` or ``spacy.Span``) n (int): number of tokens per n-gram; 2 gives bigrams, 3 gives trigrams, etc. filter_stops (bool, optional): if True, remove ngrams that start or end with a stop word filter_punct (bool, optional): if True, remove ngrams that contain any punctuation-only tokens filter_nums (bool, optional): if True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') good_pos_tags (set[str], optional): remove ngrams whose constituent tokens' part-of-speech tags are NOT all in the specified tags, using the universal POS tagset bad_pos_tags (set[str], optional): remove ngrams if any of their constituent tokens' part-of-speech tags are in the specified tags, using the universal POS tagset min_freq (int, optional): remove ngrams that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next ngram from ``doc`` passing all specified filters, in order of appearance in the document Raises: ValueError: if ``n`` < 1 """ if n < 1: raise ValueError('n must be greater than or equal to 1') ngrams_ = (doc[i: i + n] for i in range(len(doc) - n + 1)) ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_space for w in ngram)) if filter_stops is True: ngrams_ = (ngram for ngram in ngrams_ if not ngram[0].is_stop and not ngram[-1].is_stop) if filter_punct is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_punct for w in ngram)) if filter_nums is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.like_num for w in ngram)) if good_pos_tags: ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ in good_pos_tags for w in ngram)) if bad_pos_tags: ngrams_ = (ngram for ngram in ngrams_ if not any(w.pos_ in bad_pos_tags for w in ngram)) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_) ngrams_ = (ngram for ngram in ngrams_ if freqs[normalized_str(ngram)] >= min_freq) for ngram in ngrams_: yield ngram
def yake( doc: Doc, *, normalize: Optional[str] = "lemma", ngrams: Union[int, Collection[int]] = (1, 2, 3), include_pos: Optional[Union[str, Collection[str]]] = ("NOUN", "PROPN", "ADJ"), window_size: int = 2, topn: Union[int, float] = 10, ) -> List[Tuple[str, float]]: """ Extract key terms from a document using the YAKE algorithm. Args: doc: spaCy ``Doc`` from which to extract keyterms. Must be sentence-segmented; optionally POS-tagged. normalize: If "lemma", lemmatize terms; if "lower", lowercase terms; if None, use the form of terms as they appeared in ``doc``. .. note:: Unlike the other keyterm extraction functions, this one doesn't accept a callable for ``normalize``. ngrams: n of which n-grams to consider as keyterm candidates. For example, `(1, 2, 3)`` includes all unigrams, bigrams, and trigrams, while ``2`` includes bigrams only. include_pos: One or more POS tags with which to filter for good candidate keyterms. If None, include tokens of all POS tags (which also allows keyterm extraction from docs without POS-tagging.) window_size: Number of words to the right and left of a given word to use as context when computing the "relatedness to context" component of its score. Note that the resulting sliding window's full width is ``1 + (2 * window_size)``. topn: Number of top-ranked terms to return as key terms. If an integer, represents the absolute number; if a float, value must be in the interval (0.0, 1.0], which is converted to an int by ``int(round(len(candidates) * topn))`` Returns: Sorted list of top ``topn`` key terms and their corresponding YAKE scores. References: Campos, Mangaravite, Pasquali, Jorge, Nunes, and Jatowt. (2018). A Text Feature Based Automatic Keyword Extraction Method for Single Documents. Advances in Information Retrieval. ECIR 2018. Lecture Notes in Computer Science, vol 10772, pp. 684-691. """ # validate / transform args ngrams = cast(Tuple[int, ...], utils.to_collection(ngrams, int, tuple)) include_pos = cast(Set[str], utils.to_collection(include_pos, str, set)) if isinstance(topn, float): if not 0.0 < topn <= 1.0: raise ValueError(f"topn = {topn} is invalid; " "must be an int, or a float between 0.0 and 1.0") # bail out on empty docs if not doc: return [] stop_words: Set[str] = set() seen_candidates: Set[str] = set() # compute key values on a per-word basis word_occ_vals = _get_per_word_occurrence_values(doc, normalize, stop_words, window_size) # doc doesn't have any words... if not word_occ_vals: return [] word_freqs = { w_id: len(vals["is_uc"]) for w_id, vals in word_occ_vals.items() } word_scores = _compute_word_scores(doc, word_occ_vals, word_freqs, stop_words) # compute scores for candidate terms based on scores of constituent words term_scores: Dict[str, float] = {} # do single-word candidates separately; it's faster and simpler if 1 in ngrams: candidates = _get_unigram_candidates(doc, include_pos) _score_unigram_candidates( candidates, word_freqs, word_scores, term_scores, stop_words, seen_candidates, normalize, ) # now compute combined scores for higher-n ngram and candidates candidates = list( ke_utils.get_ngram_candidates( doc, [n for n in ngrams if n > 1], include_pos=include_pos, )) attr_name = _get_attr_name(normalize, True) ngram_freqs = itertoolz.frequencies(" ".join( getattr(word, attr_name) for word in ngram) for ngram in candidates) _score_ngram_candidates( candidates, ngram_freqs, word_scores, term_scores, seen_candidates, normalize, ) # build up a list of key terms in order of increasing score if isinstance(topn, float): topn = int(round(len(seen_candidates) * topn)) sorted_term_scores = sorted( term_scores.items(), key=operator.itemgetter(1), reverse=False, ) return ke_utils.get_filtered_topn_terms(sorted_term_scores, topn, match_threshold=0.8)
def words( doc, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1, ): """ Extract an ordered sequence of words from a document processed by spaCy, optionally filtering words by part-of-speech tag and frequency. Args: doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``) filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove number-like words (e.g. 10, 'ten') from word list include_pos (str or Set[str]): remove words whose part-of-speech tag IS NOT included in this param exclude_pos (str or Set[str]): remove words whose part-of-speech tag IS in the specified tags min_freq (int): remove words that occur in ``doc`` fewer than ``min_freq`` times Yields: ``spacy.Token``: the next token from ``doc`` passing specified filters in order of appearance in the document Raises: TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str, or a falsy value Note: Filtering by part-of-speech tag uses the universal POS tag set; for details, check spaCy's docs: https://spacy.io/api/annotation#pos-tagging """ words_ = (w for w in doc if not w.is_space) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if include_pos: if isinstance(include_pos, compat.unicode_): include_pos = include_pos.upper() words_ = (w for w in words_ if w.pos_ == include_pos) elif isinstance(include_pos, (set, frozenset, list, tuple)): include_pos = {pos.upper() for pos in include_pos} words_ = (w for w in words_ if w.pos_ in include_pos) else: msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos)) raise TypeError(msg) if exclude_pos: if isinstance(exclude_pos, compat.unicode_): exclude_pos = exclude_pos.upper() words_ = (w for w in words_ if w.pos_ != exclude_pos) elif isinstance(exclude_pos, (set, frozenset, list, tuple)): exclude_pos = {pos.upper() for pos in exclude_pos} words_ = (w for w in words_ if w.pos_ not in exclude_pos) else: msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos)) raise TypeError(msg) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(w.lower_ for w in words_) words_ = (w for w in words_ if freqs[w.lower_] >= min_freq) for word in words_: yield word
def named_entities(doc, include_types=None, exclude_types=None, drop_determiners=True, min_freq=1): """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a spacy-parsed doc, optionally filtering by entity types and frequencies. Args: doc (``textacy.Doc`` or ``spacy.Doc``) include_types (str or Set[str]): remove named entities whose type IS NOT in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are included exclude_types (str or Set[str]): remove named entities whose type IS in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are excluded drop_determiners (bool): Remove leading determiners (e.g. "the") from named entities (e.g. "the United States" => "United States"). .. note:: Entities from which a leading determiner has been removed are, effectively, *new* entities, and not saved to the ``SpacyDoc`` from which they came. This is irritating but unavoidable, since this function is not meant to have side-effects on document state. If you're only using the text of the returned spans, this is no big deal, but watch out if you're counting on determiner-less entities associated with the doc downstream. min_freq (int): remove named entities that occur in ``doc`` fewer than ``min_freq`` times Yields: ``spacy.Span``: the next named entity from ``doc`` passing all specified filters in order of appearance in the document Raises: TypeError: if ``include_types`` or ``exclude_types`` is not a str, a set of str, or a falsy value """ if hasattr(doc, "spacy_doc"): nes = doc.spacy_doc.ents else: nes = doc.ents # HACK: spacy's models have been erroneously tagging whitespace as entities # https://github.com/explosion/spaCy/commit/1e6725e9b734862e61081a916baf440697b9971e nes = (ne for ne in nes if not ne.text.isspace()) include_types = _parse_ne_types(include_types, "include") exclude_types = _parse_ne_types(exclude_types, "exclude") if include_types: if isinstance(include_types, compat.unicode_): nes = (ne for ne in nes if ne.label_ == include_types) elif isinstance(include_types, (set, frozenset, list, tuple)): nes = (ne for ne in nes if ne.label_ in include_types) if exclude_types: if isinstance(exclude_types, compat.unicode_): nes = (ne for ne in nes if ne.label_ != exclude_types) elif isinstance(exclude_types, (set, frozenset, list, tuple)): nes = (ne for ne in nes if ne.label_ not in exclude_types) if drop_determiners is True: nes = (ne if ne[0].pos != DET else SpacySpan( ne.doc, ne.start + 1, ne.end, label=ne.label, vector=ne.vector) for ne in nes) if min_freq > 1: nes = list(nes) freqs = itertoolz.frequencies(ne.lower_ for ne in nes) nes = (ne for ne in nes if freqs[ne.lower_] >= min_freq) for ne in nes: yield ne
def named_entities(doc, include_types=None, exclude_types=None, drop_determiners=True, min_freq=1): """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a spacy-parsed doc, optionally filtering by entity types and frequencies. Args: doc (``textacy.Doc`` or ``spacy.Doc``) include_types (str or Set[str]): remove named entities whose type IS NOT in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are included exclude_types (str or Set[str]): remove named entities whose type IS in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are excluded drop_determiners (bool): Remove leading determiners (e.g. "the") from named entities (e.g. "the United States" => "United States"). .. note:: Entities from which a leading determiner has been removed are, effectively, *new* entities, and not saved to the ``SpacyDoc`` from which they came. This is irritating but unavoidable, since this function is not meant to have side-effects on document state. If you're only using the text of the returned spans, this is no big deal, but watch out if you're counting on determiner-less entities associated with the doc downstream. min_freq (int): remove named entities that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next named entity from ``doc`` passing all specified filters in order of appearance in the document Raises: TypeError: if `include_types` or `exclude_types` is not a str, a set of str, or a falsy value """ if hasattr(doc, 'spacy_doc'): nes = doc.spacy_doc.ents else: nes = doc.ents if include_types: if isinstance(include_types, compat.unicode_): include_types = include_types.upper() if include_types == 'NUMERIC': include_types = constants.NUMERIC_NE_TYPES # we now go to next if block else: nes = (ne for ne in nes if ne.label_ == include_types) if isinstance(include_types, (set, frozenset, list, tuple)): include_types = {type_.upper() for type_ in include_types} nes = (ne for ne in nes if ne.label_ in include_types) else: msg = 'invalid `include_types` type: "{}"'.format(type(include_types)) raise TypeError(msg) if exclude_types: if isinstance(exclude_types, compat.unicode_): exclude_types = exclude_types.upper() if exclude_types == 'NUMERIC': exclude_types = constants.NUMERIC_NE_TYPES # we now go to next if block else: nes = (ne for ne in nes if ne.label_ != exclude_types) if isinstance(exclude_types, (set, frozenset, list, tuple)): exclude_types = {type_.upper() for type_ in exclude_types} nes = (ne for ne in nes if ne.label_ not in exclude_types) else: msg = 'invalid `exclude_types` type: "{}"'.format(type(exclude_types)) raise TypeError(msg) if drop_determiners is True: nes = ( ne if ne[0].pos != DET else SpacySpan(ne.doc, ne.start + 1, ne.end, label=ne.label, vector=ne.vector) for ne in nes) if min_freq > 1: nes = list(nes) freqs = itertoolz.frequencies(ne.lower_ for ne in nes) nes = (ne for ne in nes if freqs[ne.lower_] >= min_freq) for ne in nes: yield ne
def ngrams( doc: Union[Doc, Span], n: int, *, filter_stops: bool = True, filter_punct: bool = True, filter_nums: bool = False, include_pos: Optional[Union[str, Set[str]]] = None, exclude_pos: Optional[Union[str, Set[str]]] = None, min_freq: int = 1, ) -> Iterable[Span]: """ Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: doc n: Number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc. filter_stops: If True, remove ngrams that start or end with a stop word filter_punct: If True, remove ngrams that contain any punctuation-only tokens filter_nums: If True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') include_pos: Remove ngrams if any of their constituent tokens' part-of-speech tags ARE NOT included in this param exclude_pos: Remove ngrams if any of their constituent tokens' part-of-speech tags ARE included in this param min_freq: Remove ngrams that occur in ``doc`` fewer than ``min_freq`` times Yields: Next ngram from ``doc`` passing all specified filters, in order of appearance in the document Raises: ValueError: if ``n`` < 1 TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str, or a falsy value Note: Filtering by part-of-speech tag uses the universal POS tag set; for details, check spaCy's docs: https://spacy.io/api/annotation#pos-tagging """ if n < 1: raise ValueError("n must be greater than or equal to 1") ngrams_ = (doc[i:i + n] for i in range(len(doc) - n + 1)) ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_space for w in ngram)) if filter_stops is True: ngrams_ = (ngram for ngram in ngrams_ if not ngram[0].is_stop and not ngram[-1].is_stop) if filter_punct is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_punct for w in ngram)) if filter_nums is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.like_num for w in ngram)) if include_pos: include_pos = cast(Set[str], utils.to_collection(include_pos, str, set)) include_pos = {pos.upper() for pos in include_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ in include_pos for w in ngram)) if exclude_pos: exclude_pos = cast(Set[str], utils.to_collection(exclude_pos, str, set)) exclude_pos = {pos.upper() for pos in exclude_pos} ngrams_ = (ngram for ngram in ngrams_ if not any(w.pos_ in exclude_pos for w in ngram)) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(ngram.lower_ for ngram in ngrams_) ngrams_ = (ngram for ngram in ngrams_ if freqs[ngram.lower_] >= min_freq) for ngram in ngrams_: yield ngram
def entities( doc: Doc, *, include_types: Optional[Union[str, Set[str]]] = None, exclude_types: Optional[Union[str, Set[str]]] = None, drop_determiners: bool = True, min_freq: int = 1, ) -> Iterable[Span]: """ Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a ``Doc``, optionally filtering by entity types and frequencies. Args: doc include_types: Remove entities whose type IS NOT in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are included exclude_types: Remove entities whose type IS in this param; if "NUMERIC", all numeric entity types ("DATE", "MONEY", "ORDINAL", etc.) are excluded drop_determiners: Remove leading determiners (e.g. "the") from entities (e.g. "the United States" => "United States"). .. note:: Entities from which a leading determiner has been removed are, effectively, *new* entities, and not saved to the ``Doc`` from which they came. This is irritating but unavoidable, since this function is not meant to have side-effects on document state. If you're only using the text of the returned spans, this is no big deal, but watch out if you're counting on determiner-less entities associated with the doc downstream. min_freq: Remove entities that occur in ``doc`` fewer than ``min_freq`` times Yields: Next entity from ``doc`` passing all specified filters in order of appearance in the document Raises: TypeError: if ``include_types`` or ``exclude_types`` is not a str, a set of str, or a falsy value """ ents = doc.ents # HACK: spacy's models have been erroneously tagging whitespace as entities # https://github.com/explosion/spaCy/commit/1e6725e9b734862e61081a916baf440697b9971e ents = (ent for ent in ents if not ent.text.isspace()) include_types = _parse_ent_types(include_types, "include") exclude_types = _parse_ent_types(exclude_types, "exclude") if include_types: if isinstance(include_types, str): ents = (ent for ent in ents if ent.label_ == include_types) elif isinstance(include_types, (set, frozenset, list, tuple)): ents = (ent for ent in ents if ent.label_ in include_types) if exclude_types: if isinstance(exclude_types, str): ents = (ent for ent in ents if ent.label_ != exclude_types) elif isinstance(exclude_types, (set, frozenset, list, tuple)): ents = (ent for ent in ents if ent.label_ not in exclude_types) if drop_determiners is True: ents = (ent if ent[0].pos != DET else Span(ent.doc, ent.start + 1, ent.end, label=ent.label, vector=ent.vector) for ent in ents) if min_freq > 1: ents = list(ents) freqs = itertoolz.frequencies(ent.lower_ for ent in ents) ents = (ent for ent in ents if freqs[ent.lower_] >= min_freq) for ent in ents: yield ent
def to_bag_of_terms(doc, ngrams=(1, 2, 3), entities=True, normalize="lemma", weighting="count", as_strings=False, **kwargs): """ Transform ``Doc`` into a bag-of-terms: the set of unique terms in ``Doc`` mapped to their frequency of occurrence, where "terms" includes ngrams and/or entities. Args: doc (:class:`spacy.tokens.Doc`) ngrams (int or Set[int]): n of which n-grams to include; ``(1, 2, 3)`` (default) includes unigrams (words), bigrams, and trigrams; `2` if only bigrams are wanted; falsy (e.g. False) to not include any entities (bool): If True (default), include named entities; note: if ngrams are also included, any ngrams that exactly overlap with an entity are skipped to prevent double-counting normalize (str or callable): If "lemma", lemmatize terms; if "lower", lowercase terms; if falsy, use the form of terms as they appear in ``doc``; if a callable, must accept a ``Token`` or ``Span`` and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`. weighting ({"count", "freq", "binary"}): Type of weight to assign to terms. If "count" (default), weights are the absolute number of occurrences (count) of term in doc. If "binary", all counts are set equal to 1. If "freq", term counts are normalized by the total token count, giving their relative frequency of occurrence. as_strings (bool): If True, words are returned as strings; if False (default), words are returned as their unique integer ids. kwargs: - filter_stops (bool) - filter_punct (bool) - filter_nums (bool) - include_pos (str or Set[str]) - exclude_pos (str or Set[str]) - min_freq (int) - include_types (str or Set[str]) - exclude_types (str or Set[str] - drop_determiners (bool) See :func:`textacy.extract.words()`, :func:`textacy.extract.ngrams()`, and :func:`textacy.extract.entities()` for details. Returns: dict: mapping of a unique term id or string (depending on the value of ``as_strings``) to its absolute, relative, or binary frequency of occurrence (depending on the value of ``weighting``). See Also: :func:`to_terms_list()`, which is used under the hood. """ if weighting not in {"count", "freq", "binary"}: raise ValueError('weighting "{}" is invalid'.format(weighting)) terms_list = to_terms_list(doc, ngrams=ngrams, entities=entities, normalize=normalize, as_strings=as_strings, **kwargs) bot = itertoolz.frequencies(terms_list) if weighting == "freq": n_tokens = len(doc) bot = {term: weight / n_tokens for term, weight in bot.items()} elif weighting == "binary": bot = {term: 1 for term in bot.keys()} return bot
def words( doc, *, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1, ): """ Extract an ordered sequence of words from a document processed by spaCy, optionally filtering words by part-of-speech tag and frequency. Args: doc (:class:`spacy.tokens.Doc` or :class:`spacy.tokens.Span`) filter_stops (bool): If True, remove stop words from word list. filter_punct (bool): If True, remove punctuation from word list. filter_nums (bool): If True, remove number-like words (e.g. 10, "ten") from word list. include_pos (str or Set[str]): Remove words whose part-of-speech tag IS NOT included in this param. exclude_pos (str or Set[str]): Remove words whose part-of-speech tag IS in the specified tags. min_freq (int): Remove words that occur in ``doc`` fewer than ``min_freq`` times. Yields: :class:`spacy.tokens.Token`: Next token from ``doc`` passing specified filters in order of appearance in the document. Raises: TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str, or a falsy value Note: Filtering by part-of-speech tag uses the universal POS tag set; for details, check spaCy's docs: https://spacy.io/api/annotation#pos-tagging """ words_ = (w for w in doc if not w.is_space) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if include_pos: include_pos = { pos.upper() for pos in utils.to_collection(include_pos, str, set) } words_ = (w for w in words_ if w.pos_ in include_pos) if exclude_pos: exclude_pos = { pos.upper() for pos in utils.to_collection(exclude_pos, str, set) } words_ = (w for w in words_ if w.pos_ not in exclude_pos) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(w.lower_ for w in words_) words_ = (w for w in words_ if freqs[w.lower_] >= min_freq) for word in words_: yield word