Exemple #1
0
def test_frequencies():
    assert (frequencies(["cat", "pig", "cat", "eel",
                        "pig", "dog", "dog", "dog"]) ==
            {"cat": 2, "eel": 1, "pig": 2, "dog": 3})
    assert frequencies([]) == {}
    assert frequencies("onomatopoeia") == {"a": 2, "e": 1, "i": 1, "m": 1,
                                           "o": 4, "n": 1, "p": 1, "t": 1}
Exemple #2
0
def noun_chunks(doc, drop_determiners=True, min_freq=1):
    """
    Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally
    filtering by frequency and dropping leading determiners.

    Args:
        doc (``spacy.Doc``)
        drop_determiners (bool, optional): remove leading determiners (e.g. "the")
            from phrases (e.g. "the quick brown fox" => "quick brown fox")
        min_freq (int, optional): remove chunks that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Span``: the next noun chunk from ``doc`` in order of appearance
             in the document
    """
    ncs = doc.noun_chunks
    if drop_determiners is True:
        ncs = (nc if nc[0].pos != DET else nc[1:]
               for nc in ncs)
    if min_freq > 1:
        ncs = list(ncs)
        freqs = itertoolz.frequencies(normalized_str(nc) for nc in ncs)
        ncs = (nc for nc in ncs
               if freqs[normalized_str(nc)] >= min_freq)

    for nc in ncs:
        yield nc
Exemple #3
0
def named_entities(doc,
                   good_ne_types=None, bad_ne_types=None, min_freq=1,
                   drop_determiners=True):
    """
    Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from a
    spacy-parsed doc, optionally filtering by the entity types and frequencies.

    Args:
        doc (``spacy.Doc``)
        good_ne_types (set[str] or 'numeric', optional): named entity types to
            include; if "numeric", all numeric entity types are included
        bad_ne_types (set[str] or 'numeric', optional): named entity types to
            exclude; if "numeric", all numeric entity types are excluded
        min_freq (int, optional): remove named entities that occur in `doc` fewer
            than `min_freq` times
        drop_determiners (bool, optional): remove leading determiners (e.g. "the")
            from named entities (e.g. "the United States" => "United States")

    Yields:
        ``spacy.Span``: the next named entity from ``doc`` passing all specified
            filters in order of appearance in the document
    """
    nes = doc.ents
    if good_ne_types:
        if good_ne_types == 'numeric':
            good_ne_types = NUMERIC_NE_TYPES
        nes = (ne for ne in nes
               if ne.label_ in good_ne_types)
    if bad_ne_types:
        if bad_ne_types == 'numeric':
            bad_ne_types = NUMERIC_NE_TYPES
        nes = (ne for ne in nes
               if ne.label_ not in bad_ne_types)
    if drop_determiners is True:
        nes = (ne if ne[0].pos != DET else ne[1:]
               for ne in nes)
    if min_freq > 1:
        nes = list(nes)
        freqs = itertoolz.frequencies(ne.text for ne in nes)
        nes = (ne for ne in nes
               if freqs[ne.text] >= min_freq)

    for ne in nes:
        yield ne
Exemple #4
0
def words(doc,
          filter_stops=True, filter_punct=True, filter_nums=False,
          good_pos_tags=None, bad_pos_tags=None, min_freq=1):
    """
    Extract an ordered sequence of words from a spacy-parsed doc, optionally
    filtering words by part-of-speech (etc.) and frequency.

    Args:
        doc (``spacy.Doc`` or ``spacy.Span``)
        filter_stops (bool, optional): if True, remove stop words from word list
        filter_punct (bool, optional): if True, remove punctuation from word list
        filter_nums (bool, optional): if True, remove number-like words
            (e.g. 10, 'ten') from word list
        good_pos_tags (set[str], optional): remove words whose part-of-speech tag
            is NOT in the specified tags, using the set of universal POS tagset
        bad_pos_tags (set[str], optional): remove words whose part-of-speech tag
            IS in the specified tags, using the set of universal POS tagset
        min_freq (int, optional): remove words that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Token``: the next token from ``doc`` passing specified filters
            in order of appearance in the document
    """
    words_ = (w for w in doc if not w.is_space)
    if filter_stops is True:
        words_ = (w for w in words_ if not w.is_stop)
    if filter_punct is True:
        words_ = (w for w in words_ if not w.is_punct)
    if filter_nums is True:
        words_ = (w for w in words_ if not w.like_num)
    if good_pos_tags:
        words_ = (w for w in words_ if w.pos_ in good_pos_tags)
    if bad_pos_tags:
        words_ = (w for w in words_ if w.pos_ not in bad_pos_tags)
    if min_freq > 1:
        words_ = list(words_)
        freqs = itertoolz.frequencies(normalized_str(w) for w in words_)
        words_ = (w for w in words_
                  if freqs[normalized_str(w)] >= min_freq)

    for word in words_:
        yield word
Exemple #5
0
    def to_bag_of_terms(self, ngrams=(1, 2, 3), named_entities=True,
                        lemmatize=True, lowercase=False,
                        weighting='count', as_strings=False, **kwargs):
        """
        Transform ``Doc`` into a bag-of-terms: the set of unique terms in ``Doc``
        mapped to their frequency of occurrence, where "terms" includes ngrams
        and/or named entities.

        Args:
            ngrams (int or Set[int]): n of which n-grams to include; ``(1, 2, 3)``
                (default) includes unigrams (words), bigrams, and trigrams; `2`
                if only bigrams are wanted; falsy (e.g. False) to not include any
            named_entities (bool): if True (default), include named entities;
                note: if ngrams are also included, any ngrams that exactly
                overlap with an entity are skipped to prevent double-counting
            lemmatize (bool): if True, words are lemmatized before counting;
                for example, 'happy', 'happier', and 'happiest' would be grouped
                together as 'happy', with a count of 3
            lowercase (bool): if True and ``lemmatize`` is False, words are lower-
                cased before counting; for example, 'happy' and 'Happy' would be
                grouped together as 'happy', with a count of 2
            weighting ({'count', 'freq', 'binary'}): Type of weight to assign to
                terms. If 'count' (default), weights are the absolute number of
                occurrences (count) of term in doc. If 'binary', all counts are
                set equal to 1. If 'freq', term counts are normalized by the
                total token count, giving their relative frequency of occurrence.
            as_strings (bool): if True, words are returned as strings; if False
                (default), words are returned as their unique integer ids
            kwargs:
                - filter_stops (bool)
                - filter_punct (bool)
                - filter_nums (bool)
                - include_pos (str or Set[str])
                - exclude_pos (str or Set[str])
                - min_freq (int)
                - include_types (str or Set[str])
                - exclude_types (str or Set[str]
                - drop_determiners (bool)

                See :func:`extract.words() <textacy.extract.words>`,
                :func:`extract.ngrams() <textacy.extract.ngrams>`,
                and :func:`extract.named_entities() <textacy.extract.named_entities>`
                for more information on these parameters.

        Returns:
            dict: mapping of a unique term id or string (depending on the value
                of ``as_strings``) to its absolute, relative, or binary frequency
                of occurrence (depending on the value of ``weighting``).

        See Also:
            :meth:`Doc.to_terms_list() <Doc.to_terms_list>`
        """
        if weighting not in {'count', 'freq', 'binary'}:
            raise ValueError('weighting "{}" is invalid'.format(weighting))
        terms_list = self.to_terms_list(
            ngrams=ngrams, named_entities=named_entities,
            lemmatize=lemmatize, lowercase=lowercase,
            as_strings=as_strings, **kwargs)
        bot = itertoolz.frequencies(terms_list)
        if weighting == 'freq':
            n_tokens = self.n_tokens
            bot = {term: weight / n_tokens for term, weight in bot.items()}
        elif weighting == 'binary':
            bot = {term: 1 for term in bot.keys()}
        return bot
Exemple #6
0
def ngrams(
    doc,
    n,
    filter_stops=True,
    filter_punct=True,
    filter_nums=False,
    include_pos=None,
    exclude_pos=None,
    min_freq=1,
):
    """
    Extract an ordered sequence of n-grams (``n`` consecutive words) from a
    spacy-parsed doc, optionally filtering n-grams by the types and
    parts-of-speech of the constituent words.

    Args:
        doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``)
        n (int): number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc.
        filter_stops (bool): if True, remove ngrams that start or end
            with a stop word
        filter_punct (bool): if True, remove ngrams that contain
            any punctuation-only tokens
        filter_nums (bool): if True, remove ngrams that contain
            any numbers or number-like tokens (e.g. 10, 'ten')
        include_pos (str or Set[str]): remove ngrams if any of their constituent
            tokens' part-of-speech tags ARE NOT included in this param
        exclude_pos (str or Set[str]): remove ngrams if any of their constituent
            tokens' part-of-speech tags ARE included in this param
        min_freq (int): remove ngrams that occur in ``doc`` fewer than
            ``min_freq`` times

    Yields:
        ``spacy.Span``: the next ngram from ``doc`` passing all specified
        filters, in order of appearance in the document

    Raises:
        ValueError: if ``n`` < 1
        TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str,
            or a falsy value

    Note:
        Filtering by part-of-speech tag uses the universal POS tag set; for details,
        check spaCy's docs: https://spacy.io/api/annotation#pos-tagging
    """
    if n < 1:
        raise ValueError("n must be greater than or equal to 1")

    ngrams_ = (doc[i:i + n] for i in compat.range_(len(doc) - n + 1))
    ngrams_ = (ngram for ngram in ngrams_
               if not any(w.is_space for w in ngram))
    if filter_stops is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not ngram[0].is_stop and not ngram[-1].is_stop)
    if filter_punct is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.is_punct for w in ngram))
    if filter_nums is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.like_num for w in ngram))
    if include_pos:
        if isinstance(include_pos, compat.unicode_):
            include_pos = include_pos.upper()
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ == include_pos for w in ngram))
        elif isinstance(include_pos, (set, frozenset, list, tuple)):
            include_pos = {pos.upper() for pos in include_pos}
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ in include_pos for w in ngram))
        else:
            msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos))
            raise TypeError(msg)
    if exclude_pos:
        if isinstance(exclude_pos, compat.unicode_):
            exclude_pos = exclude_pos.upper()
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ != exclude_pos for w in ngram))
        elif isinstance(exclude_pos, (set, frozenset, list, tuple)):
            exclude_pos = {pos.upper() for pos in exclude_pos}
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ not in exclude_pos for w in ngram))
        else:
            msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos))
            raise TypeError(msg)
    if min_freq > 1:
        ngrams_ = list(ngrams_)
        freqs = itertoolz.frequencies(ngram.lower_ for ngram in ngrams_)
        ngrams_ = (ngram for ngram in ngrams_
                   if freqs[ngram.lower_] >= min_freq)

    for ngram in ngrams_:
        yield ngram
Exemple #7
0
def ngrams(doc, n,
           filter_stops=True, filter_punct=True, filter_nums=False,
           include_pos=None, exclude_pos=None, min_freq=1):
    """
    Extract an ordered sequence of n-grams (``n`` consecutive words) from a
    spacy-parsed doc, optionally filtering n-grams by the types and
    parts-of-speech of the constituent words.

    Args:
        doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``)
        n (int): number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc.
        filter_stops (bool): if True, remove ngrams that start or end
            with a stop word
        filter_punct (bool): if True, remove ngrams that contain
            any punctuation-only tokens
        filter_nums (bool): if True, remove ngrams that contain
            any numbers or number-like tokens (e.g. 10, 'ten')
        include_pos (str or Set[str]): remove ngrams if any of their constituent
            tokens' part-of-speech tags ARE NOT included in this param
        exclude_pos (str or Set[str]): remove ngrams if any of their constituent
            tokens' part-of-speech tags ARE included in this param
        min_freq (int, optional): remove ngrams that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Span``: the next ngram from ``doc`` passing all specified
            filters, in order of appearance in the document

    Raises:
        ValueError: if ``n`` < 1
        TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str,
            or a falsy value

    .. note:: Filtering by part-of-speech tag uses the universal POS tag set,
        http://universaldependencies.org/u/pos/
    """
    if n < 1:
        raise ValueError('n must be greater than or equal to 1')

    ngrams_ = (doc[i: i + n]
               for i in range(len(doc) - n + 1))
    ngrams_ = (ngram for ngram in ngrams_
               if not any(w.is_space for w in ngram))
    if filter_stops is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not ngram[0].is_stop and not ngram[-1].is_stop)
    if filter_punct is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.is_punct for w in ngram))
    if filter_nums is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.like_num for w in ngram))
    if include_pos:
        if isinstance(include_pos, unicode_type):
            include_pos = include_pos.upper()
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ == include_pos for w in ngram))
        elif isinstance(include_pos, (set, frozenset, list, tuple)):
            include_pos = {pos.upper() for pos in include_pos}
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ in include_pos for w in ngram))
        else:
            msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos))
            raise TypeError(msg)
    if exclude_pos:
        if isinstance(exclude_pos, unicode_type):
            exclude_pos = exclude_pos.upper()
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ != exclude_pos for w in ngram))
        elif isinstance(exclude_pos, (set, frozenset, list, tuple)):
            exclude_pos = {pos.upper() for pos in exclude_pos}
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ not in exclude_pos for w in ngram))
        else:
            msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos))
            raise TypeError(msg)
    if min_freq > 1:
        ngrams_ = list(ngrams_)
        freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_)
        ngrams_ = (ngram for ngram in ngrams_
                   if freqs[normalized_str(ngram)] >= min_freq)

    for ngram in ngrams_:
        yield ngram
Exemple #8
0
def named_entities(doc,
                   include_types=None,
                   exclude_types=None,
                   drop_determiners=True,
                   min_freq=1):
    """
    Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
    a spacy-parsed doc, optionally filtering by entity types and frequencies.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        include_types (str or Set[str]): remove named entities whose type IS NOT
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are included
        exclude_types (str or Set[str]): remove named entities whose type IS
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are excluded
        drop_determiners (bool): remove leading determiners (e.g. "the")
            from named entities (e.g. "the United States" => "United States")
        min_freq (int): remove named entities that occur in `doc` fewer
            than `min_freq` times

    Yields:
        ``spacy.Span``: the next named entity from ``doc`` passing all specified
            filters in order of appearance in the document

    Raise:
        TypeError: if `include_types` or `exclude_types` is not a str, a set of
            str, or a falsy value
    """
    if isinstance(doc, textacy.Doc):
        nes = doc.spacy_doc.ents
    else:
        nes = doc.ents
    if include_types:
        if isinstance(include_types, unicode_type):
            include_types = include_types.upper()
            if include_types == 'NUMERIC':
                include_types = NUMERIC_NE_TYPES  # we now go to next if block
            else:
                nes = (ne for ne in nes if ne.label_ == include_types)
        if isinstance(include_types, (set, frozenset, list, tuple)):
            include_types = {type_.upper() for type_ in include_types}
            nes = (ne for ne in nes if ne.label_ in include_types)
        else:
            msg = 'invalid `include_types` type: "{}"'.format(
                type(include_types))
            raise TypeError(msg)
    if exclude_types:
        if isinstance(exclude_types, unicode_type):
            exclude_types = exclude_types.upper()
            if exclude_types == 'NUMERIC':
                exclude_types = NUMERIC_NE_TYPES  # we now go to next if block
            else:
                nes = (ne for ne in nes if ne.label_ != exclude_types)
        if isinstance(exclude_types, (set, frozenset, list, tuple)):
            exclude_types = {type_.upper() for type_ in exclude_types}
            nes = (ne for ne in nes if ne.label_ not in exclude_types)
        else:
            msg = 'invalid `exclude_types` type: "{}"'.format(
                type(exclude_types))
            raise TypeError(msg)
    if drop_determiners is True:
        nes = (ne if ne[0].pos != DET else ne[1:] for ne in nes)
    if min_freq > 1:
        nes = list(nes)
        freqs = itertoolz.frequencies(ne.text for ne in nes)
        nes = (ne for ne in nes if freqs[ne.text] >= min_freq)

    for ne in nes:
        yield ne
Exemple #9
0
def named_entities(doc,
                   include_types=None, exclude_types=None,
                   drop_determiners=True, min_freq=1):
    """
    Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
    a spacy-parsed doc, optionally filtering by entity types and frequencies.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        include_types (str or Set[str]): remove named entities whose type IS NOT
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are included
        exclude_types (str or Set[str]): remove named entities whose type IS
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are excluded
        drop_determiners (bool): remove leading determiners (e.g. "the")
            from named entities (e.g. "the United States" => "United States")
        min_freq (int): remove named entities that occur in `doc` fewer
            than `min_freq` times

    Yields:
        ``spacy.Span``: the next named entity from ``doc`` passing all specified
            filters in order of appearance in the document

    Raise:
        TypeError: if `include_types` or `exclude_types` is not a str, a set of
            str, or a falsy value
    """
    if isinstance(doc, textacy.Doc):
        nes = doc.spacy_doc.ents
    else:
        nes = doc.ents
    if include_types:
        if isinstance(include_types, unicode_type):
            include_types = include_types.upper()
            if include_types == 'NUMERIC':
                include_types = NUMERIC_NE_TYPES  # we now go to next if block
            else:
                nes = (ne for ne in nes if ne.label_ == include_types)
        if isinstance(include_types, (set, frozenset, list, tuple)):
            include_types = {type_.upper() for type_ in include_types}
            nes = (ne for ne in nes if ne.label_ in include_types)
        else:
            msg = 'invalid `include_types` type: "{}"'.format(type(include_types))
            raise TypeError(msg)
    if exclude_types:
        if isinstance(exclude_types, unicode_type):
            exclude_types = exclude_types.upper()
            if exclude_types == 'NUMERIC':
                exclude_types = NUMERIC_NE_TYPES  # we now go to next if block
            else:
                nes = (ne for ne in nes if ne.label_ != exclude_types)
        if isinstance(exclude_types, (set, frozenset, list, tuple)):
            exclude_types = {type_.upper() for type_ in exclude_types}
            nes = (ne for ne in nes if ne.label_ not in exclude_types)
        else:
            msg = 'invalid `exclude_types` type: "{}"'.format(type(exclude_types))
            raise TypeError(msg)
    if drop_determiners is True:
        nes = (ne if ne[0].pos != DET else ne[1:] for ne in nes)
    if min_freq > 1:
        nes = list(nes)
        freqs = itertoolz.frequencies(ne.text for ne in nes)
        nes = (ne for ne in nes
               if freqs[ne.text] >= min_freq)

    for ne in nes:
        yield ne
Exemple #10
0
def words(doc,
          filter_stops=True, filter_punct=True, filter_nums=False,
          include_pos=None, exclude_pos=None, min_freq=1):
    """
    Extract an ordered sequence of words from a document processed by spaCy,
    optionally filtering words by part-of-speech tag and frequency.

    Args:
        doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``)
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove number-like words (e.g. 10, 'ten')
            from word list
        include_pos (str or Set[str]): remove words whose part-of-speech tag
            IS NOT included in this param
        exclude_pos (str or Set[str]): remove words whose part-of-speech tag
            IS in the specified tags
        min_freq (int): remove words that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Token``: the next token from ``doc`` passing specified filters
            in order of appearance in the document

    Raises:
        TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str,
            or a falsy value

    .. note:: Filtering by part-of-speech tag uses the universal POS tag set,
        http://universaldependencies.org/u/pos/
    """
    words_ = (w for w in doc if not w.is_space)
    if filter_stops is True:
        words_ = (w for w in words_ if not w.is_stop)
    if filter_punct is True:
        words_ = (w for w in words_ if not w.is_punct)
    if filter_nums is True:
        words_ = (w for w in words_ if not w.like_num)
    if include_pos:
        if isinstance(include_pos, unicode_type):
            include_pos = include_pos.upper()
            words_ = (w for w in words_ if w.pos_ == include_pos)
        elif isinstance(include_pos, (set, frozenset, list, tuple)):
            include_pos = {pos.upper() for pos in include_pos}
            words_ = (w for w in words_ if w.pos_ in include_pos)
        else:
            msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos))
            raise TypeError(msg)
    if exclude_pos:
        if isinstance(exclude_pos, unicode_type):
            exclude_pos = exclude_pos.upper()
            words_ = (w for w in words_ if w.pos_ != exclude_pos)
        elif isinstance(exclude_pos, (set, frozenset, list, tuple)):
            exclude_pos = {pos.upper() for pos in exclude_pos}
            words_ = (w for w in words_ if w.pos_ not in exclude_pos)
        else:
            msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos))
            raise TypeError(msg)
    if min_freq > 1:
        words_ = list(words_)
        freqs = itertoolz.frequencies(normalized_str(w) for w in words_)
        words_ = (w for w in words_
                  if freqs[normalized_str(w)] >= min_freq)

    for word in words_:
        yield word
Exemple #11
0
    def to_bag_of_terms(self,
                        ngrams=(1, 2, 3),
                        named_entities=True,
                        normalize='lemma',
                        weighting='count',
                        as_strings=False,
                        **kwargs):
        """
        Transform :class:`Doc` into a bag-of-terms: the set of unique terms in
        :class:`Doc` mapped to their frequency of occurrence, where "terms"
        includes ngrams and/or named entities.

        Args:
            ngrams (int or Set[int]): n of which n-grams to include; ``(1, 2, 3)``
                (default) includes unigrams (words), bigrams, and trigrams; `2`
                if only bigrams are wanted; falsy (e.g. False) to not include any
            named_entities (bool): if True (default), include named entities;
                note: if ngrams are also included, any ngrams that exactly
                overlap with an entity are skipped to prevent double-counting
            normalize (str or callable): if 'lemma', lemmatize terms; if 'lower',
                lowercase terms; if false-y, use the form of terms as they appear
                in doc; if a callable, must accept a ``spacy.Token`` or ``spacy.Span``
                and return a str, e.g. :func:`textacy.spacy_utils.normalized_str()`
            weighting ({'count', 'freq', 'binary'}): Type of weight to assign to
                terms. If 'count' (default), weights are the absolute number of
                occurrences (count) of term in doc. If 'binary', all counts are
                set equal to 1. If 'freq', term counts are normalized by the
                total token count, giving their relative frequency of occurrence.
            as_strings (bool): if True, words are returned as strings; if False
                (default), words are returned as their unique integer ids
            kwargs:
                - filter_stops (bool)
                - filter_punct (bool)
                - filter_nums (bool)
                - include_pos (str or Set[str])
                - exclude_pos (str or Set[str])
                - min_freq (int)
                - include_types (str or Set[str])
                - exclude_types (str or Set[str]
                - drop_determiners (bool)

                See :func:`extract.words() <textacy.extract.words>`,
                :func:`extract.ngrams() <textacy.extract.ngrams>`,
                and :func:`extract.named_entities() <textacy.extract.named_entities>`
                for more information on these parameters.

        Returns:
            dict: mapping of a unique term id or string (depending on the value
            of ``as_strings``) to its absolute, relative, or binary frequency
            of occurrence (depending on the value of ``weighting``).

        See Also:
            :meth:`Doc.to_terms_list() <Doc.to_terms_list>`
        """
        if weighting not in {'count', 'freq', 'binary'}:
            raise ValueError('weighting "{}" is invalid'.format(weighting))
        terms_list = self.to_terms_list(ngrams=ngrams,
                                        named_entities=named_entities,
                                        normalize=normalize,
                                        as_strings=as_strings,
                                        **kwargs)
        bot = itertoolz.frequencies(terms_list)
        if weighting == 'freq':
            n_tokens = self.n_tokens
            bot = {term: weight / n_tokens for term, weight in bot.items()}
        elif weighting == 'binary':
            bot = {term: 1 for term in bot.keys()}
        return bot
Exemple #12
0
def named_entities(doc,
                   include_types=None,
                   exclude_types=None,
                   drop_determiners=True,
                   min_freq=1):
    """
    Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
    a spacy-parsed doc, optionally filtering by entity types and frequencies.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        include_types (str or Set[str]): remove named entities whose type IS NOT
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are included
        exclude_types (str or Set[str]): remove named entities whose type IS
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are excluded
        drop_determiners (bool): Remove leading determiners (e.g. "the")
            from named entities (e.g. "the United States" => "United States").

            .. note:: Entities from which a leading determiner has been removed
               do *not* keep their entity type annotations. This is irritating
               but unavoidable, since the only way to re-annotate them is to
               modify ``doc`` directly, and this function is not meant to have
               any side-effects. If you're only using the text of the returned
               spans, this is no big deal; if you're using NE-like attributes
               downstream, however, this is something to watch out for.

        min_freq (int): remove named entities that occur in `doc` fewer
            than `min_freq` times

    Yields:
        ``spacy.Span``: the next named entity from ``doc`` passing all specified
            filters in order of appearance in the document

    Raise:
        TypeError: if `include_types` or `exclude_types` is not a str, a set of
            str, or a falsy value
    """
    if isinstance(doc, textacy.Doc):
        nes = doc.spacy_doc.ents
    else:
        nes = doc.ents
    if include_types:
        if isinstance(include_types, unicode_):
            include_types = include_types.upper()
            if include_types == 'NUMERIC':
                include_types = NUMERIC_NE_TYPES  # we now go to next if block
            else:
                nes = (ne for ne in nes if ne.label_ == include_types)
        if isinstance(include_types, (set, frozenset, list, tuple)):
            include_types = {type_.upper() for type_ in include_types}
            nes = (ne for ne in nes if ne.label_ in include_types)
        else:
            msg = 'invalid `include_types` type: "{}"'.format(
                type(include_types))
            raise TypeError(msg)
    if exclude_types:
        if isinstance(exclude_types, unicode_):
            exclude_types = exclude_types.upper()
            if exclude_types == 'NUMERIC':
                exclude_types = NUMERIC_NE_TYPES  # we now go to next if block
            else:
                nes = (ne for ne in nes if ne.label_ != exclude_types)
        if isinstance(exclude_types, (set, frozenset, list, tuple)):
            exclude_types = {type_.upper() for type_ in exclude_types}
            nes = (ne for ne in nes if ne.label_ not in exclude_types)
        else:
            msg = 'invalid `exclude_types` type: "{}"'.format(
                type(exclude_types))
            raise TypeError(msg)
    if drop_determiners is True:
        nes = (ne if ne[0].pos != DET else ne[1:] for ne in nes)
    if min_freq > 1:
        nes = list(nes)
        freqs = itertoolz.frequencies(ne.lower_ for ne in nes)
        nes = (ne for ne in nes if freqs[ne.lower_] >= min_freq)

    for ne in nes:
        yield ne
Exemple #13
0
def ngrams(doc, n,
           filter_stops=True, filter_punct=True, filter_nums=False,
           good_pos_tags=None, bad_pos_tags=None, min_freq=1):
    """
    Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed
    doc, optionally filtering n-grams by the types and parts-of-speech of the
    constituent words.

    Args:
        doc (``spacy.Doc`` or ``spacy.Span``)
        n (int): number of tokens per n-gram; 2 gives bigrams, 3 gives trigrams, etc.
        filter_stops (bool, optional): if True, remove ngrams that start or end
            with a stop word
        filter_punct (bool, optional): if True, remove ngrams that contain
            any punctuation-only tokens
        filter_nums (bool, optional): if True, remove ngrams that contain
            any numbers or number-like tokens (e.g. 10, 'ten')
        good_pos_tags (set[str], optional): remove ngrams whose constituent
            tokens' part-of-speech tags are NOT all in the specified tags,
            using the universal POS tagset
        bad_pos_tags (set[str], optional): remove ngrams if any of their constituent
            tokens' part-of-speech tags are in the specified tags,
            using the universal POS tagset
        min_freq (int, optional): remove ngrams that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Span``: the next ngram from ``doc`` passing all specified filters,
            in order of appearance in the document

    Raises:
        ValueError: if ``n`` < 1
    """
    if n < 1:
        raise ValueError('n must be greater than or equal to 1')

    ngrams_ = (doc[i: i + n]
               for i in range(len(doc) - n + 1))
    ngrams_ = (ngram for ngram in ngrams_
               if not any(w.is_space for w in ngram))
    if filter_stops is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not ngram[0].is_stop and not ngram[-1].is_stop)
    if filter_punct is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.is_punct for w in ngram))
    if filter_nums is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.like_num for w in ngram))
    if good_pos_tags:
        ngrams_ = (ngram for ngram in ngrams_
                   if all(w.pos_ in good_pos_tags for w in ngram))
    if bad_pos_tags:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.pos_ in bad_pos_tags for w in ngram))
    if min_freq > 1:
        ngrams_ = list(ngrams_)
        freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_)
        ngrams_ = (ngram for ngram in ngrams_
                   if freqs[normalized_str(ngram)] >= min_freq)

    for ngram in ngrams_:
        yield ngram
Exemple #14
0
def yake(
    doc: Doc,
    *,
    normalize: Optional[str] = "lemma",
    ngrams: Union[int, Collection[int]] = (1, 2, 3),
    include_pos: Optional[Union[str,
                                Collection[str]]] = ("NOUN", "PROPN", "ADJ"),
    window_size: int = 2,
    topn: Union[int, float] = 10,
) -> List[Tuple[str, float]]:
    """
    Extract key terms from a document using the YAKE algorithm.

    Args:
        doc: spaCy ``Doc`` from which to extract keyterms.
            Must be sentence-segmented; optionally POS-tagged.
        normalize: If "lemma", lemmatize terms; if "lower", lowercase terms;
            if None, use the form of terms as they appeared in ``doc``.

            .. note:: Unlike the other keyterm extraction functions, this one
               doesn't accept a callable for ``normalize``.

        ngrams: n of which n-grams to consider as keyterm candidates.
            For example, `(1, 2, 3)`` includes all unigrams, bigrams, and trigrams,
            while ``2`` includes bigrams only.
        include_pos: One or more POS tags with which to filter for good candidate keyterms.
            If None, include tokens of all POS tags
            (which also allows keyterm extraction from docs without POS-tagging.)
        window_size: Number of words to the right and left of a given word
            to use as context when computing the "relatedness to context"
            component of its score. Note that the resulting sliding window's
            full width is ``1 + (2 * window_size)``.
        topn: Number of top-ranked terms to return as key terms.
            If an integer, represents the absolute number; if a float, value
            must be in the interval (0.0, 1.0], which is converted to an int by
            ``int(round(len(candidates) * topn))``

    Returns:
        Sorted list of top ``topn`` key terms and their corresponding YAKE scores.

    References:
        Campos, Mangaravite, Pasquali, Jorge, Nunes, and Jatowt. (2018).
        A Text Feature Based Automatic Keyword Extraction Method for Single Documents.
        Advances in Information Retrieval. ECIR 2018.
        Lecture Notes in Computer Science, vol 10772, pp. 684-691.
    """
    # validate / transform args
    ngrams = cast(Tuple[int, ...], utils.to_collection(ngrams, int, tuple))
    include_pos = cast(Set[str], utils.to_collection(include_pos, str, set))
    if isinstance(topn, float):
        if not 0.0 < topn <= 1.0:
            raise ValueError(f"topn = {topn} is invalid; "
                             "must be an int, or a float between 0.0 and 1.0")

    # bail out on empty docs
    if not doc:
        return []

    stop_words: Set[str] = set()
    seen_candidates: Set[str] = set()
    # compute key values on a per-word basis
    word_occ_vals = _get_per_word_occurrence_values(doc, normalize, stop_words,
                                                    window_size)
    # doc doesn't have any words...
    if not word_occ_vals:
        return []

    word_freqs = {
        w_id: len(vals["is_uc"])
        for w_id, vals in word_occ_vals.items()
    }
    word_scores = _compute_word_scores(doc, word_occ_vals, word_freqs,
                                       stop_words)
    # compute scores for candidate terms based on scores of constituent words
    term_scores: Dict[str, float] = {}
    # do single-word candidates separately; it's faster and simpler
    if 1 in ngrams:
        candidates = _get_unigram_candidates(doc, include_pos)
        _score_unigram_candidates(
            candidates,
            word_freqs,
            word_scores,
            term_scores,
            stop_words,
            seen_candidates,
            normalize,
        )
    # now compute combined scores for higher-n ngram and candidates
    candidates = list(
        ke_utils.get_ngram_candidates(
            doc,
            [n for n in ngrams if n > 1],
            include_pos=include_pos,
        ))
    attr_name = _get_attr_name(normalize, True)
    ngram_freqs = itertoolz.frequencies(" ".join(
        getattr(word, attr_name) for word in ngram) for ngram in candidates)
    _score_ngram_candidates(
        candidates,
        ngram_freqs,
        word_scores,
        term_scores,
        seen_candidates,
        normalize,
    )
    # build up a list of key terms in order of increasing score
    if isinstance(topn, float):
        topn = int(round(len(seen_candidates) * topn))
    sorted_term_scores = sorted(
        term_scores.items(),
        key=operator.itemgetter(1),
        reverse=False,
    )
    return ke_utils.get_filtered_topn_terms(sorted_term_scores,
                                            topn,
                                            match_threshold=0.8)
Exemple #15
0
def words(
    doc,
    filter_stops=True,
    filter_punct=True,
    filter_nums=False,
    include_pos=None,
    exclude_pos=None,
    min_freq=1,
):
    """
    Extract an ordered sequence of words from a document processed by spaCy,
    optionally filtering words by part-of-speech tag and frequency.

    Args:
        doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``)
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove number-like words (e.g. 10, 'ten')
            from word list
        include_pos (str or Set[str]): remove words whose part-of-speech tag
            IS NOT included in this param
        exclude_pos (str or Set[str]): remove words whose part-of-speech tag
            IS in the specified tags
        min_freq (int): remove words that occur in ``doc`` fewer than
            ``min_freq`` times

    Yields:
        ``spacy.Token``: the next token from ``doc`` passing specified filters
        in order of appearance in the document

    Raises:
        TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str,
            or a falsy value

    Note:
        Filtering by part-of-speech tag uses the universal POS tag set; for details,
        check spaCy's docs: https://spacy.io/api/annotation#pos-tagging
    """
    words_ = (w for w in doc if not w.is_space)
    if filter_stops is True:
        words_ = (w for w in words_ if not w.is_stop)
    if filter_punct is True:
        words_ = (w for w in words_ if not w.is_punct)
    if filter_nums is True:
        words_ = (w for w in words_ if not w.like_num)
    if include_pos:
        if isinstance(include_pos, compat.unicode_):
            include_pos = include_pos.upper()
            words_ = (w for w in words_ if w.pos_ == include_pos)
        elif isinstance(include_pos, (set, frozenset, list, tuple)):
            include_pos = {pos.upper() for pos in include_pos}
            words_ = (w for w in words_ if w.pos_ in include_pos)
        else:
            msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos))
            raise TypeError(msg)
    if exclude_pos:
        if isinstance(exclude_pos, compat.unicode_):
            exclude_pos = exclude_pos.upper()
            words_ = (w for w in words_ if w.pos_ != exclude_pos)
        elif isinstance(exclude_pos, (set, frozenset, list, tuple)):
            exclude_pos = {pos.upper() for pos in exclude_pos}
            words_ = (w for w in words_ if w.pos_ not in exclude_pos)
        else:
            msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos))
            raise TypeError(msg)
    if min_freq > 1:
        words_ = list(words_)
        freqs = itertoolz.frequencies(w.lower_ for w in words_)
        words_ = (w for w in words_ if freqs[w.lower_] >= min_freq)

    for word in words_:
        yield word
Exemple #16
0
def named_entities(doc,
                   include_types=None,
                   exclude_types=None,
                   drop_determiners=True,
                   min_freq=1):
    """
    Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
    a spacy-parsed doc, optionally filtering by entity types and frequencies.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        include_types (str or Set[str]): remove named entities whose type IS NOT
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are included
        exclude_types (str or Set[str]): remove named entities whose type IS
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are excluded
        drop_determiners (bool): Remove leading determiners (e.g. "the")
            from named entities (e.g. "the United States" => "United States").

            .. note:: Entities from which a leading determiner has been removed
               are, effectively, *new* entities, and not saved to the ``SpacyDoc``
               from which they came. This is irritating but unavoidable, since
               this function is not meant to have side-effects on document state.
               If you're only using the text of the returned spans, this is no
               big deal, but watch out if you're counting on determiner-less
               entities associated with the doc downstream.

        min_freq (int): remove named entities that occur in ``doc`` fewer
            than ``min_freq`` times

    Yields:
        ``spacy.Span``: the next named entity from ``doc`` passing all specified
        filters in order of appearance in the document

    Raises:
        TypeError: if ``include_types`` or ``exclude_types`` is not a str, a set of
            str, or a falsy value
    """
    if hasattr(doc, "spacy_doc"):
        nes = doc.spacy_doc.ents
    else:
        nes = doc.ents
    # HACK: spacy's models have been erroneously tagging whitespace as entities
    # https://github.com/explosion/spaCy/commit/1e6725e9b734862e61081a916baf440697b9971e
    nes = (ne for ne in nes if not ne.text.isspace())
    include_types = _parse_ne_types(include_types, "include")
    exclude_types = _parse_ne_types(exclude_types, "exclude")
    if include_types:
        if isinstance(include_types, compat.unicode_):
            nes = (ne for ne in nes if ne.label_ == include_types)
        elif isinstance(include_types, (set, frozenset, list, tuple)):
            nes = (ne for ne in nes if ne.label_ in include_types)
    if exclude_types:
        if isinstance(exclude_types, compat.unicode_):
            nes = (ne for ne in nes if ne.label_ != exclude_types)
        elif isinstance(exclude_types, (set, frozenset, list, tuple)):
            nes = (ne for ne in nes if ne.label_ not in exclude_types)
    if drop_determiners is True:
        nes = (ne if ne[0].pos != DET else SpacySpan(
            ne.doc, ne.start + 1, ne.end, label=ne.label, vector=ne.vector)
               for ne in nes)
    if min_freq > 1:
        nes = list(nes)
        freqs = itertoolz.frequencies(ne.lower_ for ne in nes)
        nes = (ne for ne in nes if freqs[ne.lower_] >= min_freq)

    for ne in nes:
        yield ne
Exemple #17
0
def named_entities(doc,
                   include_types=None, exclude_types=None,
                   drop_determiners=True, min_freq=1):
    """
    Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
    a spacy-parsed doc, optionally filtering by entity types and frequencies.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        include_types (str or Set[str]): remove named entities whose type IS NOT
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are included
        exclude_types (str or Set[str]): remove named entities whose type IS
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are excluded
        drop_determiners (bool): Remove leading determiners (e.g. "the")
            from named entities (e.g. "the United States" => "United States").

            .. note:: Entities from which a leading determiner has been removed
               are, effectively, *new* entities, and not saved to the ``SpacyDoc``
               from which they came. This is irritating but unavoidable, since
               this function is not meant to have side-effects on document state.
               If you're only using the text of the returned spans, this is no
               big deal, but watch out if you're counting on determiner-less
               entities associated with the doc downstream.

        min_freq (int): remove named entities that occur in `doc` fewer
            than `min_freq` times

    Yields:
        ``spacy.Span``: the next named entity from ``doc`` passing all specified
        filters in order of appearance in the document

    Raises:
        TypeError: if `include_types` or `exclude_types` is not a str, a set of
            str, or a falsy value
    """
    if hasattr(doc, 'spacy_doc'):
        nes = doc.spacy_doc.ents
    else:
        nes = doc.ents
    if include_types:
        if isinstance(include_types, compat.unicode_):
            include_types = include_types.upper()
            if include_types == 'NUMERIC':
                include_types = constants.NUMERIC_NE_TYPES  # we now go to next if block
            else:
                nes = (ne for ne in nes if ne.label_ == include_types)
        if isinstance(include_types, (set, frozenset, list, tuple)):
            include_types = {type_.upper() for type_ in include_types}
            nes = (ne for ne in nes if ne.label_ in include_types)
        else:
            msg = 'invalid `include_types` type: "{}"'.format(type(include_types))
            raise TypeError(msg)
    if exclude_types:
        if isinstance(exclude_types, compat.unicode_):
            exclude_types = exclude_types.upper()
            if exclude_types == 'NUMERIC':
                exclude_types = constants.NUMERIC_NE_TYPES  # we now go to next if block
            else:
                nes = (ne for ne in nes if ne.label_ != exclude_types)
        if isinstance(exclude_types, (set, frozenset, list, tuple)):
            exclude_types = {type_.upper() for type_ in exclude_types}
            nes = (ne for ne in nes if ne.label_ not in exclude_types)
        else:
            msg = 'invalid `exclude_types` type: "{}"'.format(type(exclude_types))
            raise TypeError(msg)
    if drop_determiners is True:
        nes = (
            ne if ne[0].pos != DET else SpacySpan(ne.doc, ne.start + 1, ne.end, label=ne.label, vector=ne.vector)
            for ne in nes)
    if min_freq > 1:
        nes = list(nes)
        freqs = itertoolz.frequencies(ne.lower_ for ne in nes)
        nes = (ne for ne in nes
               if freqs[ne.lower_] >= min_freq)

    for ne in nes:
        yield ne
def ngrams(
    doc: Union[Doc, Span],
    n: int,
    *,
    filter_stops: bool = True,
    filter_punct: bool = True,
    filter_nums: bool = False,
    include_pos: Optional[Union[str, Set[str]]] = None,
    exclude_pos: Optional[Union[str, Set[str]]] = None,
    min_freq: int = 1,
) -> Iterable[Span]:
    """
    Extract an ordered sequence of n-grams (``n`` consecutive words) from a
    spacy-parsed doc, optionally filtering n-grams by the types and
    parts-of-speech of the constituent words.

    Args:
        doc
        n: Number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc.
        filter_stops: If True, remove ngrams that start or end with a stop word
        filter_punct: If True, remove ngrams that contain any punctuation-only tokens
        filter_nums: If True, remove ngrams that contain any numbers
            or number-like tokens (e.g. 10, 'ten')
        include_pos: Remove ngrams if any of their constituent tokens' part-of-speech tags
            ARE NOT included in this param
        exclude_pos: Remove ngrams if any of their constituent tokens' part-of-speech tags
            ARE included in this param
        min_freq: Remove ngrams that occur in ``doc`` fewer than ``min_freq`` times

    Yields:
        Next ngram from ``doc`` passing all specified filters, in order of appearance
        in the document

    Raises:
        ValueError: if ``n`` < 1
        TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str,
            or a falsy value

    Note:
        Filtering by part-of-speech tag uses the universal POS tag set; for details,
        check spaCy's docs: https://spacy.io/api/annotation#pos-tagging
    """
    if n < 1:
        raise ValueError("n must be greater than or equal to 1")

    ngrams_ = (doc[i:i + n] for i in range(len(doc) - n + 1))
    ngrams_ = (ngram for ngram in ngrams_
               if not any(w.is_space for w in ngram))
    if filter_stops is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not ngram[0].is_stop and not ngram[-1].is_stop)
    if filter_punct is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.is_punct for w in ngram))
    if filter_nums is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.like_num for w in ngram))
    if include_pos:
        include_pos = cast(Set[str],
                           utils.to_collection(include_pos, str, set))
        include_pos = {pos.upper() for pos in include_pos}
        ngrams_ = (ngram for ngram in ngrams_
                   if all(w.pos_ in include_pos for w in ngram))
    if exclude_pos:
        exclude_pos = cast(Set[str],
                           utils.to_collection(exclude_pos, str, set))
        exclude_pos = {pos.upper() for pos in exclude_pos}
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.pos_ in exclude_pos for w in ngram))
    if min_freq > 1:
        ngrams_ = list(ngrams_)
        freqs = itertoolz.frequencies(ngram.lower_ for ngram in ngrams_)
        ngrams_ = (ngram for ngram in ngrams_
                   if freqs[ngram.lower_] >= min_freq)

    for ngram in ngrams_:
        yield ngram
def entities(
    doc: Doc,
    *,
    include_types: Optional[Union[str, Set[str]]] = None,
    exclude_types: Optional[Union[str, Set[str]]] = None,
    drop_determiners: bool = True,
    min_freq: int = 1,
) -> Iterable[Span]:
    """
    Extract an ordered sequence of named entities (PERSON, ORG, LOC, etc.) from
    a ``Doc``, optionally filtering by entity types and frequencies.

    Args:
        doc
        include_types: Remove entities whose type IS NOT
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are included
        exclude_types: Remove entities whose type IS
            in this param; if "NUMERIC", all numeric entity types ("DATE",
            "MONEY", "ORDINAL", etc.) are excluded
        drop_determiners: Remove leading determiners (e.g. "the")
            from entities (e.g. "the United States" => "United States").

            .. note:: Entities from which a leading determiner has been removed
               are, effectively, *new* entities, and not saved to the ``Doc``
               from which they came. This is irritating but unavoidable, since
               this function is not meant to have side-effects on document state.
               If you're only using the text of the returned spans, this is no
               big deal, but watch out if you're counting on determiner-less
               entities associated with the doc downstream.

        min_freq: Remove entities that occur in ``doc`` fewer
            than ``min_freq`` times

    Yields:
        Next entity from ``doc`` passing all specified filters in order of appearance
        in the document

    Raises:
        TypeError: if ``include_types`` or ``exclude_types`` is not a str, a set of
            str, or a falsy value
    """
    ents = doc.ents
    # HACK: spacy's models have been erroneously tagging whitespace as entities
    # https://github.com/explosion/spaCy/commit/1e6725e9b734862e61081a916baf440697b9971e
    ents = (ent for ent in ents if not ent.text.isspace())
    include_types = _parse_ent_types(include_types, "include")
    exclude_types = _parse_ent_types(exclude_types, "exclude")
    if include_types:
        if isinstance(include_types, str):
            ents = (ent for ent in ents if ent.label_ == include_types)
        elif isinstance(include_types, (set, frozenset, list, tuple)):
            ents = (ent for ent in ents if ent.label_ in include_types)
    if exclude_types:
        if isinstance(exclude_types, str):
            ents = (ent for ent in ents if ent.label_ != exclude_types)
        elif isinstance(exclude_types, (set, frozenset, list, tuple)):
            ents = (ent for ent in ents if ent.label_ not in exclude_types)
    if drop_determiners is True:
        ents = (ent if ent[0].pos != DET else Span(ent.doc,
                                                   ent.start + 1,
                                                   ent.end,
                                                   label=ent.label,
                                                   vector=ent.vector)
                for ent in ents)
    if min_freq > 1:
        ents = list(ents)
        freqs = itertoolz.frequencies(ent.lower_ for ent in ents)
        ents = (ent for ent in ents if freqs[ent.lower_] >= min_freq)

    for ent in ents:
        yield ent
Exemple #20
0
def to_bag_of_terms(doc,
                    ngrams=(1, 2, 3),
                    entities=True,
                    normalize="lemma",
                    weighting="count",
                    as_strings=False,
                    **kwargs):
    """
    Transform ``Doc`` into a bag-of-terms: the set of unique terms in ``Doc``
    mapped to their frequency of occurrence, where "terms" includes ngrams and/or entities.

    Args:
        doc (:class:`spacy.tokens.Doc`)
        ngrams (int or Set[int]): n of which n-grams to include; ``(1, 2, 3)``
            (default) includes unigrams (words), bigrams, and trigrams; `2`
            if only bigrams are wanted; falsy (e.g. False) to not include any
        entities (bool): If True (default), include named entities;
            note: if ngrams are also included, any ngrams that exactly
            overlap with an entity are skipped to prevent double-counting
        normalize (str or callable): If "lemma", lemmatize terms; if "lower",
            lowercase terms; if falsy, use the form of terms as they appear
            in ``doc``; if a callable, must accept a ``Token`` or ``Span``
            and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`.
        weighting ({"count", "freq", "binary"}): Type of weight to assign to
            terms. If "count" (default), weights are the absolute number of
            occurrences (count) of term in doc. If "binary", all counts are
            set equal to 1. If "freq", term counts are normalized by the
            total token count, giving their relative frequency of occurrence.
        as_strings (bool): If True, words are returned as strings; if False
            (default), words are returned as their unique integer ids.
        kwargs:
            - filter_stops (bool)
            - filter_punct (bool)
            - filter_nums (bool)
            - include_pos (str or Set[str])
            - exclude_pos (str or Set[str])
            - min_freq (int)
            - include_types (str or Set[str])
            - exclude_types (str or Set[str]
            - drop_determiners (bool)

            See :func:`textacy.extract.words()`, :func:`textacy.extract.ngrams()`,
            and :func:`textacy.extract.entities()`  for details.

    Returns:
        dict: mapping of a unique term id or string (depending on the value
        of ``as_strings``) to its absolute, relative, or binary frequency
        of occurrence (depending on the value of ``weighting``).

    See Also:
        :func:`to_terms_list()`, which is used under the hood.
    """
    if weighting not in {"count", "freq", "binary"}:
        raise ValueError('weighting "{}" is invalid'.format(weighting))
    terms_list = to_terms_list(doc,
                               ngrams=ngrams,
                               entities=entities,
                               normalize=normalize,
                               as_strings=as_strings,
                               **kwargs)
    bot = itertoolz.frequencies(terms_list)
    if weighting == "freq":
        n_tokens = len(doc)
        bot = {term: weight / n_tokens for term, weight in bot.items()}
    elif weighting == "binary":
        bot = {term: 1 for term in bot.keys()}
    return bot
Exemple #21
0
def words(
    doc,
    *,
    filter_stops=True,
    filter_punct=True,
    filter_nums=False,
    include_pos=None,
    exclude_pos=None,
    min_freq=1,
):
    """
    Extract an ordered sequence of words from a document processed by spaCy,
    optionally filtering words by part-of-speech tag and frequency.

    Args:
        doc (:class:`spacy.tokens.Doc` or :class:`spacy.tokens.Span`)
        filter_stops (bool): If True, remove stop words from word list.
        filter_punct (bool): If True, remove punctuation from word list.
        filter_nums (bool): If True, remove number-like words (e.g. 10, "ten")
            from word list.
        include_pos (str or Set[str]): Remove words whose part-of-speech tag
            IS NOT included in this param.
        exclude_pos (str or Set[str]): Remove words whose part-of-speech tag
            IS in the specified tags.
        min_freq (int): Remove words that occur in ``doc`` fewer than
            ``min_freq`` times.

    Yields:
        :class:`spacy.tokens.Token`: Next token from ``doc`` passing specified filters
        in order of appearance in the document.

    Raises:
        TypeError: if ``include_pos`` or ``exclude_pos`` is not a str, a set of str,
            or a falsy value

    Note:
        Filtering by part-of-speech tag uses the universal POS tag set; for details,
        check spaCy's docs: https://spacy.io/api/annotation#pos-tagging
    """
    words_ = (w for w in doc if not w.is_space)
    if filter_stops is True:
        words_ = (w for w in words_ if not w.is_stop)
    if filter_punct is True:
        words_ = (w for w in words_ if not w.is_punct)
    if filter_nums is True:
        words_ = (w for w in words_ if not w.like_num)
    if include_pos:
        include_pos = {
            pos.upper()
            for pos in utils.to_collection(include_pos, str, set)
        }
        words_ = (w for w in words_ if w.pos_ in include_pos)
    if exclude_pos:
        exclude_pos = {
            pos.upper()
            for pos in utils.to_collection(exclude_pos, str, set)
        }
        words_ = (w for w in words_ if w.pos_ not in exclude_pos)
    if min_freq > 1:
        words_ = list(words_)
        freqs = itertoolz.frequencies(w.lower_ for w in words_)
        words_ = (w for w in words_ if freqs[w.lower_] >= min_freq)

    for word in words_:
        yield word