Example #1
0
def noun_chunks(doc, drop_determiners=True, min_freq=1):
    """
    Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally
    filtering by frequency and dropping leading determiners.

    Args:
        doc (``spacy.Doc``)
        drop_determiners (bool, optional): remove leading determiners (e.g. "the")
            from phrases (e.g. "the quick brown fox" => "quick brown fox")
        min_freq (int, optional): remove chunks that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Span``: the next noun chunk from ``doc`` in order of appearance
             in the document
    """
    ncs = doc.noun_chunks
    if drop_determiners is True:
        ncs = (nc if nc[0].pos != DET else nc[1:]
               for nc in ncs)
    if min_freq > 1:
        ncs = list(ncs)
        freqs = itertoolz.frequencies(normalized_str(nc) for nc in ncs)
        ncs = (nc for nc in ncs
               if freqs[normalized_str(nc)] >= min_freq)

    for nc in ncs:
        yield nc
Example #2
0
def noun_chunks(doc, drop_determiners=True, min_freq=1):
    """
    Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally
    filtering by frequency and dropping leading determiners.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        drop_determiners (bool): remove leading determiners (e.g. "the")
            from phrases (e.g. "the quick brown fox" => "quick brown fox")
        min_freq (int): remove chunks that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Span``: the next noun chunk from ``doc`` in order of appearance
             in the document
    """
    if isinstance(doc, textacy.Doc):
        ncs = doc.spacy_doc.noun_chunks
    else:
        ncs = doc.noun_chunks
    if drop_determiners is True:
        ncs = (nc if nc[0].pos != DET else nc[1:]
               for nc in ncs)
    if min_freq > 1:
        ncs = list(ncs)
        freqs = itertoolz.frequencies(normalized_str(nc) for nc in ncs)
        ncs = (nc for nc in ncs
               if freqs[normalized_str(nc)] >= min_freq)

    for nc in ncs:
        yield nc
Example #3
0
def test_normalize_str(spacy_doc):
    normalized_strs = [
        'the', 'unit', 'test', 'be', 'not', 'go', 'well', '.', '-PRON-',
        'love', 'Python', ',', 'but', '-PRON-', 'do', 'not', 'love',
        'backwards', 'incompatibility', '.', 'no', 'programmer', 'be',
        'permanently', 'damage', 'for', 'textacy', "'s", 'sake', '.',
        'thank', 'God', 'for', 'Stack', 'Overflow', '.']
    assert [spacy_utils.normalized_str(tok) for tok in spacy_doc if not tok.is_space] == normalized_strs
Example #4
0
def terms_to_semantic_network(terms,
                              window_width=10,
                              edge_weighting='cooc_freq'):
    """
    Convert an ordered list of non-overlapping terms into a semantic network,
    where each terms is represented by a node with edges linking it to other terms
    that co-occur within ``window_width`` terms of itself.

    Args:
        terms (list(str) or list(``spacy.Token``))
        window_width (int, optional): size of sliding window over `terms` that
            determines which are said to co-occur; if = 2, only adjacent terms
            will have edges in network
        edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary',
            all co-occurring terms will have network edges with weight = 1;
            if 'cooc_freq', edges will have a weight equal to the number of times
            that the connected nodes co-occur in a sliding window

    Returns:
        :class:`networkx.Graph()`

    Notes:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or spacy.Tokens beforehand
        - If terms are already strings, be sure to normalize so that like terms
          are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`)
    """
    if window_width < 2:
        raise ValueError('Window width must be >= 2.')

    if isinstance(terms[0], str):
        windows = itertoolz.sliding_window(window_width, terms)
    elif isinstance(terms[0], spacy_token):
        windows = ((normalized_str(tok) for tok in window)
                   for window in itertoolz.sliding_window(window_width, terms))
    else:
        msg = 'Input terms must be strings or spacy Tokens, not {}.'.format(type(terms[0]))
        raise TypeError(msg)

    graph = nx.Graph()

    if edge_weighting == 'cooc_freq':
        cooc_mat = defaultdict(lambda: defaultdict(int))
        for window in windows:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1
        graph.add_edges_from(
            (w1, w2, {'weight': cooc_mat[w1][w2]})
            for w1, w2s in cooc_mat.items() for w2 in w2s)

    elif edge_weighting == 'binary':
        graph.add_edges_from(
            w1_w2 for window in windows
            for w1_w2 in itertools.combinations(window, 2))

    return graph
Example #5
0
def terms_to_semantic_network(terms,
                              window_width=10,
                              edge_weighting='cooc_freq'):
    """
    Convert an ordered list of non-overlapping terms into a semantic network,
    where each terms is represented by a node with edges linking it to other terms
    that co-occur within ``window_width`` terms of itself.

    Args:
        terms (list(str) or list(``spacy.Token``))
        window_width (int, optional): size of sliding window over `terms` that
            determines which are said to co-occur; if = 2, only adjacent terms
            will have edges in network
        edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary',
            all co-occurring terms will have network edges with weight = 1;
            if 'cooc_freq', edges will have a weight equal to the number of times
            that the connected nodes co-occur in a sliding window

    Returns:
        :class:`networkx.Graph()`

    Notes:
        - Be sure to filter out stopwords, punctuation, certain parts of speech, etc.
          from the terms list before passing it to this function
        - Multi-word terms, such as named entities and compound nouns, must be merged
          into single strings or spacy.Tokens beforehand
        - If terms are already strings, be sure to normalize so that like terms
          are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`)
    """
    if window_width < 2:
        raise ValueError('Window width must be >= 2.')

    if isinstance(terms[0], str):
        windows = itertoolz.sliding_window(window_width, terms)
    elif isinstance(terms[0], spacy_token):
        windows = ((normalized_str(tok) for tok in window)
                   for window in itertoolz.sliding_window(window_width, terms))
    else:
        msg = 'Input terms must be strings or spacy Tokens, not {}.'.format(
            type(terms[0]))
        raise TypeError(msg)

    graph = nx.Graph()

    if edge_weighting == 'cooc_freq':
        cooc_mat = defaultdict(lambda: defaultdict(int))
        for window in windows:
            for w1, w2 in itertools.combinations(sorted(window), 2):
                cooc_mat[w1][w2] += 1
        graph.add_edges_from((w1, w2, {
            'weight': cooc_mat[w1][w2]
        }) for w1, w2s in cooc_mat.items() for w2 in w2s)

    elif edge_weighting == 'binary':
        graph.add_edges_from(w1_w2 for window in windows
                             for w1_w2 in itertools.combinations(window, 2))

    return graph
Example #6
0
 def test_normalize_str(self):
     normalized_strs = [
         'the', 'unit', 'test', 'be', 'not', 'go', 'well', '.', 'i', 'love',
         'Python', ',', 'but', 'i', 'do', 'not', 'love', 'some', 'of', 'Guido',
         "'s", 'decision', '.', 'no', 'computer', 'programmer', 'be', 'harm',
         'in', 'the', 'making', 'of', 'this', 'package', '.', 'thank', 'God',
         'for', 'Stack', 'Overflow', '.']
     self.assertEqual([spacy_utils.normalized_str(tok) for tok in self.spacy_doc if not tok.is_space],
                      normalized_strs)
Example #7
0
 def test_normalize_str(self):
     normalized_strs = [
         'the', 'unit', 'test', 'be', 'not', 'go', 'well', '.', '-PRON-', 'love',
         'Python', ',', 'but', '-PRON-', 'do', 'not', 'love', 'some', 'of', 'Guido',
         "'s", 'decision', '.', 'no', 'computer', 'programmer', 'be', 'harm',
         'in', 'the', 'making', 'of', 'this', 'package', '.', 'thank', 'God',
         'for', 'Stack', 'Overflow', '.']
     self.assertEqual([spacy_utils.normalized_str(tok) for tok in self.spacy_doc if not tok.is_space],
                      normalized_strs)
Example #8
0
def words(doc,
          filter_stops=True, filter_punct=True, filter_nums=False,
          good_pos_tags=None, bad_pos_tags=None, min_freq=1):
    """
    Extract an ordered sequence of words from a spacy-parsed doc, optionally
    filtering words by part-of-speech (etc.) and frequency.

    Args:
        doc (``spacy.Doc`` or ``spacy.Span``)
        filter_stops (bool, optional): if True, remove stop words from word list
        filter_punct (bool, optional): if True, remove punctuation from word list
        filter_nums (bool, optional): if True, remove number-like words
            (e.g. 10, 'ten') from word list
        good_pos_tags (set[str], optional): remove words whose part-of-speech tag
            is NOT in the specified tags, using the set of universal POS tagset
        bad_pos_tags (set[str], optional): remove words whose part-of-speech tag
            IS in the specified tags, using the set of universal POS tagset
        min_freq (int, optional): remove words that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Token``: the next token from ``doc`` passing specified filters
            in order of appearance in the document
    """
    words_ = (w for w in doc if not w.is_space)
    if filter_stops is True:
        words_ = (w for w in words_ if not w.is_stop)
    if filter_punct is True:
        words_ = (w for w in words_ if not w.is_punct)
    if filter_nums is True:
        words_ = (w for w in words_ if not w.like_num)
    if good_pos_tags:
        words_ = (w for w in words_ if w.pos_ in good_pos_tags)
    if bad_pos_tags:
        words_ = (w for w in words_ if w.pos_ not in bad_pos_tags)
    if min_freq > 1:
        words_ = list(words_)
        freqs = itertoolz.frequencies(normalized_str(w) for w in words_)
        words_ = (w for w in words_
                  if freqs[normalized_str(w)] >= min_freq)

    for word in words_:
        yield word
Example #9
0
    def term_count(self, term):
        """
        Get the number of occurrences ("count") of term in doc.

        Args:
            term (str or ``spacy.Token`` or ``spacy.Span``)

        Returns:
            int
        """
        # figure out what object we're dealing with here; convert as necessary
        if isinstance(term, unicode_type):
            term_text = term
            term_id = self.spacy_stringstore[term_text]
            term_len = term_text.count(' ') + 1
        elif isinstance(term, stoken):
            term_text = spacy_utils.normalized_str(term)
            term_id = self.spacy_stringstore[term_text]
            term_len = 1
        elif isinstance(term, sspan):
            term_text = spacy_utils.normalized_str(term)
            term_id = self.spacy_stringstore[term_text]
            term_len = len(term)

        term_count_ = self._term_counts[term_id]
        if term_count_ > 0:
            return term_count_
        # have we not already counted the appropriate `n` n-grams?
        if not any(self.spacy_stringstore[t].count(' ') == term_len
                   for t in self._term_counts):
            get_id = lambda x: self.spacy_stringstore[spacy_utils.
                                                      normalized_str(x)]
            if term_len == 1:
                self._term_counts += Counter(get_id(w) for w in self.words())
            else:
                self._term_counts += Counter(
                    get_id(ng) for ng in self.ngrams(term_len))
            term_count_ = self._term_counts[term_id]
            if term_count_ > 0:
                return term_count_
        # last resort: try a regular expression
        return sum(1 for _ in re.finditer(re.escape(term_text), self.text))
Example #10
0
def sents_to_semantic_network(sents, edge_weighting='cosine'):
    """
    Convert a list of sentences into a semantic network, where each sentence is
    represented by a node with edges linking it to other sentences weighted by
    the (cosine or jaccard) similarity of their constituent words.

    Args:
        sents (list(str) or list(:class:`spacy.Span`))
        edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric
            to use for weighting edges between sentences; if 'cosine', use the
            cosine similarity between sentences represented as tf-idf word vectors;
            if 'jaccard', use the set intersection divided by the set union of
            all words in a given sentence pair

    Returns:
        :class:`networkx.Graph()`: nodes are the integer indexes of the sentences
            in the input ``sents`` list, *not* the actual text of the sentences!

    Notes:
        * If passing sentences as strings, be sure to filter out stopwords, punctuation,
          certain parts of speech, etc. beforehand
        * Consider normalizing the strings so that like terms are counted together
          (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`)
    """
    n_sents = len(sents)
    if isinstance(sents[0], str):
        pass
    elif isinstance(sents[0], spacy_span):
        sents = [
            ' '.join(
                normalized_str(tok)
                for tok in extract.words(sent,
                                         filter_stops=True,
                                         filter_punct=True,
                                         filter_nums=False)) for sent in sents
        ]
    else:
        msg = 'Input sents must be strings or spacy Spans, not {}.'.format(
            type(sents[0]))
        raise TypeError(msg)

    if edge_weighting == 'cosine':
        term_sent_matrix = TfidfVectorizer().fit_transform(sents)
    elif edge_weighting == 'jaccard':
        term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents)
    weights = (term_sent_matrix * term_sent_matrix.T).A.tolist()

    graph = nx.Graph()
    graph.add_edges_from((i, j, {
        'weight': weights[i][j]
    }) for i in range(n_sents) for j in range(i + 1, n_sents))

    return graph
Example #11
0
    def term_count(self, term):
        """
        Get the number of occurrences ("count") of term in doc.

        Args:
            term (str or ``spacy.Token`` or ``spacy.Span``)

        Returns:
            int
        """
        # figure out what object we're dealing with here; convert as necessary
        if isinstance(term, str):
            term_text = term
            term_id = self.spacy_stringstore[term_text]
            term_len = term_text.count(' ') + 1
        elif isinstance(term, stoken):
            term_text = spacy_utils.normalized_str(term)
            term_id = self.spacy_stringstore[term_text]
            term_len = 1
        elif isinstance(term, sspan):
            term_text = spacy_utils.normalized_str(term)
            term_id = self.spacy_stringstore[term_text]
            term_len = len(term)

        term_count_ = self._term_counts[term_id]
        if term_count_ > 0:
            return term_count_
        # have we not already counted the appropriate `n` n-grams?
        if not any(self.spacy_stringstore[t].count(' ') == term_len
                   for t in self._term_counts):
            get_id = lambda x: self.spacy_stringstore[spacy_utils.normalized_str(x)]
            if term_len == 1:
                self._term_counts += Counter(get_id(w) for w in self.words())
            else:
                self._term_counts += Counter(get_id(ng) for ng in self.ngrams(term_len))
            term_count_ = self._term_counts[term_id]
            if term_count_ > 0:
                return term_count_
        # last resort: try a regular expression
        return sum(1 for _ in re.finditer(re.escape(term_text), self.text))
Example #12
0
def sents_to_semantic_network(sents,
                              edge_weighting='cosine'):
    """
    Convert a list of sentences into a semantic network, where each sentence is
    represented by a node with edges linking it to other sentences weighted by
    the (cosine or jaccard) similarity of their constituent words.

    Args:
        sents (list(str) or list(:class:`spacy.Span`))
        edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric
            to use for weighting edges between sentences; if 'cosine', use the
            cosine similarity between sentences represented as tf-idf word vectors;
            if 'jaccard', use the set intersection divided by the set union of
            all words in a given sentence pair

    Returns:
        :class:`networkx.Graph`: nodes are the integer indexes of the sentences
            in the input ``sents`` list, *not* the actual text of the sentences!

    Notes:
        * If passing sentences as strings, be sure to filter out stopwords, punctuation,
          certain parts of speech, etc. beforehand
        * Consider normalizing the strings so that like terms are counted together
          (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`)
    """
    n_sents = len(sents)
    if isinstance(sents[0], unicode_type):
        pass
    elif isinstance(sents[0], SpacySpan):
        sents = [' '.join(normalized_str(tok) for tok in
                          extract.words(sent, filter_stops=True, filter_punct=True, filter_nums=False))
                 for sent in sents]
    else:
        msg = 'Input sents must be strings or spacy Spans, not {}.'.format(type(sents[0]))
        raise TypeError(msg)

    if edge_weighting == 'cosine':
        term_sent_matrix = TfidfVectorizer().fit_transform(sents)
    elif edge_weighting == 'jaccard':
        term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents)
    weights = (term_sent_matrix * term_sent_matrix.T).A.tolist()

    graph = nx.Graph()
    graph.add_edges_from(
        (i, j, {'weight': weights[i][j]})
        for i in range(n_sents) for j in range(i + 1, n_sents))

    return graph
Example #13
0
    def term_counts(self, lemmatize='auto', ngram_range=(1, 1),
                    include_nes=False, include_ncs=False, include_kts=False):
        """
        Get the number of occurrences ("counts") of each unique term in doc;
        terms may be words, n-grams, named entities, noun phrases, and key terms.

        Args:
            lemmatize (bool or 'auto', optional): if True, lemmatize all terms
                when getting their frequencies; if 'auto', lemmatize all terms
                that aren't proper nouns or acronyms
            ngram_range (tuple(int), optional): (min n, max n) values for n-grams
                to include in terms list; default (1, 1) only includes unigrams
            include_nes (bool, optional): if True, include named entities in terms list
            include_ncs (bool, optional): if True, include noun chunks in terms list
            include_kts (bool, optional): if True, include key terms in terms list

        Returns:
            :class:`collections.Counter() <collections.Counter>`: mapping of unique
                term ids to corresponding term counts
        """
        if lemmatize == 'auto':
            get_id = lambda x: self.spacy_stringstore[spacy_utils.normalized_str(x)]
        elif lemmatize is True:
            get_id = lambda x: self.spacy_stringstore[x.lemma_]
        else:
            get_id = lambda x: self.spacy_stringstore[x.text]

        for n in range(ngram_range[0], ngram_range[1] + 1):
            if n == 1:
                self._term_counts = self._term_counts | Counter(
                    get_id(word) for word in self.words())
            else:
                self._term_counts = self._term_counts | Counter(
                    get_id(ngram) for ngram in self.ngrams(n))
        if include_nes is True:
            self._term_counts = self._term_counts | Counter(
                get_id(ne) for ne in self.named_entities())
        if include_ncs is True:
            self._term_counts = self._term_counts | Counter(
                get_id(nc) for nc in self.noun_chunks())
        if include_kts is True:
            # HACK: key terms are currently returned as strings
            # TODO: cache key terms, and return them as spacy spans
            get_id = lambda x: self.spacy_stringstore[x]
            self._term_counts = self._term_counts | Counter(
                get_id(kt) for kt, _ in self.key_terms())

        return self._term_counts
Example #14
0
def test_normalize_str(spacy_doc):
    normalized_strs = [
        "the",
        "unit",
        "test",
        "be",
        "not",
        "go",
        "well",
        ".",
        "-PRON-",
        "love",
        "Python",
        ",",
        "but",
        "-PRON-",
        "do",
        "not",
        "love",
        "backwards",
        "incompatibility",
        ".",
        "no",
        "programmer",
        "be",
        "permanently",
        "damage",
        "for",
        "textacy",
        "'s",
        "sake",
        ".",
        "thank",
        "God",
        "for",
        "Stack",
        "Overflow",
        ".",
    ]
    assert [
        spacy_utils.normalized_str(tok) for tok in spacy_doc if not tok.is_space
    ] == normalized_strs
Example #15
0
def words(doc,
          filter_stops=True, filter_punct=True, filter_nums=False,
          include_pos=None, exclude_pos=None, min_freq=1):
    """
    Extract an ordered sequence of words from a document processed by spaCy,
    optionally filtering words by part-of-speech tag and frequency.

    Args:
        doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``)
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove number-like words (e.g. 10, 'ten')
            from word list
        include_pos (str or Set[str]): remove words whose part-of-speech tag
            IS NOT included in this param
        exclude_pos (str or Set[str]): remove words whose part-of-speech tag
            IS in the specified tags
        min_freq (int): remove words that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Token``: the next token from ``doc`` passing specified filters
            in order of appearance in the document

    Raises:
        TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str,
            or a falsy value

    .. note:: Filtering by part-of-speech tag uses the universal POS tag set,
        http://universaldependencies.org/u/pos/
    """
    words_ = (w for w in doc if not w.is_space)
    if filter_stops is True:
        words_ = (w for w in words_ if not w.is_stop)
    if filter_punct is True:
        words_ = (w for w in words_ if not w.is_punct)
    if filter_nums is True:
        words_ = (w for w in words_ if not w.like_num)
    if include_pos:
        if isinstance(include_pos, unicode_):
            include_pos = include_pos.upper()
            words_ = (w for w in words_ if w.pos_ == include_pos)
        elif isinstance(include_pos, (set, frozenset, list, tuple)):
            include_pos = {pos.upper() for pos in include_pos}
            words_ = (w for w in words_ if w.pos_ in include_pos)
        else:
            msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos))
            raise TypeError(msg)
    if exclude_pos:
        if isinstance(exclude_pos, unicode_):
            exclude_pos = exclude_pos.upper()
            words_ = (w for w in words_ if w.pos_ != exclude_pos)
        elif isinstance(exclude_pos, (set, frozenset, list, tuple)):
            exclude_pos = {pos.upper() for pos in exclude_pos}
            words_ = (w for w in words_ if w.pos_ not in exclude_pos)
        else:
            msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos))
            raise TypeError(msg)
    if min_freq > 1:
        words_ = list(words_)
        freqs = itertoolz.frequencies(normalized_str(w) for w in words_)
        words_ = (w for w in words_
                  if freqs[normalized_str(w)] >= min_freq)

    for word in words_:
        yield word
Example #16
0
def ngrams(doc, n,
           filter_stops=True, filter_punct=True, filter_nums=False,
           good_pos_tags=None, bad_pos_tags=None, min_freq=1):
    """
    Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed
    doc, optionally filtering n-grams by the types and parts-of-speech of the
    constituent words.

    Args:
        doc (``spacy.Doc`` or ``spacy.Span``)
        n (int): number of tokens per n-gram; 2 gives bigrams, 3 gives trigrams, etc.
        filter_stops (bool, optional): if True, remove ngrams that start or end
            with a stop word
        filter_punct (bool, optional): if True, remove ngrams that contain
            any punctuation-only tokens
        filter_nums (bool, optional): if True, remove ngrams that contain
            any numbers or number-like tokens (e.g. 10, 'ten')
        good_pos_tags (set[str], optional): remove ngrams whose constituent
            tokens' part-of-speech tags are NOT all in the specified tags,
            using the universal POS tagset
        bad_pos_tags (set[str], optional): remove ngrams if any of their constituent
            tokens' part-of-speech tags are in the specified tags,
            using the universal POS tagset
        min_freq (int, optional): remove ngrams that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Span``: the next ngram from ``doc`` passing all specified filters,
            in order of appearance in the document

    Raises:
        ValueError: if ``n`` < 1
    """
    if n < 1:
        raise ValueError('n must be greater than or equal to 1')

    ngrams_ = (doc[i: i + n]
               for i in range(len(doc) - n + 1))
    ngrams_ = (ngram for ngram in ngrams_
               if not any(w.is_space for w in ngram))
    if filter_stops is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not ngram[0].is_stop and not ngram[-1].is_stop)
    if filter_punct is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.is_punct for w in ngram))
    if filter_nums is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.like_num for w in ngram))
    if good_pos_tags:
        ngrams_ = (ngram for ngram in ngrams_
                   if all(w.pos_ in good_pos_tags for w in ngram))
    if bad_pos_tags:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.pos_ in bad_pos_tags for w in ngram))
    if min_freq > 1:
        ngrams_ = list(ngrams_)
        freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_)
        ngrams_ = (ngram for ngram in ngrams_
                   if freqs[normalized_str(ngram)] >= min_freq)

    for ngram in ngrams_:
        yield ngram
Example #17
0
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None):
    """
    Extract key terms from a document using the [SGRank]_ algorithm.

    Args:
        doc (``spacy.Doc``)
        window_width (int, optional): width of sliding window in which term
            co-occurrences are said to occur
        n_keyterms (int or float, optional): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            representing the fraction of top-ranked terms to return as keyterms
        idf (dict, optional): mapping of
            {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency}
            for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1);
            NOTE: results are better with idf information

    Returns:
        list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their
            corresponding SGRank scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]

    References:
        .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and
           Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase
           Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117.
    """
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError(
                '`n_keyterms` must be an int, or a float between 0.0 and 1.0')
    n_toks = len(doc)
    min_term_freq = min(n_toks // 1500, 4)

    # build full list of candidate terms
    terms = list(
        itertoolz.concat(
            extract.ngrams(doc,
                           n,
                           filter_stops=True,
                           filter_punct=True,
                           filter_nums=False,
                           good_pos_tags={'NOUN', 'ADJ'},
                           min_freq=min_term_freq) for n in range(1, 7)))
    # if inverse document frequencies available, also add verbs
    # verbs without IDF downweighting dominate the results, and not in a good way
    if idf:
        terms.extend(
            itertoolz.concat(
                extract.ngrams(doc,
                               n,
                               filter_stops=True,
                               filter_punct=True,
                               filter_nums=False,
                               good_pos_tags={'VERB'},
                               min_freq=min_term_freq) for n in range(1, 7)))

    terms_as_strs = {
        id(term): spacy_utils.normalized_str(term)
        for term in terms
    }

    # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available
    n_top_20pct = int(len(terms) * 0.2)
    term_counts = Counter(terms_as_strs[id(term)] for term in terms)
    if idf:
        mod_tfidfs = {
            term: count * idf[term] if ' ' not in term else count
            for term, count in term_counts.items()
        }
        top_term_texts = {
            term
            for term, _ in sorted(mod_tfidfs.items(),
                                  key=itemgetter(1),
                                  reverse=True)[:n_top_20pct]
        }
    else:
        top_term_texts = {
            term
            for term, _ in term_counts.most_common(n_top_20pct)
        }

    terms = [
        term for term in terms if terms_as_strs[id(term)] in top_term_texts
    ]

    # compute term weights from statistical attributes
    term_weights = {}
    set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms}
    n_toks_plus_1 = n_toks + 1
    for term in terms:
        term_str = terms_as_strs[id(term)]
        pos_first_occ_factor = log(n_toks_plus_1 / (term.start + 1))
        # TODO: assess if len(t) puts too much emphasis on long terms
        # alternative: term_len = 1 if ' ' not in term else sqrt(len(term))
        term_len = 1 if ' ' not in term else len(term)
        term_count = term_counts[term_str]
        subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str
                           if t2 != term_str and term_str in t2)
        term_freq_factor = (term_count - subsum_count)
        if idf and ' ' not in term_str:
            term_freq_factor *= idf[term_str]
        term_weights[
            term_str] = term_freq_factor * pos_first_occ_factor * term_len

    # filter terms to only those with positive weights
    terms = [
        term for term in terms if term_weights[terms_as_strs[id(term)]] > 0
    ]

    n_coocs = defaultdict(lambda: defaultdict(int))
    sum_logdists = defaultdict(lambda: defaultdict(float))

    # iterate over windows
    for start_ind in range(n_toks):
        end_ind = start_ind + window_width
        window_terms = (term for term in terms
                        if start_ind <= term.start <= end_ind)
        # get all token combinations within window
        for t1, t2 in itertools.combinations(window_terms, 2):
            if t1 is t2:
                continue
            n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1
            try:
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    log(window_width / abs(t1.start - t2.start))
            except ZeroDivisionError:  # HACK: pretend that they're 1 token apart
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    log(window_width)
        if end_ind > n_toks:
            break

    # compute edge weights between co-occurring terms (nodes)
    edge_weights = defaultdict(lambda: defaultdict(float))
    for t1, t2s in sum_logdists.items():
        for t2 in t2s:
            edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]
                                    ) * term_weights[t1] * term_weights[t2]
    # normalize edge weights by sum of outgoing edge weights per term (node)
    norm_edge_weights = []
    for t1, t2s in edge_weights.items():
        sum_edge_weights = sum(t2s.values())
        norm_edge_weights.extend((t1, t2, {
            'weight': weight / sum_edge_weights
        }) for t2, weight in t2s.items())

    # build the weighted directed graph from edges, rank nodes by pagerank
    graph = nx.DiGraph()
    graph.add_edges_from(norm_edge_weights)
    term_ranks = nx.pagerank_scipy(graph)

    if isinstance(n_keyterms, float):
        n_keyterms = int(len(term_ranks) * n_keyterms)

    return sorted(term_ranks.items(), key=itemgetter(1),
                  reverse=True)[:n_keyterms]
Example #18
0
def key_terms_from_semantic_network(doc,
                                    window_width=2,
                                    edge_weighting='binary',
                                    ranking_algo='pagerank',
                                    join_key_words=False,
                                    n_keyterms=10,
                                    **kwargs):
    """
    Extract key terms from a document by ranking nodes in a semantic network of
    terms, connected by edges and weights specified by parameters.

    Args:
        doc (``spacy.Doc``):
        window_width (int, optional): width of sliding window in which term
            co-occurrences are said to occur
        edge_weighting (str {'binary', 'cooc_freq'}, optional): method used to
            determine weights of edges between nodes in the semantic network;
            if 'binary', edge weight is set to 1 for any two terms co-occurring
            within `window_width` terms; if 'cooc_freq', edge weight is set to
            the number of times that any two terms co-occur
        ranking_algo (str {'pagerank', 'divrank', 'bestcoverage'}, optional):
            algorithm with which to rank nodes in the semantic network;
            `pagerank` is the canonical (and default) algorithm, but it prioritizes
            node centrality at the expense of node diversity; the other two
            attempt to balance centrality with diversity
        join_key_words (bool, optional): if True, join consecutive key words
            together into longer key terms, taking the sum of the constituent words'
            scores as the joined key term's combined score
        n_keyterms (int or float, optional): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            representing the fraction of top-ranked terms to return as keyterms

    Returns:
        list((str, float)): sorted list of top ``n_keyterms`` key terms and their
            corresponding ranking scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]
    """
    word_list = [spacy_utils.normalized_str(word) for word in doc]
    good_word_list = [
        spacy_utils.normalized_str(word) for word in doc if not word.is_stop
        and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'}
    ]

    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError(
                '`n_keyterms` must be an int, or a float between 0.0 and 1.0')
        n_keyterms = int(n_keyterms * len(set(good_word_list)))

    graph = terms_to_semantic_network(good_word_list,
                                      window_width=window_width,
                                      edge_weighting=edge_weighting)

    # rank nodes by algorithm, and sort in descending order
    if ranking_algo == 'pagerank':
        word_ranks = nx.pagerank_scipy(graph, weight='weight')
    elif ranking_algo == 'divrank':
        word_ranks = rank_nodes_by_divrank(graph,
                                           r=None,
                                           lambda_=kwargs.get('lambda_', 0.5),
                                           alpha=kwargs.get('alpha', 0.5))
    elif ranking_algo == 'bestcoverage':
        word_ranks = rank_nodes_by_bestcoverage(graph,
                                                k=n_keyterms,
                                                c=kwargs.get('c', 1),
                                                alpha=kwargs.get('alpha', 1.0))

    # bail out here if all we wanted was key *words* and not *terms*
    if join_key_words is False:
        return [(word, score) for word, score in sorted(
            word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]]

    top_n = int(0.25 * len(word_ranks))
    top_word_ranks = {
        word: rank
        for word, rank in sorted(
            word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]
    }

    # join consecutive key words into key terms
    seen_joined_key_terms = set()
    joined_key_terms = []
    for key, group in itertools.groupby(word_list,
                                        lambda word: word in top_word_ranks):
        if key is True:
            words = list(group)
            term = ' '.join(words)
            if term in seen_joined_key_terms:
                continue
            seen_joined_key_terms.add(term)
            joined_key_terms.append(
                (term, sum(word_ranks[word] for word in words)))

    return sorted(joined_key_terms, key=itemgetter(1),
                  reverse=True)[:n_keyterms]
Example #19
0
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None):
    """
    Extract key terms from a document using the [SGRank]_ algorithm.

    Args:
        doc (``spacy.Doc``)
        window_width (int, optional): width of sliding window in which term
            co-occurrences are said to occur
        n_keyterms (int or float, optional): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            representing the fraction of top-ranked terms to return as keyterms
        idf (dict, optional): mapping of
            {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency}
            for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1);
            NOTE: results are better with idf information

    Returns:
        list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their
            corresponding SGRank scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]

    References:
        .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and
           Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase
           Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117.
    """
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
    n_toks = len(doc)
    min_term_freq = min(n_toks // 1500, 4)

    # build full list of candidate terms
    terms = list(itertoolz.concat(
        extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False,
                       good_pos_tags={'NOUN', 'ADJ'}, min_freq=min_term_freq)
        for n in range(1, 7)))
    # if inverse document frequencies available, also add verbs
    # verbs without IDF downweighting dominate the results, and not in a good way
    if idf:
        terms.extend(itertoolz.concat(
            extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False,
                           good_pos_tags={'VERB'}, min_freq=min_term_freq)
            for n in range(1, 7)))

    terms_as_strs = {id(term): spacy_utils.normalized_str(term)
                     for term in terms}

    # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available
    n_top_20pct = int(len(terms) * 0.2)
    term_counts = Counter(terms_as_strs[id(term)] for term in terms)
    if idf:
        mod_tfidfs = {term: count * idf[term] if ' ' not in term else count
                      for term, count in term_counts.items()}
        top_term_texts = {term for term, _ in sorted(
            mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct]}
    else:
        top_term_texts = {term for term, _ in term_counts.most_common(n_top_20pct)}

    terms = [term for term in terms
             if terms_as_strs[id(term)] in top_term_texts]

    # compute term weights from statistical attributes
    term_weights = {}
    set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms}
    n_toks_plus_1 = n_toks + 1
    for term in terms:
        term_str = terms_as_strs[id(term)]
        pos_first_occ_factor = math.log(n_toks_plus_1 / (term.start + 1))
        # TODO: assess if len(t) puts too much emphasis on long terms
        # alternative: term_len = 1 if ' ' not in term else math.sqrt(len(term))
        term_len = 1 if ' ' not in term else len(term)
        term_count = term_counts[term_str]
        subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str
                           if t2 != term_str and term_str in t2)
        term_freq_factor = (term_count - subsum_count)
        if idf and ' ' not in term_str:
            term_freq_factor *= idf[term_str]
        term_weights[term_str] = term_freq_factor * pos_first_occ_factor * term_len

    # filter terms to only those with positive weights
    terms = [term for term in terms
             if term_weights[terms_as_strs[id(term)]] > 0]

    n_coocs = defaultdict(lambda: defaultdict(int))
    sum_logdists = defaultdict(lambda: defaultdict(float))

    # iterate over windows
    for start_ind in range(n_toks):
        end_ind = start_ind + window_width
        window_terms = (term for term in terms
                        if start_ind <= term.start <= end_ind)
        # get all token combinations within window
        for t1, t2 in itertools.combinations(window_terms, 2):
            if t1 is t2:
                continue
            n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1
            try:
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    math.log(window_width / abs(t1.start - t2.start))
            except ZeroDivisionError:  # HACK: pretend that they're 1 token apart
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    math.log(window_width)
        if end_ind > n_toks:
            break

    # compute edge weights between co-occurring terms (nodes)
    edge_weights = defaultdict(lambda: defaultdict(float))
    for t1, t2s in sum_logdists.items():
        for t2 in t2s:
            edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]) * term_weights[t1] * term_weights[t2]
    # normalize edge weights by sum of outgoing edge weights per term (node)
    norm_edge_weights = []
    for t1, t2s in edge_weights.items():
        sum_edge_weights = sum(t2s.values())
        norm_edge_weights.extend((t1, t2, {'weight': weight / sum_edge_weights})
                                 for t2, weight in t2s.items())

    # build the weighted directed graph from edges, rank nodes by pagerank
    graph = nx.DiGraph()
    graph.add_edges_from(norm_edge_weights)
    term_ranks = nx.pagerank_scipy(graph)

    if isinstance(n_keyterms, float):
        n_keyterms = int(len(term_ranks) * n_keyterms)

    return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]
Example #20
0
def ngrams(doc, n,
           filter_stops=True, filter_punct=True, filter_nums=False,
           include_pos=None, exclude_pos=None, min_freq=1):
    """
    Extract an ordered sequence of n-grams (``n`` consecutive words) from a
    spacy-parsed doc, optionally filtering n-grams by the types and
    parts-of-speech of the constituent words.

    Args:
        doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``)
        n (int): number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc.
        filter_stops (bool): if True, remove ngrams that start or end
            with a stop word
        filter_punct (bool): if True, remove ngrams that contain
            any punctuation-only tokens
        filter_nums (bool): if True, remove ngrams that contain
            any numbers or number-like tokens (e.g. 10, 'ten')
        include_pos (str or Set[str]): remove ngrams if any of their constituent
            tokens' part-of-speech tags ARE NOT included in this param
        exclude_pos (str or Set[str]): remove ngrams if any of their constituent
            tokens' part-of-speech tags ARE included in this param
        min_freq (int, optional): remove ngrams that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Span``: the next ngram from ``doc`` passing all specified
            filters, in order of appearance in the document

    Raises:
        ValueError: if ``n`` < 1
        TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str,
            or a falsy value

    .. note:: Filtering by part-of-speech tag uses the universal POS tag set,
        http://universaldependencies.org/u/pos/
    """
    if n < 1:
        raise ValueError('n must be greater than or equal to 1')

    ngrams_ = (doc[i: i + n]
               for i in range(len(doc) - n + 1))
    ngrams_ = (ngram for ngram in ngrams_
               if not any(w.is_space for w in ngram))
    if filter_stops is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not ngram[0].is_stop and not ngram[-1].is_stop)
    if filter_punct is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.is_punct for w in ngram))
    if filter_nums is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.like_num for w in ngram))
    if include_pos:
        if isinstance(include_pos, unicode_):
            include_pos = include_pos.upper()
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ == include_pos for w in ngram))
        elif isinstance(include_pos, (set, frozenset, list, tuple)):
            include_pos = {pos.upper() for pos in include_pos}
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ in include_pos for w in ngram))
        else:
            msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos))
            raise TypeError(msg)
    if exclude_pos:
        if isinstance(exclude_pos, unicode_):
            exclude_pos = exclude_pos.upper()
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ != exclude_pos for w in ngram))
        elif isinstance(exclude_pos, (set, frozenset, list, tuple)):
            exclude_pos = {pos.upper() for pos in exclude_pos}
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ not in exclude_pos for w in ngram))
        else:
            msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos))
            raise TypeError(msg)
    if min_freq > 1:
        ngrams_ = list(ngrams_)
        freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_)
        ngrams_ = (ngram for ngram in ngrams_
                   if freqs[normalized_str(ngram)] >= min_freq)

    for ngram in ngrams_:
        yield ngram
Example #21
0
def key_terms_from_semantic_network(doc, window_width=2, edge_weighting='binary',
                                    ranking_algo='pagerank', join_key_words=False,
                                    n_keyterms=10, **kwargs):
    """
    Extract key terms from a document by ranking nodes in a semantic network of
    terms, connected by edges and weights specified by parameters.

    Args:
        doc (``spacy.Doc``):
        window_width (int, optional): width of sliding window in which term
            co-occurrences are said to occur
        edge_weighting (str {'binary', 'cooc_freq'}, optional): method used to
            determine weights of edges between nodes in the semantic network;
            if 'binary', edge weight is set to 1 for any two terms co-occurring
            within `window_width` terms; if 'cooc_freq', edge weight is set to
            the number of times that any two terms co-occur
        ranking_algo (str {'pagerank', 'divrank', 'bestcoverage'}, optional):
            algorithm with which to rank nodes in the semantic network;
            `pagerank` is the canonical (and default) algorithm, but it prioritizes
            node centrality at the expense of node diversity; the other two
            attempt to balance centrality with diversity
        join_key_words (bool, optional): if True, join consecutive key words
            together into longer key terms, taking the sum of the constituent words'
            scores as the joined key term's combined score
        n_keyterms (int or float, optional): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            representing the fraction of top-ranked terms to return as keyterms

    Returns:
        list((str, float)): sorted list of top ``n_keyterms`` key terms and their
            corresponding ranking scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]
    """
    word_list = [spacy_utils.normalized_str(word) for word in doc]
    good_word_list = [spacy_utils.normalized_str(word)
                      for word in doc
                      if not word.is_stop and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'}]

    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
        n_keyterms = int(n_keyterms * len(set(good_word_list)))

    graph = terms_to_semantic_network(
        good_word_list, window_width=window_width, edge_weighting=edge_weighting)

    # rank nodes by algorithm, and sort in descending order
    if ranking_algo == 'pagerank':
        word_ranks = nx.pagerank_scipy(graph, weight='weight')
    elif ranking_algo == 'divrank':
        word_ranks = rank_nodes_by_divrank(
            graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5))
    elif ranking_algo == 'bestcoverage':
        word_ranks = rank_nodes_by_bestcoverage(
            graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0))

    # bail out here if all we wanted was key *words* and not *terms*
    if join_key_words is False:
        return [(word, score) for word, score in
                sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]]

    top_n = int(0.25 * len(word_ranks))
    top_word_ranks = {word: rank for word, rank in
                      sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]}

    # join consecutive key words into key terms
    seen_joined_key_terms = set()
    joined_key_terms = []
    for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks):
        if key is True:
            words = list(group)
            term = ' '.join(words)
            if term in seen_joined_key_terms:
                continue
            seen_joined_key_terms.add(term)
            joined_key_terms.append((term, sum(word_ranks[word] for word in words)))

    return sorted(joined_key_terms, key=itemgetter(1), reverse=True)[:n_keyterms]
Example #22
0
def words(doc,
          filter_stops=True, filter_punct=True, filter_nums=False,
          include_pos=None, exclude_pos=None, min_freq=1):
    """
    Extract an ordered sequence of words from a document processed by spaCy,
    optionally filtering words by part-of-speech tag and frequency.

    Args:
        doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``)
        filter_stops (bool): if True, remove stop words from word list
        filter_punct (bool): if True, remove punctuation from word list
        filter_nums (bool): if True, remove number-like words (e.g. 10, 'ten')
            from word list
        include_pos (str or Set[str]): remove words whose part-of-speech tag
            IS NOT included in this param
        exclude_pos (str or Set[str]): remove words whose part-of-speech tag
            IS in the specified tags
        min_freq (int): remove words that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Token``: the next token from ``doc`` passing specified filters
            in order of appearance in the document

    Raises:
        TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str,
            or a falsy value

    .. note:: Filtering by part-of-speech tag uses the universal POS tag set,
        http://universaldependencies.org/u/pos/
    """
    words_ = (w for w in doc if not w.is_space)
    if filter_stops is True:
        words_ = (w for w in words_ if not w.is_stop)
    if filter_punct is True:
        words_ = (w for w in words_ if not w.is_punct)
    if filter_nums is True:
        words_ = (w for w in words_ if not w.like_num)
    if include_pos:
        if isinstance(include_pos, unicode_type):
            include_pos = include_pos.upper()
            words_ = (w for w in words_ if w.pos_ == include_pos)
        elif isinstance(include_pos, (set, frozenset, list, tuple)):
            include_pos = {pos.upper() for pos in include_pos}
            words_ = (w for w in words_ if w.pos_ in include_pos)
        else:
            msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos))
            raise TypeError(msg)
    if exclude_pos:
        if isinstance(exclude_pos, unicode_type):
            exclude_pos = exclude_pos.upper()
            words_ = (w for w in words_ if w.pos_ != exclude_pos)
        elif isinstance(exclude_pos, (set, frozenset, list, tuple)):
            exclude_pos = {pos.upper() for pos in exclude_pos}
            words_ = (w for w in words_ if w.pos_ not in exclude_pos)
        else:
            msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos))
            raise TypeError(msg)
    if min_freq > 1:
        words_ = list(words_)
        freqs = itertoolz.frequencies(normalized_str(w) for w in words_)
        words_ = (w for w in words_
                  if freqs[normalized_str(w)] >= min_freq)

    for word in words_:
        yield word
Example #23
0
def ngrams(doc, n,
           filter_stops=True, filter_punct=True, filter_nums=False,
           include_pos=None, exclude_pos=None, min_freq=1):
    """
    Extract an ordered sequence of n-grams (``n`` consecutive words) from a
    spacy-parsed doc, optionally filtering n-grams by the types and
    parts-of-speech of the constituent words.

    Args:
        doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``)
        n (int): number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc.
        filter_stops (bool): if True, remove ngrams that start or end
            with a stop word
        filter_punct (bool): if True, remove ngrams that contain
            any punctuation-only tokens
        filter_nums (bool): if True, remove ngrams that contain
            any numbers or number-like tokens (e.g. 10, 'ten')
        include_pos (str or Set[str]): remove ngrams if any of their constituent
            tokens' part-of-speech tags ARE NOT included in this param
        exclude_pos (str or Set[str]): remove ngrams if any of their constituent
            tokens' part-of-speech tags ARE included in this param
        min_freq (int, optional): remove ngrams that occur in `doc` fewer than
            `min_freq` times

    Yields:
        ``spacy.Span``: the next ngram from ``doc`` passing all specified
            filters, in order of appearance in the document

    Raises:
        ValueError: if ``n`` < 1
        TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str,
            or a falsy value

    .. note:: Filtering by part-of-speech tag uses the universal POS tag set,
        http://universaldependencies.org/u/pos/
    """
    if n < 1:
        raise ValueError('n must be greater than or equal to 1')

    ngrams_ = (doc[i: i + n]
               for i in range(len(doc) - n + 1))
    ngrams_ = (ngram for ngram in ngrams_
               if not any(w.is_space for w in ngram))
    if filter_stops is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not ngram[0].is_stop and not ngram[-1].is_stop)
    if filter_punct is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.is_punct for w in ngram))
    if filter_nums is True:
        ngrams_ = (ngram for ngram in ngrams_
                   if not any(w.like_num for w in ngram))
    if include_pos:
        if isinstance(include_pos, unicode_type):
            include_pos = include_pos.upper()
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ == include_pos for w in ngram))
        elif isinstance(include_pos, (set, frozenset, list, tuple)):
            include_pos = {pos.upper() for pos in include_pos}
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ in include_pos for w in ngram))
        else:
            msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos))
            raise TypeError(msg)
    if exclude_pos:
        if isinstance(exclude_pos, unicode_type):
            exclude_pos = exclude_pos.upper()
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ != exclude_pos for w in ngram))
        elif isinstance(exclude_pos, (set, frozenset, list, tuple)):
            exclude_pos = {pos.upper() for pos in exclude_pos}
            ngrams_ = (ngram for ngram in ngrams_
                       if all(w.pos_ not in exclude_pos for w in ngram))
        else:
            msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos))
            raise TypeError(msg)
    if min_freq > 1:
        ngrams_ = list(ngrams_)
        freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_)
        ngrams_ = (ngram for ngram in ngrams_
                   if freqs[normalized_str(ngram)] >= min_freq)

    for ngram in ngrams_:
        yield ngram
Example #24
0
def mySgRank(doc, window_width=1500, n_keyterms=10, idf=None):
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError(
                '`n_keyterms` must be an int, or a float between 0.0 and 1.0')
    n_toks = len(doc)
    min_term_freq = min(n_toks // 1500, 4)

    # build full list of candidate terms
    terms = list(
        itertoolz.concat(
            extract.ngrams(doc,
                           n,
                           filter_stops=True,
                           filter_punct=True,
                           filter_nums=False,
                           min_freq=min_term_freq) for n in range(1, 7)))
    # if inverse document frequencies available, also add verbs
    # verbs without IDF downweighting dominate the results, and not in a good way
    if idf:
        terms.extend(
            itertoolz.concat(
                extract.ngrams(doc,
                               n,
                               filter_stops=True,
                               filter_punct=True,
                               filter_nums=False,
                               min_freq=min_term_freq) for n in range(1, 7)))

    terms_as_strs = {
        id(term): spacy_utils.normalized_str(term)
        for term in terms
    }

    # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available
    n_top_20pct = int(len(terms) * 0.2)
    term_counts = Counter(terms_as_strs[id(term)] for term in terms)
    if idf:
        mod_tfidfs = {
            term: count * idf[term] if ' ' not in term else count
            for term, count in term_counts.items()
        }
        top_term_texts = {
            term
            for term, _ in sorted(mod_tfidfs.items(),
                                  key=itemgetter(1),
                                  reverse=True)[:n_top_20pct]
        }
    else:
        top_term_texts = {
            term
            for term, _ in term_counts.most_common(n_top_20pct)
        }

    terms = [
        term for term in terms if terms_as_strs[id(term)] in top_term_texts
    ]

    # compute term weights from statistical attributes
    term_weights = {}
    set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms}
    n_toks_plus_1 = n_toks + 1
    for term in terms:
        term_str = terms_as_strs[id(term)]
        pos_first_occ_factor = math.log(n_toks_plus_1 / (term.start + 1))
        # TODO: assess if len(t) puts too much emphasis on long terms
        # alternative: term_len = 1 if ' ' not in term else math.sqrt(len(term))
        term_len = 1 if ' ' not in term else len(term)
        term_count = term_counts[term_str]
        subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str
                           if t2 != term_str and term_str in t2)
        term_freq_factor = (term_count - subsum_count)
        if idf and ' ' not in term_str:
            term_freq_factor *= idf[term_str]
        term_weights[
            term_str] = term_freq_factor * pos_first_occ_factor * term_len

    # filter terms to only those with positive weights
    terms = [
        term for term in terms if term_weights[terms_as_strs[id(term)]] > 0
    ]

    n_coocs = defaultdict(lambda: defaultdict(int))
    sum_logdists = defaultdict(lambda: defaultdict(float))

    # iterate over windows
    for start_ind in range(n_toks):
        end_ind = start_ind + window_width
        window_terms = (term for term in terms
                        if start_ind <= term.start <= end_ind)
        # get all token combinations within window
        for t1, t2 in itertools.combinations(window_terms, 2):
            if t1 is t2:
                continue
            n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1
            try:
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    math.log(window_width / abs(t1.start - t2.start))
            except ZeroDivisionError:  # HACK: pretend that they're 1 token apart
                sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \
                    math.log(window_width)
        if end_ind > n_toks:
            break

    # compute edge weights between co-occurring terms (nodes)
    edge_weights = defaultdict(lambda: defaultdict(float))
    for t1, t2s in sum_logdists.items():
        for t2 in t2s:
            edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]
                                    ) * term_weights[t1] * term_weights[t2]
    # normalize edge weights by sum of outgoing edge weights per term (node)
    norm_edge_weights = []
    for t1, t2s in edge_weights.items():
        sum_edge_weights = sum(t2s.values())
        norm_edge_weights.extend((t1, t2, {
            'weight': weight / sum_edge_weights
        }) for t2, weight in t2s.items())

    # build the weighted directed graph from edges, rank nodes by pagerank
    graph = nx.DiGraph()
    graph.add_edges_from(norm_edge_weights)
    term_ranks = nx.pagerank_scipy(graph)

    if isinstance(n_keyterms, float):
        n_keyterms = int(len(term_ranks) * n_keyterms)

    return sorted(term_ranks.items(), key=itemgetter(1),
                  reverse=True)[:n_keyterms]