def noun_chunks(doc, drop_determiners=True, min_freq=1): """ Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally filtering by frequency and dropping leading determiners. Args: doc (``spacy.Doc``) drop_determiners (bool, optional): remove leading determiners (e.g. "the") from phrases (e.g. "the quick brown fox" => "quick brown fox") min_freq (int, optional): remove chunks that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next noun chunk from ``doc`` in order of appearance in the document """ ncs = doc.noun_chunks if drop_determiners is True: ncs = (nc if nc[0].pos != DET else nc[1:] for nc in ncs) if min_freq > 1: ncs = list(ncs) freqs = itertoolz.frequencies(normalized_str(nc) for nc in ncs) ncs = (nc for nc in ncs if freqs[normalized_str(nc)] >= min_freq) for nc in ncs: yield nc
def noun_chunks(doc, drop_determiners=True, min_freq=1): """ Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally filtering by frequency and dropping leading determiners. Args: doc (``textacy.Doc`` or ``spacy.Doc``) drop_determiners (bool): remove leading determiners (e.g. "the") from phrases (e.g. "the quick brown fox" => "quick brown fox") min_freq (int): remove chunks that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next noun chunk from ``doc`` in order of appearance in the document """ if isinstance(doc, textacy.Doc): ncs = doc.spacy_doc.noun_chunks else: ncs = doc.noun_chunks if drop_determiners is True: ncs = (nc if nc[0].pos != DET else nc[1:] for nc in ncs) if min_freq > 1: ncs = list(ncs) freqs = itertoolz.frequencies(normalized_str(nc) for nc in ncs) ncs = (nc for nc in ncs if freqs[normalized_str(nc)] >= min_freq) for nc in ncs: yield nc
def test_normalize_str(spacy_doc): normalized_strs = [ 'the', 'unit', 'test', 'be', 'not', 'go', 'well', '.', '-PRON-', 'love', 'Python', ',', 'but', '-PRON-', 'do', 'not', 'love', 'backwards', 'incompatibility', '.', 'no', 'programmer', 'be', 'permanently', 'damage', 'for', 'textacy', "'s", 'sake', '.', 'thank', 'God', 'for', 'Stack', 'Overflow', '.'] assert [spacy_utils.normalized_str(tok) for tok in spacy_doc if not tok.is_space] == normalized_strs
def terms_to_semantic_network(terms, window_width=10, edge_weighting='cooc_freq'): """ Convert an ordered list of non-overlapping terms into a semantic network, where each terms is represented by a node with edges linking it to other terms that co-occur within ``window_width`` terms of itself. Args: terms (list(str) or list(``spacy.Token``)) window_width (int, optional): size of sliding window over `terms` that determines which are said to co-occur; if = 2, only adjacent terms will have edges in network edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary', all co-occurring terms will have network edges with weight = 1; if 'cooc_freq', edges will have a weight equal to the number of times that the connected nodes co-occur in a sliding window Returns: :class:`networkx.Graph()` Notes: - Be sure to filter out stopwords, punctuation, certain parts of speech, etc. from the terms list before passing it to this function - Multi-word terms, such as named entities and compound nouns, must be merged into single strings or spacy.Tokens beforehand - If terms are already strings, be sure to normalize so that like terms are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`) """ if window_width < 2: raise ValueError('Window width must be >= 2.') if isinstance(terms[0], str): windows = itertoolz.sliding_window(window_width, terms) elif isinstance(terms[0], spacy_token): windows = ((normalized_str(tok) for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: msg = 'Input terms must be strings or spacy Tokens, not {}.'.format(type(terms[0])) raise TypeError(msg) graph = nx.Graph() if edge_weighting == 'cooc_freq': cooc_mat = defaultdict(lambda: defaultdict(int)) for window in windows: for w1, w2 in itertools.combinations(sorted(window), 2): cooc_mat[w1][w2] += 1 graph.add_edges_from( (w1, w2, {'weight': cooc_mat[w1][w2]}) for w1, w2s in cooc_mat.items() for w2 in w2s) elif edge_weighting == 'binary': graph.add_edges_from( w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) return graph
def terms_to_semantic_network(terms, window_width=10, edge_weighting='cooc_freq'): """ Convert an ordered list of non-overlapping terms into a semantic network, where each terms is represented by a node with edges linking it to other terms that co-occur within ``window_width`` terms of itself. Args: terms (list(str) or list(``spacy.Token``)) window_width (int, optional): size of sliding window over `terms` that determines which are said to co-occur; if = 2, only adjacent terms will have edges in network edge_weighting (str {'cooc_freq', 'binary'}, optional): if 'binary', all co-occurring terms will have network edges with weight = 1; if 'cooc_freq', edges will have a weight equal to the number of times that the connected nodes co-occur in a sliding window Returns: :class:`networkx.Graph()` Notes: - Be sure to filter out stopwords, punctuation, certain parts of speech, etc. from the terms list before passing it to this function - Multi-word terms, such as named entities and compound nouns, must be merged into single strings or spacy.Tokens beforehand - If terms are already strings, be sure to normalize so that like terms are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`) """ if window_width < 2: raise ValueError('Window width must be >= 2.') if isinstance(terms[0], str): windows = itertoolz.sliding_window(window_width, terms) elif isinstance(terms[0], spacy_token): windows = ((normalized_str(tok) for tok in window) for window in itertoolz.sliding_window(window_width, terms)) else: msg = 'Input terms must be strings or spacy Tokens, not {}.'.format( type(terms[0])) raise TypeError(msg) graph = nx.Graph() if edge_weighting == 'cooc_freq': cooc_mat = defaultdict(lambda: defaultdict(int)) for window in windows: for w1, w2 in itertools.combinations(sorted(window), 2): cooc_mat[w1][w2] += 1 graph.add_edges_from((w1, w2, { 'weight': cooc_mat[w1][w2] }) for w1, w2s in cooc_mat.items() for w2 in w2s) elif edge_weighting == 'binary': graph.add_edges_from(w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) return graph
def test_normalize_str(self): normalized_strs = [ 'the', 'unit', 'test', 'be', 'not', 'go', 'well', '.', 'i', 'love', 'Python', ',', 'but', 'i', 'do', 'not', 'love', 'some', 'of', 'Guido', "'s", 'decision', '.', 'no', 'computer', 'programmer', 'be', 'harm', 'in', 'the', 'making', 'of', 'this', 'package', '.', 'thank', 'God', 'for', 'Stack', 'Overflow', '.'] self.assertEqual([spacy_utils.normalized_str(tok) for tok in self.spacy_doc if not tok.is_space], normalized_strs)
def test_normalize_str(self): normalized_strs = [ 'the', 'unit', 'test', 'be', 'not', 'go', 'well', '.', '-PRON-', 'love', 'Python', ',', 'but', '-PRON-', 'do', 'not', 'love', 'some', 'of', 'Guido', "'s", 'decision', '.', 'no', 'computer', 'programmer', 'be', 'harm', 'in', 'the', 'making', 'of', 'this', 'package', '.', 'thank', 'God', 'for', 'Stack', 'Overflow', '.'] self.assertEqual([spacy_utils.normalized_str(tok) for tok in self.spacy_doc if not tok.is_space], normalized_strs)
def words(doc, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags=None, bad_pos_tags=None, min_freq=1): """ Extract an ordered sequence of words from a spacy-parsed doc, optionally filtering words by part-of-speech (etc.) and frequency. Args: doc (``spacy.Doc`` or ``spacy.Span``) filter_stops (bool, optional): if True, remove stop words from word list filter_punct (bool, optional): if True, remove punctuation from word list filter_nums (bool, optional): if True, remove number-like words (e.g. 10, 'ten') from word list good_pos_tags (set[str], optional): remove words whose part-of-speech tag is NOT in the specified tags, using the set of universal POS tagset bad_pos_tags (set[str], optional): remove words whose part-of-speech tag IS in the specified tags, using the set of universal POS tagset min_freq (int, optional): remove words that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Token``: the next token from ``doc`` passing specified filters in order of appearance in the document """ words_ = (w for w in doc if not w.is_space) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if good_pos_tags: words_ = (w for w in words_ if w.pos_ in good_pos_tags) if bad_pos_tags: words_ = (w for w in words_ if w.pos_ not in bad_pos_tags) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(normalized_str(w) for w in words_) words_ = (w for w in words_ if freqs[normalized_str(w)] >= min_freq) for word in words_: yield word
def term_count(self, term): """ Get the number of occurrences ("count") of term in doc. Args: term (str or ``spacy.Token`` or ``spacy.Span``) Returns: int """ # figure out what object we're dealing with here; convert as necessary if isinstance(term, unicode_type): term_text = term term_id = self.spacy_stringstore[term_text] term_len = term_text.count(' ') + 1 elif isinstance(term, stoken): term_text = spacy_utils.normalized_str(term) term_id = self.spacy_stringstore[term_text] term_len = 1 elif isinstance(term, sspan): term_text = spacy_utils.normalized_str(term) term_id = self.spacy_stringstore[term_text] term_len = len(term) term_count_ = self._term_counts[term_id] if term_count_ > 0: return term_count_ # have we not already counted the appropriate `n` n-grams? if not any(self.spacy_stringstore[t].count(' ') == term_len for t in self._term_counts): get_id = lambda x: self.spacy_stringstore[spacy_utils. normalized_str(x)] if term_len == 1: self._term_counts += Counter(get_id(w) for w in self.words()) else: self._term_counts += Counter( get_id(ng) for ng in self.ngrams(term_len)) term_count_ = self._term_counts[term_id] if term_count_ > 0: return term_count_ # last resort: try a regular expression return sum(1 for _ in re.finditer(re.escape(term_text), self.text))
def sents_to_semantic_network(sents, edge_weighting='cosine'): """ Convert a list of sentences into a semantic network, where each sentence is represented by a node with edges linking it to other sentences weighted by the (cosine or jaccard) similarity of their constituent words. Args: sents (list(str) or list(:class:`spacy.Span`)) edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric to use for weighting edges between sentences; if 'cosine', use the cosine similarity between sentences represented as tf-idf word vectors; if 'jaccard', use the set intersection divided by the set union of all words in a given sentence pair Returns: :class:`networkx.Graph()`: nodes are the integer indexes of the sentences in the input ``sents`` list, *not* the actual text of the sentences! Notes: * If passing sentences as strings, be sure to filter out stopwords, punctuation, certain parts of speech, etc. beforehand * Consider normalizing the strings so that like terms are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`) """ n_sents = len(sents) if isinstance(sents[0], str): pass elif isinstance(sents[0], spacy_span): sents = [ ' '.join( normalized_str(tok) for tok in extract.words(sent, filter_stops=True, filter_punct=True, filter_nums=False)) for sent in sents ] else: msg = 'Input sents must be strings or spacy Spans, not {}.'.format( type(sents[0])) raise TypeError(msg) if edge_weighting == 'cosine': term_sent_matrix = TfidfVectorizer().fit_transform(sents) elif edge_weighting == 'jaccard': term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents) weights = (term_sent_matrix * term_sent_matrix.T).A.tolist() graph = nx.Graph() graph.add_edges_from((i, j, { 'weight': weights[i][j] }) for i in range(n_sents) for j in range(i + 1, n_sents)) return graph
def term_count(self, term): """ Get the number of occurrences ("count") of term in doc. Args: term (str or ``spacy.Token`` or ``spacy.Span``) Returns: int """ # figure out what object we're dealing with here; convert as necessary if isinstance(term, str): term_text = term term_id = self.spacy_stringstore[term_text] term_len = term_text.count(' ') + 1 elif isinstance(term, stoken): term_text = spacy_utils.normalized_str(term) term_id = self.spacy_stringstore[term_text] term_len = 1 elif isinstance(term, sspan): term_text = spacy_utils.normalized_str(term) term_id = self.spacy_stringstore[term_text] term_len = len(term) term_count_ = self._term_counts[term_id] if term_count_ > 0: return term_count_ # have we not already counted the appropriate `n` n-grams? if not any(self.spacy_stringstore[t].count(' ') == term_len for t in self._term_counts): get_id = lambda x: self.spacy_stringstore[spacy_utils.normalized_str(x)] if term_len == 1: self._term_counts += Counter(get_id(w) for w in self.words()) else: self._term_counts += Counter(get_id(ng) for ng in self.ngrams(term_len)) term_count_ = self._term_counts[term_id] if term_count_ > 0: return term_count_ # last resort: try a regular expression return sum(1 for _ in re.finditer(re.escape(term_text), self.text))
def sents_to_semantic_network(sents, edge_weighting='cosine'): """ Convert a list of sentences into a semantic network, where each sentence is represented by a node with edges linking it to other sentences weighted by the (cosine or jaccard) similarity of their constituent words. Args: sents (list(str) or list(:class:`spacy.Span`)) edge_weighting (str {'cosine', 'jaccard'}, optional): similarity metric to use for weighting edges between sentences; if 'cosine', use the cosine similarity between sentences represented as tf-idf word vectors; if 'jaccard', use the set intersection divided by the set union of all words in a given sentence pair Returns: :class:`networkx.Graph`: nodes are the integer indexes of the sentences in the input ``sents`` list, *not* the actual text of the sentences! Notes: * If passing sentences as strings, be sure to filter out stopwords, punctuation, certain parts of speech, etc. beforehand * Consider normalizing the strings so that like terms are counted together (see :func:`normalized_str() <textacy.spacy_utils.normalized_str>`) """ n_sents = len(sents) if isinstance(sents[0], unicode_type): pass elif isinstance(sents[0], SpacySpan): sents = [' '.join(normalized_str(tok) for tok in extract.words(sent, filter_stops=True, filter_punct=True, filter_nums=False)) for sent in sents] else: msg = 'Input sents must be strings or spacy Spans, not {}.'.format(type(sents[0])) raise TypeError(msg) if edge_weighting == 'cosine': term_sent_matrix = TfidfVectorizer().fit_transform(sents) elif edge_weighting == 'jaccard': term_sent_matrix = CountVectorizer(binary=True).fit_transform(sents) weights = (term_sent_matrix * term_sent_matrix.T).A.tolist() graph = nx.Graph() graph.add_edges_from( (i, j, {'weight': weights[i][j]}) for i in range(n_sents) for j in range(i + 1, n_sents)) return graph
def term_counts(self, lemmatize='auto', ngram_range=(1, 1), include_nes=False, include_ncs=False, include_kts=False): """ Get the number of occurrences ("counts") of each unique term in doc; terms may be words, n-grams, named entities, noun phrases, and key terms. Args: lemmatize (bool or 'auto', optional): if True, lemmatize all terms when getting their frequencies; if 'auto', lemmatize all terms that aren't proper nouns or acronyms ngram_range (tuple(int), optional): (min n, max n) values for n-grams to include in terms list; default (1, 1) only includes unigrams include_nes (bool, optional): if True, include named entities in terms list include_ncs (bool, optional): if True, include noun chunks in terms list include_kts (bool, optional): if True, include key terms in terms list Returns: :class:`collections.Counter() <collections.Counter>`: mapping of unique term ids to corresponding term counts """ if lemmatize == 'auto': get_id = lambda x: self.spacy_stringstore[spacy_utils.normalized_str(x)] elif lemmatize is True: get_id = lambda x: self.spacy_stringstore[x.lemma_] else: get_id = lambda x: self.spacy_stringstore[x.text] for n in range(ngram_range[0], ngram_range[1] + 1): if n == 1: self._term_counts = self._term_counts | Counter( get_id(word) for word in self.words()) else: self._term_counts = self._term_counts | Counter( get_id(ngram) for ngram in self.ngrams(n)) if include_nes is True: self._term_counts = self._term_counts | Counter( get_id(ne) for ne in self.named_entities()) if include_ncs is True: self._term_counts = self._term_counts | Counter( get_id(nc) for nc in self.noun_chunks()) if include_kts is True: # HACK: key terms are currently returned as strings # TODO: cache key terms, and return them as spacy spans get_id = lambda x: self.spacy_stringstore[x] self._term_counts = self._term_counts | Counter( get_id(kt) for kt, _ in self.key_terms()) return self._term_counts
def test_normalize_str(spacy_doc): normalized_strs = [ "the", "unit", "test", "be", "not", "go", "well", ".", "-PRON-", "love", "Python", ",", "but", "-PRON-", "do", "not", "love", "backwards", "incompatibility", ".", "no", "programmer", "be", "permanently", "damage", "for", "textacy", "'s", "sake", ".", "thank", "God", "for", "Stack", "Overflow", ".", ] assert [ spacy_utils.normalized_str(tok) for tok in spacy_doc if not tok.is_space ] == normalized_strs
def words(doc, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1): """ Extract an ordered sequence of words from a document processed by spaCy, optionally filtering words by part-of-speech tag and frequency. Args: doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``) filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove number-like words (e.g. 10, 'ten') from word list include_pos (str or Set[str]): remove words whose part-of-speech tag IS NOT included in this param exclude_pos (str or Set[str]): remove words whose part-of-speech tag IS in the specified tags min_freq (int): remove words that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Token``: the next token from ``doc`` passing specified filters in order of appearance in the document Raises: TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str, or a falsy value .. note:: Filtering by part-of-speech tag uses the universal POS tag set, http://universaldependencies.org/u/pos/ """ words_ = (w for w in doc if not w.is_space) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if include_pos: if isinstance(include_pos, unicode_): include_pos = include_pos.upper() words_ = (w for w in words_ if w.pos_ == include_pos) elif isinstance(include_pos, (set, frozenset, list, tuple)): include_pos = {pos.upper() for pos in include_pos} words_ = (w for w in words_ if w.pos_ in include_pos) else: msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos)) raise TypeError(msg) if exclude_pos: if isinstance(exclude_pos, unicode_): exclude_pos = exclude_pos.upper() words_ = (w for w in words_ if w.pos_ != exclude_pos) elif isinstance(exclude_pos, (set, frozenset, list, tuple)): exclude_pos = {pos.upper() for pos in exclude_pos} words_ = (w for w in words_ if w.pos_ not in exclude_pos) else: msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos)) raise TypeError(msg) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(normalized_str(w) for w in words_) words_ = (w for w in words_ if freqs[normalized_str(w)] >= min_freq) for word in words_: yield word
def ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags=None, bad_pos_tags=None, min_freq=1): """ Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: doc (``spacy.Doc`` or ``spacy.Span``) n (int): number of tokens per n-gram; 2 gives bigrams, 3 gives trigrams, etc. filter_stops (bool, optional): if True, remove ngrams that start or end with a stop word filter_punct (bool, optional): if True, remove ngrams that contain any punctuation-only tokens filter_nums (bool, optional): if True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') good_pos_tags (set[str], optional): remove ngrams whose constituent tokens' part-of-speech tags are NOT all in the specified tags, using the universal POS tagset bad_pos_tags (set[str], optional): remove ngrams if any of their constituent tokens' part-of-speech tags are in the specified tags, using the universal POS tagset min_freq (int, optional): remove ngrams that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next ngram from ``doc`` passing all specified filters, in order of appearance in the document Raises: ValueError: if ``n`` < 1 """ if n < 1: raise ValueError('n must be greater than or equal to 1') ngrams_ = (doc[i: i + n] for i in range(len(doc) - n + 1)) ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_space for w in ngram)) if filter_stops is True: ngrams_ = (ngram for ngram in ngrams_ if not ngram[0].is_stop and not ngram[-1].is_stop) if filter_punct is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_punct for w in ngram)) if filter_nums is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.like_num for w in ngram)) if good_pos_tags: ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ in good_pos_tags for w in ngram)) if bad_pos_tags: ngrams_ = (ngram for ngram in ngrams_ if not any(w.pos_ in bad_pos_tags for w in ngram)) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_) ngrams_ = (ngram for ngram in ngrams_ if freqs[normalized_str(ngram)] >= min_freq) for ngram in ngrams_: yield ngram
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None): """ Extract key terms from a document using the [SGRank]_ algorithm. Args: doc (``spacy.Doc``) window_width (int, optional): width of sliding window in which term co-occurrences are said to occur n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms idf (dict, optional): mapping of {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency} for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1); NOTE: results are better with idf information Returns: list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their corresponding SGRank scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] References: .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117. """ if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError( '`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_toks = len(doc) min_term_freq = min(n_toks // 1500, 4) # build full list of candidate terms terms = list( itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'NOUN', 'ADJ'}, min_freq=min_term_freq) for n in range(1, 7))) # if inverse document frequencies available, also add verbs # verbs without IDF downweighting dominate the results, and not in a good way if idf: terms.extend( itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'VERB'}, min_freq=min_term_freq) for n in range(1, 7))) terms_as_strs = { id(term): spacy_utils.normalized_str(term) for term in terms } # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available n_top_20pct = int(len(terms) * 0.2) term_counts = Counter(terms_as_strs[id(term)] for term in terms) if idf: mod_tfidfs = { term: count * idf[term] if ' ' not in term else count for term, count in term_counts.items() } top_term_texts = { term for term, _ in sorted(mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct] } else: top_term_texts = { term for term, _ in term_counts.most_common(n_top_20pct) } terms = [ term for term in terms if terms_as_strs[id(term)] in top_term_texts ] # compute term weights from statistical attributes term_weights = {} set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms} n_toks_plus_1 = n_toks + 1 for term in terms: term_str = terms_as_strs[id(term)] pos_first_occ_factor = log(n_toks_plus_1 / (term.start + 1)) # TODO: assess if len(t) puts too much emphasis on long terms # alternative: term_len = 1 if ' ' not in term else sqrt(len(term)) term_len = 1 if ' ' not in term else len(term) term_count = term_counts[term_str] subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str if t2 != term_str and term_str in t2) term_freq_factor = (term_count - subsum_count) if idf and ' ' not in term_str: term_freq_factor *= idf[term_str] term_weights[ term_str] = term_freq_factor * pos_first_occ_factor * term_len # filter terms to only those with positive weights terms = [ term for term in terms if term_weights[terms_as_strs[id(term)]] > 0 ] n_coocs = defaultdict(lambda: defaultdict(int)) sum_logdists = defaultdict(lambda: defaultdict(float)) # iterate over windows for start_ind in range(n_toks): end_ind = start_ind + window_width window_terms = (term for term in terms if start_ind <= term.start <= end_ind) # get all token combinations within window for t1, t2 in itertools.combinations(window_terms, 2): if t1 is t2: continue n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1 try: sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ log(window_width / abs(t1.start - t2.start)) except ZeroDivisionError: # HACK: pretend that they're 1 token apart sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ log(window_width) if end_ind > n_toks: break # compute edge weights between co-occurring terms (nodes) edge_weights = defaultdict(lambda: defaultdict(float)) for t1, t2s in sum_logdists.items(): for t2 in t2s: edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2] ) * term_weights[t1] * term_weights[t2] # normalize edge weights by sum of outgoing edge weights per term (node) norm_edge_weights = [] for t1, t2s in edge_weights.items(): sum_edge_weights = sum(t2s.values()) norm_edge_weights.extend((t1, t2, { 'weight': weight / sum_edge_weights }) for t2, weight in t2s.items()) # build the weighted directed graph from edges, rank nodes by pagerank graph = nx.DiGraph() graph.add_edges_from(norm_edge_weights) term_ranks = nx.pagerank_scipy(graph) if isinstance(n_keyterms, float): n_keyterms = int(len(term_ranks) * n_keyterms) return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]
def key_terms_from_semantic_network(doc, window_width=2, edge_weighting='binary', ranking_algo='pagerank', join_key_words=False, n_keyterms=10, **kwargs): """ Extract key terms from a document by ranking nodes in a semantic network of terms, connected by edges and weights specified by parameters. Args: doc (``spacy.Doc``): window_width (int, optional): width of sliding window in which term co-occurrences are said to occur edge_weighting (str {'binary', 'cooc_freq'}, optional): method used to determine weights of edges between nodes in the semantic network; if 'binary', edge weight is set to 1 for any two terms co-occurring within `window_width` terms; if 'cooc_freq', edge weight is set to the number of times that any two terms co-occur ranking_algo (str {'pagerank', 'divrank', 'bestcoverage'}, optional): algorithm with which to rank nodes in the semantic network; `pagerank` is the canonical (and default) algorithm, but it prioritizes node centrality at the expense of node diversity; the other two attempt to balance centrality with diversity join_key_words (bool, optional): if True, join consecutive key words together into longer key terms, taking the sum of the constituent words' scores as the joined key term's combined score n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms Returns: list((str, float)): sorted list of top ``n_keyterms`` key terms and their corresponding ranking scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] """ word_list = [spacy_utils.normalized_str(word) for word in doc] good_word_list = [ spacy_utils.normalized_str(word) for word in doc if not word.is_stop and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'} ] if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError( '`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_keyterms = int(n_keyterms * len(set(good_word_list))) graph = terms_to_semantic_network(good_word_list, window_width=window_width, edge_weighting=edge_weighting) # rank nodes by algorithm, and sort in descending order if ranking_algo == 'pagerank': word_ranks = nx.pagerank_scipy(graph, weight='weight') elif ranking_algo == 'divrank': word_ranks = rank_nodes_by_divrank(graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5)) elif ranking_algo == 'bestcoverage': word_ranks = rank_nodes_by_bestcoverage(graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0)) # bail out here if all we wanted was key *words* and not *terms* if join_key_words is False: return [(word, score) for word, score in sorted( word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]] top_n = int(0.25 * len(word_ranks)) top_word_ranks = { word: rank for word, rank in sorted( word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n] } # join consecutive key words into key terms seen_joined_key_terms = set() joined_key_terms = [] for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks): if key is True: words = list(group) term = ' '.join(words) if term in seen_joined_key_terms: continue seen_joined_key_terms.add(term) joined_key_terms.append( (term, sum(word_ranks[word] for word in words))) return sorted(joined_key_terms, key=itemgetter(1), reverse=True)[:n_keyterms]
def sgrank(doc, window_width=1500, n_keyterms=10, idf=None): """ Extract key terms from a document using the [SGRank]_ algorithm. Args: doc (``spacy.Doc``) window_width (int, optional): width of sliding window in which term co-occurrences are said to occur n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms idf (dict, optional): mapping of {`normalized_str(term) <textacy.spacy_utils.normalized_str>`: inverse document frequency} for re-weighting of unigrams (n-grams with n > 1 have df assumed = 1); NOTE: results are better with idf information Returns: list[(str, float)]: sorted list of top ``n_keyterms`` key terms and their corresponding SGRank scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] References: .. [SGRank] Danesh, Sumner, and Martin. "SGRank: Combining Statistical and Graphical Methods to Improve the State of the Art in Unsupervised Keyphrase Extraction". Lexical and Computational Semantics (* SEM 2015) (2015): 117. """ if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_toks = len(doc) min_term_freq = min(n_toks // 1500, 4) # build full list of candidate terms terms = list(itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'NOUN', 'ADJ'}, min_freq=min_term_freq) for n in range(1, 7))) # if inverse document frequencies available, also add verbs # verbs without IDF downweighting dominate the results, and not in a good way if idf: terms.extend(itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, good_pos_tags={'VERB'}, min_freq=min_term_freq) for n in range(1, 7))) terms_as_strs = {id(term): spacy_utils.normalized_str(term) for term in terms} # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available n_top_20pct = int(len(terms) * 0.2) term_counts = Counter(terms_as_strs[id(term)] for term in terms) if idf: mod_tfidfs = {term: count * idf[term] if ' ' not in term else count for term, count in term_counts.items()} top_term_texts = {term for term, _ in sorted( mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct]} else: top_term_texts = {term for term, _ in term_counts.most_common(n_top_20pct)} terms = [term for term in terms if terms_as_strs[id(term)] in top_term_texts] # compute term weights from statistical attributes term_weights = {} set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms} n_toks_plus_1 = n_toks + 1 for term in terms: term_str = terms_as_strs[id(term)] pos_first_occ_factor = math.log(n_toks_plus_1 / (term.start + 1)) # TODO: assess if len(t) puts too much emphasis on long terms # alternative: term_len = 1 if ' ' not in term else math.sqrt(len(term)) term_len = 1 if ' ' not in term else len(term) term_count = term_counts[term_str] subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str if t2 != term_str and term_str in t2) term_freq_factor = (term_count - subsum_count) if idf and ' ' not in term_str: term_freq_factor *= idf[term_str] term_weights[term_str] = term_freq_factor * pos_first_occ_factor * term_len # filter terms to only those with positive weights terms = [term for term in terms if term_weights[terms_as_strs[id(term)]] > 0] n_coocs = defaultdict(lambda: defaultdict(int)) sum_logdists = defaultdict(lambda: defaultdict(float)) # iterate over windows for start_ind in range(n_toks): end_ind = start_ind + window_width window_terms = (term for term in terms if start_ind <= term.start <= end_ind) # get all token combinations within window for t1, t2 in itertools.combinations(window_terms, 2): if t1 is t2: continue n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1 try: sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ math.log(window_width / abs(t1.start - t2.start)) except ZeroDivisionError: # HACK: pretend that they're 1 token apart sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ math.log(window_width) if end_ind > n_toks: break # compute edge weights between co-occurring terms (nodes) edge_weights = defaultdict(lambda: defaultdict(float)) for t1, t2s in sum_logdists.items(): for t2 in t2s: edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2]) * term_weights[t1] * term_weights[t2] # normalize edge weights by sum of outgoing edge weights per term (node) norm_edge_weights = [] for t1, t2s in edge_weights.items(): sum_edge_weights = sum(t2s.values()) norm_edge_weights.extend((t1, t2, {'weight': weight / sum_edge_weights}) for t2, weight in t2s.items()) # build the weighted directed graph from edges, rank nodes by pagerank graph = nx.DiGraph() graph.add_edges_from(norm_edge_weights) term_ranks = nx.pagerank_scipy(graph) if isinstance(n_keyterms, float): n_keyterms = int(len(term_ranks) * n_keyterms) return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]
def ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1): """ Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``) n (int): number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc. filter_stops (bool): if True, remove ngrams that start or end with a stop word filter_punct (bool): if True, remove ngrams that contain any punctuation-only tokens filter_nums (bool): if True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') include_pos (str or Set[str]): remove ngrams if any of their constituent tokens' part-of-speech tags ARE NOT included in this param exclude_pos (str or Set[str]): remove ngrams if any of their constituent tokens' part-of-speech tags ARE included in this param min_freq (int, optional): remove ngrams that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next ngram from ``doc`` passing all specified filters, in order of appearance in the document Raises: ValueError: if ``n`` < 1 TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str, or a falsy value .. note:: Filtering by part-of-speech tag uses the universal POS tag set, http://universaldependencies.org/u/pos/ """ if n < 1: raise ValueError('n must be greater than or equal to 1') ngrams_ = (doc[i: i + n] for i in range(len(doc) - n + 1)) ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_space for w in ngram)) if filter_stops is True: ngrams_ = (ngram for ngram in ngrams_ if not ngram[0].is_stop and not ngram[-1].is_stop) if filter_punct is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_punct for w in ngram)) if filter_nums is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.like_num for w in ngram)) if include_pos: if isinstance(include_pos, unicode_): include_pos = include_pos.upper() ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ == include_pos for w in ngram)) elif isinstance(include_pos, (set, frozenset, list, tuple)): include_pos = {pos.upper() for pos in include_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ in include_pos for w in ngram)) else: msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos)) raise TypeError(msg) if exclude_pos: if isinstance(exclude_pos, unicode_): exclude_pos = exclude_pos.upper() ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ != exclude_pos for w in ngram)) elif isinstance(exclude_pos, (set, frozenset, list, tuple)): exclude_pos = {pos.upper() for pos in exclude_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ not in exclude_pos for w in ngram)) else: msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos)) raise TypeError(msg) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_) ngrams_ = (ngram for ngram in ngrams_ if freqs[normalized_str(ngram)] >= min_freq) for ngram in ngrams_: yield ngram
def key_terms_from_semantic_network(doc, window_width=2, edge_weighting='binary', ranking_algo='pagerank', join_key_words=False, n_keyterms=10, **kwargs): """ Extract key terms from a document by ranking nodes in a semantic network of terms, connected by edges and weights specified by parameters. Args: doc (``spacy.Doc``): window_width (int, optional): width of sliding window in which term co-occurrences are said to occur edge_weighting (str {'binary', 'cooc_freq'}, optional): method used to determine weights of edges between nodes in the semantic network; if 'binary', edge weight is set to 1 for any two terms co-occurring within `window_width` terms; if 'cooc_freq', edge weight is set to the number of times that any two terms co-occur ranking_algo (str {'pagerank', 'divrank', 'bestcoverage'}, optional): algorithm with which to rank nodes in the semantic network; `pagerank` is the canonical (and default) algorithm, but it prioritizes node centrality at the expense of node diversity; the other two attempt to balance centrality with diversity join_key_words (bool, optional): if True, join consecutive key words together into longer key terms, taking the sum of the constituent words' scores as the joined key term's combined score n_keyterms (int or float, optional): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms Returns: list((str, float)): sorted list of top ``n_keyterms`` key terms and their corresponding ranking scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] """ word_list = [spacy_utils.normalized_str(word) for word in doc] good_word_list = [spacy_utils.normalized_str(word) for word in doc if not word.is_stop and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'}] if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_keyterms = int(n_keyterms * len(set(good_word_list))) graph = terms_to_semantic_network( good_word_list, window_width=window_width, edge_weighting=edge_weighting) # rank nodes by algorithm, and sort in descending order if ranking_algo == 'pagerank': word_ranks = nx.pagerank_scipy(graph, weight='weight') elif ranking_algo == 'divrank': word_ranks = rank_nodes_by_divrank( graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5)) elif ranking_algo == 'bestcoverage': word_ranks = rank_nodes_by_bestcoverage( graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0)) # bail out here if all we wanted was key *words* and not *terms* if join_key_words is False: return [(word, score) for word, score in sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]] top_n = int(0.25 * len(word_ranks)) top_word_ranks = {word: rank for word, rank in sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]} # join consecutive key words into key terms seen_joined_key_terms = set() joined_key_terms = [] for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks): if key is True: words = list(group) term = ' '.join(words) if term in seen_joined_key_terms: continue seen_joined_key_terms.add(term) joined_key_terms.append((term, sum(word_ranks[word] for word in words))) return sorted(joined_key_terms, key=itemgetter(1), reverse=True)[:n_keyterms]
def words(doc, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1): """ Extract an ordered sequence of words from a document processed by spaCy, optionally filtering words by part-of-speech tag and frequency. Args: doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``) filter_stops (bool): if True, remove stop words from word list filter_punct (bool): if True, remove punctuation from word list filter_nums (bool): if True, remove number-like words (e.g. 10, 'ten') from word list include_pos (str or Set[str]): remove words whose part-of-speech tag IS NOT included in this param exclude_pos (str or Set[str]): remove words whose part-of-speech tag IS in the specified tags min_freq (int): remove words that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Token``: the next token from ``doc`` passing specified filters in order of appearance in the document Raises: TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str, or a falsy value .. note:: Filtering by part-of-speech tag uses the universal POS tag set, http://universaldependencies.org/u/pos/ """ words_ = (w for w in doc if not w.is_space) if filter_stops is True: words_ = (w for w in words_ if not w.is_stop) if filter_punct is True: words_ = (w for w in words_ if not w.is_punct) if filter_nums is True: words_ = (w for w in words_ if not w.like_num) if include_pos: if isinstance(include_pos, unicode_type): include_pos = include_pos.upper() words_ = (w for w in words_ if w.pos_ == include_pos) elif isinstance(include_pos, (set, frozenset, list, tuple)): include_pos = {pos.upper() for pos in include_pos} words_ = (w for w in words_ if w.pos_ in include_pos) else: msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos)) raise TypeError(msg) if exclude_pos: if isinstance(exclude_pos, unicode_type): exclude_pos = exclude_pos.upper() words_ = (w for w in words_ if w.pos_ != exclude_pos) elif isinstance(exclude_pos, (set, frozenset, list, tuple)): exclude_pos = {pos.upper() for pos in exclude_pos} words_ = (w for w in words_ if w.pos_ not in exclude_pos) else: msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos)) raise TypeError(msg) if min_freq > 1: words_ = list(words_) freqs = itertoolz.frequencies(normalized_str(w) for w in words_) words_ = (w for w in words_ if freqs[normalized_str(w)] >= min_freq) for word in words_: yield word
def ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=None, exclude_pos=None, min_freq=1): """ Extract an ordered sequence of n-grams (``n`` consecutive words) from a spacy-parsed doc, optionally filtering n-grams by the types and parts-of-speech of the constituent words. Args: doc (``textacy.Doc``, ``spacy.Doc``, or ``spacy.Span``) n (int): number of tokens per n-gram; 2 => bigrams, 3 => trigrams, etc. filter_stops (bool): if True, remove ngrams that start or end with a stop word filter_punct (bool): if True, remove ngrams that contain any punctuation-only tokens filter_nums (bool): if True, remove ngrams that contain any numbers or number-like tokens (e.g. 10, 'ten') include_pos (str or Set[str]): remove ngrams if any of their constituent tokens' part-of-speech tags ARE NOT included in this param exclude_pos (str or Set[str]): remove ngrams if any of their constituent tokens' part-of-speech tags ARE included in this param min_freq (int, optional): remove ngrams that occur in `doc` fewer than `min_freq` times Yields: ``spacy.Span``: the next ngram from ``doc`` passing all specified filters, in order of appearance in the document Raises: ValueError: if ``n`` < 1 TypeError: if `include_pos` or `exclude_pos` is not a str, a set of str, or a falsy value .. note:: Filtering by part-of-speech tag uses the universal POS tag set, http://universaldependencies.org/u/pos/ """ if n < 1: raise ValueError('n must be greater than or equal to 1') ngrams_ = (doc[i: i + n] for i in range(len(doc) - n + 1)) ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_space for w in ngram)) if filter_stops is True: ngrams_ = (ngram for ngram in ngrams_ if not ngram[0].is_stop and not ngram[-1].is_stop) if filter_punct is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.is_punct for w in ngram)) if filter_nums is True: ngrams_ = (ngram for ngram in ngrams_ if not any(w.like_num for w in ngram)) if include_pos: if isinstance(include_pos, unicode_type): include_pos = include_pos.upper() ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ == include_pos for w in ngram)) elif isinstance(include_pos, (set, frozenset, list, tuple)): include_pos = {pos.upper() for pos in include_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ in include_pos for w in ngram)) else: msg = 'invalid `include_pos` type: "{}"'.format(type(include_pos)) raise TypeError(msg) if exclude_pos: if isinstance(exclude_pos, unicode_type): exclude_pos = exclude_pos.upper() ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ != exclude_pos for w in ngram)) elif isinstance(exclude_pos, (set, frozenset, list, tuple)): exclude_pos = {pos.upper() for pos in exclude_pos} ngrams_ = (ngram for ngram in ngrams_ if all(w.pos_ not in exclude_pos for w in ngram)) else: msg = 'invalid `exclude_pos` type: "{}"'.format(type(exclude_pos)) raise TypeError(msg) if min_freq > 1: ngrams_ = list(ngrams_) freqs = itertoolz.frequencies(normalized_str(ngram) for ngram in ngrams_) ngrams_ = (ngram for ngram in ngrams_ if freqs[normalized_str(ngram)] >= min_freq) for ngram in ngrams_: yield ngram
def mySgRank(doc, window_width=1500, n_keyterms=10, idf=None): if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError( '`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_toks = len(doc) min_term_freq = min(n_toks // 1500, 4) # build full list of candidate terms terms = list( itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, min_freq=min_term_freq) for n in range(1, 7))) # if inverse document frequencies available, also add verbs # verbs without IDF downweighting dominate the results, and not in a good way if idf: terms.extend( itertoolz.concat( extract.ngrams(doc, n, filter_stops=True, filter_punct=True, filter_nums=False, min_freq=min_term_freq) for n in range(1, 7))) terms_as_strs = { id(term): spacy_utils.normalized_str(term) for term in terms } # pre-filter terms to the top 20% ranked by TF or modified TF*IDF, if available n_top_20pct = int(len(terms) * 0.2) term_counts = Counter(terms_as_strs[id(term)] for term in terms) if idf: mod_tfidfs = { term: count * idf[term] if ' ' not in term else count for term, count in term_counts.items() } top_term_texts = { term for term, _ in sorted(mod_tfidfs.items(), key=itemgetter(1), reverse=True)[:n_top_20pct] } else: top_term_texts = { term for term, _ in term_counts.most_common(n_top_20pct) } terms = [ term for term in terms if terms_as_strs[id(term)] in top_term_texts ] # compute term weights from statistical attributes term_weights = {} set_terms_as_str = {terms_as_strs[id(terms)] for terms in terms} n_toks_plus_1 = n_toks + 1 for term in terms: term_str = terms_as_strs[id(term)] pos_first_occ_factor = math.log(n_toks_plus_1 / (term.start + 1)) # TODO: assess if len(t) puts too much emphasis on long terms # alternative: term_len = 1 if ' ' not in term else math.sqrt(len(term)) term_len = 1 if ' ' not in term else len(term) term_count = term_counts[term_str] subsum_count = sum(term_counts[t2] for t2 in set_terms_as_str if t2 != term_str and term_str in t2) term_freq_factor = (term_count - subsum_count) if idf and ' ' not in term_str: term_freq_factor *= idf[term_str] term_weights[ term_str] = term_freq_factor * pos_first_occ_factor * term_len # filter terms to only those with positive weights terms = [ term for term in terms if term_weights[terms_as_strs[id(term)]] > 0 ] n_coocs = defaultdict(lambda: defaultdict(int)) sum_logdists = defaultdict(lambda: defaultdict(float)) # iterate over windows for start_ind in range(n_toks): end_ind = start_ind + window_width window_terms = (term for term in terms if start_ind <= term.start <= end_ind) # get all token combinations within window for t1, t2 in itertools.combinations(window_terms, 2): if t1 is t2: continue n_coocs[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += 1 try: sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ math.log(window_width / abs(t1.start - t2.start)) except ZeroDivisionError: # HACK: pretend that they're 1 token apart sum_logdists[terms_as_strs[id(t1)]][terms_as_strs[id(t2)]] += \ math.log(window_width) if end_ind > n_toks: break # compute edge weights between co-occurring terms (nodes) edge_weights = defaultdict(lambda: defaultdict(float)) for t1, t2s in sum_logdists.items(): for t2 in t2s: edge_weights[t1][t2] = (sum_logdists[t1][t2] / n_coocs[t1][t2] ) * term_weights[t1] * term_weights[t2] # normalize edge weights by sum of outgoing edge weights per term (node) norm_edge_weights = [] for t1, t2s in edge_weights.items(): sum_edge_weights = sum(t2s.values()) norm_edge_weights.extend((t1, t2, { 'weight': weight / sum_edge_weights }) for t2, weight in t2s.items()) # build the weighted directed graph from edges, rank nodes by pagerank graph = nx.DiGraph() graph.add_edges_from(norm_edge_weights) term_ranks = nx.pagerank_scipy(graph) if isinstance(n_keyterms, float): n_keyterms = int(len(term_ranks) * n_keyterms) return sorted(term_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]