def to_semantic_network(self, nodes='words', normalize='lemma', edge_weighting='default', window_width=10): """ Transform ``Doc`` into a semantic network, where nodes are either 'words' or 'sents' and edges between nodes may be weighted in different ways. Args: nodes ({'words', 'sents'}): type of doc component to use as nodes in the semantic network normalize (str or callable): if 'lemma', lemmatize terms; if 'lower', lowercase terms; if false-y, use the form of terms as they appear in doc; if a callable, must accept a ``spacy.Token`` or ``spacy.Span`` (if ``nodes`` = 'words' or 'sents', respectively) and return a str, e.g. :func:`textacy.spacy_utils.normalized_str()` edge_weighting (str): type of weighting to apply to edges between nodes; if ``nodes == 'words'``, options are {'cooc_freq', 'binary'}, if ``nodes == 'sents'``, options are {'cosine', 'jaccard'}; if 'default', 'cooc_freq' or 'cosine' will be automatically used window_width (int): size of sliding window over terms that determines which are said to co-occur; only applicable if 'words' Returns: :class:`networkx.Graph <networkx.Graph>`: where nodes represent either terms or sentences in doc; edges, the relationships between them Raises: ValueError: if ``nodes`` is neither 'words' nor 'sents' See Also: :func:`terms_to_semantic_network() <textacy.network.terms_to_semantic_network>` :func:`sents_to_semantic_network() <textacy.network.sents_to_semantic_network>` """ if nodes == 'words': if edge_weighting == 'default': edge_weighting = 'cooc_freq' return network.terms_to_semantic_network( list(textacy.extract.words(self)), normalize=normalize, window_width=window_width, edge_weighting=edge_weighting) elif nodes == 'sents': if edge_weighting == 'default': edge_weighting = 'cosine' return network.sents_to_semantic_network( list(self.sents), normalize=normalize, edge_weighting=edge_weighting) else: msg = 'nodes "{}" not valid; must be in {}'.format( nodes, {'words', 'sents'}) raise ValueError(msg)
def to_semantic_network(self, nodes='words', edge_weighting='default', window_width=10): """ Transform ``Doc`` into a semantic network, where nodes are either 'words' or 'sents' and edges between nodes may be weighted in different ways. Args: nodes ({'words', 'sents'}): type of doc component to use as nodes in the semantic network edge_weighting (str): type of weighting to apply to edges between nodes; if ``nodes == 'words'``, options are {'cooc_freq', 'binary'}, if ``nodes == 'sents'``, options are {'cosine', 'jaccard'}; if 'default', 'cooc_freq' or 'cosine' will be automatically used window_width (int): size of sliding window over terms that determines which are said to co-occur; only applicable if 'words' Returns: :class:`networkx.Graph <networkx.Graph>`: where nodes represent either terms or sentences in doc; edges, the relationships between them Raises: ValueError: if ``nodes`` is neither 'words' nor 'sents' See Also: :func:`terms_to_semantic_network() <textacy.network.terms_to_semantic_network>` :func:`sents_to_semantic_network() <textacy.network.sents_to_semantic_network>` """ if nodes == 'words': if edge_weighting == 'default': edge_weighting = 'cooc_freq' return network.terms_to_semantic_network( list(textacy.extract.words(self)), window_width=window_width, edge_weighting=edge_weighting) elif nodes == 'sents': if edge_weighting == 'default': edge_weighting = 'cosine' return network.sents_to_semantic_network( list(self.sents), edge_weighting=edge_weighting) else: msg = 'nodes "{}" not valid; must be in {}'.format( nodes, {'words', 'sents'}) raise ValueError(msg)
def key_terms_from_semantic_network(doc, normalize='lemma', window_width=2, edge_weighting='binary', ranking_algo='pagerank', join_key_words=False, n_keyterms=10, **kwargs): """ Extract key terms from a document by ranking nodes in a semantic network of terms, connected by edges and weights specified by parameters. Args: doc (``textacy.Doc`` or ``spacy.Doc``) normalize (str or callable): if 'lemma', lemmatize terms; if 'lower', lowercase terms; if None, use the form of terms as they appeared in ``doc``; if a callable, must accept a ``spacy.Token`` and return a str, e.g. :func:`textacy.spacy_utils.normalized_str()` window_width (int): width of sliding window in which term co-occurrences are said to occur edge_weighting ('binary', 'cooc_freq'}): method used to determine weights of edges between nodes in the semantic network; if 'binary', edge weight is set to 1 for any two terms co-occurring within `window_width` terms; if 'cooc_freq', edge weight is set to the number of times that any two terms co-occur ranking_algo ({'pagerank', 'divrank', 'bestcoverage'}): algorithm with which to rank nodes in the semantic network; `pagerank` is the canonical (and default) algorithm, but it prioritizes node centrality at the expense of node diversity; the other two attempt to balance centrality with diversity join_key_words (bool): if True, join consecutive key words together into longer key terms, taking the sum of the constituent words' scores as the joined key term's combined score n_keyterms (int or float): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), is converted to an integer by ``round(len(doc) * n_keyterms)`` Returns: List[Tuple[str, float]]: sorted list of top ``n_keyterms`` key terms and their corresponding ranking scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] """ if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_keyterms = int(round(len(doc) * n_keyterms)) include_pos = {'NOUN', 'PROPN', 'ADJ'} if normalize == 'lemma': word_list = [word.lemma_ for word in doc] good_word_list = [word.lemma_ for word in doc if not word.is_stop and not word.is_punct and word.pos_ in include_pos] elif normalize == 'lower': word_list = [word.lower_ for word in doc] good_word_list = [word.lower_ for word in doc if not word.is_stop and not word.is_punct and word.pos_ in include_pos] elif not normalize: word_list = [word.text for word in doc] good_word_list = [word.text for word in doc if not word.is_stop and not word.is_punct and word.pos_ in include_pos] else: word_list = [normalize(word) for word in doc] good_word_list = [normalize(word) for word in doc if not word.is_stop and not word.is_punct and word.pos_ in include_pos] # HACK: omit empty strings, which happen as a bug in spacy as of v1.5 # and may well happen with ``normalize`` as a callable # an empty string should never be considered a keyterm good_word_list = [word for word in good_word_list if word] graph = terms_to_semantic_network( good_word_list, window_width=window_width, edge_weighting=edge_weighting) # rank nodes by algorithm, and sort in descending order if ranking_algo == 'pagerank': word_ranks = nx.pagerank_scipy(graph, weight='weight') elif ranking_algo == 'divrank': word_ranks = rank_nodes_by_divrank( graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5)) elif ranking_algo == 'bestcoverage': word_ranks = rank_nodes_by_bestcoverage( graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0)) # bail out here if all we wanted was key *words* and not *terms* if join_key_words is False: return [(word, score) for word, score in sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]] top_n = int(0.25 * len(word_ranks)) top_word_ranks = {word: rank for word, rank in sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]} # join consecutive key words into key terms seen_joined_key_terms = set() joined_key_terms = [] for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks): if key is True: words = list(group) term = ' '.join(words) if term in seen_joined_key_terms: continue seen_joined_key_terms.add(term) joined_key_terms.append((term, sum(word_ranks[word] for word in words))) return sorted(joined_key_terms, key=itemgetter(1, 0), reverse=True)[:n_keyterms]
def key_terms_from_semantic_network(doc, window_width=2, edge_weighting='binary', ranking_algo='pagerank', join_key_words=False, n_keyterms=10, **kwargs): """ Extract key terms from a document by ranking nodes in a semantic network of terms, connected by edges and weights specified by parameters. Args: doc (``textacy.Doc`` or ``spacy.Doc``) window_width (int): width of sliding window in which term co-occurrences are said to occur edge_weighting ('binary', 'cooc_freq'}): method used to determine weights of edges between nodes in the semantic network; if 'binary', edge weight is set to 1 for any two terms co-occurring within `window_width` terms; if 'cooc_freq', edge weight is set to the number of times that any two terms co-occur ranking_algo ({'pagerank', 'divrank', 'bestcoverage'}): algorithm with which to rank nodes in the semantic network; `pagerank` is the canonical (and default) algorithm, but it prioritizes node centrality at the expense of node diversity; the other two attempt to balance centrality with diversity join_key_words (bool): if True, join consecutive key words together into longer key terms, taking the sum of the constituent words' scores as the joined key term's combined score n_keyterms (int or float): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms Returns: list((str, float)): sorted list of top ``n_keyterms`` key terms and their corresponding ranking scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] """ word_list = [spacy_utils.normalized_str(word) for word in doc] good_word_list = [spacy_utils.normalized_str(word) for word in doc if not word.is_stop and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'}] if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_keyterms = int(n_keyterms * len(set(good_word_list))) graph = terms_to_semantic_network( good_word_list, window_width=window_width, edge_weighting=edge_weighting) # rank nodes by algorithm, and sort in descending order if ranking_algo == 'pagerank': word_ranks = nx.pagerank_scipy(graph, weight='weight') elif ranking_algo == 'divrank': word_ranks = rank_nodes_by_divrank( graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5)) elif ranking_algo == 'bestcoverage': word_ranks = rank_nodes_by_bestcoverage( graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0)) # bail out here if all we wanted was key *words* and not *terms* if join_key_words is False: return [(word, score) for word, score in sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]] top_n = int(0.25 * len(word_ranks)) top_word_ranks = {word: rank for word, rank in sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]} # join consecutive key words into key terms seen_joined_key_terms = set() joined_key_terms = [] for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks): if key is True: words = list(group) term = ' '.join(words) if term in seen_joined_key_terms: continue seen_joined_key_terms.add(term) joined_key_terms.append((term, sum(word_ranks[word] for word in words))) return sorted(joined_key_terms, key=itemgetter(1), reverse=True)[:n_keyterms]
def key_terms_from_semantic_network(doc, window_width=2, edge_weighting='binary', ranking_algo='pagerank', join_key_words=False, n_keyterms=10, **kwargs): """ Extract key terms from a document by ranking nodes in a semantic network of terms, connected by edges and weights specified by parameters. Args: doc (``textacy.Doc`` or ``spacy.Doc``) window_width (int): width of sliding window in which term co-occurrences are said to occur edge_weighting ('binary', 'cooc_freq'}): method used to determine weights of edges between nodes in the semantic network; if 'binary', edge weight is set to 1 for any two terms co-occurring within `window_width` terms; if 'cooc_freq', edge weight is set to the number of times that any two terms co-occur ranking_algo ({'pagerank', 'divrank', 'bestcoverage'}): algorithm with which to rank nodes in the semantic network; `pagerank` is the canonical (and default) algorithm, but it prioritizes node centrality at the expense of node diversity; the other two attempt to balance centrality with diversity join_key_words (bool): if True, join consecutive key words together into longer key terms, taking the sum of the constituent words' scores as the joined key term's combined score n_keyterms (int or float): if int, number of top-ranked terms to return as keyterms; if float, must be in the open interval (0, 1), representing the fraction of top-ranked terms to return as keyterms Returns: list((str, float)): sorted list of top ``n_keyterms`` key terms and their corresponding ranking scores Raises: ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0] """ word_list = [spacy_utils.normalized_str(word) for word in doc] good_word_list = [ spacy_utils.normalized_str(word) for word in doc if not word.is_stop and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'} ] if isinstance(n_keyterms, float): if not 0.0 < n_keyterms <= 1.0: raise ValueError( '`n_keyterms` must be an int, or a float between 0.0 and 1.0') n_keyterms = int(n_keyterms * len(set(good_word_list))) graph = terms_to_semantic_network(good_word_list, window_width=window_width, edge_weighting=edge_weighting) # rank nodes by algorithm, and sort in descending order if ranking_algo == 'pagerank': word_ranks = nx.pagerank_scipy(graph, weight='weight') elif ranking_algo == 'divrank': word_ranks = rank_nodes_by_divrank(graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5)) elif ranking_algo == 'bestcoverage': word_ranks = rank_nodes_by_bestcoverage(graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0)) # bail out here if all we wanted was key *words* and not *terms* if join_key_words is False: return [(word, score) for word, score in sorted( word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]] top_n = int(0.25 * len(word_ranks)) top_word_ranks = { word: rank for word, rank in sorted( word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n] } # join consecutive key words into key terms seen_joined_key_terms = set() joined_key_terms = [] for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks): if key is True: words = list(group) term = ' '.join(words) if term in seen_joined_key_terms: continue seen_joined_key_terms.add(term) joined_key_terms.append( (term, sum(word_ranks[word] for word in words))) return sorted(joined_key_terms, key=itemgetter(1), reverse=True)[:n_keyterms]