コード例 #1
0
ファイル: doc.py プロジェクト: tozammel/textacy
    def to_semantic_network(self,
                            nodes='words',
                            normalize='lemma',
                            edge_weighting='default',
                            window_width=10):
        """
        Transform ``Doc`` into a semantic network, where nodes are either 'words'
        or 'sents' and edges between nodes may be weighted in different ways.

        Args:
            nodes ({'words', 'sents'}): type of doc component to use as nodes
                in the semantic network
            normalize (str or callable): if 'lemma', lemmatize terms; if 'lower',
                lowercase terms; if false-y, use the form of terms as they appear
                in doc; if a callable, must accept a ``spacy.Token`` or ``spacy.Span``
                (if ``nodes`` = 'words' or 'sents', respectively) and return a
                str, e.g. :func:`textacy.spacy_utils.normalized_str()`
            edge_weighting (str): type of weighting to apply to edges
                between nodes; if ``nodes == 'words'``, options are {'cooc_freq', 'binary'},
                if ``nodes == 'sents'``, options are {'cosine', 'jaccard'}; if
                'default', 'cooc_freq' or 'cosine' will be automatically used
            window_width (int): size of sliding window over terms that
                determines which are said to co-occur; only applicable if 'words'

        Returns:
            :class:`networkx.Graph <networkx.Graph>`: where nodes represent either
                terms or sentences in doc; edges, the relationships between them

        Raises:
            ValueError: if ``nodes`` is neither 'words' nor 'sents'

        See Also:
            :func:`terms_to_semantic_network() <textacy.network.terms_to_semantic_network>`
            :func:`sents_to_semantic_network() <textacy.network.sents_to_semantic_network>`
        """
        if nodes == 'words':
            if edge_weighting == 'default':
                edge_weighting = 'cooc_freq'
            return network.terms_to_semantic_network(
                list(textacy.extract.words(self)),
                normalize=normalize,
                window_width=window_width,
                edge_weighting=edge_weighting)
        elif nodes == 'sents':
            if edge_weighting == 'default':
                edge_weighting = 'cosine'
            return network.sents_to_semantic_network(
                list(self.sents),
                normalize=normalize,
                edge_weighting=edge_weighting)
        else:
            msg = 'nodes "{}" not valid; must be in {}'.format(
                nodes, {'words', 'sents'})
            raise ValueError(msg)
コード例 #2
0
ファイル: doc.py プロジェクト: chartbeat-labs/textacy
    def to_semantic_network(self, nodes='words',
                            edge_weighting='default', window_width=10):
        """
        Transform ``Doc`` into a semantic network, where nodes are either 'words'
        or 'sents' and edges between nodes may be weighted in different ways.

        Args:
            nodes ({'words', 'sents'}): type of doc component to use as nodes
                in the semantic network
            edge_weighting (str): type of weighting to apply to edges
                between nodes; if ``nodes == 'words'``, options are {'cooc_freq', 'binary'},
                if ``nodes == 'sents'``, options are {'cosine', 'jaccard'}; if
                'default', 'cooc_freq' or 'cosine' will be automatically used
            window_width (int): size of sliding window over terms that
                determines which are said to co-occur; only applicable if 'words'

        Returns:
            :class:`networkx.Graph <networkx.Graph>`: where nodes represent either
                terms or sentences in doc; edges, the relationships between them

        Raises:
            ValueError: if ``nodes`` is neither 'words' nor 'sents'

        See Also:
            :func:`terms_to_semantic_network() <textacy.network.terms_to_semantic_network>`
            :func:`sents_to_semantic_network() <textacy.network.sents_to_semantic_network>`
        """
        if nodes == 'words':
            if edge_weighting == 'default':
                edge_weighting = 'cooc_freq'
            return network.terms_to_semantic_network(
                list(textacy.extract.words(self)),
                window_width=window_width,
                edge_weighting=edge_weighting)
        elif nodes == 'sents':
            if edge_weighting == 'default':
                edge_weighting = 'cosine'
            return network.sents_to_semantic_network(
                list(self.sents), edge_weighting=edge_weighting)
        else:
            msg = 'nodes "{}" not valid; must be in {}'.format(
                nodes, {'words', 'sents'})
            raise ValueError(msg)
コード例 #3
0
def key_terms_from_semantic_network(doc, normalize='lemma',
                                    window_width=2, edge_weighting='binary',
                                    ranking_algo='pagerank', join_key_words=False,
                                    n_keyterms=10, **kwargs):
    """
    Extract key terms from a document by ranking nodes in a semantic network of
    terms, connected by edges and weights specified by parameters.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        normalize (str or callable): if 'lemma', lemmatize terms; if 'lower',
            lowercase terms; if None, use the form of terms as they appeared in
            ``doc``; if a callable, must accept a ``spacy.Token`` and return a str,
            e.g. :func:`textacy.spacy_utils.normalized_str()`
        window_width (int): width of sliding window in which term
            co-occurrences are said to occur
        edge_weighting ('binary', 'cooc_freq'}): method used to
            determine weights of edges between nodes in the semantic network;
            if 'binary', edge weight is set to 1 for any two terms co-occurring
            within `window_width` terms; if 'cooc_freq', edge weight is set to
            the number of times that any two terms co-occur
        ranking_algo ({'pagerank', 'divrank', 'bestcoverage'}):
            algorithm with which to rank nodes in the semantic network;
            `pagerank` is the canonical (and default) algorithm, but it prioritizes
            node centrality at the expense of node diversity; the other two
            attempt to balance centrality with diversity
        join_key_words (bool): if True, join consecutive key words
            together into longer key terms, taking the sum of the constituent words'
            scores as the joined key term's combined score
        n_keyterms (int or float): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            is converted to an integer by ``round(len(doc) * n_keyterms)``

    Returns:
        List[Tuple[str, float]]: sorted list of top ``n_keyterms`` key terms and
            their corresponding ranking scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]
    """
    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
        n_keyterms = int(round(len(doc) * n_keyterms))

    include_pos = {'NOUN', 'PROPN', 'ADJ'}
    if normalize == 'lemma':
        word_list = [word.lemma_ for word in doc]
        good_word_list = [word.lemma_ for word in doc
                          if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
    elif normalize == 'lower':
        word_list = [word.lower_ for word in doc]
        good_word_list = [word.lower_ for word in doc
                          if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
    elif not normalize:
        word_list = [word.text for word in doc]
        good_word_list = [word.text for word in doc
                          if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
    else:
        word_list = [normalize(word) for word in doc]
        good_word_list = [normalize(word) for word in doc
                          if not word.is_stop and not word.is_punct and word.pos_ in include_pos]

    # HACK: omit empty strings, which happen as a bug in spacy as of v1.5
    # and may well happen with ``normalize`` as a callable
    # an empty string should never be considered a keyterm
    good_word_list = [word for word in good_word_list if word]
    graph = terms_to_semantic_network(
        good_word_list, window_width=window_width, edge_weighting=edge_weighting)

    # rank nodes by algorithm, and sort in descending order
    if ranking_algo == 'pagerank':
        word_ranks = nx.pagerank_scipy(graph, weight='weight')
    elif ranking_algo == 'divrank':
        word_ranks = rank_nodes_by_divrank(
            graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5))
    elif ranking_algo == 'bestcoverage':
        word_ranks = rank_nodes_by_bestcoverage(
            graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0))

    # bail out here if all we wanted was key *words* and not *terms*
    if join_key_words is False:
        return [(word, score) for word, score in
                sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]]

    top_n = int(0.25 * len(word_ranks))
    top_word_ranks = {word: rank for word, rank in
                      sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]}

    # join consecutive key words into key terms
    seen_joined_key_terms = set()
    joined_key_terms = []
    for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks):
        if key is True:
            words = list(group)
            term = ' '.join(words)
            if term in seen_joined_key_terms:
                continue
            seen_joined_key_terms.add(term)
            joined_key_terms.append((term, sum(word_ranks[word] for word in words)))

    return sorted(joined_key_terms, key=itemgetter(1, 0), reverse=True)[:n_keyterms]
コード例 #4
0
ファイル: keyterms.py プロジェクト: chartbeat-labs/textacy
def key_terms_from_semantic_network(doc, window_width=2, edge_weighting='binary',
                                    ranking_algo='pagerank', join_key_words=False,
                                    n_keyterms=10, **kwargs):
    """
    Extract key terms from a document by ranking nodes in a semantic network of
    terms, connected by edges and weights specified by parameters.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        window_width (int): width of sliding window in which term
            co-occurrences are said to occur
        edge_weighting ('binary', 'cooc_freq'}): method used to
            determine weights of edges between nodes in the semantic network;
            if 'binary', edge weight is set to 1 for any two terms co-occurring
            within `window_width` terms; if 'cooc_freq', edge weight is set to
            the number of times that any two terms co-occur
        ranking_algo ({'pagerank', 'divrank', 'bestcoverage'}):
            algorithm with which to rank nodes in the semantic network;
            `pagerank` is the canonical (and default) algorithm, but it prioritizes
            node centrality at the expense of node diversity; the other two
            attempt to balance centrality with diversity
        join_key_words (bool): if True, join consecutive key words
            together into longer key terms, taking the sum of the constituent words'
            scores as the joined key term's combined score
        n_keyterms (int or float): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            representing the fraction of top-ranked terms to return as keyterms

    Returns:
        list((str, float)): sorted list of top ``n_keyterms`` key terms and their
            corresponding ranking scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]
    """
    word_list = [spacy_utils.normalized_str(word) for word in doc]
    good_word_list = [spacy_utils.normalized_str(word)
                      for word in doc
                      if not word.is_stop and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'}]

    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
        n_keyterms = int(n_keyterms * len(set(good_word_list)))

    graph = terms_to_semantic_network(
        good_word_list, window_width=window_width, edge_weighting=edge_weighting)

    # rank nodes by algorithm, and sort in descending order
    if ranking_algo == 'pagerank':
        word_ranks = nx.pagerank_scipy(graph, weight='weight')
    elif ranking_algo == 'divrank':
        word_ranks = rank_nodes_by_divrank(
            graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5))
    elif ranking_algo == 'bestcoverage':
        word_ranks = rank_nodes_by_bestcoverage(
            graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0))

    # bail out here if all we wanted was key *words* and not *terms*
    if join_key_words is False:
        return [(word, score) for word, score in
                sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]]

    top_n = int(0.25 * len(word_ranks))
    top_word_ranks = {word: rank for word, rank in
                      sorted(word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]}

    # join consecutive key words into key terms
    seen_joined_key_terms = set()
    joined_key_terms = []
    for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks):
        if key is True:
            words = list(group)
            term = ' '.join(words)
            if term in seen_joined_key_terms:
                continue
            seen_joined_key_terms.add(term)
            joined_key_terms.append((term, sum(word_ranks[word] for word in words)))

    return sorted(joined_key_terms, key=itemgetter(1), reverse=True)[:n_keyterms]
コード例 #5
0
def key_terms_from_semantic_network(doc,
                                    window_width=2,
                                    edge_weighting='binary',
                                    ranking_algo='pagerank',
                                    join_key_words=False,
                                    n_keyterms=10,
                                    **kwargs):
    """
    Extract key terms from a document by ranking nodes in a semantic network of
    terms, connected by edges and weights specified by parameters.

    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        window_width (int): width of sliding window in which term
            co-occurrences are said to occur
        edge_weighting ('binary', 'cooc_freq'}): method used to
            determine weights of edges between nodes in the semantic network;
            if 'binary', edge weight is set to 1 for any two terms co-occurring
            within `window_width` terms; if 'cooc_freq', edge weight is set to
            the number of times that any two terms co-occur
        ranking_algo ({'pagerank', 'divrank', 'bestcoverage'}):
            algorithm with which to rank nodes in the semantic network;
            `pagerank` is the canonical (and default) algorithm, but it prioritizes
            node centrality at the expense of node diversity; the other two
            attempt to balance centrality with diversity
        join_key_words (bool): if True, join consecutive key words
            together into longer key terms, taking the sum of the constituent words'
            scores as the joined key term's combined score
        n_keyterms (int or float): if int, number of top-ranked terms
            to return as keyterms; if float, must be in the open interval (0, 1),
            representing the fraction of top-ranked terms to return as keyterms

    Returns:
        list((str, float)): sorted list of top ``n_keyterms`` key terms and their
            corresponding ranking scores

    Raises:
        ValueError: if ``n_keyterms`` is a float but not in (0.0, 1.0]
    """
    word_list = [spacy_utils.normalized_str(word) for word in doc]
    good_word_list = [
        spacy_utils.normalized_str(word) for word in doc if not word.is_stop
        and not word.is_punct and word.pos_ in {'NOUN', 'ADJ'}
    ]

    if isinstance(n_keyterms, float):
        if not 0.0 < n_keyterms <= 1.0:
            raise ValueError(
                '`n_keyterms` must be an int, or a float between 0.0 and 1.0')
        n_keyterms = int(n_keyterms * len(set(good_word_list)))

    graph = terms_to_semantic_network(good_word_list,
                                      window_width=window_width,
                                      edge_weighting=edge_weighting)

    # rank nodes by algorithm, and sort in descending order
    if ranking_algo == 'pagerank':
        word_ranks = nx.pagerank_scipy(graph, weight='weight')
    elif ranking_algo == 'divrank':
        word_ranks = rank_nodes_by_divrank(graph,
                                           r=None,
                                           lambda_=kwargs.get('lambda_', 0.5),
                                           alpha=kwargs.get('alpha', 0.5))
    elif ranking_algo == 'bestcoverage':
        word_ranks = rank_nodes_by_bestcoverage(graph,
                                                k=n_keyterms,
                                                c=kwargs.get('c', 1),
                                                alpha=kwargs.get('alpha', 1.0))

    # bail out here if all we wanted was key *words* and not *terms*
    if join_key_words is False:
        return [(word, score) for word, score in sorted(
            word_ranks.items(), key=itemgetter(1), reverse=True)[:n_keyterms]]

    top_n = int(0.25 * len(word_ranks))
    top_word_ranks = {
        word: rank
        for word, rank in sorted(
            word_ranks.items(), key=itemgetter(1), reverse=True)[:top_n]
    }

    # join consecutive key words into key terms
    seen_joined_key_terms = set()
    joined_key_terms = []
    for key, group in itertools.groupby(word_list,
                                        lambda word: word in top_word_ranks):
        if key is True:
            words = list(group)
            term = ' '.join(words)
            if term in seen_joined_key_terms:
                continue
            seen_joined_key_terms.add(term)
            joined_key_terms.append(
                (term, sum(word_ranks[word] for word in words)))

    return sorted(joined_key_terms, key=itemgetter(1),
                  reverse=True)[:n_keyterms]