Beispiel #1
0
def keywords(text, ratio=0.2, words=None, split=False, scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
Beispiel #2
0
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True):
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
def get_graph(text):
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph
Beispiel #4
0
def get_graph(text):
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph
def _strip_word(word):
    """Get cleaned `word`.

    Parameters
    ----------
    word : str
        Given word.

    Returns
    -------
    str
        Cleaned word.
    """
    stripped_word_list = list(_tokenize_by_word(word))
    return stripped_word_list[0] if stripped_word_list else ""
Beispiel #6
0
def _strip_word(word):
    """Get cleaned `word`.

    Parameters
    ----------
    word : str
        Given word.

    Returns
    -------
    str
        Cleaned word.
    """
    stripped_word_list = list(_tokenize_by_word(word))
    return stripped_word_list[0] if stripped_word_list else ""
def get_graph(text):
    """Creates and returns graph from given text, cleans and tokenize text before building graph.

    Parameters
    ----------
    text : str
        Sequence of values.

    Returns
    -------
    :class:`~gensim.summarization.graph.Graph`
        Created graph.

    """
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph
Beispiel #8
0
def get_graph(text):
    """Creates and returns graph from given text, cleans and tokenize text before building graph.

    Parameters
    ----------
    text : str
        Sequence of values.

    Returns
    -------
    :class:`~gensim.summarization.graph.Graph`
        Created graph.

    """
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)

    return graph
def keywords(text,
             ratio=0.2,
             words=None,
             split=False,
             scores=False,
             pos_filter=('NN', 'JJ'),
             lemmatize=False,
             deacc=True):
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
Beispiel #10
0
def keywords(text, ratio=0.2, words=None, split=False, scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
def mz_keywords(text,
                blocksize=1024,
                scores=False,
                split=False,
                weighted=True,
                threshold=0.0):
    """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_

    Parameters
    ----------
    text: str
        Document for summarization.
    blocksize: int, optional
        Size of blocks to use in analysis.
    scores: bool, optional
        Whether to return score with keywords.
    split: bool, optional
        Whether to return results as list.
    weighted: bool, optional
        Whether to weight scores by word frequency.
        False can useful for shorter texts, and allows automatic thresholding.
    threshold: float or 'auto', optional
        Minimum score for returned keywords,  'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8,
        use 'auto' with `weighted=False`.

    Returns
    -------
    results: str
        newline separated keywords if `split` == False **OR**
    results: list(str)
        list of keywords if `scores` == False **OR**
    results: list(tuple(str, float))
        list of (keyword, score) tuples if `scores` == True

    Results are returned in descending order of score regardless of the format.

    Note
    ----
    This algorithm looks for keywords that contribute to the structure of the
    text on scales of `blocksize` words of larger. It is suitable for extracting
    keywords representing the major themes of long texts.

    References
    ----------
    .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in
           written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153,
           DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558

    """
    text = to_unicode(text)
    words = [word for word in _tokenize_by_word(text)]
    vocab = sorted(set(words))
    word_counts = numpy.array(
        [[words[i:i + blocksize].count(word) for word in vocab]
         for i in range(0, len(words), blocksize)]).astype('d')
    n_blocks = word_counts.shape[0]
    totals = word_counts.sum(axis=0)
    n_words = totals.sum()
    p = word_counts / totals
    log_p = numpy.log2(p)
    h = numpy.nan_to_num(p * log_p).sum(axis=0)
    analytic = __analytic_entropy(blocksize, n_blocks, n_words)
    h += analytic(totals).astype('d')
    if weighted:
        h *= totals / n_words
    if threshold == 'auto':
        threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8
    weights = [(word, score) for (word, score) in zip(vocab, h)
               if score > threshold]
    weights.sort(key=lambda x: -x[1])
    result = weights if scores else [word for (word, score) in weights]
    if not (scores or split):
        result = '\n'.join(result)
    return result
def _strip_word(word):
    stripped_word_list = list(_tokenize_by_word(word))
    return stripped_word_list[0] if stripped_word_list else ""
Beispiel #13
0
def mz_keywords(text, blocksize=1024, scores=False, split=False, weighted=True, threshold=0.0):
    """Extract keywords from text using the Montemurro and Zanette entropy algorithm. [1]_

    Parameters
    ----------
    text: str
        Document for summarization.
    blocksize: int, optional
        Size of blocks to use in analysis.
    scores: bool, optional
        Whether to return score with keywords.
    split: bool, optional
        Whether to return results as list.
    weighted: bool, optional
        Whether to weight scores by word frequency.
        False can useful for shorter texts, and allows automatic thresholding.
    threshold: float or 'auto', optional
        Minimum score for returned keywords,  'auto' calculates the threshold as n_blocks / (n_blocks + 1.0) + 1e-8,
        use 'auto' with `weighted=False`.

    Returns
    -------
    results: str
        newline separated keywords if `split` == False **OR**
    results: list(str)
        list of keywords if `scores` == False **OR**
    results: list(tuple(str, float))
        list of (keyword, score) tuples if `scores` == True

    Results are returned in descending order of score regardless of the format.

    Note
    ----
    This algorithm looks for keywords that contribute to the structure of the
    text on scales of `blocksize` words of larger. It is suitable for extracting
    keywords representing the major themes of long texts.

    References
    ----------
    .. [1] Marcello A Montemurro, Damian Zanette, "Towards the quantification of the semantic information encoded in
           written language". Advances in Complex Systems, Volume 13, Issue 2 (2010), pp. 135-153,
           DOI: 10.1142/S0219525910002530, https://arxiv.org/abs/0907.1558

    """
    text = to_unicode(text)
    words = [word for word in _tokenize_by_word(text)]
    vocab = sorted(set(words))
    word_counts = count_freqs_by_blocks(words, vocab, blocksize)
    n_blocks = word_counts.shape[0]
    totals = word_counts.sum(axis=0)
    n_words = totals.sum()
    p = word_counts / totals
    log_p = np.log2(p)
    h = np.nan_to_num(p * log_p).sum(axis=0)
    analytic = __analytic_entropy(blocksize, n_blocks, n_words)
    h += analytic(totals).astype('d', copy=False)
    if weighted:
        h *= totals / n_words
    if threshold == 'auto':
        threshold = n_blocks / (n_blocks + 1.0) + 1.0e-8
    weights = [(word, score) for (word, score) in zip(vocab, h) if score > threshold]
    weights.sort(key=lambda x: -x[1])
    result = weights if scores else [word for (word, score) in weights]
    if not (scores or split):
        result = '\n'.join(result)
    return result
def keywords(text,
             ratio=0.2,
             words=None,
             split=False,
             scores=False,
             pos_filter=('NN', 'JJ'),
             lemmatize=False,
             deacc=True):
    """Get most ranked words of provided text and/or its combinations.

    Parameters
    ----------

    text : str
        Input text.
    ratio : float, optional
        If no "words" option is selected, the number of sentences is reduced by the provided ratio,
        else, the ratio is ignored.
    words : int, optional
        Number of returned words.
    split : bool, optional
        Whether split keywords if True.
    scores : bool, optional
        Whether score of keyword.
    pos_filter : tuple, optional
        Part of speech filters.
    lemmatize : bool, optional
        If True - lemmatize words.
    deacc : bool, optional
        If True - remove accentuation.

    Returns
    -------
    result: list of (str, float)
        If `scores`, keywords with scores **OR**
    result: list of str
        If `split`, keywords only **OR**
    result: str
        Keywords, joined by endl.

    """
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    get_sentence_score_per_word(text)
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    if not any(True for _ in graph.iter_edges()):
        return _format_results([], [], split, scores)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
Beispiel #15
0
def _strip_word(word):
    stripped_word_list = list(_tokenize_by_word(word))
    return stripped_word_list[0] if stripped_word_list else ""
Beispiel #16
0
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'),
             lemmatize=False, deacc=True):
    """Get most ranked words of provided text and/or its combinations.

    Parameters
    ----------

    text : str
        Input text.
    ratio : float, optional
        If no "words" option is selected, the number of sentences is reduced by the provided ratio,
        else, the ratio is ignored.
    words : int, optional
        Number of returned words.
    split : bool, optional
        Whether split keywords if True.
    scores : bool, optional
        Whether score of keyword.
    pos_filter : tuple, optional
        Part of speech filters.
    lemmatize : bool, optional
        If True - lemmatize words.
    deacc : bool, optional
        If True - remove accentuation.

    Returns
    -------
    result: list of (str, float)
        If `scores`, keywords with scores **OR**
    result: list of str
        If `split`, keywords only **OR**
    result: str
        Keywords, joined by endl.

    """
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    if not any(True for _ in graph.iter_edges()):
        return _format_results([], [], split, scores)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)