Python _pagerank Examples, gensim.summarization.pagerank_weighted._pagerank Python Examples

Example #1

0

Show file

File: summarizer.py Project: nonva/gensim

def summarize_corpus(corpus, ratio=0.2):
    """
    Returns a list of the most important documents of a corpus using a
    variation of the TextRank algorithm.
    The input must have at least INPUT_MIN_LENGTH documents for the
    summary to make sense.

    The length of the output can be specified using the ratio parameter,
    which determines how many documents will be chosen for the summary
    (defaults at 20% of the number of documents of the corpus).

    The most important documents are returned as a list sorted by the
    document score, highest first.

    """
    hashable_corpus = _build_hasheable_corpus(corpus)

    if len(corpus) < INPUT_MIN_LENGTH:
        raise RuntimeError("Input corpus must have at least " + str(INPUT_MIN_LENGTH) + " documents.")

    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)

    pagerank_scores = _pagerank(graph)

    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)

    return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]

Example #2

0

Show file

def keywords(text, ratio=0.2, words=None, split=False, scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)

Example #3

0

Show file

File: summarizer.py Project: polcar/gensim

def summarize_corpus(corpus, ratio=0.2):
    """
    Returns a list of the most important documents of a corpus using a
    variation of the TextRank algorithm.
    The input must have at least INPUT_MIN_LENGTH documents for the
    summary to make sense.

    The length of the output can be specified using the ratio parameter,
    which determines how many documents will be chosen for the summary
    (defaults at 20% of the number of documents of the corpus).

    The most important documents are returned as a list sorted by the
    document score, highest first.

    """
    hashable_corpus = _build_hasheable_corpus(corpus)

    if len(corpus) < INPUT_MIN_LENGTH:
        raise RuntimeError("Input corpus must have at least " +
                           str(INPUT_MIN_LENGTH) + " documents.")

    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)

    pagerank_scores = _pagerank(graph)

    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0),
                         reverse=True)

    return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]

Example #4

0

Show file

File: improved_keywords.py Project: AsalJalilvand/extractive-text-summarization

def get_sentence_score_per_word(text):
    global sentence_score_per_word
    sentence_score_per_word = {}
    # form Gensim summarizer, I moved some code here because I wanted to access pagerank results
    sentences = _clean_text_by_sentences(text)
    corpus = _build_corpus(sentences)
    hashable_corpus = _build_hasheable_corpus(corpus)
    sentences_by_corpus = dict(zip(hashable_corpus, sentences))
    graph = _build_sentence_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)
    pagerank_scores = _pagerank(graph)
    for sentence_id, score in pagerank_scores.items():
        sentence = sentences_by_corpus[sentence_id]
        for token in sentence.token.split():
            if token in sentence_score_per_word:
                score_dict = sentence_score_per_word[token]
                sentence_score_per_word[token][
                    'n_sents'] = score_dict['n_sents'] + 1
                sentence_score_per_word[token][
                    'cumulative_score'] = score_dict['cumulative_score'] + score
            else:
                sentence_score_per_word[token] = {
                    'n_sents': 1,
                    'cumulative_score': score
                }

Example #5

0

Show file

File: keywords.py Project: leahic/gensim

def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True):
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)

Example #6

0

Show file

File: summarizer.py Project: RaRe-Technologies/gensim

def summarize_corpus(corpus, ratio=0.2):
    """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_.
     Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer`

    Note
    ----
    The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary
    to make sense.


    Parameters
    ----------
    corpus : list of list of (int, int)
        Given corpus.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary, optional.

    Returns
    -------
    list of str
        Most important documents of given `corpus` sorted by the document score, highest first.

    """
    hashable_corpus = _build_hasheable_corpus(corpus)

    # If the corpus is empty, the function ends.
    if len(corpus) == 0:
        logger.warning("Input corpus is empty.")
        return []

    # Warns the user if there are too few documents.
    if len(corpus) < INPUT_MIN_LENGTH:
        logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)

    logger.info('Building graph')
    graph = _build_graph(hashable_corpus)

    logger.info('Filling graph')
    _set_graph_edge_weights(graph)

    logger.info('Removing unreachable nodes of graph')
    _remove_unreachable_nodes(graph)

    # Cannot calculate eigenvectors if number of unique documents in corpus < 3.
    # Warns user to add more text. The function ends.
    if len(graph.nodes()) < 3:
        logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
        return []

    logger.info('Pagerank graph')
    pagerank_scores = _pagerank(graph)

    logger.info('Sorting pagerank scores')
    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)

    return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]

Example #7

0

Show file

File: summarizer.py Project: ShirleyHan6/TextSumDemo

def summarize_corpus(corpus, ratio=0.2):
    """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_.
     Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer`

    Note
    ----
    The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary
    to make sense.


    Parameters
    ----------
    corpus : list of list of (int, int)
        Given corpus.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary, optional.

    Returns
    -------
    list of str
        Most important documents of given `corpus` sorted by the document score, highest first.

    """
    hashable_corpus = _build_hasheable_corpus(corpus)

    # If the corpus is empty, the function ends.
    if len(corpus) == 0:
        # logger.warning("Input corpus is empty.")
        return []

    # Warns the user if there are too few documents.
    # Return the whole document
    if len(corpus) < INPUT_MIN_LENGTH:
        #logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)
        sentence_length = len(corpus)
        return [list(doc) for doc in hashable_corpus[:-1]]

    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)

    # Cannot calculate eigenvectors if number of unique documents in corpus < 3.
    # Warns user to add more text. The function ends.
    if len(graph.nodes()) < 3:
        # logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
        return []

    logger.info('Pagerank graph')
    pagerank_scores = _pagerank(graph)

    logger.info('Sorting pagerank scores')
    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0),
                         reverse=True)

    return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]

Example #8

0

Show file

def summarize_corpus(corpus, ratio=0.2, weight_threshold=1.e-3):
    """
    Returns a list of the most important documents of a corpus using a
    variation of the TextRank algorithm.
    The input must have at least INPUT_MIN_LENGTH (%d) documents for the
    summary to make sense.

    The length of the output can be specified using the ratio parameter,
    which determines how many documents will be chosen for the summary
    (defaults at 20%% of the number of documents of the corpus).

    The most important documents are returned as a list sorted by the
    document score, highest first.

    """ % INPUT_MIN_LENGTH
    hashable_corpus = _build_hasheable_corpus(corpus)

    # If the corpus is empty, the function ends.
    if len(corpus) == 0:
        logger.warning("Input corpus is empty.")
        return

    # Warns the user if there are too few documents.
    if len(corpus) < INPUT_MIN_LENGTH:
        logger.warning("Input corpus is expected to have at least " +
                       str(INPUT_MIN_LENGTH) + " documents.")

    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph, weight_threshold)
    _remove_unreachable_nodes(graph)

    pagerank_scores = _pagerank(graph)

    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0),
                         reverse=True)

    #MODIFICATION:
    #As there is the possibility of less than 5 sentences in the input document,
    #The int approximation of the 20% of sentences may be zero. In such cases, return the corpus. Ordered by relevance.
    if (int(len(corpus) * ratio) == 0):
        return [list(doc) for doc in hashable_corpus]
    else:
        return [
            list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]
        ]

Example #9

0

Show file

File: 48540_summarizer.py Project: chitang1990/Gensim-3.1.0

def summarize_corpus(corpus, ratio=0.2):
    """
    Returns a list of the most important documents of a corpus using a
    variation of the TextRank algorithm.
    The input must have at least INPUT_MIN_LENGTH (%d) documents for the
    summary to make sense.

    The length of the output can be specified using the ratio parameter,
    which determines how many documents will be chosen for the summary
    (defaults at 20%% of the number of documents of the corpus).

    The most important documents are returned as a list sorted by the
    document score, highest first.
    """ % INPUT_MIN_LENGTH
    hashable_corpus = _build_hasheable_corpus(corpus)

    # If the corpus is empty, the function ends.
    if len(corpus) == 0:
        logger.warning("Input corpus is empty.")
        return []

    # Warns the user if there are too few documents.
    if len(corpus) < INPUT_MIN_LENGTH:
        logger.warning(
            "Input corpus is expected to have at least %d documents.",
            INPUT_MIN_LENGTH)

    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)

    # Cannot calculate eigenvectors if number of unique documents in corpus < 3.
    # Warns user to add more text. The function ends.
    if len(graph.nodes()) < 3:
        logger.warning(
            "Please add more sentences to the text. The number of reachable nodes is below 3"
        )
        return []

    pagerank_scores = _pagerank(graph)

    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0),
                         reverse=True)

    return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]

Example #10

0

Show file

File: summarizer.py Project: rmalouf/gensim

def summarize_corpus(corpus, ratio=0.2):
    """
    Returns a list of the most important documents of a corpus using a
    variation of the TextRank algorithm.
    The input must have at least INPUT_MIN_LENGTH (%d) documents for the
    summary to make sense.

    The length of the output can be specified using the ratio parameter,
    which determines how many documents will be chosen for the summary
    (defaults at 20%% of the number of documents of the corpus).

    The most important documents are returned as a list sorted by the
    document score, highest first.
    """ % INPUT_MIN_LENGTH
    hashable_corpus = _build_hasheable_corpus(corpus)

    # If the corpus is empty, the function ends.
    if len(corpus) == 0:
        logger.warning("Input corpus is empty.")
        return []

    # Warns the user if there are too few documents.
    if len(corpus) < INPUT_MIN_LENGTH:
        logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)

    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)

    # Cannot calculate eigenvectors if number of unique documents in corpus < 3.
    # Warns user to add more text. The function ends.
    if len(graph.nodes()) < 3:
        logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3")
        return []

    pagerank_scores = _pagerank(graph)

    hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)

    return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]

Example #11

0

Show file

File: 48538_keywords.py Project: chitang1990/Gensim-3.1.0

def keywords(text,
             ratio=0.2,
             words=None,
             split=False,
             scores=False,
             pos_filter=('NN', 'JJ'),
             lemmatize=False,
             deacc=True):
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)

Example #12

0

Show file

File: textrank_gensim.py Project: pinkw/nlg-yongzhuo

def summarize_corpus(corpus):
    """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_.
     Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer`

    Note
    ----
    The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary
    to make sense.


    Parameters
    ----------
    corpus : list of list of (int, int)
        Given corpus.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary, optional.

    Returns
    -------
    list of str
        Most important documents of given `corpus` sorted by the document score, highest first.

    """
    hashable_corpus = _build_hasheable_corpus(corpus)

    logger.info('Building graph')
    graph = _build_graph(hashable_corpus)

    logger.info('Filling graph')
    _set_graph_edge_weights(graph)

    logger.info('Removing unreachable nodes of graph')
    _remove_unreachable_nodes(graph)

    logger.info('Pagerank graph')
    pagerank_scores = _pagerank(graph)
    return pagerank_scores

Example #13

0

Show file

File: keywords.py Project: CarbonCycles/gensim

def keywords(text, ratio=0.2, words=None, split=False, scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)

Example #14

0

Show file

File: Hybrid.py Project: parakh08/hybrid-text-summarisation


MAX_WORDS=100
for i,case_filename in enumerate(case_filenames[:]):
    with open('{}/{}'.format(DATASET_LOCATION,case_filename)) as f:
        text = f.read().strip()
        text = text.split("\n",6)[6]
    sentences = _clean_text_by_sentences(text)
    sent_for_nltk = [sent.text for sent in sentences]
    nltk_str = " ".join(sent_for_nltk)
    corpus = _build_corpus(sentences)
    hashable_corpus = _build_hasheable_corpus(corpus)
    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)
    pagerank_scores = _pagerank(graph)
    sentences_by_corpus = dict(zip(hashable_corpus, sentences))
    get_sentences = [sentences_by_corpus[tuple(doc)] for doc in hashable_corpus[:-1]]
    get_scores = [pagerank_scores.get(doc) for doc in hashable_corpus[:-1]]
    
    word_frequencies = {}  
    for word in nltk.word_tokenize(nltk_str):  
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    maximum_frequncy = max(word_frequencies.values())
    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    sentence_scores = {}  
    stopped_sentences = []

Example #15

0

Show file

File: improved_keywords.py Project: AsalJalilvand/extractive-text-summarization

def keywords(text,
             ratio=0.2,
             words=None,
             split=False,
             scores=False,
             pos_filter=('NN', 'JJ'),
             lemmatize=False,
             deacc=True):
    """Get most ranked words of provided text and/or its combinations.

    Parameters
    ----------

    text : str
        Input text.
    ratio : float, optional
        If no "words" option is selected, the number of sentences is reduced by the provided ratio,
        else, the ratio is ignored.
    words : int, optional
        Number of returned words.
    split : bool, optional
        Whether split keywords if True.
    scores : bool, optional
        Whether score of keyword.
    pos_filter : tuple, optional
        Part of speech filters.
    lemmatize : bool, optional
        If True - lemmatize words.
    deacc : bool, optional
        If True - remove accentuation.

    Returns
    -------
    result: list of (str, float)
        If `scores`, keywords with scores **OR**
    result: list of str
        If `split`, keywords only **OR**
    result: str
        Keywords, joined by endl.

    """
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    get_sentence_score_per_word(text)
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    if not any(True for _ in graph.iter_edges()):
        return _format_results([], [], split, scores)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)

Example #16

0

Show file

File: keywords.py Project: RaRe-Technologies/gensim

def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'),
             lemmatize=False, deacc=True):
    """Get most ranked words of provided text and/or its combinations.

    Parameters
    ----------

    text : str
        Input text.
    ratio : float, optional
        If no "words" option is selected, the number of sentences is reduced by the provided ratio,
        else, the ratio is ignored.
    words : int, optional
        Number of returned words.
    split : bool, optional
        Whether split keywords if True.
    scores : bool, optional
        Whether score of keyword.
    pos_filter : tuple, optional
        Part of speech filters.
    lemmatize : bool, optional
        If True - lemmatize words.
    deacc : bool, optional
        If True - remove accentuation.

    Returns
    -------
    result: list of (str, float)
        If `scores`, keywords with scores **OR**
    result: list of str
        If `split`, keywords only **OR**
    result: str
        Keywords, joined by endl.

    """
    # Gets a dict of word -> lemma
    text = to_unicode(text)
    tokens = _clean_text_by_word(text, deacc=deacc)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens, pos_filter))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    if not any(True for _ in graph.iter_edges()):
        return _format_results([], [], split, scores)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    # The results can be polluted by many variations of the same word
    if lemmatize:
        lemmas_to_word = {}
        for word, unit in iteritems(tokens):
            lemmas_to_word[unit.token] = [word]
    else:
        lemmas_to_word = _lemmas_to_words(tokens)

    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)

Example #17

0

Show file

def fetch_summary_from_case(case_filename):
    MAX_WORDS=100
    with open('{}/{}'.format(DATASET_LOCATION,case_filename)) as f:
        text = f.read().strip()
        text = text.split("\n",6)[6]
    sentences = _clean_text_by_sentences(text)
    sent_for_nltk = [sent.text for sent in sentences]
    nltk_str = " ".join(sent_for_nltk)
    corpus = _build_corpus(sentences)
    hashable_corpus = _build_hasheable_corpus(corpus)
    graph = _build_graph(hashable_corpus)
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)
    pagerank_scores = _pagerank(graph)
    sentences_by_corpus = dict(zip(hashable_corpus, sentences))
    get_sentences = [sentences_by_corpus[tuple(doc)] for doc in hashable_corpus[:-1]]
    get_scores = [pagerank_scores.get(doc) for doc in hashable_corpus[:-1]]
    
    word_frequencies = {}  
    for word in nltk.word_tokenize(nltk_str):  
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    maximum_frequncy = max(word_frequencies.values())
    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
    sentence_scores = {}  
    stopped_sentences = []
    sani_sent_list = []
    new_sent = sent_for_nltk[0]
    for j in range(len(sent_for_nltk)-1):
        last_word = new_sent.split(" ")[-1]
        if last_word and last_word[-1] != ".":
            new_sent += "."
        last_word = last_word[:-1]
        if len(last_word) < 4 or "." in last_word or "/" in last_word:
            new_sent += (" " + sent_for_nltk[j+1])
        else:
            sani_sent_list.append(new_sent)
            new_sent = sent_for_nltk[j+1]
    if new_sent.split(" ")[-1][-1] != ".":
        new_sent += "."
    sani_sent_list.append(new_sent)
    for sent in sani_sent_list:
        j=0
        stopped_sent_words = []
        for word in nltk.word_tokenize(sent.lower()):
            j=j+1
            if word in word_frequencies.keys():
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]              
                stopped_sent_words.append(word)
            stopped_sentences.append(" ".join(stopped_sent_words))
        sentence_scores[sent]=sentence_scores[sent]/j     
    for j,get_score in enumerate(get_scores):
        if get_scores[j] == None:
            get_scores[j]=0
    j=0
    l=0
    final_sentence_scores={} 
    
    for sent in sani_sent_list:
        j=j+l
        l=0
        if sent not in final_sentence_scores.keys():
                final_sentence_scores[sent]=0
        else:
            final_sentence_scores[sent]+=sentence_scores[sent]
        for sentence in get_sentences[j:-1]:
            if sentence.text[-1]!='.':
                sentence.text+='.'
            if sent.endswith(sentence.text):
                final_sentence_scores[sent]+=get_scores[j]
                l=l+1
                break
            final_sentence_scores[sent]+=get_scores[j]
            l=l+1
    
    summary_sentences = heapq.nlargest(30, final_sentence_scores, key=final_sentence_scores.get)
    
    
    size = [len(s.split(" ")) for s in summary_sentences]
    weights = [final_sentence_scores[s]/len(s.split(" ")) for s in summary_sentences]
    sol = knapsack(size, weights).solve(MAX_WORDS)
    max_weight, selected_sizes = sol
    summary = " ".join(summary_sentences[s] for s in selected_sizes)
    words_in_summary = len(summary.split(" "))
    return(summary)

Example #18

0

Show file

File: improved_summarizer.py Project: AsalJalilvand/extractive-text-summarization

def summarize_corpus(corpus,
                     dictionary,
                     sentences,
                     ratio=0.2,
                     redundancy_check=True,
                     query=None):
    """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_.
     Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer`

    Note
    ----
    The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary
    to make sense.


    Parameters
    ----------
    corpus : list of list of (int, int)
        Given corpus.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary, optional.

    Returns
    -------
    list of str
        Most important documents of given `corpus` sorted by the document score, highest first.

    """
    #hashable_corpus = _build_hasheable_corpus(corpus)

    # If the corpus is empty, the function ends.
    '''if len(corpus) == 0:
        logger.warning("Input corpus is empty.")
        return []

    # Warns the user if there are too few documents.
    if len(corpus) < INPUT_MIN_LENGTH:
        logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)'''

    logger.info('Building graph')
    graph = _build_graph(sentences)

    logger.info('Filling graph')
    _set_graph_edge_weights(graph, dictionary)

    logger.info('Removing unreachable nodes of graph')
    _remove_unreachable_nodes(graph)

    # Cannot calculate eigenvectors if number of unique documents in corpus < 3.
    # Warns user to add more text. The function ends.
    if len(graph.nodes()) < 3:
        logger.warning(
            "Please add more sentences to the text. The number of reachable nodes is below 3"
        )
        return []

    logger.info('Pagerank graph')
    pagerank_scores = _pagerank(graph)

    logger.info('Sorting pagerank scores')
    sentences.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True)

    if redundancy_check or (query is not None):
        selected = []
        counter = 0
        while (len(selected) <= int(len(corpus) * ratio)
               and counter < len(corpus)):
            sentence_words = sentences[counter].token.split()
            if redundancy_check and _is_redundant(sentence_words, selected):
                counter += 1
                continue
            if _is_related_to_query(sentence_words, query):
                selected.append(sentences[counter])
            counter += 1
        return selected

    return sentences[:int(len(corpus) * ratio)]