Beispiel #1
0
def keywords(text,
             ratio=0.2,
             words=None,
             language="english",
             split=False,
             scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text, language)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text  # It's no longer used

    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio,
                                       words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
Beispiel #2
0
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split, scores)
Beispiel #3
0
def summarize(text, namscores, original='pagerank', ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language)

    #print namscores

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pgrank_scores = _pagerank(graph, namscores, original)
    
    pagerank_scores = pgrank_scores
    #print '\n\npagerank scores\n',pgrank_scores
    #print '\n\nnamscores\n',namscores
    if original=='suraj':
        suraj_scores=[]
        for i in graph.nodes():
            suraj_scores.append((pgrank_scores[i])*(namscores[i]))

        pagerank_scores = dict(zip(graph.nodes(),suraj_scores))
    #print '\n\nhosa scores\n',new_scores
    
    
    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split, scores)
Beispiel #4
0
def keywords(text, ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a dict of word -> lemma
    tokens = _clean_text_by_word(text, language)
    split_text = list(_tokenize_by_word(text))

    # Creates the graph and adds the edges
    graph = _build_graph(_get_words_for_graph(tokens))
    _set_graph_edges(graph, tokens, split_text)
    del split_text # It's no longer used

    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score
    pagerank_scores = _pagerank(graph)

    extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words)

    lemmas_to_word = _lemmas_to_words(tokens)
    keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word)

    # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined
    combined_keywords = _get_combined_keywords(keywords, text.split())

    return _format_results(keywords, combined_keywords, split, scores)
Beispiel #5
0
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split, scores)
def summarize(text1,
              text2,
              ratio=0.2,
              words=None,
              language="english",
              split=False,
              scores=False,
              additional_stopwords=None):
    if not isinstance(text1, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")
    if not isinstance(text2, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a list of processed sentences.
    # Each sentence is created as a syntactic unit with the source article number to be referenced later on as well
    languague = "english"
    sentencesText1 = _clean_text_by_sentences(
        text1, source=0, additional_stopwords=additional_stopwords)
    sentencesText2 = _clean_text_by_sentences(
        text2, source=1, additional_stopwords=additional_stopwords)
    allSentences = _clean_text_by_sentences(
        text1, source=0,
        additional_stopwords=additional_stopwords) + _clean_text_by_sentences(
            text2, source=1, additional_stopwords=additional_stopwords)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graphCombined = _build_graph([sentence.token for sentence in allSentences])
    _set_graph_edge_weights(graphCombined)

    graph1 = _build_graph([sentence.token for sentence in sentencesText1])
    _set_graph_edge_weights(graph1)

    graph2 = _build_graph([sentence.token for sentence in sentencesText2])
    _set_graph_edge_weights(graph2)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graphCombined)
    _remove_unreachable_nodes(graph1)
    _remove_unreachable_nodes(graph2)

    # PageRank cannot be run in an empty graph.
    if len(graphCombined.nodes()) == 0 or len(graph1.nodes()) == 0 or len(
            graph2.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores_combined = _pagerank(graphCombined)
    pagerank_scores_1 = _pagerank(graph1)
    pagerank_scores_2 = _pagerank(graph2)

    # print("pagerank_scores_combined")
    # print(pagerank_scores_combined)
    # print("pagerank_scores_1")
    # print(pagerank_scores_1)
    # print("pagerank_scores_2")
    # print(pagerank_scores_2)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentencesText1, pagerank_scores_1)
    _add_scores_to_sentences(sentencesText2, pagerank_scores_2)
    _add_scores_to_sentences(allSentences, pagerank_scores_combined)

    # I want to create a table that shows the scores of each sentence in the combined graph vs the summary individually and the scores attributed to it
    sentenceScores = []
    for sentence in allSentences:
        if sentence.token in pagerank_scores_1:
            row = {
                "sentence": sentence.token,
                "combinedScore": pagerank_scores_combined[sentence.token],
                "summary1Score": pagerank_scores_1[sentence.token]
            }
            sentenceScores.append(row)
        if sentence.token in pagerank_scores_2:
            row = {
                "sentence": sentence.token,
                "combinedScore": pagerank_scores_combined[sentence.token],
                "summary2Score": pagerank_scores_2[sentence.token]
            }
            sentenceScores.append(row)

    df = pd.DataFrame(sentenceScores)
    print(df)

    # Extracts the most important sentences with the selected criterion.
    print("_____TEXT1_____")
    for sentence in sentencesText1:
        print(sentence)
        print(sentence.score)
    print("_____TEXT2_____")
    for sentence in sentencesText2:
        print(sentence)
        print(sentence.score)
    print("_____TEXT_COMBINED_____")
    for sentence in allSentences:
        print(sentence)
        print(sentence.score)
    summary1 = _extract_most_important_sentences(sentencesText1, ratio, words,
                                                 0)
    summary2 = _extract_most_important_sentences(sentencesText2, ratio, words,
                                                 1)
    summary1_combined = _extract_most_important_sentences(
        allSentences, ratio, words, 0)
    summary2_combined = _extract_most_important_sentences(
        allSentences, ratio, words, 1)

    # We want combined_graphs_similarity > separate_graphs_similarity
    separate_graphs_similarity = _get_similarity(summary1, summary2)
    combined_graphs_similarity = _get_similarity(summary1_combined,
                                                 summary2_combined)

    # print("Similarity of separate graphs: " + str(separate_graphs_similarity))
    # print("Similarity of combined graphs: " + str(combined_graphs_similarity))

    # Sorts the extracted sentences by apparition order in the original text.
    summary1.sort(key=lambda s: s.index)
    summary2.sort(key=lambda s: s.index)
    summary1_combined.sort(key=lambda s: s.index)
    summary2_combined.sort(key=lambda s: s.index)

    summary1Text = summaryText(summary1)
    summary2Text = summaryText(summary2)
    summary1CombinedText = summaryText(summary1_combined)
    summary2CombinedText = summaryText(summary2_combined)

    # Compute the similarity score of the similar sentences here and the sentences computed without doing the one graph approach
    return summary1Text, summary2Text, summary1CombinedText, summary2CombinedText, separate_graphs_similarity, combined_graphs_similarity