Beispiel #1
0
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split, scores)
def get_graph(text, language="english"):
    sentences = _clean_text_by_sentences(text, language)

    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    return graph
Beispiel #3
0
def get_graph(text, language="english"):
    sentences = _clean_text_by_sentences(text, language)

    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    return graph
Beispiel #4
0
def summarize(text, namscores, original='pagerank', ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language)

    #print namscores

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pgrank_scores = _pagerank(graph, namscores, original)
    
    pagerank_scores = pgrank_scores
    #print '\n\npagerank scores\n',pgrank_scores
    #print '\n\nnamscores\n',namscores
    if original=='suraj':
        suraj_scores=[]
        for i in graph.nodes():
            suraj_scores.append((pgrank_scores[i])*(namscores[i]))

        pagerank_scores = dict(zip(graph.nodes(),suraj_scores))
    #print '\n\nhosa scores\n',new_scores
    
    
    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split, scores)
Beispiel #5
0
def summarize(text, ratio=0.2, words=None, language="english", split=False, scores=False):
    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, language)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(sentences, ratio, words)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)

    return _format_results(extracted_sentences, split, scores)
Beispiel #6
0
def _get_labels(text, language, by_sentence):
    syntactic_units = _clean_text_by_sentences(text, language) if by_sentence \
        else _clean_text_by_word(text, language).values()
    return {unit.token: unit.text for unit in syntactic_units}
def summarize(text1,
              text2,
              ratio=0.2,
              words=None,
              language="english",
              split=False,
              scores=False,
              additional_stopwords=None):
    if not isinstance(text1, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")
    if not isinstance(text2, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    # Gets a list of processed sentences.
    # Each sentence is created as a syntactic unit with the source article number to be referenced later on as well
    languague = "english"
    sentencesText1 = _clean_text_by_sentences(
        text1, source=0, additional_stopwords=additional_stopwords)
    sentencesText2 = _clean_text_by_sentences(
        text2, source=1, additional_stopwords=additional_stopwords)
    allSentences = _clean_text_by_sentences(
        text1, source=0,
        additional_stopwords=additional_stopwords) + _clean_text_by_sentences(
            text2, source=1, additional_stopwords=additional_stopwords)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graphCombined = _build_graph([sentence.token for sentence in allSentences])
    _set_graph_edge_weights(graphCombined)

    graph1 = _build_graph([sentence.token for sentence in sentencesText1])
    _set_graph_edge_weights(graph1)

    graph2 = _build_graph([sentence.token for sentence in sentencesText2])
    _set_graph_edge_weights(graph2)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graphCombined)
    _remove_unreachable_nodes(graph1)
    _remove_unreachable_nodes(graph2)

    # PageRank cannot be run in an empty graph.
    if len(graphCombined.nodes()) == 0 or len(graph1.nodes()) == 0 or len(
            graph2.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores_combined = _pagerank(graphCombined)
    pagerank_scores_1 = _pagerank(graph1)
    pagerank_scores_2 = _pagerank(graph2)

    # print("pagerank_scores_combined")
    # print(pagerank_scores_combined)
    # print("pagerank_scores_1")
    # print(pagerank_scores_1)
    # print("pagerank_scores_2")
    # print(pagerank_scores_2)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentencesText1, pagerank_scores_1)
    _add_scores_to_sentences(sentencesText2, pagerank_scores_2)
    _add_scores_to_sentences(allSentences, pagerank_scores_combined)

    # I want to create a table that shows the scores of each sentence in the combined graph vs the summary individually and the scores attributed to it
    sentenceScores = []
    for sentence in allSentences:
        if sentence.token in pagerank_scores_1:
            row = {
                "sentence": sentence.token,
                "combinedScore": pagerank_scores_combined[sentence.token],
                "summary1Score": pagerank_scores_1[sentence.token]
            }
            sentenceScores.append(row)
        if sentence.token in pagerank_scores_2:
            row = {
                "sentence": sentence.token,
                "combinedScore": pagerank_scores_combined[sentence.token],
                "summary2Score": pagerank_scores_2[sentence.token]
            }
            sentenceScores.append(row)

    df = pd.DataFrame(sentenceScores)
    print(df)

    # Extracts the most important sentences with the selected criterion.
    print("_____TEXT1_____")
    for sentence in sentencesText1:
        print(sentence)
        print(sentence.score)
    print("_____TEXT2_____")
    for sentence in sentencesText2:
        print(sentence)
        print(sentence.score)
    print("_____TEXT_COMBINED_____")
    for sentence in allSentences:
        print(sentence)
        print(sentence.score)
    summary1 = _extract_most_important_sentences(sentencesText1, ratio, words,
                                                 0)
    summary2 = _extract_most_important_sentences(sentencesText2, ratio, words,
                                                 1)
    summary1_combined = _extract_most_important_sentences(
        allSentences, ratio, words, 0)
    summary2_combined = _extract_most_important_sentences(
        allSentences, ratio, words, 1)

    # We want combined_graphs_similarity > separate_graphs_similarity
    separate_graphs_similarity = _get_similarity(summary1, summary2)
    combined_graphs_similarity = _get_similarity(summary1_combined,
                                                 summary2_combined)

    # print("Similarity of separate graphs: " + str(separate_graphs_similarity))
    # print("Similarity of combined graphs: " + str(combined_graphs_similarity))

    # Sorts the extracted sentences by apparition order in the original text.
    summary1.sort(key=lambda s: s.index)
    summary2.sort(key=lambda s: s.index)
    summary1_combined.sort(key=lambda s: s.index)
    summary2_combined.sort(key=lambda s: s.index)

    summary1Text = summaryText(summary1)
    summary2Text = summaryText(summary2)
    summary1CombinedText = summaryText(summary1_combined)
    summary2CombinedText = summaryText(summary2_combined)

    # Compute the similarity score of the similar sentences here and the sentences computed without doing the one graph approach
    return summary1Text, summary2Text, summary1CombinedText, summary2CombinedText, separate_graphs_similarity, combined_graphs_similarity
Beispiel #8
0
def _get_labels(text, language, by_sentence):
    syntactic_units = _clean_text_by_sentences(text, language) if by_sentence \
        else _clean_text_by_word(text, language).values()
    return {unit.token: unit.text for unit in syntactic_units}