Beispiel #1
0
def summarize(sentences,
              ratio=0.2,
              words=None,
              language="english",
              split=False,
              scores=False):
    # Gets a list of processed sentences.
    # sentences = _clean_text_by_sentences(text, language)

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    i = 0
    while i < len(sentences):
        sentences[i] = ''.join(str(x) for x in sentences[i])
        i = i + 1

    graph = summarizer._build_graph(sentences)

    summarizer._set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    i = 0
    scores_list = []
    while i < len(sentences):
        if sentences[i] in pagerank_scores.keys():
            scores_list.append(pagerank_scores[sentences[i]])
        else:
            scores_list.append(0)
        i = i + 1

    return scores_list
def summarize(sentences, similarity_matrix, ratio=0.2, split=False):
    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([i for i in range(len(sentences))])

    _set_graph_edge_weights(graph, similarity_func=partial(
        cosine_similarity, similarity_matrix))

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return [] if split else ""

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Extracts the most important sentences with the selected criterion.
    extracted_sentences = _extract_most_important_sentences(
        sentences, ratio, words=None)

    # Sorts the extracted sentences by apparition order in the original text.
    extracted_sentences.sort(key=lambda s: s.index)
    # print([(x.index, x.score) for x in extracted_sentences[:2]])
    return _format_results(extracted_sentences, split, score=None)
Beispiel #3
0
def summarize_with_model(text, session, model, model_name, additional_stopwords):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")
    lang = detect(text)[:2]
    if lang == "en":
        paragraphs = text.split("\n")
        sentences = []
        paragraph_index = 0
        for paragraph in paragraphs:
            # Gets a list of processed sentences.
            if paragraph:
                tmp = _clean_text_by_sentences(
                    paragraph, additional_stopwords)
                if tmp:
                    for j, sent in enumerate(tmp):
                        sent.paragraph = paragraph_index
                        # Hacky way to overwrite token
                        sent.token = len(sentences) + j
                    sentences += tmp
                    paragraph_index += 1
    elif lang == "zh" or lang == "ko":  # zh-Hant sometimes got misclassified into ko
        if model_name != "xling":
            raise ValueError("Only 'xling' model supports zh.")
        sentences = cut_sentences_by_rule(text)
    elif lang == "ja":
        if model_name != "xling":
            raise ValueError("Only 'xling' model supports ja.")
        if not JA_SUPPORT:
            raise ImportError("Missing dependencies for Japanese support.")
        sentences = ja_clean_and_cut_sentences(text)
        for i, sent in enumerate(sentences):
            # Hacky way to overwrite token
            sent.token = i
    else:
        return ["Language not suppored! (supported languages: en, zh, ja)"], None, lang

    # print([sentence.token for sentence in sentences if sentence.token])
    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    similarities = attach_sentence_embeddings(
        session, sentences, model, batch_size=32)
    graph = _build_graph([x.token for x in sentences])
    _set_graph_edge_weights(graph, partial(cosine_similarity, similarities))

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return []

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Sorts the sentences
    sentences.sort(key=lambda s: s.score, reverse=True)
    return sentences, graph, lang
Beispiel #4
0
def summarize_custom(text, ratio=0.2, split=False, scores=False, words=None, stopwords=None):
    cleaned_sentences = clean_sentences(text, stopwords=stopwords)
    graph = build_graph([sentence.token for sentence in cleaned_sentences])
    _set_graph_edge_weights(graph)
    _remove_unreachable_nodes(graph)
    pagerank_scores = _pagerank(graph)
    _add_scores_to_sentences(cleaned_sentences, pagerank_scores)
    extracted_sentences = _extract_most_important_sentences(
        cleaned_sentences, ratio, words)
    extracted_sentences.sort(key=lambda s: s.index)
    return _format_results(extracted_sentences, split, scores)
Beispiel #5
0
def summarize(text, additional_stopwords=None):
    if not isinstance(text, str):
        raise ValueError("Text parameter must be a Unicode object (str)!")

    lang = detect(text)[:2]
    if lang == "en":
        paragraphs = text.split("\n")
        sentences = []
        paragraph_index = 0
        for paragraph in paragraphs:
            # Gets a list of processed sentences.
            if paragraph:
                tmp = _clean_text_by_sentences(
                    paragraph, additional_stopwords)
                if tmp:
                    for j, sent in enumerate(tmp):
                        sent.paragraph = paragraph_index
                        # Hacky way to overwrite token
                        sent.token = len(sentences) + j
                    sentences += tmp
                    paragraph_index += 1
    elif lang == "zh" or lang == "ko":  # zh-Hant sometimes got misclassified into ko
        sentences = cut_sentences_by_rule(text)
    elif lang == "ja":
        raise NotImplementedError("No ja support yet.")
    else:
        return ["Language not suppored! (supported languages: en, zh)"], None, lang
    similarities = attach_sentence_embeddings(
        lang, sentences,  batch_size=32)
    graph = _build_graph([x.token for x in sentences])
    _set_graph_edge_weights(graph, partial(cosine_similarity, similarities))

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return []

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # Sorts the sentences
    sentences.sort(key=lambda s: s.score, reverse=True)
    return sentences, graph, lang
Beispiel #6
0
def get_sentence_scores_by_textrank(text):

    # text = get_text_without_boilerplate(htmlcontent)

    # the following code was adapted from the source code
    # of the summa summarizer function by Federico Barrios et al.
    # ref: https://pypi.org/project/summa/

    # Gets a list of processed sentences.
    sentences = _clean_text_by_sentences(text, "english", None)
    # TODO: update the language so that it is automatically determined

    # Creates the graph and calculates the similarity coefficient for every pair of nodes.
    graph = _build_graph([sentence.token for sentence in sentences])
    _set_graph_edge_weights(graph)

    # Remove all nodes with all edges weights equal to zero.
    _remove_unreachable_nodes(graph)

    # PageRank cannot be run in an empty graph.
    if len(graph.nodes()) == 0:
        return []

    # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score
    pagerank_scores = _pagerank(graph)

    # Adds the summa scores to the sentence objects.
    _add_scores_to_sentences(sentences, pagerank_scores)

    # done with adapated code

    scored_sentences = []

    for sentence in sentences:
        scored_sentences.append((sentence.score, sentence.text))

    return sorted(scored_sentences, reverse=True)