def summarize(sentences, ratio=0.2, words=None, language="english", split=False, scores=False): # Gets a list of processed sentences. # sentences = _clean_text_by_sentences(text, language) # Creates the graph and calculates the similarity coefficient for every pair of nodes. i = 0 while i < len(sentences): sentences[i] = ''.join(str(x) for x in sentences[i]) i = i + 1 graph = summarizer._build_graph(sentences) summarizer._set_graph_edge_weights(graph) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) i = 0 scores_list = [] while i < len(sentences): if sentences[i] in pagerank_scores.keys(): scores_list.append(pagerank_scores[sentences[i]]) else: scores_list.append(0) i = i + 1 return scores_list
def summarize(sentences, similarity_matrix, ratio=0.2, split=False): # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([i for i in range(len(sentences))]) _set_graph_edge_weights(graph, similarity_func=partial( cosine_similarity, similarity_matrix)) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] if split else "" # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_most_important_sentences( sentences, ratio, words=None) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) # print([(x.index, x.score) for x in extracted_sentences[:2]]) return _format_results(extracted_sentences, split, score=None)
def summarize_with_model(text, session, model, model_name, additional_stopwords): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") lang = detect(text)[:2] if lang == "en": paragraphs = text.split("\n") sentences = [] paragraph_index = 0 for paragraph in paragraphs: # Gets a list of processed sentences. if paragraph: tmp = _clean_text_by_sentences( paragraph, additional_stopwords) if tmp: for j, sent in enumerate(tmp): sent.paragraph = paragraph_index # Hacky way to overwrite token sent.token = len(sentences) + j sentences += tmp paragraph_index += 1 elif lang == "zh" or lang == "ko": # zh-Hant sometimes got misclassified into ko if model_name != "xling": raise ValueError("Only 'xling' model supports zh.") sentences = cut_sentences_by_rule(text) elif lang == "ja": if model_name != "xling": raise ValueError("Only 'xling' model supports ja.") if not JA_SUPPORT: raise ImportError("Missing dependencies for Japanese support.") sentences = ja_clean_and_cut_sentences(text) for i, sent in enumerate(sentences): # Hacky way to overwrite token sent.token = i else: return ["Language not suppored! (supported languages: en, zh, ja)"], None, lang # print([sentence.token for sentence in sentences if sentence.token]) # Creates the graph and calculates the similarity coefficient for every pair of nodes. similarities = attach_sentence_embeddings( session, sentences, model, batch_size=32) graph = _build_graph([x.token for x in sentences]) _set_graph_edge_weights(graph, partial(cosine_similarity, similarities)) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Sorts the sentences sentences.sort(key=lambda s: s.score, reverse=True) return sentences, graph, lang
def summarize_custom(text, ratio=0.2, split=False, scores=False, words=None, stopwords=None): cleaned_sentences = clean_sentences(text, stopwords=stopwords) graph = build_graph([sentence.token for sentence in cleaned_sentences]) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) _add_scores_to_sentences(cleaned_sentences, pagerank_scores) extracted_sentences = _extract_most_important_sentences( cleaned_sentences, ratio, words) extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split, scores)
def summarize(text, additional_stopwords=None): if not isinstance(text, str): raise ValueError("Text parameter must be a Unicode object (str)!") lang = detect(text)[:2] if lang == "en": paragraphs = text.split("\n") sentences = [] paragraph_index = 0 for paragraph in paragraphs: # Gets a list of processed sentences. if paragraph: tmp = _clean_text_by_sentences( paragraph, additional_stopwords) if tmp: for j, sent in enumerate(tmp): sent.paragraph = paragraph_index # Hacky way to overwrite token sent.token = len(sentences) + j sentences += tmp paragraph_index += 1 elif lang == "zh" or lang == "ko": # zh-Hant sometimes got misclassified into ko sentences = cut_sentences_by_rule(text) elif lang == "ja": raise NotImplementedError("No ja support yet.") else: return ["Language not suppored! (supported languages: en, zh)"], None, lang similarities = attach_sentence_embeddings( lang, sentences, batch_size=32) graph = _build_graph([x.token for x in sentences]) _set_graph_edge_weights(graph, partial(cosine_similarity, similarities)) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # Sorts the sentences sentences.sort(key=lambda s: s.score, reverse=True) return sentences, graph, lang
def get_sentence_scores_by_textrank(text): # text = get_text_without_boilerplate(htmlcontent) # the following code was adapted from the source code # of the summa summarizer function by Federico Barrios et al. # ref: https://pypi.org/project/summa/ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text, "english", None) # TODO: update the language so that it is automatically determined # Creates the graph and calculates the similarity coefficient for every pair of nodes. graph = _build_graph([sentence.token for sentence in sentences]) _set_graph_edge_weights(graph) # Remove all nodes with all edges weights equal to zero. _remove_unreachable_nodes(graph) # PageRank cannot be run in an empty graph. if len(graph.nodes()) == 0: return [] # Ranks the tokens using the PageRank algorithm. Returns dict of sentence -> score pagerank_scores = _pagerank(graph) # Adds the summa scores to the sentence objects. _add_scores_to_sentences(sentences, pagerank_scores) # done with adapated code scored_sentences = [] for sentence in sentences: scored_sentences.append((sentence.score, sentence.text)) return sorted(scored_sentences, reverse=True)