def summary_highlight(text, coref, ratio): if coref: # coref_text = find_coreference_replace_pronominal(text) coref_text = text else: coref_text = text sum_text = summarize(text, coref_text, ratio) # extracted_sentences_number = get_extracted_number(sum_text, text) original_sentence_list = _format_results(_clean_text_by_sentences(text), True) extracted_sentence_list = _format_results( _clean_text_by_sentences(sum_text), True) index = 0 for i in original_sentence_list: try: if i == extracted_sentence_list[index]: original_index = original_sentence_list.index(i) i = '<mark><em>' + i + '</em></mark>' original_sentence_list[original_index] = i if index < len(extracted_sentence_list) - 1: index += 1 except IndexError: pass return " ".join(original_sentence_list)
def get_sentence_score_per_word(text): global sentence_score_per_word sentence_score_per_word = {} # form Gensim summarizer, I moved some code here because I wanted to access pagerank results sentences = _clean_text_by_sentences(text) corpus = _build_corpus(sentences) hashable_corpus = _build_hasheable_corpus(corpus) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) graph = _build_sentence_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) for sentence_id, score in pagerank_scores.items(): sentence = sentences_by_corpus[sentence_id] for token in sentence.token.split(): if token in sentence_score_per_word: score_dict = sentence_score_per_word[token] sentence_score_per_word[token][ 'n_sents'] = score_dict['n_sents'] + 1 sentence_score_per_word[token][ 'cumulative_score'] = score_dict['cumulative_score'] + score else: sentence_score_per_word[token] = { 'n_sents': 1, 'cumulative_score': score }
def summarize(text, ratio=0.2, word_count=None, split=False): """ Returns a summarized version of the given text using a variation of the TextRank algorithm. The input must be longer than INPUT_MIN_LENGTH sentences for the summary to make sense and must be given as a string. The output summary will consist of the most representative sentences and will also be returned as a string, divided by newlines. If the split parameter is set to True, a list of sentences will be returned. The length of the output can be specified using the ratio and word_count parameters: ratio should be a number between 0 and 1 that determines the percentage of the number of sentences of the original text to be chosen for the summary (defaults at 0.2). word_count determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) # If no sentence could be identified, the function ends. if len(sentences) == 0: logger.warning("Input text is empty.") return [] if split else u"" # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). if len(sentences) == 1: raise ValueError("input must have more than one sentence") # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH) corpus = _build_corpus(sentences) most_important_docs = summarize_corpus( corpus, ratio=ratio if word_count is None else 1) # If couldn't get important docs, the algorithm ends. if not most_important_docs: logger.warning("Couldn't get relevant sentences.") return [] if split else u"" # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split)
def summarize(text, ratio=0.2, word_count=None, split=False): """ Returns a summarized version of the given text using a variation of the TextRank algorithm. The input must be longer than INPUT_MIN_LENGTH sentences for the summary to make sense and must be given as a string. The output summary will consist of the most representative sentences and will also be returned as a string, divided by newlines. If the split parameter is set to True, a list of sentences will be returned. The length of the output can be specified using the ratio and word_count parameters: ratio should be a number between 0 and 1 that determines the percentage of the number of sentences of the original text to be chosen for the summary (defaults at 0.2). word_count determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) # If no sentence could be identified, the function ends. if len(sentences) == 0: logger.warning("Input text is empty.") return [] if split else u"" # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). if len(sentences) == 1: raise ValueError("input must have more than one sentence") # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH) corpus = _build_corpus(sentences) most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1) # If couldn't get important docs, the algorithm ends. if not most_important_docs: logger.warning("Couldn't get relevant sentences.") return [] if split else u"" # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split)
def summarize(text, ratio=0.2, word_count=None, split=False): """ Returns a summarized version of the given text using a variation of the TextRank algorithm. The input must be longer than INPUT_MIN_LENGTH sentences for the summary to make sense and must be given as a string. The output summary will consist of the most representative sentences and will also be returned as a string, divided by newlines. If the split parameter is set to True, a list of sentences will be returned. The length of the output can be specified using the ratio and word_count parameters: ratio should be a number between 0 and 1 that determines the percentage of the number of sentences of the original text to be chosen for the summary (defaults at 0.2). word_count determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) if len(sentences) < INPUT_MIN_LENGTH: raise RuntimeError("Input text must have at least " + str(INPUT_MIN_LENGTH) + " sentences.") corpus = _build_corpus(sentences) most_important_docs = summarize_corpus( corpus, ratio=ratio if word_count is None else 1) # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split)
def summarize(text, ratio=0.2, word_count=None, split=False): """ Returns a summarized version of the given text using a variation of the TextRank algorithm. The input must be longer than INPUT_MIN_LENGTH sentences for the summary to make sense and must be given as a string. The output summary will consist of the most representative sentences and will also be returned as a string, divided by newlines. If the split parameter is set to True, a list of sentences will be returned. The length of the output can be specified using the ratio and word_count parameters: ratio should be a number between 0 and 1 that determines the percentage of the number of sentences of the original text to be chosen for the summary (defaults at 0.2). word_count determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) if len(sentences) < INPUT_MIN_LENGTH: raise RuntimeError("Input text must have at least " + str(INPUT_MIN_LENGTH) + " sentences.") corpus = _build_corpus(sentences) most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1) # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split)
def summarize(text, coref, ratio, word_count=None, split=False): """Get a summarized version of the given text. The output summary will consist of the most representative sentences and will be returned as a string, divided by newlines. Note ---- The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` sentences for the summary to make sense. The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner` module. Note that newlines divide sentences. Parameters ---------- text : str Given text. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary. word_count : int or None, optional Determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. split : bool, optional If True, list of sentences will be returned. Otherwise joined strings will bwe returned. Returns ------- list of str If `split` **OR** str Most representative sentences of given the text. """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(coref) original = _clean_text_by_sentences(text) # If no sentence could be identified, the function ends. if len(sentences) == 0: logger.warning("Input text is empty.") return [] if split else u"" # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). if len(sentences) == 1: raise ValueError("input must have more than one sentence") # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH) corpus = _build_corpus(sentences) most_important_docs = summarize_corpus( corpus, ratio=ratio if word_count is None else 1) # If couldn't get important docs, the algorithm ends. if not most_important_docs: logger.warning("Couldn't get relevant sentences.") return [] if split else u"" # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_important_sentences(original, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split)
def summarize(text, ratio=0.2, word_count=None, split=False): """Get a summarized version of the given text. The output summary will consist of the most representative sentences and will be returned as a string, divided by newlines. Note ---- The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` sentences for the summary to make sense. The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner` module. Note that newlines divide sentences. Parameters ---------- text : str Given text. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary. word_count : int or None, optional Determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. split : bool, optional If True, list of sentences will be returned. Otherwise joined strings will bwe returned. Returns ------- list of str If `split` **OR** str Most representative sentences of given the text. """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) # If no sentence could be identified, the function ends. if len(sentences) == 0: logger.warning("Input text is empty.") return [] if split else u"" # If only one sentence is present, the function raises an error (Avoids ZeroDivisionError). if len(sentences) == 1: raise ValueError("input must have more than one sentence") # Warns if the text is too short. if len(sentences) < INPUT_MIN_LENGTH: logger.warning("Input text is expected to have at least %d sentences.", INPUT_MIN_LENGTH) corpus = _build_corpus(sentences) most_important_docs = summarize_corpus(corpus, ratio=ratio if word_count is None else 1) # If couldn't get important docs, the algorithm ends. if not most_important_docs: logger.warning("Couldn't get relevant sentences.") return [] if split else u"" # Extracts the most important sentences with the selected criterion. extracted_sentences = _extract_important_sentences(sentences, corpus, most_important_docs, word_count) # Sorts the extracted sentences by apparition order in the original text. extracted_sentences.sort(key=lambda s: s.index) return _format_results(extracted_sentences, split)