def summarize_corpus(corpus, ratio=0.2): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) if len(corpus) < INPUT_MIN_LENGTH: raise RuntimeError("Input corpus must have at least " + str(INPUT_MIN_LENGTH) + " documents.") graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def keywords(text, ratio=0.2, words=None, split=False, scores=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def get_sentence_score_per_word(text): global sentence_score_per_word sentence_score_per_word = {} # form Gensim summarizer, I moved some code here because I wanted to access pagerank results sentences = _clean_text_by_sentences(text) corpus = _build_corpus(sentences) hashable_corpus = _build_hasheable_corpus(corpus) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) graph = _build_sentence_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) for sentence_id, score in pagerank_scores.items(): sentence = sentences_by_corpus[sentence_id] for token in sentence.token.split(): if token in sentence_score_per_word: score_dict = sentence_score_per_word[token] sentence_score_per_word[token][ 'n_sents'] = score_dict['n_sents'] + 1 sentence_score_per_word[token][ 'cumulative_score'] = score_dict['cumulative_score'] + score else: sentence_score_per_word[token] = { 'n_sents': 1, 'cumulative_score': score }
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def summarize_corpus(corpus, ratio=0.2): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) logger.info('Building graph') graph = _build_graph(hashable_corpus) logger.info('Filling graph') _set_graph_edge_weights(graph) logger.info('Removing unreachable nodes of graph') _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return [] logger.info('Pagerank graph') pagerank_scores = _pagerank(graph) logger.info('Sorting pagerank scores') hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def summarize_corpus(corpus, ratio=0.2): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: # logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. # Return the whole document if len(corpus) < INPUT_MIN_LENGTH: #logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) sentence_length = len(corpus) return [list(doc) for doc in hashable_corpus[:-1]] graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: # logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return [] logger.info('Pagerank graph') pagerank_scores = _pagerank(graph) logger.info('Sorting pagerank scores') hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def summarize_corpus(corpus, ratio=0.2, weight_threshold=1.e-3): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH (%d) documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20%% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.") graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph, weight_threshold) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) #MODIFICATION: #As there is the possibility of less than 5 sentences in the input document, #The int approximation of the 20% of sentences may be zero. In such cases, return the corpus. Ordered by relevance. if (int(len(corpus) * ratio) == 0): return [list(doc) for doc in hashable_corpus] else: return [ list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)] ]
def summarize_corpus(corpus, ratio=0.2): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH (%d) documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20%% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning( "Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning( "Please add more sentences to the text. The number of reachable nodes is below 3" ) return [] pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def summarize_corpus(corpus, ratio=0.2): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH (%d) documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20%% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return [] pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def summarize_corpus(corpus): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) logger.info('Building graph') graph = _build_graph(hashable_corpus) logger.info('Filling graph') _set_graph_edge_weights(graph) logger.info('Removing unreachable nodes of graph') _remove_unreachable_nodes(graph) logger.info('Pagerank graph') pagerank_scores = _pagerank(graph) return pagerank_scores
MAX_WORDS=100 for i,case_filename in enumerate(case_filenames[:]): with open('{}/{}'.format(DATASET_LOCATION,case_filename)) as f: text = f.read().strip() text = text.split("\n",6)[6] sentences = _clean_text_by_sentences(text) sent_for_nltk = [sent.text for sent in sentences] nltk_str = " ".join(sent_for_nltk) corpus = _build_corpus(sentences) hashable_corpus = _build_hasheable_corpus(corpus) graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) get_sentences = [sentences_by_corpus[tuple(doc)] for doc in hashable_corpus[:-1]] get_scores = [pagerank_scores.get(doc) for doc in hashable_corpus[:-1]] word_frequencies = {} for word in nltk.word_tokenize(nltk_str): if word not in word_frequencies.keys(): word_frequencies[word] = 1 else: word_frequencies[word] += 1 maximum_frequncy = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) sentence_scores = {} stopped_sentences = []
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): """Get most ranked words of provided text and/or its combinations. Parameters ---------- text : str Input text. ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional If True - lemmatize words. deacc : bool, optional If True - remove accentuation. Returns ------- result: list of (str, float) If `scores`, keywords with scores **OR** result: list of str If `split`, keywords only **OR** result: str Keywords, joined by endl. """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) get_sentence_score_per_word(text) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) if not any(True for _ in graph.iter_edges()): return _format_results([], [], split, scores) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): """Get most ranked words of provided text and/or its combinations. Parameters ---------- text : str Input text. ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional If True - lemmatize words. deacc : bool, optional If True - remove accentuation. Returns ------- result: list of (str, float) If `scores`, keywords with scores **OR** result: list of str If `split`, keywords only **OR** result: str Keywords, joined by endl. """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) if not any(True for _ in graph.iter_edges()): return _format_results([], [], split, scores) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def fetch_summary_from_case(case_filename): MAX_WORDS=100 with open('{}/{}'.format(DATASET_LOCATION,case_filename)) as f: text = f.read().strip() text = text.split("\n",6)[6] sentences = _clean_text_by_sentences(text) sent_for_nltk = [sent.text for sent in sentences] nltk_str = " ".join(sent_for_nltk) corpus = _build_corpus(sentences) hashable_corpus = _build_hasheable_corpus(corpus) graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) sentences_by_corpus = dict(zip(hashable_corpus, sentences)) get_sentences = [sentences_by_corpus[tuple(doc)] for doc in hashable_corpus[:-1]] get_scores = [pagerank_scores.get(doc) for doc in hashable_corpus[:-1]] word_frequencies = {} for word in nltk.word_tokenize(nltk_str): if word not in word_frequencies.keys(): word_frequencies[word] = 1 else: word_frequencies[word] += 1 maximum_frequncy = max(word_frequencies.values()) for word in word_frequencies.keys(): word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) sentence_scores = {} stopped_sentences = [] sani_sent_list = [] new_sent = sent_for_nltk[0] for j in range(len(sent_for_nltk)-1): last_word = new_sent.split(" ")[-1] if last_word and last_word[-1] != ".": new_sent += "." last_word = last_word[:-1] if len(last_word) < 4 or "." in last_word or "/" in last_word: new_sent += (" " + sent_for_nltk[j+1]) else: sani_sent_list.append(new_sent) new_sent = sent_for_nltk[j+1] if new_sent.split(" ")[-1][-1] != ".": new_sent += "." sani_sent_list.append(new_sent) for sent in sani_sent_list: j=0 stopped_sent_words = [] for word in nltk.word_tokenize(sent.lower()): j=j+1 if word in word_frequencies.keys(): if sent not in sentence_scores.keys(): sentence_scores[sent] = word_frequencies[word] else: sentence_scores[sent] += word_frequencies[word] stopped_sent_words.append(word) stopped_sentences.append(" ".join(stopped_sent_words)) sentence_scores[sent]=sentence_scores[sent]/j for j,get_score in enumerate(get_scores): if get_scores[j] == None: get_scores[j]=0 j=0 l=0 final_sentence_scores={} for sent in sani_sent_list: j=j+l l=0 if sent not in final_sentence_scores.keys(): final_sentence_scores[sent]=0 else: final_sentence_scores[sent]+=sentence_scores[sent] for sentence in get_sentences[j:-1]: if sentence.text[-1]!='.': sentence.text+='.' if sent.endswith(sentence.text): final_sentence_scores[sent]+=get_scores[j] l=l+1 break final_sentence_scores[sent]+=get_scores[j] l=l+1 summary_sentences = heapq.nlargest(30, final_sentence_scores, key=final_sentence_scores.get) size = [len(s.split(" ")) for s in summary_sentences] weights = [final_sentence_scores[s]/len(s.split(" ")) for s in summary_sentences] sol = knapsack(size, weights).solve(MAX_WORDS) max_weight, selected_sizes = sol summary = " ".join(summary_sentences[s] for s in selected_sizes) words_in_summary = len(summary.split(" ")) return(summary)
def summarize_corpus(corpus, dictionary, sentences, ratio=0.2, redundancy_check=True, query=None): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ #hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. '''if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)''' logger.info('Building graph') graph = _build_graph(sentences) logger.info('Filling graph') _set_graph_edge_weights(graph, dictionary) logger.info('Removing unreachable nodes of graph') _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning( "Please add more sentences to the text. The number of reachable nodes is below 3" ) return [] logger.info('Pagerank graph') pagerank_scores = _pagerank(graph) logger.info('Sorting pagerank scores') sentences.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) if redundancy_check or (query is not None): selected = [] counter = 0 while (len(selected) <= int(len(corpus) * ratio) and counter < len(corpus)): sentence_words = sentences[counter].token.split() if redundancy_check and _is_redundant(sentence_words, selected): counter += 1 continue if _is_related_to_query(sentence_words, query): selected.append(sentences[counter]) counter += 1 return selected return sentences[:int(len(corpus) * ratio)]