def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False, deacc=True): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def keywords(text, ratio=0.2, words=None, split=False, scores=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def summarize_corpus(corpus, ratio=0.2): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) if len(corpus) < INPUT_MIN_LENGTH: raise RuntimeError("Input corpus must have at least " + str(INPUT_MIN_LENGTH) + " documents.") graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def summarize_corpus(corpus, ratio=0.2): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) if len(corpus) < INPUT_MIN_LENGTH: raise RuntimeError("Input corpus must have at least " + str(INPUT_MIN_LENGTH) + " documents.") graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def get_graph(text): tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) return graph
def get_graph(text): tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) return graph
def summarize_corpus(corpus, ratio=0.2): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) logger.info('Building graph') graph = _build_graph(hashable_corpus) logger.info('Filling graph') _set_graph_edge_weights(graph) logger.info('Removing unreachable nodes of graph') _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return [] logger.info('Pagerank graph') pagerank_scores = _pagerank(graph) logger.info('Sorting pagerank scores') hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def summarize_corpus(corpus, ratio=0.2): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: # logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. # Return the whole document if len(corpus) < INPUT_MIN_LENGTH: #logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) sentence_length = len(corpus) return [list(doc) for doc in hashable_corpus[:-1]] graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: # logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return [] logger.info('Pagerank graph') pagerank_scores = _pagerank(graph) logger.info('Sorting pagerank scores') hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def summarize_corpus(corpus, ratio=0.2, weight_threshold=1.e-3): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH (%d) documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20%% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least " + str(INPUT_MIN_LENGTH) + " documents.") graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph, weight_threshold) _remove_unreachable_nodes(graph) pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) #MODIFICATION: #As there is the possibility of less than 5 sentences in the input document, #The int approximation of the 20% of sentences may be zero. In such cases, return the corpus. Ordered by relevance. if (int(len(corpus) * ratio) == 0): return [list(doc) for doc in hashable_corpus] else: return [ list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)] ]
def summarize_corpus(corpus, ratio=0.2): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH (%d) documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20%% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning( "Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning( "Please add more sentences to the text. The number of reachable nodes is below 3" ) return [] pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def get_graph(text): """Creates and returns graph from given text, cleans and tokenize text before building graph. Parameters ---------- text : str Sequence of values. Returns ------- :class:`~gensim.summarization.graph.Graph` Created graph. """ tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) return graph
def get_graph(text): """Creates and returns graph from given text, cleans and tokenize text before building graph. Parameters ---------- text : str Sequence of values. Returns ------- :class:`~gensim.summarization.graph.Graph` Created graph. """ tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) return graph
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def summarize_corpus(corpus, ratio=0.2): """ Returns a list of the most important documents of a corpus using a variation of the TextRank algorithm. The input must have at least INPUT_MIN_LENGTH (%d) documents for the summary to make sense. The length of the output can be specified using the ratio parameter, which determines how many documents will be chosen for the summary (defaults at 20%% of the number of documents of the corpus). The most important documents are returned as a list sorted by the document score, highest first. """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH) graph = _build_graph(hashable_corpus) _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return [] pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) return [list(doc) for doc in hashable_corpus[:int(len(corpus) * ratio)]]
def summarize_corpus(corpus): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ hashable_corpus = _build_hasheable_corpus(corpus) logger.info('Building graph') graph = _build_graph(hashable_corpus) logger.info('Filling graph') _set_graph_edge_weights(graph) logger.info('Removing unreachable nodes of graph') _remove_unreachable_nodes(graph) logger.info('Pagerank graph') pagerank_scores = _pagerank(graph) return pagerank_scores
def keywords(text, ratio=0.2, words=None, split=False, scores=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): """Get most ranked words of provided text and/or its combinations. Parameters ---------- text : str Input text. ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional If True - lemmatize words. deacc : bool, optional If True - remove accentuation. Returns ------- result: list of (str, float) If `scores`, keywords with scores **OR** result: list of str If `split`, keywords only **OR** result: str Keywords, joined by endl. """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) get_sentence_score_per_word(text) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) if not any(True for _ in graph.iter_edges()): return _format_results([], [], split, scores) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=('NN', 'JJ'), lemmatize=False, deacc=True): """Get most ranked words of provided text and/or its combinations. Parameters ---------- text : str Input text. ratio : float, optional If no "words" option is selected, the number of sentences is reduced by the provided ratio, else, the ratio is ignored. words : int, optional Number of returned words. split : bool, optional Whether split keywords if True. scores : bool, optional Whether score of keyword. pos_filter : tuple, optional Part of speech filters. lemmatize : bool, optional If True - lemmatize words. deacc : bool, optional If True - remove accentuation. Returns ------- result: list of (str, float) If `scores`, keywords with scores **OR** result: list of str If `split`, keywords only **OR** result: str Keywords, joined by endl. """ # Gets a dict of word -> lemma text = to_unicode(text) tokens = _clean_text_by_word(text, deacc=deacc) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used _remove_unreachable_nodes(graph) if not any(True for _ in graph.iter_edges()): return _format_results([], [], split, scores) # Ranks the tokens using the PageRank algorithm. Returns dict of lemma -> score pagerank_scores = _pagerank(graph) extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) # The results can be polluted by many variations of the same word if lemmatize: lemmas_to_word = {} for word, unit in iteritems(tokens): lemmas_to_word[unit.token] = [word] else: lemmas_to_word = _lemmas_to_words(tokens) keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined combined_keywords = _get_combined_keywords(keywords, text.split()) return _format_results(keywords, combined_keywords, split, scores)
def summarize_corpus(corpus, dictionary, sentences, ratio=0.2, redundancy_check=True, query=None): """Get a list of the most important documents of a corpus using a variation of the TextRank algorithm [1]_. Used as helper for summarize :func:`~gensim.summarization.summarizer.summarizer` Note ---- The input must have at least :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH` documents for the summary to make sense. Parameters ---------- corpus : list of list of (int, int) Given corpus. ratio : float, optional Number between 0 and 1 that determines the proportion of the number of sentences of the original text to be chosen for the summary, optional. Returns ------- list of str Most important documents of given `corpus` sorted by the document score, highest first. """ #hashable_corpus = _build_hasheable_corpus(corpus) # If the corpus is empty, the function ends. '''if len(corpus) == 0: logger.warning("Input corpus is empty.") return [] # Warns the user if there are too few documents. if len(corpus) < INPUT_MIN_LENGTH: logger.warning("Input corpus is expected to have at least %d documents.", INPUT_MIN_LENGTH)''' logger.info('Building graph') graph = _build_graph(sentences) logger.info('Filling graph') _set_graph_edge_weights(graph, dictionary) logger.info('Removing unreachable nodes of graph') _remove_unreachable_nodes(graph) # Cannot calculate eigenvectors if number of unique documents in corpus < 3. # Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning( "Please add more sentences to the text. The number of reachable nodes is below 3" ) return [] logger.info('Pagerank graph') pagerank_scores = _pagerank(graph) logger.info('Sorting pagerank scores') sentences.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) if redundancy_check or (query is not None): selected = [] counter = 0 while (len(selected) <= int(len(corpus) * ratio) and counter < len(corpus)): sentence_words = sentences[counter].token.split() if redundancy_check and _is_redundant(sentence_words, selected): counter += 1 continue if _is_related_to_query(sentence_words, query): selected.append(sentences[counter]) counter += 1 return selected return sentences[:int(len(corpus) * ratio)]