def summarize(title, text, summary_length=5): """ Summarizes a single document. Args: | title (str) -- the document title | text (str) -- the document test | summary_length (int) -- the preferred sentence length of the summary (default=5) Returns: | summary (list) -- list of sentences selected for the summary. Currently uses a modified version of `PyTeaser <https://github.com/xiaoxu193/PyTeaser>`, which is based off of `TextTeaser <https://github.com/MojoJolo/textteaser>`. """ summary = [] keys = keywords(text) title_tokens = tokenize(title) # Score sentences and use the top selections. ranks = score(sent_tokenize(text), title_tokens, keys).most_common(summary_length) for rank in ranks: summary.append(rank[0]) return summary
def keywords(text): """ Gets the top 10 keywords and their frequency scores from a document. Sorts them in descending order by number of occurrences. """ from operator import itemgetter # for sorting text = sub(r'[^\w ]', '', text) # strip special chars text_with_stops = [x.strip('.').lower() for x in text.split()] numWords = len(text_with_stops) text = tokenize(text) freq = Counter() for word in text: freq[word] += 1 minSize = min(10, len(freq)) keywords = tuple(freq.most_common(minSize)) # get first 10 keywords = dict((x, y) for x, y in keywords) # recreate a dict for k in keywords: articleScore = keywords[k] * 1.0 / numWords keywords[k] = articleScore * 1.5 + 1 keywords = sorted(iter(keywords.items()), key=itemgetter(1)) keywords.reverse() return dict(keywords)
def score(sentences, title_words, keywords): """ Score sentences based on their features. Args: | sentences (list) -- list of sentences to score | title_words (list) -- list of words in the title | keywords (list) -- list of keywords from the document """ num_sentences = len(sentences) ranks = Counter() for i, s in enumerate(sentences): sentence = tokenize(s) # Calculate features. title_score = score_title(title_words, sentence) s_length = sentence_length(sentence) s_position = sentence_position(i + 1, num_sentences) sbs_feature = sbs(sentence, keywords) dbs_feature = dbs(sentence, keywords) frequency = (sbs_feature + dbs_feature) / 2.0 * 10.0 # Weighted average of feature scores. total_score = (title_score * 1.5 + frequency * 2.0 + s_length * 1.0 + s_position * 1.0) / 4.0 ranks[s] = total_score return ranks
def keywords(text): """ Gets the top 10 keywords and their frequency scores from a document. Sorts them in descending order by number of occurrences. """ from operator import itemgetter # for sorting text = sub(r'[^\w ]', '', text) # strip special chars text_with_stops = [x.strip('.').lower() for x in text.split()] numWords = len(text_with_stops) text = tokenize(text) freq = Counter() for word in text: freq[word] += 1 minSize = min(10, len(freq)) keywords = tuple(freq.most_common(minSize)) # get first 10 keywords = dict((x, y) for x, y in keywords) # recreate a dict for k in keywords: articleScore = keywords[k]*1.0 / numWords keywords[k] = articleScore * 1.5 + 1 keywords = sorted(iter(keywords.items()), key=itemgetter(1)) keywords.reverse() return dict(keywords)
def score(sentences, title_words, keywords): """ Score sentences based on their features. Args: | sentences (list) -- list of sentences to score | title_words (list) -- list of words in the title | keywords (list) -- list of keywords from the document """ num_sentences = len(sentences) ranks = Counter() for i, s in enumerate(sentences): sentence = tokenize(s) # Calculate features. title_score = score_title(title_words, sentence) s_length = sentence_length(sentence) s_position = sentence_position(i+1, num_sentences) sbs_feature = sbs(sentence, keywords) dbs_feature = dbs(sentence, keywords) frequency = (sbs_feature + dbs_feature) / 2.0 * 10.0 # Weighted average of feature scores. total_score = (title_score*1.5 + frequency*2.0 + s_length*1.0 + s_position*1.0) / 4.0 ranks[s] = total_score return ranks
def multisummarize(docs, summary_length=5): """ Summarize multi documents. Args: | docs (list) -- list of documents (i.e. texts) | summary_length (int) -- the preferred sentence length of the summary (default=5) .. note:: The current implementation is super naive, thus the quality and coherence of its summaries is pretty damn terrible. But it's purpose for now is that there is *some* API for multidoc summarization. Returns: | summary (list) -- list of sentences selected for the summary. .. note:: BTW: this is super slow. takes well over a minute for 4 moderately-sized documents. """ # Collect all sentences from the input documents. # Also collect position information about each sentence. sents = [] for doc in docs: sents += [(sent, vectorize(sent), pos + 1) for pos, sent in enumerate(sent_tokenize(doc))] clusters = [] # Cluster the sentences. for sent in sents: # sent = (sent, vec, pos) # Keep track of the maximum scoring cluster # (above some minimum similarity) # and the avg sim score. # The higher the min_sim, # the harder it is to join a cluster. min_sim = 0.2 max_cluster = None, min_sim for cluster in clusters: avg_sim = 0 for sent_c in cluster: avg_sim += (1 - cosine(sent[1], sent_c[1])) avg_sim = avg_sim / len(cluster) if avg_sim >= max_cluster[1]: max_cluster = cluster, avg_sim # If a cluster was found, # add the sentence to it if max_cluster[0]: max_cluster[0].append(sent) # Otherwise, create a new cluster. else: clusters.append([sent]) # Rank the clusters. # Assuming that clusters with more sentences are more important, # take the top 5. ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length] # For each sentence cluster, select the highest scoring sentence. # Again - very naive. ideal_length = 20 summary_sentences = [] for cluster in ranked_clusters: max_sent = '', 0 for sent in cluster: avg_sim = 0 for sent_c in cluster: avg_sim += 1 - cosine(sent[1], sent_c[1]) avg_sim = avg_sim / len(cluster) pos = sent[2] length = fabs(ideal_length - len(tokenize(sent[0]))) / ideal_length # Score is the average similarity penalized by distance from ideal length, # weighted by the inverse of the position. score = (avg_sim - length / 2) / pos if score >= max_sent[1]: max_sent = sent[0], score summary_sentences.append(max_sent[0]) return summary_sentences
def multisummarize(docs, summary_length=5): """ Summarize multi documents. Args: | docs (list) -- list of documents (i.e. texts) | summary_length (int) -- the preferred sentence length of the summary (default=5) .. note:: The current implementation is super naive, thus the quality and coherence of its summaries is pretty damn terrible. But it's purpose for now is that there is *some* API for multidoc summarization. Returns: | summary (list) -- list of sentences selected for the summary. .. note:: BTW: this is super slow. takes well over a minute for 4 moderately-sized documents. """ # Collect all sentences from the input documents. # Also collect position information about each sentence. sents = [] for doc in docs: sents += [(sent, vectorize(sent), pos + 1) for pos, sent in enumerate(sent_tokenize(doc))] clusters = [] # Cluster the sentences. for sent in sents: # sent = (sent, vec, pos) # Keep track of the maximum scoring cluster # (above some minimum similarity) # and the avg sim score. # The higher the min_sim, # the harder it is to join a cluster. min_sim = 0.2 max_cluster = None, min_sim for cluster in clusters: avg_sim = 0 for sent_c in cluster: avg_sim += (1 - cosine(sent[1], sent_c[1])) avg_sim = avg_sim/len(cluster) if avg_sim >= max_cluster[1]: max_cluster = cluster, avg_sim # If a cluster was found, # add the sentence to it if max_cluster[0]: max_cluster[0].append(sent) # Otherwise, create a new cluster. else: clusters.append([sent]) # Rank the clusters. # Assuming that clusters with more sentences are more important, # take the top 5. ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length] # For each sentence cluster, select the highest scoring sentence. # Again - very naive. ideal_length = 20 summary_sentences = [] for cluster in ranked_clusters: max_sent = '', 0 for sent in cluster: avg_sim = 0 for sent_c in cluster: avg_sim += 1 - cosine(sent[1], sent_c[1]) avg_sim = avg_sim/len(cluster) pos = sent[2] length = fabs(ideal_length - len(tokenize(sent[0])))/ideal_length # Score is the average similarity penalized by distance from ideal length, # weighted by the inverse of the position. score = (avg_sim - length/2)/pos if score >= max_sent[1]: max_sent = sent[0], score summary_sentences.append(max_sent[0]) return summary_sentences