def __init__(self, id, plain_text): # parse ranked tokenized sentences self.id = id self.sentences = sentencize(plain_text) self.wlength = None # calculate only if needed self.tfidf = vectorize(plain_text)
def multisummarize(docs, summary_length=5): """ Summarize multi documents. Args: | docs (list) -- list of documents (i.e. texts) | summary_length (int) -- the preferred sentence length of the summary (default=5) .. note:: The current implementation is super naive, thus the quality and coherence of its summaries is pretty damn terrible. But it's purpose for now is that there is *some* API for multidoc summarization. Returns: | summary (list) -- list of sentences selected for the summary. .. note:: BTW: this is super slow. takes well over a minute for 4 moderately-sized documents. """ # Collect all sentences from the input documents. # Also collect position information about each sentence. sents = [] for doc in docs: sents += [(sent, vectorize(sent), pos + 1) for pos, sent in enumerate(sent_tokenize(doc))] clusters = [] # Cluster the sentences. for sent in sents: # sent = (sent, vec, pos) # Keep track of the maximum scoring cluster # (above some minimum similarity) # and the avg sim score. # The higher the min_sim, # the harder it is to join a cluster. min_sim = 0.2 max_cluster = None, min_sim for cluster in clusters: avg_sim = 0 for sent_c in cluster: avg_sim += (1 - cosine(sent[1], sent_c[1])) avg_sim = avg_sim / len(cluster) if avg_sim >= max_cluster[1]: max_cluster = cluster, avg_sim # If a cluster was found, # add the sentence to it if max_cluster[0]: max_cluster[0].append(sent) # Otherwise, create a new cluster. else: clusters.append([sent]) # Rank the clusters. # Assuming that clusters with more sentences are more important, # take the top 5. ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length] # For each sentence cluster, select the highest scoring sentence. # Again - very naive. ideal_length = 20 summary_sentences = [] for cluster in ranked_clusters: max_sent = '', 0 for sent in cluster: avg_sim = 0 for sent_c in cluster: avg_sim += 1 - cosine(sent[1], sent_c[1]) avg_sim = avg_sim / len(cluster) pos = sent[2] length = fabs(ideal_length - len(tokenize(sent[0]))) / ideal_length # Score is the average similarity penalized by distance from ideal length, # weighted by the inverse of the position. score = (avg_sim - length / 2) / pos if score >= max_sent[1]: max_sent = sent[0], score summary_sentences.append(max_sent[0]) return summary_sentences
def multisummarize(docs, summary_length=5): """ Summarize multi documents. Args: | docs (list) -- list of documents (i.e. texts) | summary_length (int) -- the preferred sentence length of the summary (default=5) .. note:: The current implementation is super naive, thus the quality and coherence of its summaries is pretty damn terrible. But it's purpose for now is that there is *some* API for multidoc summarization. Returns: | summary (list) -- list of sentences selected for the summary. .. note:: BTW: this is super slow. takes well over a minute for 4 moderately-sized documents. """ # Collect all sentences from the input documents. # Also collect position information about each sentence. sents = [] for doc in docs: sents += [(sent, vectorize(sent), pos + 1) for pos, sent in enumerate(sent_tokenize(doc))] clusters = [] # Cluster the sentences. for sent in sents: # sent = (sent, vec, pos) # Keep track of the maximum scoring cluster # (above some minimum similarity) # and the avg sim score. # The higher the min_sim, # the harder it is to join a cluster. min_sim = 0.2 max_cluster = None, min_sim for cluster in clusters: avg_sim = 0 for sent_c in cluster: avg_sim += (1 - cosine(sent[1], sent_c[1])) avg_sim = avg_sim/len(cluster) if avg_sim >= max_cluster[1]: max_cluster = cluster, avg_sim # If a cluster was found, # add the sentence to it if max_cluster[0]: max_cluster[0].append(sent) # Otherwise, create a new cluster. else: clusters.append([sent]) # Rank the clusters. # Assuming that clusters with more sentences are more important, # take the top 5. ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length] # For each sentence cluster, select the highest scoring sentence. # Again - very naive. ideal_length = 20 summary_sentences = [] for cluster in ranked_clusters: max_sent = '', 0 for sent in cluster: avg_sim = 0 for sent_c in cluster: avg_sim += 1 - cosine(sent[1], sent_c[1]) avg_sim = avg_sim/len(cluster) pos = sent[2] length = fabs(ideal_length - len(tokenize(sent[0])))/ideal_length # Score is the average similarity penalized by distance from ideal length, # weighted by the inverse of the position. score = (avg_sim - length/2)/pos if score >= max_sent[1]: max_sent = sent[0], score summary_sentences.append(max_sent[0]) return summary_sentences