def predict(self, collection, max_dates=10, max_summary_sents=1, ref_tl=None, input_titles=False, output_titles=False, output_body_sents=True): print('vectorizer...') vectorizer = TfidfVectorizer(stop_words='english', lowercase=True) vectorizer.fit([s.raw for a in collection.articles() for s in a.sentences]) print('date ranking...') ranked_dates = self.date_ranker.rank_dates(collection) start = collection.start.date() end = collection.end.date() ranked_dates = [d for d in ranked_dates if start <= d <= end] print('candidates & summarization...') dates_with_sents = self.sent_collector.collect_sents( ranked_dates, collection, vectorizer, include_titles=input_titles, ) def sent_filter(sent): """ Returns True if sentence is allowed to be in a summary. """ lower = sent.raw.lower() if not any([kw in lower for kw in collection.keywords]): return False elif not output_titles and sent.is_title: return False elif not output_body_sents and not sent.is_sent: return False else: return True timeline = [] l = 0 for i, (d, d_sents) in enumerate(dates_with_sents): if l >= max_dates: break summary = self.summarizer.summarize( d_sents, k=max_summary_sents, vectorizer=vectorizer, filter=sent_filter ) if summary: time = datetime.datetime(d.year, d.month, d.day) timeline.append((time, summary)) l += 1 timeline.sort(key=lambda x: x[0]) return data.Timeline(timeline)
def predict(self, collection, max_dates=10, max_summary_sents=1, ref_tl=None, input_titles=False, output_titles=False, output_body_sents=True): print('date ranking...') # Rank top dates to be included in the timeline. ranked_dates = self.date_ranker.rank_dates(collection) start = collection.start.date() end = collection.end.date() ranked_dates = [d for d in ranked_dates if start <= d <= end] vectorizer = TfidfVectorizer(stop_words='english', lowercase=True) vectorizer.fit( [s.raw for a in collection.articles() for s in a.sentences]) print('candidates & summarization...') # Select sentences on each date dates_with_sents = self.sent_collector.collect_sents( ranked_dates, collection, vectorizer, include_titles=False, ) timeline = [] l = 0 # Summarize sentence on each date with BART for i, (d, d_sents) in enumerate(dates_with_sents): if l >= max_dates: break summary = self.summarizer.summarize(d_sents) if summary: time = datetime.datetime(d.year, d.month, d.day) timeline.append((time, summary)) l += 1 timeline.sort(key=lambda x: x[0]) return data.Timeline(timeline)
def predict(self, j, cluster_dir, collection, max_dates=10, max_summary_sents=1, ref_tl=None, input_titles=False, output_titles=False, output_body_sents=True): # word embedding & cluster vectorizer = None embedder = SentenceTransformer('paraphrase-distilroberta-base-v1') #embedder = SentenceTransformer('paraphrase-distilroberta-base-v2') clusters = self.clusterer.cluster(collection, None, embedder) #doc_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english') #clusters = self.clusterer.cluster(collection, doc_vectorizer, None) clusters_num = len(clusters) print(f'ref_tl={ref_tl}') centroid_list = [c.centroid for c in clusters] # assign dates print('assigning cluster times...') for c in clusters: c.time = c.most_mentioned_time() if c.time is None: c.time = c.earliest_pub_time() print('ranking clusters...') ranked_clusters = self.cluster_ranker.rank(clusters, collection) batch = { 'cluster': ranked_clusters, 'ref': ref_tl } with open(cluster_dir/f'{collection.name}_{j}.pkl', 'wb') as f: pickle.dump(batch, file=f) print('vectorizing sentences...') def sent_filter(sent): return True print('summarization...') sys_l = 0 sys_m = 0 ref_m = max_dates * max_summary_sents date_to_summary = collections.defaultdict(list) for c in ranked_clusters: date = c.time.date() c_sents = self._select_sents_from_cluster(c) #print("C", date, len(c_sents), "M", sys_m, "L", sys_l) summary = self.summarizer.summarize(c_sents) if summary: if self.unique_dates and date in date_to_summary: continue date_to_summary[date] += summary sys_m += len(summary) if self.unique_dates: sys_l += 1 if sys_m >= ref_m or sys_l >= max_dates: break timeline = [] for d, summary in date_to_summary.items(): t = datetime.datetime(d.year, d.month, d.day) timeline.append((t, summary)) timeline.sort(key=lambda x: x[0]) return data.Timeline(timeline), clusters_num
def predict(self, collection, max_dates=10, max_summary_sents=1, ref_tl=None, input_titles=False, output_titles=False, output_body_sents=True): print('clustering articles...') if self.clustering_rep == 'tfidf': print("\tusing tfidf") doc_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english') clusters = self.clusterer.cluster(collection, doc_vectorizer) # use sentence transformer elif self.sbert_sequence_len: print("\tusing {} with {} max tokens".format( self.clustering_rep, self.sbert_sequence_len)) sbert_model = SentenceTransformer(self.clustering_rep) sbert_model.max_seq_length = self.sbert_sequence_len - 3 clusters = self.clusterer.cluster(collection, sbert_model, sbert=True) else: raise NotImplementedError( "invalid clustering_rep and sbert_sequence_len combination") print('assigning cluster times...') for c in clusters: c.time = c.most_mentioned_time() if c.time is None: c.time = c.earliest_pub_time() print('ranking clusters...') ranked_clusters = self.cluster_ranker.rank(clusters, collection) if self.sbert_summarizer: print('using a SBERTSummarizer') elif self.summarizer_rep == 'tfidf': print('tfidf vectorizing sentences...') raw_sents = [ s.raw for a in collection.articles() for s in a.sentences[:self.clip_sents] ] vectorizer = TfidfVectorizer(lowercase=True, stop_words='english') vectorizer.fit(raw_sents) using_sbert = False elif self.summarizer_rep == 'same': print("\reusing clustering sbert model") vectorizer = sbert_model using_sbert = True def sent_filter(sent): """ Returns True if sentence is allowed to be in a summary. """ lower = sent.raw.lower() if not any([kw in lower for kw in collection.keywords]): return False elif not output_titles and sent.is_title: return False elif not output_body_sents and not sent.is_sent: return False else: return True print('summarization...') sys_l = 0 sys_m = 0 ref_m = max_dates * max_summary_sents date_to_summary = collections.defaultdict(list) for c in ranked_clusters: date = c.time.date() if self.sbert_summarizer: summary = self.summarizer.summarize(c.articles, k=max_summary_sents, date=date) else: c_sents = self._select_sents_from_cluster(c) #print("C", date, len(c_sents), "M", sys_m, "L", sys_l) summary = self.summarizer.summarize(c_sents, k=max_summary_sents, vectorizer=vectorizer, filter=sent_filter, sbert=using_sbert) if summary: if self.unique_dates and date in date_to_summary: continue date_to_summary[date] += summary sys_m += len(summary) if self.unique_dates: sys_l += 1 if sys_m >= ref_m or sys_l >= max_dates: break timeline = [] for d, summary in date_to_summary.items(): t = datetime.datetime(d.year, d.month, d.day) timeline.append((t, summary)) timeline.sort(key=lambda x: x[0]) return data.Timeline(timeline)
def predict(self, collection, max_dates=10, max_summary_sents=1, ref_tl=None): ''' Predict timeline for given collection Args: collection max_dates: max number of timeline events max_summary_sents: max sentences per timeline event ''' articles = list(collection.articles()) print("Getting all {} document embeddings...".format(len(articles))) full_texts = ['{}. {}'.format(a.title, a.text) for a in articles] article_vecs = self.sbert.encode(full_texts, batch_size=self.batch_size, show_progress_bar=True, device=self.device, num_workers=24) params = list(itertools.product(self.cd_thresholds, self.cd_n_articles)) print("Detecting communities with {} param options...".format( len(params))) n = 0 # for printing n detection runs clust_large_enough = [] clust_too_small = [] for thresh, min_n in params: n += 1 avg_percentile = np.mean([ stats.percentileofscore(self.cd_thresholds, thresh), stats.percentileofscore(self.cd_n_articles, min_n) ]) ordered_clusters = self._community_detection( article_vecs, thresh, min_n, min(len(article_vecs), self.cd_init_max_size)) n_clusts = len(ordered_clusters) if n_clusts >= (max_dates * self.min_comm_mult): print( "\tdetecting communities [n={},\tthresh={},\tmin={}\tap={}]" .format(n, thresh, min_n, avg_percentile)) print("\t\t{} communities (enough)".format(n_clusts)) clust_large_enough.append((avg_percentile, ordered_clusters)) else: # print("\t\t{} communities (not enough)".format(n_clusts)) clust_too_small.append( (n_clusts, avg_percentile, ordered_clusters)) if len(clust_large_enough) > 0: best = sorted(clust_large_enough, key=lambda element: element[0], reverse=True)[0] ordered_clusters = best[1] print("\nUsing {} communities ({} percentile params)\n".format( len(ordered_clusters), best[0])) else: best = sorted(clust_too_small, key=lambda element: (element[0], element[1]), reverse=True)[0] ordered_clusters = best[2] print( "\nNone with enough communities. Using {} communities ({} percentile params)/n" .format(best[0], best[1])) formated_clusts = [] if self.cluster_ranking == 'date_mention': articles_arr = np.asarray(articles) for c in ordered_clusters: clust_dict = dict() clust_dict['articles'] = articles_arr[c] clust_dict['vectors'] = article_vecs[c] all_dates = [] for a in articles_arr[c]: all_dates.append(a.time.date()) for s in a.sentences: if s.get_date(): all_dates.append(s.get_date()) most_common = collections.Counter(all_dates).most_common(1)[0] clust_dict['date'] = most_common[0] clust_dict['date_count'] = most_common[1] formated_clusts.append(clust_dict) formated_clusts = sorted(formated_clusts, key=lambda c: (c['date_count'], len(c['articles'])), reverse=True) elif self.cluster_ranking == 'size': articles_arr = np.asarray(articles) for c in ordered_clusters: clust_dict = dict() clust_dict['articles'] = articles_arr[c] clust_dict['vectors'] = article_vecs[c] clust_dict['date'] = None clust_dict['date_count'] = None formated_clusts.append(clust_dict) else: raise ValueError("invalid cluster_ranking option") print('summarization...') sys_l = 0 sys_m = 0 ref_m = max_dates * max_summary_sents date_to_summary = collections.defaultdict(list) for c in formated_clusts: if c['date']: print( '\n\tcommunity with {} articles and {} date count'.format( len(c['articles']), c['date_count'])) core_doc_vecs = c['vectors'][:self.similarity_num_articles] candidate_sents = [] date_docs = [] for a in c['articles']: start_ind = 0 article_added = False if a.time.date() == c['date']: date_docs.append(a) article_added = True start_ind = self.candidate_sents_per for s in a.sentences[:start_ind]: candidate_sents.append(s) for s in a.sentences[start_ind:]: if s.get_date() and s.get_date() == c['date']: if not article_added: date_docs.append(a) article_added = True candidate_sents.append(s) if len(candidate_sents) == 0: print("no date linked candidate sentences") continue print("...encoding candidate sentences...") candidate_sents_text = [s.raw for s in candidate_sents] candidate_sents_vecs = self.sbert.encode( candidate_sents_text, batch_size=self.batch_size, show_progress_bar=True, device=self.device, num_workers=24) if self.summary_criteria == 'centroid': doc_compare_vecs = np.mean(core_doc_vecs, axis=0) assert len(doc_compare_vecs) == len(core_doc_vecs[0]) sent_compare_vecs = np.mean(candidate_sents_vecs, axis=0) assert len(doc_compare_vecs) == len(sent_compare_vecs) else: doc_compare_vecs = core_doc_vecs sent_compare_vecs = candidate_sents_vecs if self.compare_with == 'both': doc_sim = np.mean(util.pytorch_cos_sim( candidate_sents_vecs, torch.from_numpy(doc_compare_vecs).float()).numpy(), axis=1) sent_sim = np.mean(util.pytorch_cos_sim( candidate_sents_vecs, torch.from_numpy(sent_compare_vecs).float()).numpy(), axis=1) sent_scores = np.mean(np.stack((doc_sim, sent_sim)), axis=0) else: raise NotImplementedError top_sent_inds = np.argsort(-sent_scores)[:max_summary_sents] event_summary = '' date = c['date'] for ind in top_sent_inds: event_summary += candidate_sents_text[ind] + ' ' if not date: print('\tNo date for event found') continue if self.unique_dates and date in date_to_summary: print('\tSkipping repeat date') continue date_to_summary[date] += [event_summary] print('\t\t{}\t{}'.format(date, event_summary)) sys_m += max_summary_sents if self.unique_dates: sys_l += 1 if sys_m >= ref_m or sys_l >= max_dates: break else: print('\n\tcommunity with {} articles'.format( len(c['articles']))) core_doc_vecs = c['vectors'][:self.similarity_num_articles] core_articles = c['articles'][:self.candidate_articles_per] candidate_sents = [ s for a in core_articles for s in a.sentences[:self.candidate_sents_per] ] candidate_sents_text = [s.raw for s in candidate_sents] candidate_sents_vecs = self.sbert.encode( candidate_sents_text, batch_size=self.batch_size, show_progress_bar=True, device=self.device, num_workers=24) if self.summary_criteria == 'centroid': doc_compare_vecs = np.mean(core_doc_vecs, axis=0) assert len(doc_compare_vecs) == len(core_doc_vecs[0]) sent_compare_vecs = np.mean(candidate_sents_vecs, axis=0) assert len(doc_compare_vecs) == len(sent_compare_vecs) else: doc_compare_vecs = core_doc_vecs sent_compare_vecs = candidate_sents_vecs if self.compare_with == 'both': doc_sim = np.mean(util.pytorch_cos_sim( candidate_sents_vecs, torch.from_numpy(doc_compare_vecs).float()).numpy(), axis=1) sent_sim = np.mean(util.pytorch_cos_sim( candidate_sents_vecs, torch.from_numpy(sent_compare_vecs).float()).numpy(), axis=1) sent_scores = np.mean(np.stack((doc_sim, sent_sim)), axis=0) top_sent_inds = np.argsort(-sent_scores)[:max_summary_sents] event_summary = '' date = None for ind in top_sent_inds: event_summary += candidate_sents_text[ind] + ' ' if not date: if candidate_sents[ind].get_date(): date = candidate_sents[ind].get_date() else: date = candidate_sents[ind].pub_time.date() if not date: print('\tNo date for event found') continue if self.unique_dates and date in date_to_summary: print('\tSkipping repeat date') continue date_to_summary[date] += [event_summary] print('\t\t{}\t{}'.format(date, event_summary)) sys_m += max_summary_sents if self.unique_dates: sys_l += 1 if sys_m >= ref_m or sys_l >= max_dates: break timeline = [] for d, summary in date_to_summary.items(): t = datetime.datetime(d.year, d.month, d.day) timeline.append((t, summary)) timeline.sort(key=lambda x: x[0]) return data.Timeline(timeline)
def predict(self, collection, max_dates=10, max_summary_sents=1, ref_tl=None, input_titles=False, output_titles=False, output_body_sents=True): print('clustering articles...') doc_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english') clusters = self.clusterer.cluster(collection, doc_vectorizer) print('assigning cluster times...') for c in clusters: c.time = c.most_mentioned_time() if c.time is None: c.time = c.earliest_pub_time() print('ranking clusters...') ranked_clusters = self.cluster_ranker.rank(clusters, collection) print('vectorizing sentences...') raw_sents = [ s.raw for a in collection.articles() for s in a.sentences[:self.clip_sents] ] vectorizer = TfidfVectorizer(lowercase=True, stop_words='english') vectorizer.fit(raw_sents) def sent_filter(sent): """ Returns True if sentence is allowed to be in a summary. """ lower = sent.raw.lower() if not any([kw in lower for kw in collection.keywords]): return False elif not output_titles and sent.is_title: return False elif not output_body_sents and not sent.is_sent: return False else: return True print('summarization...') sys_l = 0 sys_m = 0 ref_m = max_dates * max_summary_sents date_to_summary = collections.defaultdict(list) for c in ranked_clusters: date = c.time.date() c_sents = self._select_sents_from_cluster(c) #print("C", date, len(c_sents), "M", sys_m, "L", sys_l) summary = self.summarizer.summarize(c_sents, k=max_summary_sents, vectorizer=vectorizer, filter=sent_filter) if summary: if self.unique_dates and date in date_to_summary: continue date_to_summary[date] += summary sys_m += len(summary) if self.unique_dates: sys_l += 1 if sys_m >= ref_m or sys_l >= max_dates: break timeline = [] for d, summary in date_to_summary.items(): t = datetime.datetime(d.year, d.month, d.day) timeline.append((t, summary)) timeline.sort(key=lambda x: x[0]) return data.Timeline(timeline)
def predict( self, collection, max_dates=10, max_summary_sents=1, ref_tl=None, input_titles=False, output_titles=False, output_body_sents=True, ): print("vectorizer...") vectorizer = TfidfVectorizer(stop_words="english", lowercase=True) vectorizer.fit( [s.raw for a in collection.articles() for s in a.sentences]) print("date ranking...") ranked_dates = self.date_ranker.rank_dates(collection, plug=self.plug_page) start = collection.start.date() end = collection.end.date() ranked_dates = [d for d in ranked_dates if start <= d <= end] print("candidates & summarization...") dates_with_sents = self.sent_collector.collect_sents( ranked_dates, collection, vectorizer, include_titles=input_titles, ) def sent_filter(sent): """ Returns True if sentence is allowed to be in a summary. """ lower = sent.raw.lower() if not any([kw in lower for kw in collection.keywords]): return False elif not output_titles and sent.is_title: return False elif not output_body_sents and not sent.is_sent: return False else: return True timeline = [] l = 0 for i, (d, d_sents) in enumerate(dates_with_sents): if l >= max_dates: break summary = self.summarizer.summarize(d_sents, k=max_summary_sents, vectorizer=vectorizer, filter=sent_filter) if len(summary) == 0: summary = [""] sent_id = None sent_page = None sent_taxo = None else: idx = [sent.raw for sent in d_sents].index(summary[0]) sent_id = d_sents[idx].article_id sent_page = d_sents[idx].article_page sent_taxo = d_sents[idx].article_taxo if summary: time = datetime.datetime(d.year, d.month, d.day) timeline.append(( time, [ "%s : %s : %s : " % ( sent_id, sent_taxo, sent_page, ) + summary[0] ], )) l += 1 timeline.sort(key=lambda x: x[0]) if self.plug_taxo: distances = plugin.taxostat_distance(timeline, 4) timeline = [ timeline[i] for i, dist in enumerate(distances) if dist <= self.plug_taxo ] return data.Timeline(timeline)
def predict( self, collection, max_dates=10, max_summary_sents=1, ref_tl=None, input_titles=False, output_titles=False, output_body_sents=True, ): print("clustering articles...") doc_vectorizer = TfidfVectorizer(lowercase=True, stop_words="english") clusters = self.clusterer.cluster(collection, doc_vectorizer) print("assigning cluster times...") for c in clusters: c.time = c.most_mentioned_time() if c.time is None: c.time = c.earliest_pub_time() print("ranking clusters...") ranked_clusters = self.cluster_ranker.rank( clusters, collection, plug=self.plug_page ) print("vectorizing sentences...") raw_sents = [ s.raw for a in collection.articles() for s in a.sentences[: self.clip_sents] ] vectorizer = TfidfVectorizer(lowercase=True, stop_words="english") vectorizer.fit(raw_sents) def sent_filter(sent): """ Returns True if sentence is allowed to be in a summary. """ lower = sent.raw.lower() if not any([kw in lower for kw in collection.keywords]): return False elif not output_titles and sent.is_title: return False elif not output_body_sents and not sent.is_sent: return False else: return True print("summarization...") sys_l = 0 sys_m = 0 ref_m = max_dates * max_summary_sents date_to_summary = collections.defaultdict(list) for c in ranked_clusters: date = c.time.date() c_sents = self._select_sents_from_cluster(c) summary = self.summarizer.summarize( c_sents, k=max_summary_sents, vectorizer=vectorizer, filter=sent_filter ) if summary: c_sents_raw = [s.raw for s in c_sents] idx = c_sents_raw.index(summary[0]) if self.unique_dates and date in date_to_summary: continue date_to_summary[date] += [ "%s : %s : %s : " % ( c_sents[idx].article_id, c_sents[idx].article_taxo, c_sents[idx].article_page, ) + summary[0] ] sys_m += len(summary) if self.unique_dates: sys_l += 1 if sys_m >= ref_m or sys_l >= max_dates: break timeline = [] for d, summary in date_to_summary.items(): t = datetime.datetime(d.year, d.month, d.day) timeline.append((t, summary)) timeline.sort(key=lambda x: x[0]) if self.plug_taxo: distances = plugin.taxostat_distance(timeline, 4) timeline = [ timeline[i] for i, dist in enumerate(distances) if dist <= self.plug_taxo ] return data.Timeline(timeline)