Beispiel #1
0
    def predict(self,
                collection,
                max_dates=10,
                max_summary_sents=1,
                ref_tl=None,
                input_titles=False,
                output_titles=False,
                output_body_sents=True):
        print('vectorizer...')
        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
        vectorizer.fit([s.raw for a in collection.articles() for s in a.sentences])

        print('date ranking...')
        ranked_dates = self.date_ranker.rank_dates(collection)

        start = collection.start.date()
        end = collection.end.date()
        ranked_dates = [d for d in ranked_dates if start <= d <= end]

        print('candidates & summarization...')
        dates_with_sents = self.sent_collector.collect_sents(
            ranked_dates,
            collection,
            vectorizer,
            include_titles=input_titles,
        )

        def sent_filter(sent):
            """
            Returns True if sentence is allowed to be in a summary.
            """
            lower = sent.raw.lower()
            if not any([kw in lower for kw in collection.keywords]):
                return False
            elif not output_titles and sent.is_title:
                return False
            elif not output_body_sents and not sent.is_sent:
                return False
            else:
                return True

        timeline = []
        l = 0
        for i, (d, d_sents) in enumerate(dates_with_sents):
            if l >= max_dates:
                break

            summary = self.summarizer.summarize(
                d_sents,
                k=max_summary_sents,
                vectorizer=vectorizer,
                filter=sent_filter
            )
            if summary:
                time = datetime.datetime(d.year, d.month, d.day)
                timeline.append((time, summary))
                l += 1

        timeline.sort(key=lambda x: x[0])
        return data.Timeline(timeline)
Beispiel #2
0
    def predict(self,
                collection,
                max_dates=10,
                max_summary_sents=1,
                ref_tl=None,
                input_titles=False,
                output_titles=False,
                output_body_sents=True):

        print('date ranking...')

        # Rank top dates to be included in the timeline.

        ranked_dates = self.date_ranker.rank_dates(collection)

        start = collection.start.date()
        end = collection.end.date()
        ranked_dates = [d for d in ranked_dates if start <= d <= end]

        vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
        vectorizer.fit(
            [s.raw for a in collection.articles() for s in a.sentences])

        print('candidates & summarization...')

        # Select sentences on each date

        dates_with_sents = self.sent_collector.collect_sents(
            ranked_dates,
            collection,
            vectorizer,
            include_titles=False,
        )

        timeline = []
        l = 0

        # Summarize sentence on each date with BART

        for i, (d, d_sents) in enumerate(dates_with_sents):

            if l >= max_dates:
                break

            summary = self.summarizer.summarize(d_sents)
            if summary:
                time = datetime.datetime(d.year, d.month, d.day)
                timeline.append((time, summary))
                l += 1

        timeline.sort(key=lambda x: x[0])
        return data.Timeline(timeline)
Beispiel #3
0
    def predict(self,
                j,
                cluster_dir,
                collection,
                max_dates=10,
                max_summary_sents=1,
                ref_tl=None,
                input_titles=False,
                output_titles=False,
                output_body_sents=True):

        # word embedding & cluster
        vectorizer = None
        embedder = SentenceTransformer('paraphrase-distilroberta-base-v1')
        #embedder = SentenceTransformer('paraphrase-distilroberta-base-v2')
        clusters = self.clusterer.cluster(collection, None, embedder)

        #doc_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
        #clusters = self.clusterer.cluster(collection, doc_vectorizer, None)
        clusters_num = len(clusters)

        print(f'ref_tl={ref_tl}')

        centroid_list = [c.centroid for c in clusters]

        # assign dates
        print('assigning cluster times...')
        for c in clusters:
            c.time = c.most_mentioned_time()
            if c.time is None:
                c.time = c.earliest_pub_time()

        print('ranking clusters...')
        ranked_clusters = self.cluster_ranker.rank(clusters, collection)
        batch = {
            'cluster': ranked_clusters,
            'ref': ref_tl
        }
        with open(cluster_dir/f'{collection.name}_{j}.pkl', 'wb') as f:
            pickle.dump(batch, file=f)

        print('vectorizing sentences...')

        def sent_filter(sent):
            return True

        print('summarization...')
        sys_l = 0
        sys_m = 0
        ref_m = max_dates * max_summary_sents

        date_to_summary = collections.defaultdict(list)
        for c in ranked_clusters:

            date = c.time.date()
            c_sents = self._select_sents_from_cluster(c)
            #print("C", date, len(c_sents), "M", sys_m, "L", sys_l)
            summary = self.summarizer.summarize(c_sents)

            if summary:
                if self.unique_dates and date in date_to_summary:
                    continue
                date_to_summary[date] += summary
                sys_m += len(summary)
                if self.unique_dates:
                    sys_l += 1

            if sys_m >= ref_m or sys_l >= max_dates:
                break

        timeline = []
        for d, summary in date_to_summary.items():
            t = datetime.datetime(d.year, d.month, d.day)
            timeline.append((t, summary))
        timeline.sort(key=lambda x: x[0])

        return data.Timeline(timeline), clusters_num
Beispiel #4
0
    def predict(self,
                collection,
                max_dates=10,
                max_summary_sents=1,
                ref_tl=None,
                input_titles=False,
                output_titles=False,
                output_body_sents=True):

        print('clustering articles...')
        if self.clustering_rep == 'tfidf':
            print("\tusing tfidf")
            doc_vectorizer = TfidfVectorizer(lowercase=True,
                                             stop_words='english')
            clusters = self.clusterer.cluster(collection, doc_vectorizer)
        # use sentence transformer
        elif self.sbert_sequence_len:
            print("\tusing {} with {} max tokens".format(
                self.clustering_rep, self.sbert_sequence_len))
            sbert_model = SentenceTransformer(self.clustering_rep)
            sbert_model.max_seq_length = self.sbert_sequence_len - 3
            clusters = self.clusterer.cluster(collection,
                                              sbert_model,
                                              sbert=True)
        else:
            raise NotImplementedError(
                "invalid clustering_rep and sbert_sequence_len combination")

        print('assigning cluster times...')
        for c in clusters:
            c.time = c.most_mentioned_time()
            if c.time is None:
                c.time = c.earliest_pub_time()

        print('ranking clusters...')
        ranked_clusters = self.cluster_ranker.rank(clusters, collection)

        if self.sbert_summarizer:
            print('using a SBERTSummarizer')
        elif self.summarizer_rep == 'tfidf':
            print('tfidf vectorizing sentences...')
            raw_sents = [
                s.raw for a in collection.articles()
                for s in a.sentences[:self.clip_sents]
            ]
            vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
            vectorizer.fit(raw_sents)

            using_sbert = False
        elif self.summarizer_rep == 'same':
            print("\reusing clustering sbert model")
            vectorizer = sbert_model
            using_sbert = True

        def sent_filter(sent):
            """
            Returns True if sentence is allowed to be in a summary.
            """
            lower = sent.raw.lower()
            if not any([kw in lower for kw in collection.keywords]):
                return False
            elif not output_titles and sent.is_title:
                return False
            elif not output_body_sents and not sent.is_sent:
                return False
            else:
                return True

        print('summarization...')
        sys_l = 0
        sys_m = 0
        ref_m = max_dates * max_summary_sents

        date_to_summary = collections.defaultdict(list)
        for c in ranked_clusters:
            date = c.time.date()
            if self.sbert_summarizer:
                summary = self.summarizer.summarize(c.articles,
                                                    k=max_summary_sents,
                                                    date=date)
            else:
                c_sents = self._select_sents_from_cluster(c)
                #print("C", date, len(c_sents), "M", sys_m, "L", sys_l)
                summary = self.summarizer.summarize(c_sents,
                                                    k=max_summary_sents,
                                                    vectorizer=vectorizer,
                                                    filter=sent_filter,
                                                    sbert=using_sbert)

            if summary:
                if self.unique_dates and date in date_to_summary:
                    continue
                date_to_summary[date] += summary
                sys_m += len(summary)
                if self.unique_dates:
                    sys_l += 1

            if sys_m >= ref_m or sys_l >= max_dates:
                break

        timeline = []
        for d, summary in date_to_summary.items():
            t = datetime.datetime(d.year, d.month, d.day)
            timeline.append((t, summary))
        timeline.sort(key=lambda x: x[0])

        return data.Timeline(timeline)
Beispiel #5
0
    def predict(self,
                collection,
                max_dates=10,
                max_summary_sents=1,
                ref_tl=None):
        '''
        Predict timeline for given collection

        Args:
            collection
            max_dates: max number of timeline events
            max_summary_sents: max sentences per timeline event
        '''
        articles = list(collection.articles())
        print("Getting all {} document embeddings...".format(len(articles)))
        full_texts = ['{}. {}'.format(a.title, a.text) for a in articles]
        article_vecs = self.sbert.encode(full_texts,
                                         batch_size=self.batch_size,
                                         show_progress_bar=True,
                                         device=self.device,
                                         num_workers=24)

        params = list(itertools.product(self.cd_thresholds,
                                        self.cd_n_articles))
        print("Detecting communities with {} param options...".format(
            len(params)))
        n = 0  # for printing n detection runs

        clust_large_enough = []
        clust_too_small = []

        for thresh, min_n in params:
            n += 1
            avg_percentile = np.mean([
                stats.percentileofscore(self.cd_thresholds, thresh),
                stats.percentileofscore(self.cd_n_articles, min_n)
            ])
            ordered_clusters = self._community_detection(
                article_vecs, thresh, min_n,
                min(len(article_vecs), self.cd_init_max_size))

            n_clusts = len(ordered_clusters)

            if n_clusts >= (max_dates * self.min_comm_mult):
                print(
                    "\tdetecting communities [n={},\tthresh={},\tmin={}\tap={}]"
                    .format(n, thresh, min_n, avg_percentile))
                print("\t\t{} communities (enough)".format(n_clusts))
                clust_large_enough.append((avg_percentile, ordered_clusters))
            else:
                # print("\t\t{} communities (not enough)".format(n_clusts))
                clust_too_small.append(
                    (n_clusts, avg_percentile, ordered_clusters))

        if len(clust_large_enough) > 0:
            best = sorted(clust_large_enough,
                          key=lambda element: element[0],
                          reverse=True)[0]
            ordered_clusters = best[1]
            print("\nUsing {} communities ({} percentile params)\n".format(
                len(ordered_clusters), best[0]))
        else:
            best = sorted(clust_too_small,
                          key=lambda element: (element[0], element[1]),
                          reverse=True)[0]
            ordered_clusters = best[2]
            print(
                "\nNone with enough communities. Using {} communities ({} percentile params)/n"
                .format(best[0], best[1]))

        formated_clusts = []

        if self.cluster_ranking == 'date_mention':
            articles_arr = np.asarray(articles)

            for c in ordered_clusters:
                clust_dict = dict()
                clust_dict['articles'] = articles_arr[c]
                clust_dict['vectors'] = article_vecs[c]

                all_dates = []
                for a in articles_arr[c]:
                    all_dates.append(a.time.date())
                    for s in a.sentences:
                        if s.get_date():
                            all_dates.append(s.get_date())

                most_common = collections.Counter(all_dates).most_common(1)[0]
                clust_dict['date'] = most_common[0]
                clust_dict['date_count'] = most_common[1]
                formated_clusts.append(clust_dict)

            formated_clusts = sorted(formated_clusts,
                                     key=lambda c:
                                     (c['date_count'], len(c['articles'])),
                                     reverse=True)

        elif self.cluster_ranking == 'size':
            articles_arr = np.asarray(articles)
            for c in ordered_clusters:
                clust_dict = dict()
                clust_dict['articles'] = articles_arr[c]
                clust_dict['vectors'] = article_vecs[c]
                clust_dict['date'] = None
                clust_dict['date_count'] = None
                formated_clusts.append(clust_dict)
        else:
            raise ValueError("invalid cluster_ranking option")

        print('summarization...')
        sys_l = 0
        sys_m = 0
        ref_m = max_dates * max_summary_sents

        date_to_summary = collections.defaultdict(list)

        for c in formated_clusts:
            if c['date']:
                print(
                    '\n\tcommunity with {} articles and {} date count'.format(
                        len(c['articles']), c['date_count']))
                core_doc_vecs = c['vectors'][:self.similarity_num_articles]
                candidate_sents = []
                date_docs = []

                for a in c['articles']:
                    start_ind = 0
                    article_added = False

                    if a.time.date() == c['date']:
                        date_docs.append(a)
                        article_added = True
                        start_ind = self.candidate_sents_per
                        for s in a.sentences[:start_ind]:
                            candidate_sents.append(s)

                    for s in a.sentences[start_ind:]:
                        if s.get_date() and s.get_date() == c['date']:
                            if not article_added:
                                date_docs.append(a)
                                article_added = True
                            candidate_sents.append(s)

                if len(candidate_sents) == 0:
                    print("no date linked candidate sentences")
                    continue

                print("...encoding candidate sentences...")
                candidate_sents_text = [s.raw for s in candidate_sents]
                candidate_sents_vecs = self.sbert.encode(
                    candidate_sents_text,
                    batch_size=self.batch_size,
                    show_progress_bar=True,
                    device=self.device,
                    num_workers=24)

                if self.summary_criteria == 'centroid':
                    doc_compare_vecs = np.mean(core_doc_vecs, axis=0)
                    assert len(doc_compare_vecs) == len(core_doc_vecs[0])

                    sent_compare_vecs = np.mean(candidate_sents_vecs, axis=0)
                    assert len(doc_compare_vecs) == len(sent_compare_vecs)
                else:
                    doc_compare_vecs = core_doc_vecs
                    sent_compare_vecs = candidate_sents_vecs

                if self.compare_with == 'both':
                    doc_sim = np.mean(util.pytorch_cos_sim(
                        candidate_sents_vecs,
                        torch.from_numpy(doc_compare_vecs).float()).numpy(),
                                      axis=1)
                    sent_sim = np.mean(util.pytorch_cos_sim(
                        candidate_sents_vecs,
                        torch.from_numpy(sent_compare_vecs).float()).numpy(),
                                       axis=1)

                    sent_scores = np.mean(np.stack((doc_sim, sent_sim)),
                                          axis=0)
                else:
                    raise NotImplementedError

                top_sent_inds = np.argsort(-sent_scores)[:max_summary_sents]

                event_summary = ''
                date = c['date']
                for ind in top_sent_inds:
                    event_summary += candidate_sents_text[ind] + ' '
                if not date:
                    print('\tNo date for event found')
                    continue
                if self.unique_dates and date in date_to_summary:
                    print('\tSkipping repeat date')
                    continue

                date_to_summary[date] += [event_summary]
                print('\t\t{}\t{}'.format(date, event_summary))

                sys_m += max_summary_sents
                if self.unique_dates:
                    sys_l += 1

                if sys_m >= ref_m or sys_l >= max_dates:
                    break

            else:
                print('\n\tcommunity with {} articles'.format(
                    len(c['articles'])))
                core_doc_vecs = c['vectors'][:self.similarity_num_articles]
                core_articles = c['articles'][:self.candidate_articles_per]
                candidate_sents = [
                    s for a in core_articles
                    for s in a.sentences[:self.candidate_sents_per]
                ]
                candidate_sents_text = [s.raw for s in candidate_sents]
                candidate_sents_vecs = self.sbert.encode(
                    candidate_sents_text,
                    batch_size=self.batch_size,
                    show_progress_bar=True,
                    device=self.device,
                    num_workers=24)

                if self.summary_criteria == 'centroid':
                    doc_compare_vecs = np.mean(core_doc_vecs, axis=0)
                    assert len(doc_compare_vecs) == len(core_doc_vecs[0])

                    sent_compare_vecs = np.mean(candidate_sents_vecs, axis=0)
                    assert len(doc_compare_vecs) == len(sent_compare_vecs)
                else:
                    doc_compare_vecs = core_doc_vecs
                    sent_compare_vecs = candidate_sents_vecs

                if self.compare_with == 'both':
                    doc_sim = np.mean(util.pytorch_cos_sim(
                        candidate_sents_vecs,
                        torch.from_numpy(doc_compare_vecs).float()).numpy(),
                                      axis=1)
                    sent_sim = np.mean(util.pytorch_cos_sim(
                        candidate_sents_vecs,
                        torch.from_numpy(sent_compare_vecs).float()).numpy(),
                                       axis=1)

                    sent_scores = np.mean(np.stack((doc_sim, sent_sim)),
                                          axis=0)

                top_sent_inds = np.argsort(-sent_scores)[:max_summary_sents]

                event_summary = ''
                date = None
                for ind in top_sent_inds:
                    event_summary += candidate_sents_text[ind] + ' '
                    if not date:
                        if candidate_sents[ind].get_date():
                            date = candidate_sents[ind].get_date()
                        else:
                            date = candidate_sents[ind].pub_time.date()
                if not date:
                    print('\tNo date for event found')
                    continue
                if self.unique_dates and date in date_to_summary:
                    print('\tSkipping repeat date')
                    continue

                date_to_summary[date] += [event_summary]
                print('\t\t{}\t{}'.format(date, event_summary))

                sys_m += max_summary_sents
                if self.unique_dates:
                    sys_l += 1

                if sys_m >= ref_m or sys_l >= max_dates:
                    break

        timeline = []
        for d, summary in date_to_summary.items():
            t = datetime.datetime(d.year, d.month, d.day)
            timeline.append((t, summary))
        timeline.sort(key=lambda x: x[0])

        return data.Timeline(timeline)
Beispiel #6
0
    def predict(self,
                collection,
                max_dates=10,
                max_summary_sents=1,
                ref_tl=None,
                input_titles=False,
                output_titles=False,
                output_body_sents=True):

        print('clustering articles...')
        doc_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
        clusters = self.clusterer.cluster(collection, doc_vectorizer)

        print('assigning cluster times...')
        for c in clusters:
            c.time = c.most_mentioned_time()
            if c.time is None:
                c.time = c.earliest_pub_time()

        print('ranking clusters...')
        ranked_clusters = self.cluster_ranker.rank(clusters, collection)

        print('vectorizing sentences...')
        raw_sents = [
            s.raw for a in collection.articles()
            for s in a.sentences[:self.clip_sents]
        ]
        vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
        vectorizer.fit(raw_sents)

        def sent_filter(sent):
            """
            Returns True if sentence is allowed to be in a summary.
            """
            lower = sent.raw.lower()
            if not any([kw in lower for kw in collection.keywords]):
                return False
            elif not output_titles and sent.is_title:
                return False
            elif not output_body_sents and not sent.is_sent:
                return False
            else:
                return True

        print('summarization...')
        sys_l = 0
        sys_m = 0
        ref_m = max_dates * max_summary_sents

        date_to_summary = collections.defaultdict(list)
        for c in ranked_clusters:

            date = c.time.date()
            c_sents = self._select_sents_from_cluster(c)
            #print("C", date, len(c_sents), "M", sys_m, "L", sys_l)
            summary = self.summarizer.summarize(c_sents,
                                                k=max_summary_sents,
                                                vectorizer=vectorizer,
                                                filter=sent_filter)

            if summary:
                if self.unique_dates and date in date_to_summary:
                    continue
                date_to_summary[date] += summary
                sys_m += len(summary)
                if self.unique_dates:
                    sys_l += 1

            if sys_m >= ref_m or sys_l >= max_dates:
                break

        timeline = []
        for d, summary in date_to_summary.items():
            t = datetime.datetime(d.year, d.month, d.day)
            timeline.append((t, summary))
        timeline.sort(key=lambda x: x[0])

        return data.Timeline(timeline)
Beispiel #7
0
    def predict(
        self,
        collection,
        max_dates=10,
        max_summary_sents=1,
        ref_tl=None,
        input_titles=False,
        output_titles=False,
        output_body_sents=True,
    ):
        print("vectorizer...")
        vectorizer = TfidfVectorizer(stop_words="english", lowercase=True)
        vectorizer.fit(
            [s.raw for a in collection.articles() for s in a.sentences])

        print("date ranking...")
        ranked_dates = self.date_ranker.rank_dates(collection,
                                                   plug=self.plug_page)

        start = collection.start.date()
        end = collection.end.date()
        ranked_dates = [d for d in ranked_dates if start <= d <= end]

        print("candidates & summarization...")
        dates_with_sents = self.sent_collector.collect_sents(
            ranked_dates,
            collection,
            vectorizer,
            include_titles=input_titles,
        )

        def sent_filter(sent):
            """
            Returns True if sentence is allowed to be in a summary.
            """
            lower = sent.raw.lower()
            if not any([kw in lower for kw in collection.keywords]):
                return False
            elif not output_titles and sent.is_title:
                return False
            elif not output_body_sents and not sent.is_sent:
                return False
            else:
                return True

        timeline = []
        l = 0
        for i, (d, d_sents) in enumerate(dates_with_sents):
            if l >= max_dates:
                break

            summary = self.summarizer.summarize(d_sents,
                                                k=max_summary_sents,
                                                vectorizer=vectorizer,
                                                filter=sent_filter)
            if len(summary) == 0:
                summary = [""]
                sent_id = None
                sent_page = None
                sent_taxo = None
            else:
                idx = [sent.raw for sent in d_sents].index(summary[0])
                sent_id = d_sents[idx].article_id
                sent_page = d_sents[idx].article_page
                sent_taxo = d_sents[idx].article_taxo

            if summary:
                time = datetime.datetime(d.year, d.month, d.day)
                timeline.append((
                    time,
                    [
                        "%s : %s : %s : " % (
                            sent_id,
                            sent_taxo,
                            sent_page,
                        ) + summary[0]
                    ],
                ))
                l += 1

        timeline.sort(key=lambda x: x[0])
        if self.plug_taxo:
            distances = plugin.taxostat_distance(timeline, 4)
            timeline = [
                timeline[i] for i, dist in enumerate(distances)
                if dist <= self.plug_taxo
            ]
        return data.Timeline(timeline)
Beispiel #8
0
    def predict(
        self,
        collection,
        max_dates=10,
        max_summary_sents=1,
        ref_tl=None,
        input_titles=False,
        output_titles=False,
        output_body_sents=True,
    ):

        print("clustering articles...")
        doc_vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
        clusters = self.clusterer.cluster(collection, doc_vectorizer)

        print("assigning cluster times...")
        for c in clusters:
            c.time = c.most_mentioned_time()
            if c.time is None:
                c.time = c.earliest_pub_time()

        print("ranking clusters...")
        ranked_clusters = self.cluster_ranker.rank(
            clusters, collection, plug=self.plug_page
        )

        print("vectorizing sentences...")
        raw_sents = [
            s.raw for a in collection.articles() for s in a.sentences[: self.clip_sents]
        ]
        vectorizer = TfidfVectorizer(lowercase=True, stop_words="english")
        vectorizer.fit(raw_sents)

        def sent_filter(sent):
            """
            Returns True if sentence is allowed to be in a summary.
            """
            lower = sent.raw.lower()
            if not any([kw in lower for kw in collection.keywords]):
                return False
            elif not output_titles and sent.is_title:
                return False
            elif not output_body_sents and not sent.is_sent:
                return False
            else:
                return True

        print("summarization...")
        sys_l = 0
        sys_m = 0
        ref_m = max_dates * max_summary_sents

        date_to_summary = collections.defaultdict(list)

        for c in ranked_clusters:

            date = c.time.date()
            c_sents = self._select_sents_from_cluster(c)

            summary = self.summarizer.summarize(
                c_sents, k=max_summary_sents, vectorizer=vectorizer, filter=sent_filter
            )

            if summary:
                c_sents_raw = [s.raw for s in c_sents]
                idx = c_sents_raw.index(summary[0])
                if self.unique_dates and date in date_to_summary:
                    continue
                date_to_summary[date] += [
                    "%s : %s : %s : "
                    % (
                        c_sents[idx].article_id,
                        c_sents[idx].article_taxo,
                        c_sents[idx].article_page,
                    )
                    + summary[0]
                ]
                sys_m += len(summary)
                if self.unique_dates:
                    sys_l += 1

            if sys_m >= ref_m or sys_l >= max_dates:
                break

        timeline = []
        for d, summary in date_to_summary.items():
            t = datetime.datetime(d.year, d.month, d.day)
            timeline.append((t, summary))
        timeline.sort(key=lambda x: x[0])
        if self.plug_taxo:
            distances = plugin.taxostat_distance(timeline, 4)
            timeline = [
                timeline[i]
                for i, dist in enumerate(distances)
                if dist <= self.plug_taxo
            ]

        return data.Timeline(timeline)