Beispiel #1
0
def main(args):

    dataset_path = Path(args.dataset)  # list current absolute path
    if not dataset_path.exists():
        raise FileNotFoundError(f'Dataset not found: {args.dataset}')
    dataset = data.Dataset(dataset_path)
    dataset_name = dataset_path.name  # last part of the path

    if args.method == 'datewise':
        resources = Path(args.resources)
        models_path = resources / 'supervised_date_ranker.{}.pkl'.format(
            dataset_name)
        # load regression models for date ranking
        key_to_model = utils.load_pkl(models_path)
        date_ranker = datewise.SupervisedDateRanker(method='regression')
        sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5,
                                                            pub_end=2)
        summarizer = summarizers.CentroidOpt()
        system = datewise.DatewiseTimelineGenerator(
            date_ranker=date_ranker,
            summarizer=summarizer,
            sent_collector=sent_collector,
            key_to_model=key_to_model)

    elif args.method == 'clust':
        cluster_ranker = clust.ClusterDateMentionCountRanker()
        #clusterer = clust.AffinityPropagationClusterer()
        clusterer = clust.TemporalMarkovClusterer()
        summarizer = summarizers.Pegasus()
        system = clust.ClusteringTimelineGenerator(
            cluster_ranker=cluster_ranker,
            clusterer=clusterer,
            summarizer=summarizer,
            clip_sents=5,
            unique_dates=True,
        )
    else:
        raise ValueError(f'Method not found: {args.method}')

    if dataset_name == 'entities':
        evaluate(system,
                 dataset,
                 args.output,
                 trunc_timelines=True,
                 time_span_extension=7)
    else:
        evaluate(system,
                 dataset,
                 args.output,
                 trunc_timelines=False,
                 time_span_extension=0)
Beispiel #2
0
def main(args):

    dataset_path = Path(args.dataset)
    if not dataset_path.exists():
        raise FileNotFoundError(f'Dataset not found: {args.dataset}')
    dataset = data.Dataset(dataset_path)
    dataset_name = dataset_path.name

    if args.method == 'datewise':
        # load regression models for date ranking
        key_to_model = utils.load_pkl(args.model)
        models = list(key_to_model.values())
        date_ranker = datewise.SupervisedDateRanker(method='regression')
        # there are multiple models (for cross-validation),
        # we just an arbitrary model, the first one
        date_ranker.model = models[0]
        sent_collector = datewise.PM_Mean_SentenceCollector(
            clip_sents=2, pub_end=2)
        summarizer = summarizers.CentroidOpt()
        system = datewise.DatewiseTimelineGenerator(
            date_ranker=date_ranker,
            summarizer=summarizer,
            sent_collector=sent_collector,
            key_to_model = key_to_model
        )

    elif args.method == 'clust':
        cluster_ranker = clust.ClusterDateMentionCountRanker()
        clusterer = clust.TemporalMarkovClusterer()
        summarizer = summarizers.CentroidOpt()
        system = clust.ClusteringTimelineGenerator(
            cluster_ranker=cluster_ranker,
            clusterer=clusterer,
            summarizer=summarizer,
            clip_sents=2,
            unique_dates=True,
        )
    else:
        raise ValueError(f'Method not found: {args.method}')


    run(system, dataset, args.output)
Beispiel #3
0
def main(args):
    dataset_path = Path(args.dataset)
    if not dataset_path.exists():
        raise FileNotFoundError(f'Dataset not found: {args.dataset}')
    dataset = data.Dataset(dataset_path)
    dataset_name = dataset_path.name

    if args.method == 'datewise':
        resources = Path(args.resources)

        # load date_models for date ranking
        if args.model == 'new_lr':
            method = 'log_regression'
            model_path = resources / 'date_ranker_new_lr.all.pkl'
            key_to_model = utils.load_pkl(model_path)
            model = key_to_model[dataset_name]
        elif is_neural_net(args.model):  # fcn, deep_fcn, cnn, wide_fcn
            model, model_path = model_selector(args.model)
            model.load_state_dict(torch.load(model_path)[dataset_name])
            method = 'neural_net'
            key_to_model = None
        else:
            method = 'linear_regression'
            model_path = resources / 'date_ranker_orig.{}.pkl'.format(
                dataset_name)
            key_to_model = utils.load_pkl(model_path)
            model = None
        date_ranker = datewise.SupervisedDateRanker(model, method=method)
        sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5,
                                                            pub_end=2)
        summarizer = summarizers.CentroidOpt()
        system = datewise.DatewiseTimelineGenerator(
            date_ranker=date_ranker,
            summarizer=summarizer,
            sent_collector=sent_collector,
            key_to_model=key_to_model,
            method=method)

    elif args.method == 'clust':
        cluster_ranker = clust.ClusterDateMentionCountRanker()
        clusterer = clust.TemporalMarkovClusterer(max_days=1)
        summarizer = summarizers.CentroidOpt()
        system = clust.ClusteringTimelineGenerator(
            cluster_ranker=cluster_ranker,
            clusterer=clusterer,
            summarizer=summarizer,
            clip_sents=5,
            unique_dates=True,
        )
    elif args.method == 'clust_sbertsum':
        cluster_ranker = clust.ClusterDateMentionCountRanker()
        clusterer = clust.TemporalMarkovClusterer()
        summarizer = summarizers.SBERTSummarizer(date_only=False,
                                                 summary_criteria='similarity',
                                                 candidate_sents_per=5,
                                                 compare_with='both')
        system = clust.ClusteringTimelineGenerator(
            cluster_ranker=cluster_ranker,
            clusterer=clusterer,
            summarizer=summarizer,
            clip_sents=5,
            unique_dates=True,
            sbert_summarizer=True)
    elif args.method == 'clust_sbert':
        cluster_ranker = clust.ClusterDateMentionCountRanker()
        clusterer = clust.TemporalMarkovClusterer(max_days=365)
        summarizer = summarizers.CentroidOpt()
        system = clust.ClusteringTimelineGenerator(
            clustering_rep='distilroberta-base-paraphrase-v1',
            sbert_sequence_len=512,
            cluster_ranker=cluster_ranker,
            clusterer=clusterer,
            summarizer=summarizer,
            clip_sents=5,
            unique_dates=True,
        )
    elif args.method == 'clust_sbert_sbertsum':
        cluster_ranker = clust.ClusterDateMentionCountRanker()
        clusterer = clust.TemporalMarkovClusterer(max_days=7)
        summarizer = summarizers.SBERTSummarizer(date_only=True,
                                                 summary_criteria='similarity',
                                                 candidate_sents_per=5,
                                                 compare_with='both')
        system = clust.ClusteringTimelineGenerator(
            clustering_rep='distilroberta-base-paraphrase-v1',
            sbert_sequence_len=512,
            cluster_ranker=cluster_ranker,
            clusterer=clusterer,
            summarizer=summarizer,
            sbert_summarizer=True,
            clip_sents=5,
            unique_dates=True,
        )
    elif args.method == 'clust_sbert_sbert_old':
        cluster_ranker = clust.ClusterDateMentionCountRanker()
        clusterer = clust.TemporalMarkovClusterer(max_days=7)
        summarizer = summarizers.SubmodularSummarizer()
        system = clust.ClusteringTimelineGenerator(
            clustering_rep='distilroberta-base-paraphrase-v1',
            sbert_sequence_len=512,
            cluster_ranker=cluster_ranker,
            clusterer=clusterer,
            summarizer=summarizer,
            summarizer_rep='same',
            clip_sents=5,
            unique_dates=True,
        )
    elif args.method == 'sbert':
        system = clust.SBERTTimelineGenerator(
            model_name='distilroberta-base-paraphrase-v1',
            cd_n_articles=list(range(5, 20, 1)) + list(range(20, 100, 5)),
            cd_thresholds=np.linspace(.25, .95, 25).tolist(),
            cd_init_max_size=500,
            min_comm_mult=1.5,
            cluster_ranking='date_mention',
            candidate_sents_per=5,
            candidate_articles_per=10,
            similarity_num_articles=10,
            summary_criteria='similarity',
            compare_with='both',
            unique_dates=True)
    # elif args.method == 'network':
    #     summarizer = summarizers.CentroidOpt()
    #     system = tr_network.NetworkTimelineGenerator()
    else:
        raise ValueError(f'Method not found: {args.method}')

    if dataset_name == 'entities':
        evaluate(system,
                 dataset,
                 args.output,
                 trunc_timelines=True,
                 time_span_extension=7,
                 word_mover_stop_words='nltk')
    else:
        evaluate(system,
                 dataset,
                 args.output,
                 trunc_timelines=False,
                 time_span_extension=0,
                 word_mover_stop_words='nltk')
Beispiel #4
0
 def load_model_orig(self, model_path, topic):
     model_dict = utils.load_pkl(model_path)
     self.model = model_dict[topic]['model']
     print()
Beispiel #5
0
 def load_model_lr(self, model_path, dataset_name):
     model_dict = utils.load_pkl(model_path)
     self.model = model_dict[dataset_name]
Beispiel #6
0
def main(args):
    dataset_path = Path(args.dataset)
    if not dataset_path.exists():
        raise FileNotFoundError(f'Dataset not found: {args.dataset}')
    dataset = data.Dataset(dataset_path)
    dataset_name = dataset_path.name

    if args.method == 'datewise':
        resources = Path(args.resources)

        # load date_models for date ranking
        if args.model == 'new_lr':
            method = 'log_regression'
            model_path = resources / 'date_ranker_new_lr.all.pkl'
            key_to_model = utils.load_pkl(model_path)
            model = key_to_model[dataset_name]
        elif is_neural_net(args.model):  # fcn, deep_fcn, cnn, wide_fcn
            model, model_path = model_selector(args.model)
            model.load_state_dict(torch.load(model_path)[dataset_name])
            method = 'neural_net'
            key_to_model = None
        else:
            method = 'linear_regression'
            model_path = resources / 'date_ranker_orig.{}.pkl'.format(
                dataset_name)
            key_to_model = utils.load_pkl(model_path)
            model = None
        date_ranker = datewise.SupervisedDateRanker(model, method=method)
        sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5,
                                                            pub_end=2)
        summarizer = summarizers.CentroidOpt()
        system = datewise.DatewiseTimelineGenerator(
            date_ranker=date_ranker,
            summarizer=summarizer,
            sent_collector=sent_collector,
            key_to_model=key_to_model,
            method=method)

    elif args.method == 'clust':
        cluster_ranker = clust.ClusterDateMentionCountRanker()
        clusterer = clust.TemporalMarkovClusterer()
        summarizer = summarizers.CentroidOpt()
        system = clust.ClusteringTimelineGenerator(
            cluster_ranker=cluster_ranker,
            clusterer=clusterer,
            summarizer=summarizer,
            clip_sents=5,
            unique_dates=True,
        )
    else:
        raise ValueError(f'Method not found: {args.method}')

    if dataset_name == 'entities':
        evaluate(system,
                 dataset,
                 args.output,
                 trunc_timelines=True,
                 time_span_extension=7)
    else:
        evaluate(system,
                 dataset,
                 args.output,
                 trunc_timelines=False,
                 time_span_extension=0)