def main(args): dataset_path = Path(args.dataset) # list current absolute path if not dataset_path.exists(): raise FileNotFoundError(f'Dataset not found: {args.dataset}') dataset = data.Dataset(dataset_path) dataset_name = dataset_path.name # last part of the path if args.method == 'datewise': resources = Path(args.resources) models_path = resources / 'supervised_date_ranker.{}.pkl'.format( dataset_name) # load regression models for date ranking key_to_model = utils.load_pkl(models_path) date_ranker = datewise.SupervisedDateRanker(method='regression') sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5, pub_end=2) summarizer = summarizers.CentroidOpt() system = datewise.DatewiseTimelineGenerator( date_ranker=date_ranker, summarizer=summarizer, sent_collector=sent_collector, key_to_model=key_to_model) elif args.method == 'clust': cluster_ranker = clust.ClusterDateMentionCountRanker() #clusterer = clust.AffinityPropagationClusterer() clusterer = clust.TemporalMarkovClusterer() summarizer = summarizers.Pegasus() system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, ) else: raise ValueError(f'Method not found: {args.method}') if dataset_name == 'entities': evaluate(system, dataset, args.output, trunc_timelines=True, time_span_extension=7) else: evaluate(system, dataset, args.output, trunc_timelines=False, time_span_extension=0)
def main(args): dataset_path = Path(args.dataset) if not dataset_path.exists(): raise FileNotFoundError(f'Dataset not found: {args.dataset}') dataset = data.Dataset(dataset_path) dataset_name = dataset_path.name if args.method == 'datewise': # load regression models for date ranking key_to_model = utils.load_pkl(args.model) models = list(key_to_model.values()) date_ranker = datewise.SupervisedDateRanker(method='regression') # there are multiple models (for cross-validation), # we just an arbitrary model, the first one date_ranker.model = models[0] sent_collector = datewise.PM_Mean_SentenceCollector( clip_sents=2, pub_end=2) summarizer = summarizers.CentroidOpt() system = datewise.DatewiseTimelineGenerator( date_ranker=date_ranker, summarizer=summarizer, sent_collector=sent_collector, key_to_model = key_to_model ) elif args.method == 'clust': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer() summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=2, unique_dates=True, ) else: raise ValueError(f'Method not found: {args.method}') run(system, dataset, args.output)
def main(args): dataset_path = Path(args.dataset) if not dataset_path.exists(): raise FileNotFoundError(f'Dataset not found: {args.dataset}') dataset = data.Dataset(dataset_path) dataset_name = dataset_path.name if args.method == 'datewise': resources = Path(args.resources) # load date_models for date ranking if args.model == 'new_lr': method = 'log_regression' model_path = resources / 'date_ranker_new_lr.all.pkl' key_to_model = utils.load_pkl(model_path) model = key_to_model[dataset_name] elif is_neural_net(args.model): # fcn, deep_fcn, cnn, wide_fcn model, model_path = model_selector(args.model) model.load_state_dict(torch.load(model_path)[dataset_name]) method = 'neural_net' key_to_model = None else: method = 'linear_regression' model_path = resources / 'date_ranker_orig.{}.pkl'.format( dataset_name) key_to_model = utils.load_pkl(model_path) model = None date_ranker = datewise.SupervisedDateRanker(model, method=method) sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5, pub_end=2) summarizer = summarizers.CentroidOpt() system = datewise.DatewiseTimelineGenerator( date_ranker=date_ranker, summarizer=summarizer, sent_collector=sent_collector, key_to_model=key_to_model, method=method) elif args.method == 'clust': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer(max_days=1) summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, ) elif args.method == 'clust_sbertsum': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer() summarizer = summarizers.SBERTSummarizer(date_only=False, summary_criteria='similarity', candidate_sents_per=5, compare_with='both') system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, sbert_summarizer=True) elif args.method == 'clust_sbert': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer(max_days=365) summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( clustering_rep='distilroberta-base-paraphrase-v1', sbert_sequence_len=512, cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, ) elif args.method == 'clust_sbert_sbertsum': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer(max_days=7) summarizer = summarizers.SBERTSummarizer(date_only=True, summary_criteria='similarity', candidate_sents_per=5, compare_with='both') system = clust.ClusteringTimelineGenerator( clustering_rep='distilroberta-base-paraphrase-v1', sbert_sequence_len=512, cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, sbert_summarizer=True, clip_sents=5, unique_dates=True, ) elif args.method == 'clust_sbert_sbert_old': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer(max_days=7) summarizer = summarizers.SubmodularSummarizer() system = clust.ClusteringTimelineGenerator( clustering_rep='distilroberta-base-paraphrase-v1', sbert_sequence_len=512, cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, summarizer_rep='same', clip_sents=5, unique_dates=True, ) elif args.method == 'sbert': system = clust.SBERTTimelineGenerator( model_name='distilroberta-base-paraphrase-v1', cd_n_articles=list(range(5, 20, 1)) + list(range(20, 100, 5)), cd_thresholds=np.linspace(.25, .95, 25).tolist(), cd_init_max_size=500, min_comm_mult=1.5, cluster_ranking='date_mention', candidate_sents_per=5, candidate_articles_per=10, similarity_num_articles=10, summary_criteria='similarity', compare_with='both', unique_dates=True) # elif args.method == 'network': # summarizer = summarizers.CentroidOpt() # system = tr_network.NetworkTimelineGenerator() else: raise ValueError(f'Method not found: {args.method}') if dataset_name == 'entities': evaluate(system, dataset, args.output, trunc_timelines=True, time_span_extension=7, word_mover_stop_words='nltk') else: evaluate(system, dataset, args.output, trunc_timelines=False, time_span_extension=0, word_mover_stop_words='nltk')
def load_model_orig(self, model_path, topic): model_dict = utils.load_pkl(model_path) self.model = model_dict[topic]['model'] print()
def load_model_lr(self, model_path, dataset_name): model_dict = utils.load_pkl(model_path) self.model = model_dict[dataset_name]
def main(args): dataset_path = Path(args.dataset) if not dataset_path.exists(): raise FileNotFoundError(f'Dataset not found: {args.dataset}') dataset = data.Dataset(dataset_path) dataset_name = dataset_path.name if args.method == 'datewise': resources = Path(args.resources) # load date_models for date ranking if args.model == 'new_lr': method = 'log_regression' model_path = resources / 'date_ranker_new_lr.all.pkl' key_to_model = utils.load_pkl(model_path) model = key_to_model[dataset_name] elif is_neural_net(args.model): # fcn, deep_fcn, cnn, wide_fcn model, model_path = model_selector(args.model) model.load_state_dict(torch.load(model_path)[dataset_name]) method = 'neural_net' key_to_model = None else: method = 'linear_regression' model_path = resources / 'date_ranker_orig.{}.pkl'.format( dataset_name) key_to_model = utils.load_pkl(model_path) model = None date_ranker = datewise.SupervisedDateRanker(model, method=method) sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5, pub_end=2) summarizer = summarizers.CentroidOpt() system = datewise.DatewiseTimelineGenerator( date_ranker=date_ranker, summarizer=summarizer, sent_collector=sent_collector, key_to_model=key_to_model, method=method) elif args.method == 'clust': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer() summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, ) else: raise ValueError(f'Method not found: {args.method}') if dataset_name == 'entities': evaluate(system, dataset, args.output, trunc_timelines=True, time_span_extension=7) else: evaluate(system, dataset, args.output, trunc_timelines=False, time_span_extension=0)