def main(args): dataset_path = Path(args.dataset) if not dataset_path.exists(): raise FileNotFoundError(f'Dataset not found: {args.dataset}') dataset = data.Dataset(dataset_path) dataset_name = dataset_path.name if args.method == 'datewise': resources = Path(args.resources) models_path = resources / 'supervised_date_ranker.{}.pkl'.format( dataset_name) # load regression models for date ranking key_to_model = utils.load_pkl(models_path) date_ranker = datewise.SupervisedDateRanker(method='regression') sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5, pub_end=2) summarizer = summarizers.CentroidOpt() system = datewise.DatewiseTimelineGenerator( date_ranker=date_ranker, summarizer=summarizer, sent_collector=sent_collector, key_to_model=key_to_model) elif args.method == 'clust': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer() summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, ) else: raise ValueError(f'Method not found: {args.method}') if dataset_name == 'entities': evaluate(system, dataset, args.output, trunc_timelines=True, time_span_extension=7) else: evaluate(system, dataset, args.output, trunc_timelines=False, time_span_extension=0)
def main(args): dataset_path = Path(args.dataset) if not dataset_path.exists(): raise FileNotFoundError(f'Dataset not found: {args.dataset}') dataset = data.Dataset(dataset_path) dataset_name = dataset_path.name if args.method == 'datewise': # load regression models for date ranking key_to_model = utils.load_pkl(args.model) models = list(key_to_model.values()) date_ranker = datewise.SupervisedDateRanker(method='regression') # there are multiple models (for cross-validation), # we just an arbitrary model, the first one date_ranker.model = models[0] sent_collector = datewise.PM_Mean_SentenceCollector( clip_sents=2, pub_end=2) summarizer = summarizers.CentroidOpt() system = datewise.DatewiseTimelineGenerator( date_ranker=date_ranker, summarizer=summarizer, sent_collector=sent_collector, key_to_model = key_to_model ) elif args.method == 'clust': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer() summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=2, unique_dates=True, ) else: raise ValueError(f'Method not found: {args.method}') run(system, dataset, args.output)
def __init__(self, date_ranker=None, summarizer=None, sent_collector=None, clip_sents=5, pub_end=2, key_to_model=None): self.date_ranker = date_ranker or MentionCountDateRanker() self.sent_collector = sent_collector or PM_Mean_SentenceCollector( clip_sents, pub_end) self.summarizer = summarizer or summarizers.CentroidOpt() self.key_to_model = key_to_model
def __init__( self, date_ranker=None, summarizer=None, sent_collector=None, clip_sents=5, pub_end=2, key_to_model=None, plug_page=False, plug_taxo=False, ): self.plug_page = plug_page self.plug_taxo = plug_taxo self.date_ranker = (date_ranker or MentionCountDateRanker() ) # if date_ranker is None, use new instance self.sent_collector = sent_collector or PM_Mean_SentenceCollector( clip_sents, pub_end) self.summarizer = summarizer or summarizers.CentroidOpt( plug=self.plug_page) self.key_to_model = key_to_model
def main(args): dataset_path = Path(args.dataset) if not dataset_path.exists(): raise FileNotFoundError(f'Dataset not found: {args.dataset}') dataset = data.Dataset(dataset_path) dataset_name = dataset_path.name if args.method == 'datewise': resources = Path(args.resources) # load date_models for date ranking if args.model == 'new_lr': method = 'log_regression' model_path = resources / 'date_ranker_new_lr.all.pkl' key_to_model = utils.load_pkl(model_path) model = key_to_model[dataset_name] elif is_neural_net(args.model): # fcn, deep_fcn, cnn, wide_fcn model, model_path = model_selector(args.model) model.load_state_dict(torch.load(model_path)[dataset_name]) method = 'neural_net' key_to_model = None else: method = 'linear_regression' model_path = resources / 'date_ranker_orig.{}.pkl'.format( dataset_name) key_to_model = utils.load_pkl(model_path) model = None date_ranker = datewise.SupervisedDateRanker(model, method=method) sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5, pub_end=2) summarizer = summarizers.CentroidOpt() system = datewise.DatewiseTimelineGenerator( date_ranker=date_ranker, summarizer=summarizer, sent_collector=sent_collector, key_to_model=key_to_model, method=method) elif args.method == 'clust': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer(max_days=1) summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, ) elif args.method == 'clust_sbertsum': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer() summarizer = summarizers.SBERTSummarizer(date_only=False, summary_criteria='similarity', candidate_sents_per=5, compare_with='both') system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, sbert_summarizer=True) elif args.method == 'clust_sbert': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer(max_days=365) summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( clustering_rep='distilroberta-base-paraphrase-v1', sbert_sequence_len=512, cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, ) elif args.method == 'clust_sbert_sbertsum': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer(max_days=7) summarizer = summarizers.SBERTSummarizer(date_only=True, summary_criteria='similarity', candidate_sents_per=5, compare_with='both') system = clust.ClusteringTimelineGenerator( clustering_rep='distilroberta-base-paraphrase-v1', sbert_sequence_len=512, cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, sbert_summarizer=True, clip_sents=5, unique_dates=True, ) elif args.method == 'clust_sbert_sbert_old': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer(max_days=7) summarizer = summarizers.SubmodularSummarizer() system = clust.ClusteringTimelineGenerator( clustering_rep='distilroberta-base-paraphrase-v1', sbert_sequence_len=512, cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, summarizer_rep='same', clip_sents=5, unique_dates=True, ) elif args.method == 'sbert': system = clust.SBERTTimelineGenerator( model_name='distilroberta-base-paraphrase-v1', cd_n_articles=list(range(5, 20, 1)) + list(range(20, 100, 5)), cd_thresholds=np.linspace(.25, .95, 25).tolist(), cd_init_max_size=500, min_comm_mult=1.5, cluster_ranking='date_mention', candidate_sents_per=5, candidate_articles_per=10, similarity_num_articles=10, summary_criteria='similarity', compare_with='both', unique_dates=True) # elif args.method == 'network': # summarizer = summarizers.CentroidOpt() # system = tr_network.NetworkTimelineGenerator() else: raise ValueError(f'Method not found: {args.method}') if dataset_name == 'entities': evaluate(system, dataset, args.output, trunc_timelines=True, time_span_extension=7, word_mover_stop_words='nltk') else: evaluate(system, dataset, args.output, trunc_timelines=False, time_span_extension=0, word_mover_stop_words='nltk')
def main(args): dataset_path = Path(args.dataset) if not dataset_path.exists(): raise FileNotFoundError(f'Dataset not found: {args.dataset}') dataset = data.Dataset(dataset_path) dataset_name = dataset_path.name if args.method == 'datewise': resources = Path(args.resources) # load date_models for date ranking if args.model == 'new_lr': method = 'log_regression' model_path = resources / 'date_ranker_new_lr.all.pkl' key_to_model = utils.load_pkl(model_path) model = key_to_model[dataset_name] elif is_neural_net(args.model): # fcn, deep_fcn, cnn, wide_fcn model, model_path = model_selector(args.model) model.load_state_dict(torch.load(model_path)[dataset_name]) method = 'neural_net' key_to_model = None else: method = 'linear_regression' model_path = resources / 'date_ranker_orig.{}.pkl'.format( dataset_name) key_to_model = utils.load_pkl(model_path) model = None date_ranker = datewise.SupervisedDateRanker(model, method=method) sent_collector = datewise.PM_Mean_SentenceCollector(clip_sents=5, pub_end=2) summarizer = summarizers.CentroidOpt() system = datewise.DatewiseTimelineGenerator( date_ranker=date_ranker, summarizer=summarizer, sent_collector=sent_collector, key_to_model=key_to_model, method=method) elif args.method == 'clust': cluster_ranker = clust.ClusterDateMentionCountRanker() clusterer = clust.TemporalMarkovClusterer() summarizer = summarizers.CentroidOpt() system = clust.ClusteringTimelineGenerator( cluster_ranker=cluster_ranker, clusterer=clusterer, summarizer=summarizer, clip_sents=5, unique_dates=True, ) else: raise ValueError(f'Method not found: {args.method}') if dataset_name == 'entities': evaluate(system, dataset, args.output, trunc_timelines=True, time_span_extension=7) else: evaluate(system, dataset, args.output, trunc_timelines=False, time_span_extension=0)