def make_apsaltr_summaries( apsal_sal_cutoff, apsal_sim_cutoff, model_summaries, max_samples=1000): data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge") config_path = os.path.join( data_dir, "apsaltr_sal_{}_sim_{}_config".format( apsal_sal_cutoff, apsal_sim_cutoff)) config_paths = [] for job in jobs.event_cross_validation_jobs("crossval"): for event, corpus in job.dev_events(): model_summary = model_summaries[event.fs_name()] model_path = os.path.join( data_dir, "model_{}".format(event.fs_name())) max_len = len(model_summary) df = APSalTRankSalThreshFilteredSummary().get_dataframe( event, job.key, job.feature_set, apsal_sal_cutoff, apsal_sim_cutoff) updates = [update.decode('utf-8') for update in df['text'].tolist()] for n_sample in xrange(max_samples): #summary_text = random_summary(updates, max_len) sum_path = os.path.join( data_dir, "apsaltr_sal_{}_sim_{}_sample_{}_{}".format( apsal_sal_cutoff, apsal_sim_cutoff, n_sample, event.fs_name())) #with open(sum_path, 'w') as f: # f.write(summary_text.encode('utf-8')) config_paths.append('{} {}'.format(sum_path, model_path)) with open(config_path, 'w') as f: f.write('\n'.join(config_paths)) return config_path
def make_ap_summaries(ap_sim_cutoff, model_summaries, max_samples=1000): data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge") config_path = os.path.join(data_dir, "ap_sim_{}_config".format(ap_sim_cutoff)) config_paths = [] for job in jobs.event_cross_validation_jobs("crossval"): for event, corpus in job.dev_events(): model_summary = model_summaries[event.fs_name()] model_path = os.path.join(data_dir, "model_{}".format(event.fs_name())) max_len = len(model_summary) df = APFilteredSummary().get_dataframe(event, ap_sim_cutoff) updates = [ update.decode('utf-8') for update in df['text'].tolist() ] for n_sample in xrange(max_samples): #summary_text = random_summary(updates, max_len) sum_path = os.path.join( data_dir, "ap_sim_{}_sample_{}_{}".format(ap_sim_cutoff, n_sample, event.fs_name())) #with open(sum_path, 'w') as f: # f.write(summary_text.encode('utf-8')) config_paths.append('{} {}'.format(sum_path, model_path)) with open(config_path, 'w') as f: f.write('\n'.join(config_paths)) return config_path
def make_apsaltr_summaries(apsal_sal_cutoff, apsal_sim_cutoff, model_summaries, max_samples=1000): data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge") rouge_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge", "apsaltr") config_path = os.path.join( data_dir, "apsaltr_sal_{}_sim_{}_config".format(apsal_sal_cutoff, apsal_sim_cutoff)) config_paths = defaultdict(list) for job in jobs.event_cross_validation_jobs("crossval"): for event, corpus in job.eval_events(): model_summary = model_summaries[event.fs_name()] model_path = os.path.join(rouge_dir, "model_{}".format(event.fs_name())) max_len = len(model_summary) event_hours = event.list_event_hours() df = APSalTRankSalThreshFilteredSummary().get_dataframe( event, job.key, job.feature_set, apsal_sal_cutoff, apsal_sim_cutoff) n_hours = len(event_hours) for t, h in enumerate(xrange(12, n_hours, 12), 1): print "\t", t, h timestamp = int((event_hours[h] - \ datetime(1970,1,1)).total_seconds()) df_t = df[df['timestamp'] < timestamp] updates = [ update.decode('utf-8') for update in df_t['text'].tolist() ] for n_sample in xrange(max_samples): summary_text = random_summary(updates, max_len) sum_path = os.path.join( data_dir, "apsaltr_sal_{}_sim_{}_sample_{}_t{}_{}".format( apsal_sal_cutoff, apsal_sim_cutoff, n_sample, t, event.fs_name())) with open(sum_path, 'w') as f: f.write(summary_text.encode('utf-8')) config_paths[t].append('{} {}'.format( sum_path, model_path)) all_config_paths = [] for t in sorted(config_paths.keys()): config_path_t = config_path + "_t{}".format(t) print config_path_t with open(config_path_t, 'w') as f: f.write('\n'.join(config_paths[t])) all_config_paths.append(config_path_t) return all_config_paths
def make_apsaltr_summaries( apsal_sal_cutoff, apsal_sim_cutoff, model_summaries, max_samples=1000): data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge") rouge_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge", "apsaltr") config_path = os.path.join( data_dir, "apsaltr_sal_{}_sim_{}_config".format( apsal_sal_cutoff, apsal_sim_cutoff)) config_paths = defaultdict(list) for job in jobs.event_cross_validation_jobs("crossval"): for event, corpus in job.eval_events(): model_summary = model_summaries[event.fs_name()] model_path = os.path.join( rouge_dir, "model_{}".format(event.fs_name())) max_len = len(model_summary) event_hours = event.list_event_hours() df = APSalTRankSalThreshFilteredSummary().get_dataframe( event, job.key, job.feature_set, apsal_sal_cutoff, apsal_sim_cutoff) n_hours = len(event_hours) for t, h in enumerate(xrange(12, n_hours, 12), 1): print "\t",t, h timestamp = int((event_hours[h] - \ datetime(1970,1,1)).total_seconds()) df_t = df[df['timestamp'] < timestamp] updates = [update.decode('utf-8') for update in df_t['text'].tolist()] for n_sample in xrange(max_samples): summary_text = random_summary(updates, max_len) sum_path = os.path.join( data_dir, "apsaltr_sal_{}_sim_{}_sample_{}_t{}_{}".format( apsal_sal_cutoff, apsal_sim_cutoff, n_sample, t, event.fs_name())) with open(sum_path, 'w') as f: f.write(summary_text.encode('utf-8')) config_paths[t].append( '{} {}'.format(sum_path, model_path)) all_config_paths = [] for t in sorted(config_paths.keys()): config_path_t = config_path + "_t{}".format(t) print config_path_t with open(config_path_t, 'w') as f: f.write('\n'.join(config_paths[t])) all_config_paths.append(config_path_t) return all_config_paths
def run_summarizer_jobs(feature_ablation=False, cross_fold=False, **kwargs): import cuttsum.pipeline.jobs as jobs from cuttsum.pipeline.salience import SalienceModels if feature_ablation: for jnum, job in enumerate( jobs.feature_ablation_jobs(u'feature-ablation')): print job job.start(**kwargs) if cross_fold: for job in jobs.event_cross_validation_jobs("crossval"): print job job.start(**kwargs)
def make_rank_summaries(rank_sal_cutoff, rank_sim_cutoff, model_summaries, max_samples=1000): rouge_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge") data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge", "rank") if not os.path.exists(data_dir): os.makedirs(data_dir) config_path = os.path.join( data_dir, "rank_sal_{}_sim_{}_config".format(rank_sal_cutoff, rank_sim_cutoff)) config_paths = [] for job in jobs.event_cross_validation_jobs("crossval"): for event, corpus in job.eval_events(): model_summary = model_summaries[event.fs_name()] model_path = os.path.join(rouge_dir, "model_{}".format(event.fs_name())) max_len = len(model_summary) df = RankedSalienceFilteredSummary().get_dataframe( event, rank_sal_cutoff, rank_sim_cutoff) print event.fs_name() if df is None: print "?" import sys sys.exit() updates = [ update.decode('utf-8') for update in df['text'].tolist() ] for n_sample in xrange(max_samples): summary_text = random_summary(updates, max_len) sum_path = os.path.join( data_dir, "rank_sal_{}_sim_{}_sample_{}_{}".format( rank_sal_cutoff, rank_sim_cutoff, n_sample, event.fs_name())) with open(sum_path, 'w') as f: f.write(summary_text.encode('utf-8')) config_paths.append('{} {}'.format(sum_path, model_path)) with open(config_path, 'w') as f: f.write('\n'.join(config_paths)) return config_path
def make_rank_summaries( rank_sal_cutoff, rank_sim_cutoff, model_summaries, max_samples=1000): rouge_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge") data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge", "rank") if not os.path.exists(data_dir): os.makedirs(data_dir) config_path = os.path.join( data_dir, "rank_sal_{}_sim_{}_config".format( rank_sal_cutoff, rank_sim_cutoff)) config_paths = [] for job in jobs.event_cross_validation_jobs("crossval"): for event, corpus in job.eval_events(): model_summary = model_summaries[event.fs_name()] model_path = os.path.join( rouge_dir, "model_{}".format(event.fs_name())) max_len = len(model_summary) df = RankedSalienceFilteredSummary().get_dataframe( event, rank_sal_cutoff, rank_sim_cutoff) print event.fs_name() if df is None: print "?" import sys sys.exit() updates = [update.decode('utf-8') for update in df['text'].tolist()] for n_sample in xrange(max_samples): summary_text = random_summary(updates, max_len) sum_path = os.path.join( data_dir, "rank_sal_{}_sim_{}_sample_{}_{}".format( rank_sal_cutoff, rank_sim_cutoff, n_sample, event.fs_name())) with open(sum_path, 'w') as f: f.write(summary_text.encode('utf-8')) config_paths.append('{} {}'.format(sum_path, model_path)) with open(config_path, 'w') as f: f.write('\n'.join(config_paths)) return config_path
shell=True) with open(rpath, 'w') as f: f.write(o) recall = float(re.search('X ROUGE-2 Average_R: ([^ ]+)', o).group(1)) prec = float(re.search('X ROUGE-2 Average_P: ([^ ]+)', o).group(1)) f1 = float(re.search('X ROUGE-2 Average_F: ([^ ]+)', o).group(1)) return config_path, recall, prec, f1 model_summaries = {} nuggets = pd.concat((cj.get_2013_nuggets(), cj.get_2014_nuggets())) data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge") if not os.path.exists(data_dir): os.makedirs(data_dir) for job in jobs.event_cross_validation_jobs("crossval"): for event, corpus in job.eval_events(): nugget_text = '\n'.join( nuggets[nuggets['query id'] == event.query_id]['text'].tolist()) model_summaries[event.fs_name()] = nugget_text.decode('utf-8') model_path = os.path.join(data_dir, "model_{}".format(event.fs_name())) with open(model_path, 'w') as f: f.write(nugget_text) from cuttsum.misc import ProgressBar apsal_sal_cutoffs = [.6] apsal_sim_cutoffs = [.65] apsaltr_sal_cutoffs = [.6] apsaltr_sim_cutoffs = [.7]
shell=True) recall = float(re.search('X ROUGE-2 Average_R: ([^ ]+)', o).group(1)) prec = float(re.search('X ROUGE-2 Average_P: ([^ ]+)', o).group(1)) f1 = float(re.search('X ROUGE-2 Average_F: ([^ ]+)', o).group(1)) return config_path, recall, prec, f1 hac_cutoffs, ap_cutoffs, apsal_cutoffs, rank_cutoffs = generate_dev_sweep() model_summaries = {} nuggets = pd.concat((cj.get_2013_nuggets(), cj.get_2014_nuggets())) data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge") if not os.path.exists(data_dir): os.makedirs(data_dir) for job in jobs.event_cross_validation_jobs("crossval"): for event, corpus in job.dev_events(): nugget_text = '\n'.join( nuggets[nuggets['query id'] == event.query_id]['text'].tolist()) model_summaries[event.fs_name()] = nugget_text.decode('utf-8') model_path = os.path.join( data_dir, "model_{}".format(event.fs_name())) with open(model_path, 'w') as f: f.write(nugget_text) from cuttsum.misc import ProgressBar rank_sal_cutoffs, rank_sim_cutoffs = rank_cutoffs apsaltr_sal_cutoffs, apsaltr_sim_cutoffs = apsal_cutoffs apsal_sal_cutoffs, apsal_sim_cutoffs = apsal_cutoffs ap_sim_cutoffs = ap_cutoffs hac_dist_cutoffs, hac_sim_cutoffs = hac_cutoffs