def filter_worker(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) import time time.sleep(1) while not job_queue.empty(): try: summarizer, sum_args = job_queue.get(block=False) if isinstance(summarizer, APFilteredSummary): event, sim_cutoff = sum_args ap = APSummarizer() ap_tsv_path = ap.get_tsv_path(event) apf_tsv_path = summarizer.get_tsv_path(event, sim_cutoff) if os.path.exists(ap_tsv_path) and \ not os.path.exists(apf_tsv_path): summarizer.make(event, sim_threshold=sim_cutoff) elif isinstance(summarizer, HACFilteredSummary): event, prefix, feature_set, dist_cutoff, sim_cutoff = sum_args hac = HACSummarizer() hac_tsv_path = hac.get_tsv_path(event, dist_cutoff) hacf_tsv_path = summarizer.get_tsv_path( event, dist_cutoff, sim_cutoff) if os.path.exists(hac_tsv_path) and \ not os.path.exists(hacf_tsv_path): try: summarizer.make(event, prefix, feature_set, dist_cutoff=dist_cutoff, sim_threshold=sim_cutoff) except Exception, e: print e print hac_tsv_path elif isinstance(summarizer, APSalienceFilteredSummary) or \ isinstance(summarizer, APSalTRankSalThreshFilteredSummary): event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args aps = APSalienceSummarizer() aps_tsv_path = aps.get_tsv_path(event, prefix, feature_set) apsf_tsv_path = summarizer.get_tsv_path( event, prefix, feature_set, sal_cutoff, sim_cutoff) if os.path.exists(aps_tsv_path) and \ not os.path.exists(apsf_tsv_path): summarizer.make(event, prefix, feature_set, min_cluster_size=2, center_threshold=sal_cutoff, sim_threshold=sim_cutoff) elif isinstance(summarizer, RankedSalienceFilteredSummary): event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args rsfs_tsv_path = summarizer.get_tsv_path( event, sal_cutoff, sim_cutoff) if not os.path.exists(rsfs_tsv_path): summarizer.make(event, prefix, feature_set, sal_cutoff, sim_cutoff)
def filter_worker(job_queue, result_queue, **kwargs): signal.signal(signal.SIGINT, signal.SIG_IGN) import time time.sleep(1) while not job_queue.empty(): try: summarizer, sum_args = job_queue.get(block=False) if isinstance(summarizer, APFilteredSummary): event, sim_cutoff = sum_args ap = APSummarizer() ap_tsv_path = ap.get_tsv_path(event) apf_tsv_path = summarizer.get_tsv_path(event, sim_cutoff) if os.path.exists(ap_tsv_path) and \ not os.path.exists(apf_tsv_path): summarizer.make(event, sim_threshold=sim_cutoff) elif isinstance(summarizer, HACFilteredSummary): event, prefix, feature_set, dist_cutoff, sim_cutoff = sum_args hac = HACSummarizer() hac_tsv_path = hac.get_tsv_path(event, dist_cutoff) hacf_tsv_path = summarizer.get_tsv_path( event, dist_cutoff, sim_cutoff) if os.path.exists(hac_tsv_path) and \ not os.path.exists(hacf_tsv_path): try: summarizer.make(event, prefix, feature_set, dist_cutoff=dist_cutoff, sim_threshold=sim_cutoff) except Exception, e: print e print hac_tsv_path elif isinstance(summarizer, APSalienceFilteredSummary) or \ isinstance(summarizer, APSalTRankSalThreshFilteredSummary): event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args aps = APSalienceSummarizer() aps_tsv_path = aps.get_tsv_path(event, prefix, feature_set) apsf_tsv_path = summarizer.get_tsv_path( event, prefix, feature_set, sal_cutoff, sim_cutoff) if os.path.exists(aps_tsv_path) and \ not os.path.exists(apsf_tsv_path): summarizer.make( event, prefix, feature_set, min_cluster_size=2, center_threshold=sal_cutoff, sim_threshold=sim_cutoff) elif isinstance(summarizer, RankedSalienceFilteredSummary): event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args rsfs_tsv_path = summarizer.get_tsv_path( event, sal_cutoff, sim_cutoff) if not os.path.exists(rsfs_tsv_path): summarizer.make( event, prefix, feature_set, sal_cutoff, sim_cutoff)
def make_summaries(self, eval_data, prefix, feature_set, hac_dist=1.35, hac_sim=.7, ap_sim=.7, apsal_sal=.4, apsal_sim=.7, apsal_tr_sal=.6, apsal_tr_sim=.6, sal_rank_sal=1.8, sal_rank_sim=.4, **kwargs): ap = APSummarizer() apsal = APSalienceSummarizer() hac = HACSummarizer() print "Running with optimal params on dev data." ### Run clustering ### print "Generating AP Cluster\n\t(no params)" print "Generating AP+Salience Cluster\n\t(no params)" print "Generating HAC Cluster\n\tdist-thresh: {}".format(hac_dist) jobs = [] for event, corpus in eval_data: print event.fs_name() apsal_tsv_dir = apsal.get_tsv_dir(prefix, feature_set) if not os.path.exists(apsal_tsv_dir): os.makedirs(apsal_tsv_dir) if not os.path.exists(ap.dir_): os.makedirs(ap.dir_) if not os.path.exists(hac.dir_): os.makedirs(hac.dir_) jobs.append((event, corpus, prefix, feature_set, hac, hac_dist)) jobs.append((event, corpus, prefix, feature_set, ap)) jobs.append((event, corpus, prefix, feature_set, apsal)) self.do_work(cluster_worker, jobs, **kwargs) ### Run filtering ### print print "Generating AP Summary" print "\tSim Threshold: {}".format(ap_sim) print "Generating AP+Salience Summary" print "\tSal Threshold: {}".format(apsal_sal) print "\tSim Threshold: {}".format(apsal_sim) print "Generating HAC Summary" print "\tDist Threshold: {}".format(hac_dist) print "\tSim Threshold: {}".format(hac_sim) print "Generating AP+Salience Time Ranked" print "\tSal Threshold: {}".format(apsal_tr_sal) print "\tSim Threshold: {}".format(apsal_tr_sim) print "Generating Salience Ranked Summary" print "\tSal Threshold: {}".format(sal_rank_sal) print "\tSim Threshold: {}".format(sal_rank_sim) jobs = [] for event, corpus in eval_data: jobs.append((HACFilteredSummary(), (event, prefix, feature_set, hac_dist, hac_sim))) jobs.append((APFilteredSummary(), (event, ap_sim))) jobs.append((APSalienceFilteredSummary(), (event, prefix, feature_set, apsal_sal, apsal_sim))) jobs.append( (APSalTRankSalThreshFilteredSummary(), (event, prefix, feature_set, apsal_tr_sal, apsal_tr_sim))) jobs.append( (RankedSalienceFilteredSummary(), (event, prefix, feature_set, sal_rank_sal, sal_rank_sim))) self.do_work(filter_worker, jobs, **kwargs)
def tune(self, dev_data, prefix, feature_set, hac_dist_min=.9, hac_dist_max=5.05, hac_dist_step=.05, sal_min=-2.0, sal_max=2.0, sal_step=.1, sem_sim_min=.2, sem_sim_max=.7, sem_sim_step=.05, rank_sim_min=.2, rank_sim_max=.4, rank_sim_step=.05, **kwargs): ap = APSummarizer() apsal = APSalienceSummarizer() hac = HACSummarizer() hac_dist_cutoffs = np.arange(hac_dist_min, hac_dist_max + hac_dist_step, hac_dist_step) sal_cutoffs = np.arange(sal_min, sal_max + sal_step, sal_step) sem_sim_cutoffs = np.arange(sem_sim_min, sem_sim_max + sem_sim_step, sem_sim_step) rank_sim_cutoffs = np.arange(rank_sim_min, rank_sim_max + rank_sim_step, rank_sim_step) print "Tuning on dev data." ### Run clustering ### print "Generating AP Cluster\n\t(no params)" print "Generating AP+Salience Cluster\n\t(no params)" print "Generating HAC Cluster" print "\tDist Threshold ({}, {}), step={} {} jobs/event".format( hac_dist_min, hac_dist_max, hac_dist_step, hac_dist_cutoffs.shape[0]) jobs = [] for event, corpus in dev_data: print event.fs_name() apsal_tsv_dir = apsal.get_tsv_dir(prefix, feature_set) if not os.path.exists(apsal_tsv_dir): os.makedirs(apsal_tsv_dir) if not os.path.exists(ap.dir_): os.makedirs(ap.dir_) if not os.path.exists(hac.dir_): os.makedirs(hac.dir_) for cutoff in hac_dist_cutoffs: jobs.append((event, corpus, prefix, feature_set, hac, cutoff)) jobs.append((event, corpus, prefix, feature_set, ap)) jobs.append((event, corpus, prefix, feature_set, apsal)) self.do_work(cluster_worker, jobs, **kwargs) ### Run filtering ### print print "Generating AP Summary" print "\tSim Threshold ({}, {}), step={}".format( sem_sim_min, sem_sim_max, sem_sim_step) print "\t{} jobs/event".format(sem_sim_cutoffs.shape[0]) print "Generating AP+Salience Summary" print "\tSal Threshold ({}, {}), step={}".format( sal_min, sal_max, sal_step) print "\tSim Threshold ({}, {}), step={}".format( sem_sim_min, sem_sim_max, sem_sim_step) print "\t{} jobs/event".format(sal_cutoffs.shape[0] * sem_sim_cutoffs.shape[0]) print "Generating HAC Summary" print "\tDist Threshold ({}, {}), step={}".format( hac_dist_min, hac_dist_max, hac_dist_step) print "\tSim Threshold ({}, {}), step={}".format( sem_sim_min, sem_sim_max, sem_sim_step) print "\t{} jobs/event".format(hac_dist_cutoffs.shape[0] * sem_sim_cutoffs.shape[0]) rsfs = RankedSalienceFilteredSummary() if not os.path.exists(rsfs.dir_): os.makedirs(rsfs.dir_) jobs = [] for event, corpus in dev_data: for sem_sim_cutoff in sem_sim_cutoffs: for dist_cutoff in hac_dist_cutoffs: jobs.append( (HACFilteredSummary(), (event, prefix, feature_set, dist_cutoff, sem_sim_cutoff))) jobs.append((APFilteredSummary(), (event, sem_sim_cutoff))) for sal_cutoff in sal_cutoffs: jobs.append((APSalienceFilteredSummary(), (event, prefix, feature_set, sal_cutoff, sem_sim_cutoff))) jobs.append((APSalTRankSalThreshFilteredSummary(), (event, prefix, feature_set, sal_cutoff, sem_sim_cutoff))) for rank_sim_cutoff in rank_sim_cutoffs: for sal_cutoff in sal_cutoffs: jobs.append((RankedSalienceFilteredSummary(), (event, prefix, feature_set, sal_cutoff, rank_sim_cutoff))) self.do_work(filter_worker, jobs, **kwargs)
def make(self, event, prefix, feature_set, min_cluster_size=2, sim_threshold=.2264, dist_cutoff=1.35): tsv_path = self.get_tsv_path(event, dist_cutoff, sim_threshold) lvecs = SentenceLatentVectorsResource() #spa = SaliencePredictionAggregator() hac = HACSummarizer() cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) updates = [] Xcache = None timestamps = sorted(list(cluster_df['timestamp'].unique())) for timestamp in timestamps: hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) lvec_df = lvecs.get_dataframe(event, hour) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) # sal_df = spa.get_dataframe(event, hour, prefix, feature_set) # sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy() clusters.sort(['stream id', 'sentence id'], inplace=True) # salience = [] # for _, row in clusters.iterrows(): # sal_pred = sal_df.loc[ # (sal_df['stream id'] == row['stream id']) & \ # (sal_df['sentence id'] == row['sentence id']) # ].as_matrix()[:,2:].astype(np.float64).mean() # salience.append(sal_pred) # clusters['salience'] = salience # clusters.sort(['salience'], ascending=False, inplace=True) # sal_mean = np.mean(salience) # sal_std = np.std(salience) for _, row in clusters.iterrows(): if row['cluster size'] < min_cluster_size: continue vec = lvec_df.loc[ (lvec_df['stream id'] == row['stream id']) & \ (lvec_df['sentence id'] == row['sentence id']) ].as_matrix()[:,2:].astype(np.float64) #sal_norm = (row['salience'] - sal_mean) / sal_std #if sal_norm < center_threshold: # continue if Xcache is None: Xcache = vec else: if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold: continue else: Xcache = np.vstack((Xcache, vec)) updates.append({ 'query id': event.query_id[5:], 'system id': 'cunlp', 'run id': 'hac-dist_{}-sim_{}'.format(dist_cutoff, sim_threshold), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': 1.0, 'string': row['string'] }) df = pd.DataFrame(updates, columns=[ "query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string" ]) with open(tsv_path, u'w') as f: df.to_csv(f, sep='\t', index=False, index_label=False, header=False)
def make(self, event, prefix, feature_set, min_cluster_size=2, sim_threshold=.2264, dist_cutoff=1.35): tsv_path = self.get_tsv_path(event, dist_cutoff, sim_threshold) lvecs = SentenceLatentVectorsResource() #spa = SaliencePredictionAggregator() hac = HACSummarizer() cluster_df = hac.get_dataframe(event, dist_cutoff) #for _, row in cluster_df.iterrows(): # print row['hour'], datetime.utcfromtimestamp(row['timestamp']) updates = [] Xcache = None timestamps = sorted(list(cluster_df['timestamp'].unique())) for timestamp in timestamps: hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1) lvec_df = lvecs.get_dataframe(event, hour) lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) # sal_df = spa.get_dataframe(event, hour, prefix, feature_set) # sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True) clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy() clusters.sort(['stream id', 'sentence id'], inplace=True) # salience = [] # for _, row in clusters.iterrows(): # sal_pred = sal_df.loc[ # (sal_df['stream id'] == row['stream id']) & \ # (sal_df['sentence id'] == row['sentence id']) # ].as_matrix()[:,2:].astype(np.float64).mean() # salience.append(sal_pred) # clusters['salience'] = salience # clusters.sort(['salience'], ascending=False, inplace=True) # sal_mean = np.mean(salience) # sal_std = np.std(salience) for _, row in clusters.iterrows(): if row['cluster size'] < min_cluster_size: continue vec = lvec_df.loc[ (lvec_df['stream id'] == row['stream id']) & \ (lvec_df['sentence id'] == row['sentence id']) ].as_matrix()[:,2:].astype(np.float64) #sal_norm = (row['salience'] - sal_mean) / sal_std #if sal_norm < center_threshold: # continue if Xcache is None: Xcache = vec else: if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold: continue else: Xcache = np.vstack((Xcache, vec)) updates.append({ 'query id': event.query_id[5:], 'system id': 'cunlp', 'run id': 'hac-dist_{}-sim_{}'.format( dist_cutoff, sim_threshold), 'stream id': row['stream id'], 'sentence id': row['sentence id'], 'timestamp': timestamp, 'conf': 1.0, 'string': row['string'] }) df = pd.DataFrame(updates, columns=["query id", "system id", "run id", "stream id", "sentence id", "timestamp", "conf", "string"]) with open(tsv_path, u'w') as f: df.to_csv( f, sep='\t', index=False, index_label=False, header=False)