Example #1
0
def filter_worker(job_queue, result_queue, **kwargs):
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    import time
    time.sleep(1)
    while not job_queue.empty():
        try:
            summarizer, sum_args = job_queue.get(block=False)

            if isinstance(summarizer, APFilteredSummary):
                event, sim_cutoff = sum_args
                ap = APSummarizer()
                ap_tsv_path = ap.get_tsv_path(event)
                apf_tsv_path = summarizer.get_tsv_path(event, sim_cutoff)
                if os.path.exists(ap_tsv_path) and \
                    not os.path.exists(apf_tsv_path):
                    summarizer.make(event, sim_threshold=sim_cutoff)

            elif isinstance(summarizer, HACFilteredSummary):
                event, prefix, feature_set, dist_cutoff, sim_cutoff = sum_args
                hac = HACSummarizer()
                hac_tsv_path = hac.get_tsv_path(event, dist_cutoff)
                hacf_tsv_path = summarizer.get_tsv_path(
                    event, dist_cutoff, sim_cutoff)
                if os.path.exists(hac_tsv_path) and \
                    not os.path.exists(hacf_tsv_path):

                    try:
                        summarizer.make(event,
                                        prefix,
                                        feature_set,
                                        dist_cutoff=dist_cutoff,
                                        sim_threshold=sim_cutoff)
                    except Exception, e:
                        print e
                        print hac_tsv_path

            elif isinstance(summarizer, APSalienceFilteredSummary) or \
                isinstance(summarizer, APSalTRankSalThreshFilteredSummary):
                event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args
                aps = APSalienceSummarizer()
                aps_tsv_path = aps.get_tsv_path(event, prefix, feature_set)
                apsf_tsv_path = summarizer.get_tsv_path(
                    event, prefix, feature_set, sal_cutoff, sim_cutoff)
                if os.path.exists(aps_tsv_path) and \
                    not os.path.exists(apsf_tsv_path):
                    summarizer.make(event,
                                    prefix,
                                    feature_set,
                                    min_cluster_size=2,
                                    center_threshold=sal_cutoff,
                                    sim_threshold=sim_cutoff)
            elif isinstance(summarizer, RankedSalienceFilteredSummary):
                event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args
                rsfs_tsv_path = summarizer.get_tsv_path(
                    event, sal_cutoff, sim_cutoff)
                if not os.path.exists(rsfs_tsv_path):
                    summarizer.make(event, prefix, feature_set, sal_cutoff,
                                    sim_cutoff)
Example #2
0
File: jobs.py Project: kedz/cuttsum
def filter_worker(job_queue, result_queue, **kwargs):
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    import time
    time.sleep(1)
    while not job_queue.empty():
        try:
            summarizer, sum_args = job_queue.get(block=False)

            if isinstance(summarizer, APFilteredSummary):
                event, sim_cutoff = sum_args
                ap = APSummarizer()
                ap_tsv_path = ap.get_tsv_path(event)
                apf_tsv_path = summarizer.get_tsv_path(event, sim_cutoff)
                if os.path.exists(ap_tsv_path) and \
                    not os.path.exists(apf_tsv_path):
                    summarizer.make(event, sim_threshold=sim_cutoff)        

            elif isinstance(summarizer, HACFilteredSummary):
                event, prefix, feature_set, dist_cutoff, sim_cutoff = sum_args
                hac = HACSummarizer()
                hac_tsv_path = hac.get_tsv_path(event, dist_cutoff)
                hacf_tsv_path = summarizer.get_tsv_path(
                    event, dist_cutoff, sim_cutoff)
                if os.path.exists(hac_tsv_path) and \
                    not os.path.exists(hacf_tsv_path):
                    
                    try:
                        summarizer.make(event, prefix, feature_set, 
                                        dist_cutoff=dist_cutoff,
                                        sim_threshold=sim_cutoff)
                    except Exception, e:
                        print e
                        print hac_tsv_path

            elif isinstance(summarizer, APSalienceFilteredSummary) or \
                isinstance(summarizer, APSalTRankSalThreshFilteredSummary):
                event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args
                aps = APSalienceSummarizer()
                aps_tsv_path = aps.get_tsv_path(event, prefix, feature_set)
                apsf_tsv_path = summarizer.get_tsv_path(
                    event, prefix, feature_set, sal_cutoff, sim_cutoff)
                if os.path.exists(aps_tsv_path) and \
                    not os.path.exists(apsf_tsv_path):
                    summarizer.make(
                        event, prefix, feature_set,
                        min_cluster_size=2, center_threshold=sal_cutoff, 
                        sim_threshold=sim_cutoff)
            elif isinstance(summarizer, RankedSalienceFilteredSummary):
                event, prefix, feature_set, sal_cutoff, sim_cutoff = sum_args
                rsfs_tsv_path = summarizer.get_tsv_path(
                    event, sal_cutoff, sim_cutoff)
                if not os.path.exists(rsfs_tsv_path):
                    summarizer.make(
                        event, prefix, feature_set,
                        sal_cutoff, sim_cutoff)
Example #3
0
    def make_summaries(self,
                       eval_data,
                       prefix,
                       feature_set,
                       hac_dist=1.35,
                       hac_sim=.7,
                       ap_sim=.7,
                       apsal_sal=.4,
                       apsal_sim=.7,
                       apsal_tr_sal=.6,
                       apsal_tr_sim=.6,
                       sal_rank_sal=1.8,
                       sal_rank_sim=.4,
                       **kwargs):

        ap = APSummarizer()
        apsal = APSalienceSummarizer()
        hac = HACSummarizer()

        print "Running with optimal params on dev data."

        ### Run clustering ###
        print "Generating AP Cluster\n\t(no params)"
        print "Generating AP+Salience Cluster\n\t(no params)"
        print "Generating HAC Cluster\n\tdist-thresh: {}".format(hac_dist)

        jobs = []
        for event, corpus in eval_data:
            print event.fs_name()
            apsal_tsv_dir = apsal.get_tsv_dir(prefix, feature_set)
            if not os.path.exists(apsal_tsv_dir):
                os.makedirs(apsal_tsv_dir)
            if not os.path.exists(ap.dir_):
                os.makedirs(ap.dir_)
            if not os.path.exists(hac.dir_):
                os.makedirs(hac.dir_)

            jobs.append((event, corpus, prefix, feature_set, hac, hac_dist))
            jobs.append((event, corpus, prefix, feature_set, ap))
            jobs.append((event, corpus, prefix, feature_set, apsal))

        self.do_work(cluster_worker, jobs, **kwargs)

        ### Run filtering ###
        print
        print "Generating AP Summary"
        print "\tSim Threshold: {}".format(ap_sim)
        print "Generating AP+Salience Summary"
        print "\tSal Threshold: {}".format(apsal_sal)
        print "\tSim Threshold: {}".format(apsal_sim)
        print "Generating HAC Summary"
        print "\tDist Threshold: {}".format(hac_dist)
        print "\tSim Threshold: {}".format(hac_sim)
        print "Generating AP+Salience Time Ranked"
        print "\tSal Threshold: {}".format(apsal_tr_sal)
        print "\tSim Threshold: {}".format(apsal_tr_sim)
        print "Generating Salience Ranked Summary"
        print "\tSal Threshold: {}".format(sal_rank_sal)
        print "\tSim Threshold: {}".format(sal_rank_sim)

        jobs = []
        for event, corpus in eval_data:
            jobs.append((HACFilteredSummary(), (event, prefix, feature_set,
                                                hac_dist, hac_sim)))

            jobs.append((APFilteredSummary(), (event, ap_sim)))
            jobs.append((APSalienceFilteredSummary(),
                         (event, prefix, feature_set, apsal_sal, apsal_sim)))
            jobs.append(
                (APSalTRankSalThreshFilteredSummary(),
                 (event, prefix, feature_set, apsal_tr_sal, apsal_tr_sim)))
            jobs.append(
                (RankedSalienceFilteredSummary(),
                 (event, prefix, feature_set, sal_rank_sal, sal_rank_sim)))

        self.do_work(filter_worker, jobs, **kwargs)
Example #4
0
    def tune(self,
             dev_data,
             prefix,
             feature_set,
             hac_dist_min=.9,
             hac_dist_max=5.05,
             hac_dist_step=.05,
             sal_min=-2.0,
             sal_max=2.0,
             sal_step=.1,
             sem_sim_min=.2,
             sem_sim_max=.7,
             sem_sim_step=.05,
             rank_sim_min=.2,
             rank_sim_max=.4,
             rank_sim_step=.05,
             **kwargs):

        ap = APSummarizer()
        apsal = APSalienceSummarizer()
        hac = HACSummarizer()

        hac_dist_cutoffs = np.arange(hac_dist_min,
                                     hac_dist_max + hac_dist_step,
                                     hac_dist_step)
        sal_cutoffs = np.arange(sal_min, sal_max + sal_step, sal_step)
        sem_sim_cutoffs = np.arange(sem_sim_min, sem_sim_max + sem_sim_step,
                                    sem_sim_step)
        rank_sim_cutoffs = np.arange(rank_sim_min,
                                     rank_sim_max + rank_sim_step,
                                     rank_sim_step)

        print "Tuning on dev data."

        ### Run clustering ###
        print "Generating AP Cluster\n\t(no params)"
        print "Generating AP+Salience Cluster\n\t(no params)"
        print "Generating HAC Cluster"
        print "\tDist Threshold ({}, {}), step={} {} jobs/event".format(
            hac_dist_min, hac_dist_max, hac_dist_step,
            hac_dist_cutoffs.shape[0])

        jobs = []
        for event, corpus in dev_data:
            print event.fs_name()
            apsal_tsv_dir = apsal.get_tsv_dir(prefix, feature_set)
            if not os.path.exists(apsal_tsv_dir):
                os.makedirs(apsal_tsv_dir)
            if not os.path.exists(ap.dir_):
                os.makedirs(ap.dir_)
            if not os.path.exists(hac.dir_):
                os.makedirs(hac.dir_)

            for cutoff in hac_dist_cutoffs:
                jobs.append((event, corpus, prefix, feature_set, hac, cutoff))
            jobs.append((event, corpus, prefix, feature_set, ap))
            jobs.append((event, corpus, prefix, feature_set, apsal))

        self.do_work(cluster_worker, jobs, **kwargs)

        ### Run filtering ###
        print
        print "Generating AP Summary"
        print "\tSim Threshold ({}, {}), step={}".format(
            sem_sim_min, sem_sim_max, sem_sim_step)
        print "\t{} jobs/event".format(sem_sim_cutoffs.shape[0])
        print "Generating AP+Salience Summary"
        print "\tSal Threshold ({}, {}), step={}".format(
            sal_min, sal_max, sal_step)
        print "\tSim Threshold ({}, {}), step={}".format(
            sem_sim_min, sem_sim_max, sem_sim_step)
        print "\t{} jobs/event".format(sal_cutoffs.shape[0] *
                                       sem_sim_cutoffs.shape[0])
        print "Generating HAC Summary"
        print "\tDist Threshold ({}, {}), step={}".format(
            hac_dist_min, hac_dist_max, hac_dist_step)
        print "\tSim Threshold ({}, {}), step={}".format(
            sem_sim_min, sem_sim_max, sem_sim_step)
        print "\t{} jobs/event".format(hac_dist_cutoffs.shape[0] *
                                       sem_sim_cutoffs.shape[0])

        rsfs = RankedSalienceFilteredSummary()
        if not os.path.exists(rsfs.dir_):
            os.makedirs(rsfs.dir_)
        jobs = []
        for event, corpus in dev_data:

            for sem_sim_cutoff in sem_sim_cutoffs:

                for dist_cutoff in hac_dist_cutoffs:
                    jobs.append(
                        (HACFilteredSummary(), (event, prefix, feature_set,
                                                dist_cutoff, sem_sim_cutoff)))

                jobs.append((APFilteredSummary(), (event, sem_sim_cutoff)))
                for sal_cutoff in sal_cutoffs:
                    jobs.append((APSalienceFilteredSummary(),
                                 (event, prefix, feature_set, sal_cutoff,
                                  sem_sim_cutoff)))
                    jobs.append((APSalTRankSalThreshFilteredSummary(),
                                 (event, prefix, feature_set, sal_cutoff,
                                  sem_sim_cutoff)))
            for rank_sim_cutoff in rank_sim_cutoffs:
                for sal_cutoff in sal_cutoffs:
                    jobs.append((RankedSalienceFilteredSummary(),
                                 (event, prefix, feature_set, sal_cutoff,
                                  rank_sim_cutoff)))
        self.do_work(filter_worker, jobs, **kwargs)
Example #5
0
    def make(self,
             event,
             prefix,
             feature_set,
             min_cluster_size=2,
             sim_threshold=.2264,
             dist_cutoff=1.35):
        tsv_path = self.get_tsv_path(event, dist_cutoff, sim_threshold)
        lvecs = SentenceLatentVectorsResource()
        #spa = SaliencePredictionAggregator()
        hac = HACSummarizer()
        cluster_df = hac.get_dataframe(event, dist_cutoff)
        #for _, row in cluster_df.iterrows():
        #    print row['hour'], datetime.utcfromtimestamp(row['timestamp'])
        updates = []
        Xcache = None
        timestamps = sorted(list(cluster_df['timestamp'].unique()))
        for timestamp in timestamps:
            hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1)
            lvec_df = lvecs.get_dataframe(event, hour)
            lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)

            #            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)
            #            sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)

            clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy()
            clusters.sort(['stream id', 'sentence id'], inplace=True)

            #            salience = []
            #            for _, row in clusters.iterrows():
            #                sal_pred = sal_df.loc[
            #                    (sal_df['stream id'] == row['stream id']) & \
            #                    (sal_df['sentence id'] == row['sentence id'])
            #                    ].as_matrix()[:,2:].astype(np.float64).mean()
            #                salience.append(sal_pred)
            #            clusters['salience'] = salience
            #            clusters.sort(['salience'], ascending=False, inplace=True)

            #            sal_mean = np.mean(salience)
            #            sal_std = np.std(salience)

            for _, row in clusters.iterrows():
                if row['cluster size'] < min_cluster_size:
                    continue
                vec = lvec_df.loc[
                    (lvec_df['stream id'] == row['stream id']) & \
                    (lvec_df['sentence id'] == row['sentence id'])
                    ].as_matrix()[:,2:].astype(np.float64)

                #sal_norm = (row['salience'] - sal_mean) / sal_std
                #if sal_norm < center_threshold:
                #    continue

                if Xcache is None:
                    Xcache = vec
                else:
                    if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold:
                        continue
                    else:
                        Xcache = np.vstack((Xcache, vec))
                updates.append({
                    'query id':
                    event.query_id[5:],
                    'system id':
                    'cunlp',
                    'run id':
                    'hac-dist_{}-sim_{}'.format(dist_cutoff, sim_threshold),
                    'stream id':
                    row['stream id'],
                    'sentence id':
                    row['sentence id'],
                    'timestamp':
                    timestamp,
                    'conf':
                    1.0,
                    'string':
                    row['string']
                })

        df = pd.DataFrame(updates,
                          columns=[
                              "query id", "system id", "run id", "stream id",
                              "sentence id", "timestamp", "conf", "string"
                          ])
        with open(tsv_path, u'w') as f:
            df.to_csv(f,
                      sep='\t',
                      index=False,
                      index_label=False,
                      header=False)
Example #6
0
    def make(self, event, prefix, feature_set,
             min_cluster_size=2, sim_threshold=.2264,
             dist_cutoff=1.35):
        tsv_path = self.get_tsv_path(event, dist_cutoff, sim_threshold)
        lvecs = SentenceLatentVectorsResource()
        #spa = SaliencePredictionAggregator()
        hac = HACSummarizer()
        cluster_df = hac.get_dataframe(event, dist_cutoff)
       #for _, row in cluster_df.iterrows():
       #    print row['hour'], datetime.utcfromtimestamp(row['timestamp']) 
        updates = []
        Xcache = None
        timestamps = sorted(list(cluster_df['timestamp'].unique()))
        for timestamp in timestamps:
            hour = datetime.utcfromtimestamp(timestamp) - timedelta(hours=1)
            lvec_df = lvecs.get_dataframe(event, hour)
            lvec_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
            
#            sal_df = spa.get_dataframe(event, hour, prefix, feature_set)  
#            sal_df.drop_duplicates(['stream id', 'sentence id'], inplace=True)
            
            clusters = cluster_df[cluster_df['timestamp'] == timestamp].copy()
            clusters.sort(['stream id', 'sentence id'], inplace=True)

#            salience = []
#            for _, row in clusters.iterrows():
#                sal_pred = sal_df.loc[
#                    (sal_df['stream id'] == row['stream id']) & \
#                    (sal_df['sentence id'] == row['sentence id'])
#                    ].as_matrix()[:,2:].astype(np.float64).mean()
#                salience.append(sal_pred)
#            clusters['salience'] = salience
#            clusters.sort(['salience'], ascending=False, inplace=True)

#            sal_mean = np.mean(salience)
#            sal_std = np.std(salience)


            for _, row in clusters.iterrows():
                if row['cluster size'] < min_cluster_size:
                    continue   
                vec = lvec_df.loc[
                    (lvec_df['stream id'] == row['stream id']) & \
                    (lvec_df['sentence id'] == row['sentence id'])
                    ].as_matrix()[:,2:].astype(np.float64)

                #sal_norm = (row['salience'] - sal_mean) / sal_std
                #if sal_norm < center_threshold:
                #    continue

                if Xcache is None:
                    Xcache = vec
                else:
                    if np.max(cosine_similarity(Xcache, vec)) >= sim_threshold:
                        continue
                    else:
                        Xcache = np.vstack((Xcache, vec))
                updates.append({
                    'query id': event.query_id[5:],
                    'system id': 'cunlp',
                    'run id': 'hac-dist_{}-sim_{}'.format(
                        dist_cutoff, sim_threshold),
                    'stream id': row['stream id'], 
                    'sentence id': row['sentence id'],
                    'timestamp': timestamp,
                    'conf': 1.0,
                    'string': row['string']
                })

        df = pd.DataFrame(updates,
            columns=["query id", "system id", "run id",
                     "stream id", "sentence id", "timestamp", 
                     "conf", "string"])
        with open(tsv_path, u'w') as f:
            df.to_csv(
                f, sep='\t', index=False, index_label=False, header=False)