Esempio n. 1
0
def make_apsaltr_summaries(
    apsal_sal_cutoff, apsal_sim_cutoff, model_summaries, max_samples=1000):
    
    data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge")
    config_path = os.path.join(
        data_dir, "apsaltr_sal_{}_sim_{}_config".format(
            apsal_sal_cutoff, apsal_sim_cutoff))
    config_paths = []
    for job in jobs.event_cross_validation_jobs("crossval"):
        for event, corpus in job.dev_events():
            model_summary = model_summaries[event.fs_name()]
            model_path = os.path.join(
                data_dir, "model_{}".format(event.fs_name()))
            max_len = len(model_summary)
            df = APSalTRankSalThreshFilteredSummary().get_dataframe(
                event, job.key, job.feature_set, 
                apsal_sal_cutoff, apsal_sim_cutoff)  
            updates = [update.decode('utf-8') 
                       for update in df['text'].tolist()]
            
            for n_sample in xrange(max_samples):
                #summary_text = random_summary(updates, max_len)      
                sum_path = os.path.join(
                    data_dir, "apsaltr_sal_{}_sim_{}_sample_{}_{}".format(
                        apsal_sal_cutoff, apsal_sim_cutoff, n_sample,
                        event.fs_name()))
                #with open(sum_path, 'w') as f:
                #    f.write(summary_text.encode('utf-8'))
                config_paths.append('{} {}'.format(sum_path, model_path))

    with open(config_path, 'w') as f:
        f.write('\n'.join(config_paths))                

    return config_path
Esempio n. 2
0
def make_ap_summaries(ap_sim_cutoff, model_summaries, max_samples=1000):

    data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge")
    config_path = os.path.join(data_dir,
                               "ap_sim_{}_config".format(ap_sim_cutoff))
    config_paths = []
    for job in jobs.event_cross_validation_jobs("crossval"):
        for event, corpus in job.dev_events():
            model_summary = model_summaries[event.fs_name()]
            model_path = os.path.join(data_dir,
                                      "model_{}".format(event.fs_name()))
            max_len = len(model_summary)
            df = APFilteredSummary().get_dataframe(event, ap_sim_cutoff)
            updates = [
                update.decode('utf-8') for update in df['text'].tolist()
            ]

            for n_sample in xrange(max_samples):
                #summary_text = random_summary(updates, max_len)
                sum_path = os.path.join(
                    data_dir,
                    "ap_sim_{}_sample_{}_{}".format(ap_sim_cutoff, n_sample,
                                                    event.fs_name()))
                #with open(sum_path, 'w') as f:
                #    f.write(summary_text.encode('utf-8'))
                config_paths.append('{} {}'.format(sum_path, model_path))

    with open(config_path, 'w') as f:
        f.write('\n'.join(config_paths))

    return config_path
Esempio n. 3
0
def make_apsaltr_summaries(apsal_sal_cutoff,
                           apsal_sim_cutoff,
                           model_summaries,
                           max_samples=1000):

    data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge")
    rouge_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge", "apsaltr")
    config_path = os.path.join(
        data_dir,
        "apsaltr_sal_{}_sim_{}_config".format(apsal_sal_cutoff,
                                              apsal_sim_cutoff))
    config_paths = defaultdict(list)
    for job in jobs.event_cross_validation_jobs("crossval"):
        for event, corpus in job.eval_events():
            model_summary = model_summaries[event.fs_name()]
            model_path = os.path.join(rouge_dir,
                                      "model_{}".format(event.fs_name()))
            max_len = len(model_summary)
            event_hours = event.list_event_hours()
            df = APSalTRankSalThreshFilteredSummary().get_dataframe(
                event, job.key, job.feature_set, apsal_sal_cutoff,
                apsal_sim_cutoff)

            n_hours = len(event_hours)
            for t, h in enumerate(xrange(12, n_hours, 12), 1):
                print "\t", t, h
                timestamp = int((event_hours[h] - \
                    datetime(1970,1,1)).total_seconds())
                df_t = df[df['timestamp'] < timestamp]

                updates = [
                    update.decode('utf-8') for update in df_t['text'].tolist()
                ]

                for n_sample in xrange(max_samples):
                    summary_text = random_summary(updates, max_len)
                    sum_path = os.path.join(
                        data_dir,
                        "apsaltr_sal_{}_sim_{}_sample_{}_t{}_{}".format(
                            apsal_sal_cutoff, apsal_sim_cutoff, n_sample, t,
                            event.fs_name()))
                    with open(sum_path, 'w') as f:
                        f.write(summary_text.encode('utf-8'))
                    config_paths[t].append('{} {}'.format(
                        sum_path, model_path))

    all_config_paths = []
    for t in sorted(config_paths.keys()):
        config_path_t = config_path + "_t{}".format(t)
        print config_path_t
        with open(config_path_t, 'w') as f:
            f.write('\n'.join(config_paths[t]))
        all_config_paths.append(config_path_t)

    return all_config_paths
Esempio n. 4
0
def make_apsaltr_summaries(
    apsal_sal_cutoff, apsal_sim_cutoff, model_summaries, max_samples=1000):
    
    data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge")
    rouge_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge", "apsaltr")
    config_path = os.path.join(
        data_dir, "apsaltr_sal_{}_sim_{}_config".format(
            apsal_sal_cutoff, apsal_sim_cutoff))
    config_paths = defaultdict(list)
    for job in jobs.event_cross_validation_jobs("crossval"):
        for event, corpus in job.eval_events():
            model_summary = model_summaries[event.fs_name()]
            model_path = os.path.join(
                rouge_dir, "model_{}".format(event.fs_name()))
            max_len = len(model_summary)
            event_hours = event.list_event_hours()
            df = APSalTRankSalThreshFilteredSummary().get_dataframe(
                event, job.key, job.feature_set, 
                apsal_sal_cutoff, apsal_sim_cutoff)  
            
            n_hours = len(event_hours)
            for t, h in enumerate(xrange(12, n_hours, 12), 1):
                print "\t",t, h
                timestamp = int((event_hours[h] - \
                    datetime(1970,1,1)).total_seconds())
                df_t = df[df['timestamp'] < timestamp]
                
                updates = [update.decode('utf-8') 
                           for update in df_t['text'].tolist()]

                for n_sample in xrange(max_samples):
                    summary_text = random_summary(updates, max_len)      
                    sum_path = os.path.join(
                        data_dir, 
                        "apsaltr_sal_{}_sim_{}_sample_{}_t{}_{}".format(
                            apsal_sal_cutoff, apsal_sim_cutoff, n_sample, t,
                            event.fs_name()))
                    with open(sum_path, 'w') as f:
                        f.write(summary_text.encode('utf-8'))
                    config_paths[t].append(
                        '{} {}'.format(sum_path, model_path))

    all_config_paths = []
    for t in sorted(config_paths.keys()):
        config_path_t = config_path + "_t{}".format(t)
        print config_path_t
        with open(config_path_t, 'w') as f:
            f.write('\n'.join(config_paths[t]))   
        all_config_paths.append(config_path_t)             

    return all_config_paths
Esempio n. 5
0
def run_summarizer_jobs(feature_ablation=False, cross_fold=False, **kwargs):
    import cuttsum.pipeline.jobs as jobs

    from cuttsum.pipeline.salience import SalienceModels
    if feature_ablation:
        for jnum, job in enumerate(
            jobs.feature_ablation_jobs(u'feature-ablation')):

            print job
            job.start(**kwargs)

    if cross_fold:
        for job in jobs.event_cross_validation_jobs("crossval"):
            print job
            job.start(**kwargs)
Esempio n. 6
0
def run_summarizer_jobs(feature_ablation=False, cross_fold=False, **kwargs):
    import cuttsum.pipeline.jobs as jobs

    from cuttsum.pipeline.salience import SalienceModels
    if feature_ablation:
        for jnum, job in enumerate(
                jobs.feature_ablation_jobs(u'feature-ablation')):

            print job
            job.start(**kwargs)

    if cross_fold:
        for job in jobs.event_cross_validation_jobs("crossval"):
            print job
            job.start(**kwargs)
Esempio n. 7
0
def make_rank_summaries(rank_sal_cutoff,
                        rank_sim_cutoff,
                        model_summaries,
                        max_samples=1000):

    rouge_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge")
    data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge", "rank")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    config_path = os.path.join(
        data_dir, "rank_sal_{}_sim_{}_config".format(rank_sal_cutoff,
                                                     rank_sim_cutoff))
    config_paths = []
    for job in jobs.event_cross_validation_jobs("crossval"):
        for event, corpus in job.eval_events():
            model_summary = model_summaries[event.fs_name()]
            model_path = os.path.join(rouge_dir,
                                      "model_{}".format(event.fs_name()))
            max_len = len(model_summary)
            df = RankedSalienceFilteredSummary().get_dataframe(
                event, rank_sal_cutoff, rank_sim_cutoff)
            print event.fs_name()
            if df is None:
                print "?"
                import sys
                sys.exit()
            updates = [
                update.decode('utf-8') for update in df['text'].tolist()
            ]

            for n_sample in xrange(max_samples):
                summary_text = random_summary(updates, max_len)
                sum_path = os.path.join(
                    data_dir, "rank_sal_{}_sim_{}_sample_{}_{}".format(
                        rank_sal_cutoff, rank_sim_cutoff, n_sample,
                        event.fs_name()))
                with open(sum_path, 'w') as f:
                    f.write(summary_text.encode('utf-8'))
                config_paths.append('{} {}'.format(sum_path, model_path))

    with open(config_path, 'w') as f:
        f.write('\n'.join(config_paths))

    return config_path
Esempio n. 8
0
def make_rank_summaries(
    rank_sal_cutoff, rank_sim_cutoff, model_summaries, max_samples=1000):
    
    rouge_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge")
    data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge", "rank")
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    config_path = os.path.join(
        data_dir, "rank_sal_{}_sim_{}_config".format(
            rank_sal_cutoff, rank_sim_cutoff))
    config_paths = []
    for job in jobs.event_cross_validation_jobs("crossval"):
        for event, corpus in job.eval_events():
            model_summary = model_summaries[event.fs_name()]
            model_path = os.path.join(
                rouge_dir, "model_{}".format(event.fs_name()))
            max_len = len(model_summary)
            df = RankedSalienceFilteredSummary().get_dataframe(
                event,  
                rank_sal_cutoff, rank_sim_cutoff)  
            print event.fs_name()
            if df is None:
                print "?"
                import sys
                sys.exit()
            updates = [update.decode('utf-8') 
                       for update in df['text'].tolist()]
            
            for n_sample in xrange(max_samples):
                summary_text = random_summary(updates, max_len)      
                sum_path = os.path.join(
                    data_dir, "rank_sal_{}_sim_{}_sample_{}_{}".format(
                        rank_sal_cutoff, rank_sim_cutoff, n_sample,
                        event.fs_name()))
                with open(sum_path, 'w') as f:
                    f.write(summary_text.encode('utf-8'))
                config_paths.append('{} {}'.format(sum_path, model_path))

    with open(config_path, 'w') as f:
        f.write('\n'.join(config_paths))                

    return config_path
Esempio n. 9
0
        shell=True)
    with open(rpath, 'w') as f:
        f.write(o)
    recall = float(re.search('X ROUGE-2 Average_R: ([^ ]+)', o).group(1))
    prec = float(re.search('X ROUGE-2 Average_P: ([^ ]+)', o).group(1))
    f1 = float(re.search('X ROUGE-2 Average_F: ([^ ]+)', o).group(1))
    return config_path, recall, prec, f1


model_summaries = {}
nuggets = pd.concat((cj.get_2013_nuggets(), cj.get_2014_nuggets()))

data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
for job in jobs.event_cross_validation_jobs("crossval"):
    for event, corpus in job.eval_events():
        nugget_text = '\n'.join(
            nuggets[nuggets['query id'] == event.query_id]['text'].tolist())
        model_summaries[event.fs_name()] = nugget_text.decode('utf-8')
        model_path = os.path.join(data_dir, "model_{}".format(event.fs_name()))
        with open(model_path, 'w') as f:
            f.write(nugget_text)

from cuttsum.misc import ProgressBar

apsal_sal_cutoffs = [.6]
apsal_sim_cutoffs = [.65]

apsaltr_sal_cutoffs = [.6]
apsaltr_sim_cutoffs = [.7]
Esempio n. 10
0
        shell=True)
    recall = float(re.search('X ROUGE-2 Average_R: ([^ ]+)', o).group(1))
    prec = float(re.search('X ROUGE-2 Average_P: ([^ ]+)', o).group(1))
    f1 = float(re.search('X ROUGE-2 Average_F: ([^ ]+)', o).group(1))
    return config_path, recall, prec, f1

hac_cutoffs, ap_cutoffs, apsal_cutoffs, rank_cutoffs = generate_dev_sweep()

model_summaries = {}
nuggets = pd.concat((cj.get_2013_nuggets(),
                     cj.get_2014_nuggets()))

data_dir = os.path.join(os.getenv("TREC_DATA", "."), "rouge")
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
for job in jobs.event_cross_validation_jobs("crossval"):
    for event, corpus in job.dev_events():
        nugget_text = '\n'.join(
            nuggets[nuggets['query id'] == event.query_id]['text'].tolist())
        model_summaries[event.fs_name()] = nugget_text.decode('utf-8')
        model_path = os.path.join(
            data_dir, "model_{}".format(event.fs_name()))
        with open(model_path, 'w') as f:
            f.write(nugget_text)

from cuttsum.misc import ProgressBar
rank_sal_cutoffs, rank_sim_cutoffs = rank_cutoffs
apsaltr_sal_cutoffs, apsaltr_sim_cutoffs = apsal_cutoffs
apsal_sal_cutoffs, apsal_sim_cutoffs = apsal_cutoffs
ap_sim_cutoffs = ap_cutoffs
hac_dist_cutoffs, hac_sim_cutoffs = hac_cutoffs