コード例 #1
0
ファイル: rouge_sampled_eval.py プロジェクト: kedz/cuttsum
def print_results(configs):

    n_jobs = len(configs)
    pb = ProgressBar(n_jobs)
    results = []
    for result in multiprocessing.Pool(5).imap_unordered(rouge, configs):
        pb.update()
        results.append(result)
    results.sort(key=lambda x: x[3], reverse=True)
    for result in results[:10]:
        print result
コード例 #2
0
def print_results(configs):

    n_jobs = len(configs)
    pb = ProgressBar(n_jobs)
    results = []
    for result in multiprocessing.Pool(20).imap_unordered(rouge, configs):
        pb.update()
        results.append(result)
    results.sort(key=lambda x: x[3], reverse=True)
    for result in results[:10]:
        print result
コード例 #3
0
ファイル: rouge_sampled_dev.py プロジェクト: kedz/cuttsum
def print_results(configs):

    n_jobs = len(configs)
    pb = ProgressBar(n_jobs)
    results = []
    for result in multiprocessing.Pool(24).imap_unordered(rouge, configs):
        pb.update()
        results.append(result)
    results.sort(key=lambda x: x[3], reverse=True)
    for i, result in enumerate(results[:10], 1):
        path, r, p, f1 = result
        print i, path
        print  "R: {}".format(r), "P: {}".format(p), "F1: {}".format(f1)
コード例 #4
0
def print_results(configs):

    n_jobs = len(configs)
    pb = ProgressBar(n_jobs)
    results = []
    for result in multiprocessing.Pool(24).imap_unordered(rouge, configs):
        pb.update()
        results.append(result)
    results.sort(key=lambda x: x[3], reverse=True)
    for i, result in enumerate(results[:10], 1):
        path, r, p, f1 = result
        print i, path
        print "R: {}".format(r), "P: {}".format(p), "F1: {}".format(f1)
コード例 #5
0
    def get(self,
            event,
            corpus,
            feature_set,
            prefix,
            model_events,
            n_samples=10,
            **kwargs):

        feats = get_resource_manager(u'SentenceFeaturesResource')

        sm = SalienceModels()
        sp = SaliencePredictions()
        model_paths = []
        for model_event in model_events:
            model_paths.extend(
                sm.get_model_paths(model_event, feature_set, prefix,
                                   n_samples))

        hours = event.list_event_hours()
        n_hours = len(hours)
        pb = ProgressBar(n_hours)
        for hour in hours:
            pb.update()
            tsv_paths = \
                [sp.get_tsv_path(event, hour, prefix, feature_set, model_path)
                 for model_path in model_paths]
            tsv_paths = [path for path in tsv_paths if os.path.exists(path)]
            if len(tsv_paths) == 0:
                continue
            data = []
            for tsv_path in tsv_paths:
                with gzip.open(tsv_path, u'r') as f:
                    df = pd.io.parsers.read_csv(f,
                                                sep='\t',
                                                quoting=3,
                                                header=0)
                    df.set_index([u'stream id', u'sentence id'], inplace=True)
                    data.append(df)
            df = pd.concat(data, axis=1)
            agg_path = self.get_tsv_path(event, hour, prefix, feature_set)
            agg_dir = os.path.dirname(agg_path)
            if not os.path.exists(agg_dir):
                os.makedirs(agg_dir)

            df.columns = sorted(df.columns)
            with gzip.open(agg_path, u'w') as f:
                df.to_csv(f, sep='\t')
コード例 #6
0
ファイル: salience.py プロジェクト: kedz/cuttsum
    def get(self, event, corpus, feature_set,
            prefix, model_events, n_samples=10, **kwargs):

        feats = get_resource_manager(u'SentenceFeaturesResource')
        
        sm = SalienceModels()
        sp = SaliencePredictions()
        model_paths = []
        for model_event in model_events:
            model_paths.extend(
                sm.get_model_paths(
                    model_event, feature_set, prefix, n_samples))

        hours = event.list_event_hours()
        n_hours = len(hours)
        pb = ProgressBar(n_hours)
        for hour in hours:
            pb.update()
            tsv_paths = \
                [sp.get_tsv_path(event, hour, prefix, feature_set, model_path)
                 for model_path in model_paths]
            tsv_paths = [path for path in tsv_paths if os.path.exists(path)]
            if len(tsv_paths) == 0:
                continue
            data = []
            for tsv_path in tsv_paths:
                with gzip.open(tsv_path, u'r') as f:
                    df = pd.io.parsers.read_csv(
                        f, sep='\t', quoting=3, header=0)
                    df.set_index([u'stream id', u'sentence id'], inplace=True)
                    data.append(df)
            df = pd.concat(data, axis=1)
            agg_path = self.get_tsv_path(event, hour, prefix, feature_set)
            agg_dir = os.path.dirname(agg_path)
            if not os.path.exists(agg_dir):
                os.makedirs(agg_dir)

            df.columns=sorted(df.columns)
            with gzip.open(agg_path, u'w') as f:
                df.to_csv(f, sep='\t')  
コード例 #7
0
import cuttsum.sc
from cuttsum.misc import ProgressBar
import os
from multiprocessing import Pool
import streamcorpus as sc
import subprocess


def worker(path):
    basename = os.path.basename(path)
    checksum = basename.split('.')[0]
    checksum = checksum.split('-')[-1]
    actual_checksum = subprocess.check_output(
        "xzcat {} | md5sum".format(path), shell=True).split(" ")[0] 
    if checksum != actual_checksum:
        print "Bad path:", path

paths = []
for path, dirs, files in os.walk(cuttsum.sc.SCChunkResource().dir_):
    for fname in files:
        paths.append(os.path.join(path, fname))

pool = Pool(10)
pb = ProgressBar(len(paths))

for result in pool.imap_unordered(worker, paths):
    pb.update()