def print_results(configs): n_jobs = len(configs) pb = ProgressBar(n_jobs) results = [] for result in multiprocessing.Pool(5).imap_unordered(rouge, configs): pb.update() results.append(result) results.sort(key=lambda x: x[3], reverse=True) for result in results[:10]: print result
def print_results(configs): n_jobs = len(configs) pb = ProgressBar(n_jobs) results = [] for result in multiprocessing.Pool(20).imap_unordered(rouge, configs): pb.update() results.append(result) results.sort(key=lambda x: x[3], reverse=True) for result in results[:10]: print result
def print_results(configs): n_jobs = len(configs) pb = ProgressBar(n_jobs) results = [] for result in multiprocessing.Pool(24).imap_unordered(rouge, configs): pb.update() results.append(result) results.sort(key=lambda x: x[3], reverse=True) for i, result in enumerate(results[:10], 1): path, r, p, f1 = result print i, path print "R: {}".format(r), "P: {}".format(p), "F1: {}".format(f1)
def get(self, event, corpus, feature_set, prefix, model_events, n_samples=10, **kwargs): feats = get_resource_manager(u'SentenceFeaturesResource') sm = SalienceModels() sp = SaliencePredictions() model_paths = [] for model_event in model_events: model_paths.extend( sm.get_model_paths(model_event, feature_set, prefix, n_samples)) hours = event.list_event_hours() n_hours = len(hours) pb = ProgressBar(n_hours) for hour in hours: pb.update() tsv_paths = \ [sp.get_tsv_path(event, hour, prefix, feature_set, model_path) for model_path in model_paths] tsv_paths = [path for path in tsv_paths if os.path.exists(path)] if len(tsv_paths) == 0: continue data = [] for tsv_path in tsv_paths: with gzip.open(tsv_path, u'r') as f: df = pd.io.parsers.read_csv(f, sep='\t', quoting=3, header=0) df.set_index([u'stream id', u'sentence id'], inplace=True) data.append(df) df = pd.concat(data, axis=1) agg_path = self.get_tsv_path(event, hour, prefix, feature_set) agg_dir = os.path.dirname(agg_path) if not os.path.exists(agg_dir): os.makedirs(agg_dir) df.columns = sorted(df.columns) with gzip.open(agg_path, u'w') as f: df.to_csv(f, sep='\t')
def get(self, event, corpus, feature_set, prefix, model_events, n_samples=10, **kwargs): feats = get_resource_manager(u'SentenceFeaturesResource') sm = SalienceModels() sp = SaliencePredictions() model_paths = [] for model_event in model_events: model_paths.extend( sm.get_model_paths( model_event, feature_set, prefix, n_samples)) hours = event.list_event_hours() n_hours = len(hours) pb = ProgressBar(n_hours) for hour in hours: pb.update() tsv_paths = \ [sp.get_tsv_path(event, hour, prefix, feature_set, model_path) for model_path in model_paths] tsv_paths = [path for path in tsv_paths if os.path.exists(path)] if len(tsv_paths) == 0: continue data = [] for tsv_path in tsv_paths: with gzip.open(tsv_path, u'r') as f: df = pd.io.parsers.read_csv( f, sep='\t', quoting=3, header=0) df.set_index([u'stream id', u'sentence id'], inplace=True) data.append(df) df = pd.concat(data, axis=1) agg_path = self.get_tsv_path(event, hour, prefix, feature_set) agg_dir = os.path.dirname(agg_path) if not os.path.exists(agg_dir): os.makedirs(agg_dir) df.columns=sorted(df.columns) with gzip.open(agg_path, u'w') as f: df.to_csv(f, sep='\t')
import cuttsum.sc from cuttsum.misc import ProgressBar import os from multiprocessing import Pool import streamcorpus as sc import subprocess def worker(path): basename = os.path.basename(path) checksum = basename.split('.')[0] checksum = checksum.split('-')[-1] actual_checksum = subprocess.check_output( "xzcat {} | md5sum".format(path), shell=True).split(" ")[0] if checksum != actual_checksum: print "Bad path:", path paths = [] for path, dirs, files in os.walk(cuttsum.sc.SCChunkResource().dir_): for fname in files: paths.append(os.path.join(path, fname)) pool = Pool(10) pb = ProgressBar(len(paths)) for result in pool.imap_unordered(worker, paths): pb.update()