def __init__(self, pool: multiprocessing.Pool, post_files: typing.Iterable[pathlib.Path], base_url: str=None, feed_url: str=None, outdate_epoch: typing.Optional[datetime.datetime]=None): self.logger = logging.getLogger('Blog') self.pool = pool self.base_url = base_url self._feed_url = feed_url self.logger.info('Loading posts...') self.posts = list(map(Post, post_files)) # Loading published dates list(pool.imap_unordered(self._get_published_at, self.posts)) # Loading titles list(pool.imap_unordered(operator.attrgetter('title'), self.posts)) self.posts.sort(key=self._get_published_at) self.canon_posts = [p for p in self.posts if p.canon] self.logger.info('Total %d posts are loaded.', len(self.posts)) self.current_base_path = './' self.outdate_epoch = outdate_epoch self.jinja2_env = Environment(loader=FileSystemLoader('templates'), extensions=['jinja2.ext.with_'], autoescape=True) self.jinja2_env.globals.update( blog=self, href_for=self.resolve_relative_url, outdate_epoch=self.outdate_epoch )
def update(self, date): """Update index components (and weight) for the **same** day before market open.""" CMD = sql.CMD1.format(date=date) self.logger.debug('Executing command:\n{}', CMD) self.cursor.execute(CMD) df1 = pd.DataFrame(list(self.cursor)) if len(df1) == 0: self.logger.warning('No records found for {} on {}', self.db.index_components.name, date) return df1.columns = ['dname', 'market', 'sid'] df1.dname = ['SH'+dname if mkt == 83 else 'SZ'+dname for mkt, dname in zip(df1.market, df1.dname)] df1.index = df1.sid CMD = sql.CMD2.format(date=date) self.logger.debug('Executing command:\n{}', CMD) self.cursor.execute(CMD) try: df2 = pd.DataFrame(list(self.cursor)) df2.columns = ['dname', 'market', 'sid', 'weight'] df2.dname = ['SH'+dname if mkt == 83 else 'SZ'+dname for mkt, dname in zip(df2.market, df2.dname)] df2.index = df2.sid except: df2 = None grouped = df1.groupby('dname') pool = Pool(self.threads) pool.imap_unordered(worker, [(date, dname, _df1, df2) for dname, _df1 in grouped], self.threads) pool.close() pool.join() self.logger.info('UPSERT documents for {} indice into (c: [{}]) of (d: [{}]) on {}', len(grouped), COLLECTION.name, self.db.name, date)
def make_epoch(n, train_true, train_false, val_true, val_false): n = n[0] train_false = list(train_false) val_false = list(val_false) np.random.shuffle(train_false) np.random.shuffle(val_false) n_train_true = len(train_true) n_val_true = len(val_true) train_epoch = train_true + train_false[:n_train_true*2] #*2 to account for 1 flip directions val_epoch = val_true + val_false[:n_val_true*2] train_epoch = combine_tups(train_epoch) val_epoch = combine_tups(val_epoch) print "Epoch {0} n files {1}&{2}".format(n, len(train_epoch), len(val_epoch)) pool = Pool(processes=12) train_epoch_data = list(itertools.chain.from_iterable(pool.imap_unordered(load_data, train_epoch))) print "Epoch {0} done loading train".format(n) val_epoch_data = list(itertools.chain.from_iterable(pool.imap_unordered(load_data, val_epoch))) print "Epoch {0} done loading validation".format(n) pool.close() np.random.shuffle(train_epoch_data) return train_epoch_data, val_epoch_data
def main(): args = docopt(__doc__) feature_name = args['<feature_name>'] assert feature_name == 'words' assert args['<experimentset_name>'] in EXPERIMENT_SETS, '<experimentset_name> must be one of %s' % str(EXPERIMENT_SETS.keys()) c = get_config() experiment_set = EXPERIMENT_SETS[args['<experimentset_name>']](feature_name=feature_name) print "Computing foreground group sums using %d cores..." % c.num_cores pool = Pool(c.num_cores, init_worker) fg_groups = experiment_set.list_foreground_groups() cache = {} try: for group_name, sum_vector in progress.bar(pool.imap_unordered(ComputeForegroundGroupSumCallable(experiment_set), fg_groups), label="Progress ", expected_size=len(fg_groups)): cache[group_name] = sum_vector except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Computing background sums..." bg_groups = experiment_set.list_background_groups() for g in bg_groups: sum_vector = experiment_set.compute_background_group_sum(g, cache) cache[g] = sum_vector print "Saving sums to ZODB..." zodb_root = open_zodb(read_only=False) if getattr(zodb_root, 'group_sums', None) is None: zodb_root.group_sums = BTrees.OOBTree.OOBTree() transaction.commit() if feature_name not in zodb_root.group_sums: zodb_root.group_sums[feature_name] = BTrees.OOBTree.OOBTree() transaction.commit() for k, v in cache.iteritems(): zodb_root.group_sums[feature_name][k] = v transaction.commit() print "Creating output db tables..." create_db(c.resultsdb_url) session_out = open_db(c.resultsdb_url) print "Computing overrepresentation using %d cores..." % c.num_cores exps = experiment_set.list_experiments() cls = experiment_set.result_table_class() try: for fg, bg, results in progress.bar(pool.imap_unordered(ComputeOverrepresentedWordsCallable(experiment_set), exps), label="Progress ", expected_size=len(exps)): for w, odds, pval in results: c = cls(foreground_group_name=fg, background_group_name=bg, word=w, odds=odds, pval=pval) session_out.add(c) except KeyboardInterrupt: print "Terminating pool.." pool.terminate() pool.join() print "Committing..." session_out.commit() print "Done"
def main() : print "Title here" multiprocessing.freeze_support() PROCESSES = 4 print '\r\n\tCreating pool with:\t%d processes' % PROCESSES pool = Pool(PROCESSES) print '\tNo. of cpu\'s present:\t%d cores' % cpu_count() procList = ["process1", "process2", "process3", "process4", "process5", "process6" ] for Name in procList : pool.imap_unordered(multi_run_wrapper,[(Name,variable1,variable2,variable3, variable4,variable5,variable6)]) pool.close(); pool.join()
def handle(self, *args, **options): user_count = options['count'] + 1 users = range(1, user_count) versions = list(Version.objects.select_related('app', 'platform').filter_by_enabled()) job_size = int(user_count / (cpu_count() or 1 * 2)) or 1 job_data = [users[i:i + job_size] for i in range(0, len(users), job_size)] pool = Pool() pool.imap_unordered(partial(run_worker, versions=versions), job_data) pool.close() pool.join()
def _go(self, num_procs, chunk_size = None): ''' This is the equivalent of the main method. It will create the processes and the pipeline between item generators -> mappers -> a reducer. ''' pool = None try: print('Initiating...', file=sys.stderr) igen = self.item_generator() reducer = self.reducer() mapper = self.mapper() if (num_procs > 1): print('Using %d processes' %num_procs, file=sys.stderr) def igen_helper(): ''' Helper generator to pass the mapper object to each slave process ''' for key, item in igen: yield (mapper, key, item) pool = Pool(num_procs) results = None if not chunk_size: results = pool.imap_unordered(_processor_helper, igen_helper()) else: results = pool.imap_unordered(_processor_helper, igen_helper(), chunk_size) for key, value in results: reducer(key, value) else: print('Using one mapper only', file = sys.stderr) for key, item in igen: value = mapper(key, item) reducer(key, value) self.finalize() print('Done.', file = sys.stderr) finally: if pool: pool.close() pool.join()
def main(): idir, ofile, dffile = _parse_cmdline() print u'Loading doc-freqs file {}...'.format(dffile) with open(dffile, 'rb') as f: df = pickle.load(f) print u'Reading input directory: {}'.format(idir) jobs = _load_jobs(idir, df) # Do the work. pool = Pool(4) njobs = len(jobs) try: import sys with codecs.open(ofile, 'wb') as pf: pickle.dump(njobs, pf) results = pool.imap_unordered(worker, jobs) for i, result in enumerate(results, 1): pickle.dump(result, pf) per = 100 * (float(i) / njobs) sys.stdout.write(u'\rPercent Complete: {:2.3f}%'.format(per)) sys.stdout.flush() sys.stdout.write(u'\rPercent Complete: 100% \n') sys.stdout.flush() except KeyboardInterrupt: sys.stdout.write(u'\rPercent Complete: {:2.3f}% \n'.format(per)) sys.stdout.write(u'Shutting down.\n') sys.stdout.flush() sys.exit() print u'Complete!'
def store_contents(data_path, save_path, preprocess, num_workers=None): """Preprocess and store a corpus of documents in sqlite. Args: data_path: Root path to directory (or directory of directories) of files containing json encoded documents (must have `id` and `text` fields). save_path: Path to output sqlite db. preprocess: Path to file defining a custom `preprocess` function. Takes in and outputs a structured doc. num_workers: Number of parallel processes to use when reading docs. """ if os.path.isfile(save_path): raise RuntimeError('%s already exists! Not overwriting.' % save_path) logger.info('Reading into database...') conn = sqlite3.connect(save_path) c = conn.cursor() c.execute("CREATE TABLE documents (id PRIMARY KEY, text);") workers = ProcessPool(num_workers, initializer=init, initargs=(preprocess,)) files = [f for f in iter_files(data_path)] count = 0 with tqdm(total=len(files)) as pbar: for pairs in tqdm(workers.imap_unordered(get_contents, files)): count += len(pairs) c.executemany("INSERT INTO documents VALUES (?,?)", pairs) pbar.update() logger.info('Read %d docs.' % count) logger.info('Committing...') conn.commit() conn.close()
def compress_file(self,corpus, np=4,separator=None): """ construct WLZW pattern out of a corpus, parallelism is an option @param corpus - string, file path of the corpus @param np - number of processes, if np = 1 the algorithm is run in serial @param separator - the separator string to separate doc id and document. pass None if no doc id is given @return set, the final set containing all frequent patterns """ #if only one process, no need for parallelization if np==1: return set(_compress_file((corpus,0,np,separator))) p=Pool(processes=np) l=[] for i in range(0,np): l.append((corpus,i,np,separator)) result=p.imap_unordered(_compress_file,l,1) if np==1: final_set=result.next() else: final_set=_union(result) return final_set
def imap_unordered(self, func, iterable, chunksize=1): '''Same as SGEPool.imap, except that the results are unordered. Rather than blocking to ensure the correct order, all jobs are polled and results are returned as soon as they are done. ''' if not self.use_grid_engine: workerPool = Pool(initializer=self.initializer, initargs=self.initargs) for val in workerPool.imap_unordered(func, iterable, chunksize): yield val iterable = iter(iterable) allJobs = self._submit_jobs(func, iterable, 'map', chunksize) interval = 3 while len(allJobs) > 0: doneJobs = [] for job in allJobs: if job.isFinished(): doneJobs.append(job) for data in self._getData(job.outputFile): yield data os.remove(job.inputFile) # BUG: these files aren't removed if there is an exception raised os.remove(job.outputFile) for job in doneJobs: allJobs.remove(job) if len(doneJobs) == 0: # no jobs are done yet-- wait for a while for them to finish time.sleep(interval) interval = min( 2 * interval, .001 )
def main(): parser = argparse.ArgumentParser(description='Analyze a bandersnatch mirror.') parser.add_argument('--json', help='save raw data to a json file', default=None) args = parser.parse_args() concurrency = 8 root = "/var/spool/pypi/web/packages/source/" p = Pool() results = {} try: try: for path, result in \ p.imap_unordered(analyse_sdist, yield_packages(root)): results[path] = result p.close() except: p.terminate() raise finally: p.join() if args.json: with open(args.json, 'wb') as f: f.write(json.dumps(results)) pprint.pprint(results)
def run_committee(graph, eweights, signs, tree_kind='rst', train_vertices=.1, size=13, degree_function=None, threshold_function=None): global GRAPH, EWEIGHTS, SIGNS, VTRAIN GRAPH, EWEIGHTS, SIGNS = graph, eweights, signs if isinstance(train_vertices, float): num_revealed = int(train_vertices*len(graph)) train_vertices = random.sample(list(graph.keys()), num_revealed) VTRAIN = train_vertices tree_kind = tree_kind.lower() assert tree_kind in ['rst', 'bfs', 'stg'], tree_kind if tree_kind == 'rst': args = size*[(get_rst, {'fake': None}), ] if tree_kind == 'bfs': degrees = sorted(((node, len(adj)) for node, adj in graph.items()), key=lambda x: x[1]) args = [(get_bfs, {'root': _[0]}) for _ in degrees[-size:]] if tree_kind == 'stg': func_dict = {'degree_function': degree_function, 'threshold_function': threshold_function} args = size*[(get_stg, func_dict), ] num_threads = min(6, size) pool = Pool(num_threads) res = list(pool.imap_unordered(predict, args, chunksize=size//num_threads)) preds, gold = [_[1] for _ in res], res[0][0] return gold, majority_vote(preds)
def create_database(f, db): global t0 t0 = walltime() P = Pool(NUMPROCESSES) it = P.imap_unordered(process_data, raw_data(f), chunksize=100) con = connect(db) con.execute(''' CREATE TABLE polygons( rowid INTEGER PRIMARY KEY, vertices TEXT, num_vertices INTEGER, volume REAL, num_points INTEGER, num_interior INTEGER, num_border INTEGER, width INTEGER, length INTEGER, symm INTEGER) ''') con.executemany('INSERT INTO polygons VALUES (?,?,?,?,?,?,?,?,?,?)', it) con.commit() con.close()
def all_links(root='http://ailev.livejournal.com/', nb=10, path='post-list.txt'): print('Fetch calendar entries') days = list_days(root_url=root) print('There are {} days with entries'.format(len(days))) t0 = time() pool = Pool(processes=nb) it = pool.imap_unordered(list_posts, days) work = list(tqdm(it, total=len(days))) pool.close() pool.join() links = [] for x in work: if x: links.extend(x) # there may be duplicates, don't know why, so fast walkaround links = list(set(links)) links.sort() with open(path, 'w') as fout: fout.writelines(x + '\n' for x in links) t1 = time() print('Done for {}s'.format(t1 - t0)) return links
def normalize_all_words(l): r = dict() processes = Pool(max(1, cpu_count()-1)) for s in l.values(): for w, wn in processes.imap_unordered(normalize_word, s): r[w] = wn return r
def build_level_database(input_path): print "Generating or regenerating level database." print "This may take a long time if a lot of files need to be scanned." paths = [p.strip() for p in find(input_path)] levels = [] pool = Pool(processes=4) ratio = 100.0 / len(paths) processed = 0 last_percent = 0 for data in pool.imap_unordered(process_level, paths): processed += 1 percent = int(processed * ratio) if percent > last_percent: last_percent = percent print "... {}%".format(percent) if not data: continue levels.append(data) db = { "levels" : levels, "version" : DATABASE_VERSION, } with open("level_db.pickle", "w") as outfile: pickle.dump(db, outfile)
def _read_multi(self, graphs, n_jobs, batch_size): """ like read_single but with multiple processes """ if n_jobs > 1: pool = Pool(processes=n_jobs) else: pool = Pool() # extract_c_and_i = lambda batch,args: [ extract_cores_and_interfaces( [y]+args ) for y in batch ] results = pool.imap_unordered(extract_cips, self._multi_process_argbuilder(graphs, batch_size=batch_size)) # the resulting chips can now be put intro the grammar jobs_done = 0 for batch in results: for exci in batch: if exci: # exci might be None because the grouper fills up with empty problems for exci_result_per_node in exci: for cip in exci_result_per_node: self._add_core_interface_data(cip) jobs_done += 1 if jobs_done == self.multiprocess_jobcount and self.mp_prepared: pool.terminate() pool.close() pool.join()
def parallel_iter(processes, f, inputs): """ Return a parallel iterator. INPUT: - ``processes`` -- integer - ``f`` -- function - ``inputs`` -- an iterable of pairs (args, kwds) OUTPUT: - iterator over values of ``f`` at ``args,kwds`` in some random order. EXAMPLES:: sage: def f(x): return x+x sage: import sage.parallel.multiprocessing_sage sage: v = list(sage.parallel.multiprocessing_sage.parallel_iter(2, f, [((2,), {}), ((3,),{})])) sage: v.sort(); v [(((2,), {}), 4), (((3,), {}), 6)] """ from twisted.internet import reactor # do not delete this (!) -- see trac 8785 if processes == 0: processes = ncpus.ncpus() p = Pool(processes) fp = pickle_function(f) result = p.imap_unordered(call_pickled_function, [ (fp, t) for t in inputs ]) for res in result: yield res p.close() p.join()
def main(): p = Pool(200) mgr= Manager() with open('ip1024', 'r') as f: proxyDictList = ['proxy_user:wYFzbwTfpR@'+line.strip()+':17102' for line in f.readlines()] http = httpUtil(iplist=mgr.list(proxyDictList),key1='Price + Shipping') partial_merge = functools.partial(getData,http=http,mutex=mgr.Lock()) with open(sys.argv[2], 'w') as file_out: for output, line in p.imap_unordered(partial_merge, open(sys.argv[1], "r+")): if(output==-2): file_out.write("%s\t%s\t%s\t%s\t%s\n" % (line.rstrip('\r\n'), 'missed2', 'N','N','-4')) continue if(output==-1): file_out.write("%s\t%s\t%s\t%s\t%s\n" % (line.rstrip('\r\n'), 'missed1', 'N','N','-4')) continue try: s=str("%s\t%s\t%s\t%s\t%s \n" % \ (line.rstrip('\r\n'), output['price1'], output['price2'], output['shipping'], output['output'])) file_out.write(s) except: file_out.write(line) p.close() p.join()
def run(asmb_fn, options): if multiproc_exception is None and options.cpus > 1: work_units = [] asmb_input=IMP.multifit.read_settings(asmb_fn) asmb_input.set_was_used(True) em_map=asmb_input.get_assembly_header().get_dens_fn() resolution=asmb_input.get_assembly_header().get_resolution() spacing=asmb_input.get_assembly_header().get_spacing() origin=asmb_input.get_assembly_header().get_origin() for i in range(asmb_input.get_number_of_component_headers()): fits_fn=asmb_input.get_component_header(i).get_transformations_fn() pdb_fn=asmb_input.get_component_header(i).get_filename() f = Fitter(em_map, spacing, resolution, origin, asmb_input.get_assembly_header().get_threshold(),pdb_fn, fits_fn, options.angle,options.num,options.angle_voxel) if multiproc_exception is None and options.cpus > 1: work_units.append(f) else: if options.cpus > 1: options.cpus = 1 print >> sys.stderr, """ The Python 'multiprocessing' module (available in Python 2.6 and later) is needed to run on multiple CPUs, and could not be found (Python error: '%s'). Running on a single processor.""" % multiproc_exception f.run() if multiproc_exception is None and options.cpus > 1: # No point in spawning more processes than components nproc = min(options.cpus, asmb_input.get_number_of_component_headers()) p = Pool(processes=nproc) out = list(p.imap_unordered(do_work, work_units))
def main(): total = len(sys.argv) if total < 3: print "Utilization: python apply_distance.py <shape_file> <shape_polygon_field> <input_csv_file> <output_csv_file>" exit(0) pool = Pool(processes=cpu_count()) idata = read_input(str(sys.argv[1]),str(sys.argv[2]),str(sys.argv[3])) num_tasks = len(idata) #imap responses = pool.imap_unordered(process_distance, idata) while (True): completed = responses._index if (completed == num_tasks): break percent = (float(completed)/float(num_tasks))*100 print "%.3f" % percent," % complete. ", "Waiting for", num_tasks-completed, "tasks to complete..." time.sleep(2) pool.close() responses = [x for x in responses if x is not None] idata = write_to_csv(str(sys.argv[4]),responses)
def process_sessions_real(coordinators, updates_directory, index_filename, pickle_root, result_pickle_root, num_workers=None): if num_workers != 0: pool = Pool(processes=num_workers) session_context_manager = SessionContextManager() session_context_manager.declare_persistent_state( 'filenames_processed', set, None) session_context_manager.declare_persistent_state( 'last_sequence_number_processed', return_negative_one, None) for coordinator in coordinators: for name, (init_func, merge_func) \ in coordinator.persistent_state.iteritems(): session_context_manager.declare_persistent_state( name, init_func, merge_func) for name, (init_func, merge_func) \ in coordinator.ephemeral_state.iteritems(): session_context_manager.declare_ephemeral_state( name, init_func, merge_func) print 'Preparing processors' process_args = [] index = UpdatesIndex(index_filename) for session in index.sessions: processors = [] for coordinator in coordinators: processors.append(coordinator.create_processor(session)) update_files = index.session_data(session) process_args.append((session, session_context_manager, pickle_root, result_pickle_root, processors, update_files, updates_directory)) print 'Processing sessions' global_context = GlobalContext() if num_workers == 0: for args in process_args: pickle_path = process_session_wrapper(args) session_context = session_context_manager.load_context(pickle_path) session_context_manager.merge_contexts(session_context, global_context) del session_context else: results = pool.imap_unordered(process_session_wrapper, process_args) for pickle_path in results: session_context = session_context_manager.load_context(pickle_path) session_context_manager.merge_contexts(session_context, global_context) del session_context pool.close() pool.join() print 'Post-processing' for coordinator in coordinators: coordinator.finished_processing(global_context)
def get_kmer_counts(input, output, k, ns, nprocs, verbose): """Analyse kmers. Multiprocessing enabled""" #define base2digit dict for 4-char seq base2digit = {"A": "0", "C": "1", "G": "2", "T": "3"} if ns: #change to 5-char seq if Ns in seq base2digit = {"A": "0", "C": "1", "G": "2", "N": "3", "T": "4"} #init mer counts #255 for uint8 #65,535 for uint16 or #4,294,967,295 for uint32 merCounts = np.zeros(len(base2digit)**k/2, dtype='uint16') #start pool #maxtasksperchild=1000) p = Pool(nprocs, initializer=init_args, initargs=(k, ns, base2digit)) #process reads for i, ids in enumerate(p.imap_unordered(seq2mers, SeqIO.parse(input, 'fastq'), \ chunksize=100), 1): if not i%1e4: sys.stderr.write(" %s [%s Mb]\r"%(i, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024)) for mid in ids: merCounts[mid] += 1 sys.stderr.write(" %s [%s Mb]\n"%(i, resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024)) #get mer freq maxCount = merCounts.max() if maxCount < 100: maxCount = 100 occurencies = [0]*maxCount for c in merCounts: occurencies[c-1] += 1 #write to file output.write("\n".join("%s\t%s"%xy for xy in enumerate(occurencies,1))+"\n") return occurencies
def initJobsDirs(jobs_dir, include_wt, displacement, debug): """ initializes the jobDirs variable """ pdbToProtein = { pdbPath.split('/')[-1]: protein for protein, pdbPath in pdbs.iteritems() } roots = list() filelists = list() for root, dirs, files in os.walk(jobs_dir, followlinks=True): roots.append(root) filelists.append(files) # checkJobsDir(pdbToProtein, root, files, include_wt, displacement, debug) doers = Pool(cpu_count()) jobs = zip([pdbToProtein]*len(roots), roots, filelists, [include_wt]*len(roots), [displacement]*len(roots), [debug]*len(roots)) print("Loading {} evaluators...{}".format(len(roots), datetime.datetime.now())) sys.stdout.flush() with click.progressbar(doers.imap_unordered(checkJobsDir, jobs), length=len(roots), label='Running', file=sys.stderr) as progbar: for j in progbar: pass print("Done...{}".format(datetime.datetime.now()))
def main(argv): """Go Main Go.""" scenario = int(argv[1]) lengths = load_lengths(scenario) dates = determine_dates(sys.argv) huc12s = find_huc12s(scenario) precip = load_precip(dates) jobs = [] for huc12 in huc12s: jobs.append([scenario, huc12, lengths[huc12], dates, precip[huc12]]) # Begin the processing work now! # NB: Usage of a ThreadPool here ended in tears (so slow) pool = Pool() totalinserts = 0 totalskipped = 0 totaldeleted = 0 for huc12, inserts, skipped, deleted in tqdm( pool.imap_unordered(do_huc12, jobs), total=len(jobs), disable=(not sys.stdout.isatty())): if inserts is None: print("ERROR: huc12 %s returned 0 data" % (huc12,)) continue totalinserts += inserts totalskipped += skipped totaldeleted += deleted print("env2database.py inserts: %s skips: %s deleted: %s" % (totalinserts, totalskipped, totaldeleted)) update_metadata(scenario, dates)
def subconfigure(args): parser = argparse.ArgumentParser() parser.add_argument('--list', type=str, help='File containing a list of subconfigures to run') parser.add_argument('--skip', type=str, help='File containing a list of Subconfigures to skip') parser.add_argument('subconfigures', type=str, nargs='*', help='Subconfigures to run if no list file is given') args, others = parser.parse_known_args(args) subconfigures = args.subconfigures if args.list: subconfigures.extend(open(args.list, 'rb').read().splitlines()) if args.skip: skips = set(open(args.skip, 'rb').read().splitlines()) subconfigures = [s for s in subconfigures if s not in skips] if not subconfigures: return 0 ret = 0 # One would think using a ThreadPool would be faster, considering # everything happens in subprocesses anyways, but no, it's actually # slower on Windows. (20s difference overall!) pool = Pool(min(len(subconfigures), cpu_count())) for relobjdir, returncode, output in \ pool.imap_unordered(run, subconfigures): print prefix_lines(output, relobjdir) sys.stdout.flush() ret = max(returncode, ret) if ret: break pool.close() pool.join() return ret
def func(n_cores = 1, **kwargs): """ Function for command line action **Arguments:** :*n_cores*: Number of cores to use """ from multiprocessing import Pool block_generator = block_generator_class(**kwargs) block_accumulator = block_accumulator_class( preexisting_slice = block_generator.preexisting_slice, incoming_slice = block_generator.incoming_slice, outputs = block_generator.outputs, **kwargs) if n_cores == 1: # Serial for block in block_generator: block() block_accumulator.send(block) else: # Parallel (processes) pool = Pool(n_cores) for block in pool.imap_unordered(pool_director, block_generator): pass block_accumulator.send(block) pool.close() pool.join() block_accumulator.close() block_acceptor = Block_Acceptor(outputs = block_accumulator.outputs, **kwargs) block_acceptor.send(block_accumulator) block_acceptor.close()
def __iter__(self): '''Return OcgCollection objects from the cache or directly from source data. yields OcgCollection''' ## simple iterator for serial operations if self.serial: it = itertools.imap(get_collection,self._iter_proc_args_()) ## use a multiprocessing pool returning unordered geometries ## for the parallel case else: pool = Pool(processes=self.nprocs) it = pool.imap_unordered(get_collection, self._iter_proc_args_()) ## the iterator return from the Pool requires calling its 'next' ## method and catching the StopIteration exception while True: try: yld = it.next() yield(yld) except StopIteration: break
def cluster_estimator_similarity(_c_trace): from multiprocessing import Pool pool = Pool(processes=6) # pack = number_of_samples # dim = _c_trace.shape # incr = int(dim[0]/pack) # number_of_samples = bins = range(0, number_of_samples, pairwise_number_in_pack) if bins[-1] <> number_of_samples: bins.append(number_of_samples) a = [_c_trace[bins[i] : bins[i + 1]] for i in range(len(bins[:-1]))] import pararell_methods start = time.time() total_matrix = sum(pool.imap_unordered(pararell_methods.pararell_calc_ne, a)) pool.close() pool.join() end = time.time() print end - start return total_matrix
def calculateCorrelationforOntology(aspect, matrix_type): print("\n\nSemantic similarity correlation calculation for aspect:" + aspect + " using matrix:" + matrix_type + " ...\n") #Clear lists before each aspect similarity_list[:] = [] proteinListNew[:] = [] similarityMatrixNameDict = {} similarityMatrixNameDict[ "All"] = "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix.csv" similarityMatrixNameDict[ "500"] = "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv" similarityMatrixNameDict[ "Sparse"] = "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv" similarityMatrixNameDict[ "200"] = "../data/preprocess/human_" + aspect + "_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv" similarityMatrixFileName = similarityMatrixNameDict[matrix_type] human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName) human_proteinSimilarityMatrix.set_index( human_proteinSimilarityMatrix.columns, inplace=True) proteinList = human_proteinSimilarityMatrix.columns #proteinListNew is referanced using Manager for prot in proteinList: proteinListNew.append(prot) if matrix_type == "Sparse": #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy") sparsified_similarity_coordinates = np.load( "../data/auxilary_input/SparsifiedSimilarityCoordinates_" + aspect + "_for_highest_500.npy") protParamList = sparsified_similarity_coordinates else: i = range(len(proteinList)) j = range(len(proteinList)) protParamList = list(itertools.product(i, j)) protParamListNew = [] # Prepare parameters for parallel processing these parameters will be # used concurrently by different processes for tup in tqdm(protParamList): i = tup[0] j = tup[1] if matrix_type == "Sparse": protein1 = proteinListNew[i] protein2 = proteinListNew[j] real = human_proteinSimilarityMatrix.loc[protein1, protein2] tupNew = (tup[0], tup[1], aspect, real) protParamListNew.append(tupNew) else: if j > i: protein1 = proteinListNew[i] protein2 = proteinListNew[j] real = human_proteinSimilarityMatrix.loc[protein1, protein2] tupNew = (tup[0], tup[1], aspect, real) protParamListNew.append(tupNew) total_task_num = len(protParamListNew) pool = Pool() similarity_listRet = [] #parallelSimilarityPartial = partial(parallelSimilarity,protein_embedding_type) for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity, protParamListNew), total=total_task_num, position=0, leave=True): pass #time.sleep(0.1) pool.close() pool.join() real_distance_list = [value[0] for value in similarity_listRet] cosine_distance_list = [value[1] for value in similarity_listRet] manhattan_distance_list = [value[2] for value in similarity_listRet] euclidian_distance_list = [value[3] for value in similarity_listRet] distance_lists = [ real_distance_list, cosine_distance_list, manhattan_distance_list, euclidian_distance_list ] if detailed_output: report_detailed_distance_scores(representation_name, matrix_type, aspect, distance_lists) cosineCorr = spearmanr(real_distance_list, cosine_distance_list) manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list) euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list) #print("Cosine Correlation for "+aspect+" is " + str(cosineCorr)) #print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr)) #print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr)) return (cosineCorr, manhattanCorr, euclidianCorr)
'ngram_range': [(1, 2)] } models = {} if args.nproc == 1: for idx, ambig_terms_batch in \ enumerate(batch_iter(all_ambigs_pmids, 10)): pickle_name = 'gilda_ambiguities_hgnc_mesh_%d.pkl' % idx models = learn_batch(ambig_terms_batch) with open(pickle_name, 'wb') as fh: pickle.dump(models, fh) else: pool = Pool(args.nproc) fun = functools.partial(learn_model, params=param_grid) pkl_idx = 0 models = {} for count, model in enumerate( pool.imap_unordered(fun, all_ambigs_pmids, chunksize=10)): print('#### %d ####' % count) if model is None: print('Model is None, skipping') else: models[model['ambig'][0].text] = model if (count + 1) % 100 == 0: pickle_name = 'gilda_ambiguities_hgnc_mesh_%d.pkl' % pkl_idx with open(pickle_name, 'wb') as fh: pickle.dump(models, fh) pkl_idx += 1 models = {} pool.close() pool.join()
def main(): args = get_parser().parse_args() # TODO convert to logging sys.stderr.write("* Initializing reads file search.\n") fast5_reads = fast5utils.iterate_fast5_reads( args.input_folder, limit=args.limit, strand_list=args.input_strand_list, recursive=args.recursive) if args.scaling is not None: sys.stderr.write("* Loading read scaling parameters from {}.\n".format( args.scaling)) all_read_params = get_per_read_params_dict_from_tsv(args.scaling) input_read_ids = frozenset(rec[1] for rec in fast5_reads) scaling_read_ids = frozenset(all_read_params.keys()) sys.stderr.write("* {} / {} reads have scaling information.\n".format( len(input_read_ids & scaling_read_ids), len(input_read_ids))) fast5_reads = [ rec for rec in fast5_reads if rec[1] in scaling_read_ids ] else: all_read_params = {} sys.stderr.write("* Calling reads.\n") nbase, ncalled, nread, nsample = 0, 0, 0, 0 t0 = time.time() progress = Progress(quiet=args.quiet) startcharacter = '@' if args.fastq else '>' initargs = [ args.device, args.model, args.chunk_size, args.overlap, all_read_params, args.alphabet, args.max_concurrent_chunks, args.fastq, args.qscore_scale, args.qscore_offset, args.beam, args.posterior, args.temperature ] pool = Pool(args.jobs, initializer=worker_init, initargs=initargs) with open_file_or_stdout(args.output) as fh: for read_id, basecall, qstring, read_nsample in \ pool.imap_unordered(worker, fast5_reads): if basecall is not None and len(basecall) > 0: fh.write("{}{}\n{}\n".format( startcharacter, read_id, basecall[::-1] if args.reverse else basecall)) nbase += len(basecall) ncalled += 1 if args.fastq: fh.write("+\n{}\n".format( qstring[::-1] if args.reverse else qstring)) nread += 1 nsample += read_nsample progress.step() total_time = time.time() - t0 sys.stderr.write("* Called {} reads in {:.2f}s\n".format( nread, int(total_time))) sys.stderr.write("* {:7.2f} kbase / s\n".format(nbase / total_time / 1000.0)) sys.stderr.write("* {:7.2f} ksample / s\n".format(nsample / total_time / 1000.0)) sys.stderr.write("* {} reads failed.\n".format(nread - ncalled)) #quantized_model(model) return
def main(): if args.perl: eval_fn_list = list(glob.glob(args.pred)) else: eval_fn_list = [ eval_fn for eval_fn in glob.glob(args.pred) if not (args.lazy_eval and Path(eval_fn + ".rouge").exists()) ] eval_fn_list = list( filter(lambda fn: not (fn.endswith('.post') or fn.endswith('.rouge')), eval_fn_list)) if args.only_eval_best: best_epoch_dict = {} for dir_path in set(Path(fn).parent for fn in eval_fn_list): fn_save = os.path.join(dir_path, 'save_best.dev') if Path(fn_save).exists(): with open(fn_save, 'r') as f_in: __, o_name, __ = f_in.read().strip().split('\n') epoch = o_name.split('.')[1] best_epoch_dict[dir_path] = epoch new_eval_fn_list = [] for fn in eval_fn_list: dir_path = Path(fn).parent if dir_path in best_epoch_dict: if Path(fn).name.split('.')[1] == best_epoch_dict[dir_path]: new_eval_fn_list.append(fn) eval_fn_list = new_eval_fn_list logger.info("***** Evaluation: %s *****", ','.join(eval_fn_list)) num_pool = max(1, min(args.processes, len(eval_fn_list))) logger.info(args.processes, len(eval_fn_list), num_pool) p = Pool(num_pool) r_list = p.imap_unordered(process_eval, eval_fn_list) r_list = sorted([(fn, scores) for fn, scores in r_list], key=lambda x: x[0]) rg2_dict = {} for fn, scores in r_list: logger.info(fn) if args.perl: print(rouge_results_to_str(scores)) else: rg2_dict[fn] = scores['rouge-2']['f'] print("ROUGE-1: {}\tROUGE-2: {}\tROUGE-L: {}\n".format( scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f'])) with open(fn + ".rouge", 'w') as f_out: f_out.write( json.dumps({ 'rg1': scores['rouge-1']['f'], 'rg2': scores['rouge-2']['f'] })) p.close() p.join() if args.save_best: # find best results group_dict = {} for k, v in rg2_dict.items(): d_name, o_name = Path(k).parent, Path(k).name if (d_name not in group_dict) or (v > group_dict[d_name][1]): group_dict[d_name] = (o_name, v) # compare and save the best result for k, v in group_dict.items(): fn = os.path.join(k, 'save_best.' + args.split) o_name_s, rst_s = v should_save = True if Path(fn).exists(): with open(fn, 'r') as f_in: rst_f = float(f_in.read().strip().split('\n')[-1]) if rst_s <= rst_f: should_save = False if should_save: with open(fn, 'w') as f_out: f_out.write('{0}\n{1}\n{2}\n'.format(k, o_name_s, rst_s))
def update(self, tree, parallel=True): # type: (Iterable[Tuple[Text, Optional[Text], bool]], bool) -> bool """Update the manifest given an iterable of items that make up the updated manifest. The iterable must either generate tuples of the form (SourceFile, True) for paths that are to be updated, or (path, False) for items that are not to be updated. This unusual API is designed as an optimistaion meaning that SourceFile items need not be constructed in the case we are not updating a path, but the absence of an item from the iterator may be used to remove defunct entries from the manifest.""" logger = get_logger() changed = False # Create local variable references to these dicts so we avoid the # attribute access in the hot loop below data = self._data types = data.type_by_path() remaining_manifest_paths = set(types) to_update = [] for path, file_hash, updated in tree: path_parts = tuple(path.split(os.path.sep)) is_new = path_parts not in remaining_manifest_paths if not updated and is_new: # This is kind of a bandaid; if we ended up here the cache # was invalid but we've been using it anyway. That's obviously # bad; we should fix the underlying issue that we sometimes # use an invalid cache. But at least this fixes the immediate # problem raise InvalidCacheError if not updated: remaining_manifest_paths.remove(path_parts) else: assert self.tests_root is not None source_file = SourceFile(self.tests_root, path, self.url_base, file_hash) hash_changed = False # type: bool if not is_new: if file_hash is None: file_hash = source_file.hash remaining_manifest_paths.remove(path_parts) old_type = types[path_parts] old_hash = data[old_type].hashes[path_parts] if old_hash != file_hash: hash_changed = True del data[old_type][path_parts] if is_new or hash_changed: to_update.append(source_file) if to_update: logger.debug("Computing manifest update for %s items" % len(to_update)) changed = True # 25 items was derived experimentally (2020-01) to be approximately the # point at which it is quicker to create a Pool and parallelize update. pool = None if parallel and len(to_update) > 25 and cpu_count() > 1: # On Python 3 on Windows, using >= MAXIMUM_WAIT_OBJECTS processes # causes a crash in the multiprocessing module. Whilst this enum # can technically have any value, it is usually 64. For safety, # restrict manifest regeneration to 48 processes on Windows. # # See https://bugs.python.org/issue26903 and https://bugs.python.org/issue40263 processes = cpu_count() if sys.platform == "win32" and processes > 48: processes = 48 pool = Pool(processes) # chunksize set > 1 when more than 10000 tests, because # chunking is a net-gain once we get to very large numbers # of items (again, experimentally, 2020-01) chunksize = max(1, len(to_update) // 10000) logger.debug("Doing a multiprocessed update. CPU count: %s, " "processes: %s, chunksize: %s" % (cpu_count(), processes, chunksize)) results = pool.imap_unordered(compute_manifest_items, to_update, chunksize=chunksize ) # type: Iterator[Tuple[Tuple[Text, ...], Text, Set[ManifestItem], Text]] else: results = map(compute_manifest_items, to_update) for result in results: rel_path_parts, new_type, manifest_items, file_hash = result data[new_type][rel_path_parts] = manifest_items data[new_type].hashes[rel_path_parts] = file_hash # Make sure to terminate the Pool, to avoid hangs on Python 3. # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool if pool is not None: pool.terminate() if remaining_manifest_paths: changed = True for rel_path_parts in remaining_manifest_paths: for test_data in data.values(): if rel_path_parts in test_data: del test_data[rel_path_parts] return changed
class BatchRunnerMP(BatchRunner): """ Child class of BatchRunner, extended with multiprocessing support. """ def __init__(self, model_cls, nr_processes=None, **kwargs): """ Create a new BatchRunnerMP for a given model with the given parameters. model_cls: The class of model to batch-run. nr_processes: int the number of separate processes the BatchRunner should start, all running in parallel. kwargs: the kwargs required for the parent BatchRunner class """ if nr_processes is None: # identify the number of processors available on users machine available_processors = cpu_count() self.processes = available_processors print("BatchRunner MP will use {} processors.".format(self.processes)) else: self.processes = nr_processes super().__init__(model_cls, **kwargs) self.pool = Pool(self.processes) def _make_model_args_mp(self): """Prepare all combinations of parameter values for `run_all` Due to multiprocessing requirements of @StaticMethod takes different input, hence the similar function Returns: List of list with the form: [[model_object, dictionary_of_kwargs, max_steps, iterations]] """ total_iterations = self.iterations all_kwargs = [] count = len(self.parameters_list) if count: for params in self.parameters_list: kwargs = params.copy() kwargs.update(self.fixed_parameters) # run each iterations specific number of times for iter in range(self.iterations): kwargs_repeated = kwargs.copy() all_kwargs.append([self.model_cls, kwargs_repeated, self.max_steps, iter]) elif len(self.fixed_parameters): count = 1 kwargs = self.fixed_parameters.copy() all_kwargs.append(kwargs) total_iterations *= count return all_kwargs, total_iterations @staticmethod def _run_wrappermp(iter_args): """ Based on requirement of Python multiprocessing requires @staticmethod decorator; this is primarily to ensure functionality on Windows OS and does not impact MAC or Linux distros :param iter_args: List of arguments for model run iter_args[0] = model object iter_args[1] = key word arguments needed for model object iter_args[2] = maximum number of steps for model iter_args[3] = number of time to run model for stochastic/random variation with same parameters :return: tuple of param values which serves as a unique key for model results model object """ model_i = iter_args[0] kwargs = iter_args[1] max_steps = iter_args[2] iteration = iter_args[3] # instantiate version of model with correct parameters model = model_i(**kwargs) while model.running and model.schedule.steps < max_steps: model.step() # add iteration number to dictionary to make unique_key kwargs["iteration"] = iteration # convert kwargs dict to tuple to make consistent param_values = tuple(kwargs.values()) return param_values, model def _result_prep_mp(self, results): """ Helper Function :param results: Takes results dictionary from Processpool and single processor debug run and fixes format to make compatible with BatchRunner Output :updates model_vars and agents_vars so consistent across all batchrunner """ # Take results and convert to dictionary so dataframe can be called for model_key, model in results.items(): if self.model_reporters: self.model_vars[model_key] = self.collect_model_vars(model) if self.agent_reporters: agent_vars = self.collect_agent_vars(model) for agent_id, reports in agent_vars.items(): agent_key = model_key + (agent_id,) self.agent_vars[agent_key] = reports if hasattr(model, "datacollector"): if model.datacollector.model_reporters is not None: self.datacollector_model_reporters[model_key] = model.datacollector.get_model_vars_dataframe() if model.datacollector.agent_reporters is not None: self.datacollector_agent_reporters[model_key] = model.datacollector.get_agent_vars_dataframe() # Make results consistent if len(self.datacollector_model_reporters.keys()) == 0: self.datacollector_model_reporters = None if len(self.datacollector_agent_reporters.keys()) == 0: self.datacollector_agent_reporters = None def run_all(self): """ Run the model at all parameter combinations and store results, overrides run_all from BatchRunner. """ run_iter_args, total_iterations = self._make_model_args_mp() # register the process pool and init a queue # store results in ordered dictionary results = {} if self.processes > 1: with tqdm(total_iterations, disable=not self.display_progress) as pbar: for params, model in self.pool.imap_unordered(self._run_wrappermp, run_iter_args): results[params] = model pbar.update() self._result_prep_mp(results) # For debugging model due to difficulty of getting errors during multiprocessing else: for run in run_iter_args: params, model_data = self._run_wrappermp(run) results[params] = model_data self._result_prep_mp(results) # Close multi-processing self.pool.close() return (getattr(self, "model_vars", None), getattr(self, "agent_vars", None), getattr(self, "datacollector_model_reporters", None), getattr(self, "datacollector_agent_reporters", None))
def _compute_ricci_curvature_edges(G: nx.Graph, weight="weight", edge_list=[], alpha=0.5, method="OTD", base=math.e, exp_power=2, proc=cpu_count(), chunksize=None, cache_maxsize=1000000): """ Compute Ricci curvature for edges in given edge lists. :param G: A NetworkX graph. :param weight: The edge weight used to compute Ricci curvature. Default: "weight". :param edge_list: The list of edges to compute Ricci curvature, set to [] to run for all edges in G. Default: []. :param alpha: The parameter for the discrete Ricci curvature, range from 0 ~ 1. It means the share of mass to leave on the original node. E.g. x -> y, alpha = 0.4 means 0.4 for x, 0.6 to evenly spread to x's nbr. Default: 0.5. :param method: Transportation method, "OTD" for Optimal Transportation Distance (Default), "ATD" for Average Transportation Distance. "Sinkhorn" for OTD approximated Sinkhorn distance. :param base: Base variable for weight distribution. Default: math.e. :param exp_power: Exponential power for weight distribution. Default: 0. :param proc: Number of processor used for multiprocessing. :param chunksize: Chunk size for multiprocessing, set None for auto decide. Default: None. :param cache_maxsize: Max size for LRU cache for pairwise shortest path computation. Set this to None for unlimited cache. Default: 1000000. :return: output: A dictionary of edge Ricci curvature. E.g.: {(node1, node2): ricciCurvature}. """ if not nx.get_edge_attributes(G, weight): print('Edge weight not detected in graph, use "weight" as default edge weight.') for (v1, v2) in G.edges(): G[v1][v2][weight] = 1.0 # ---set to global variable for multiprocessing used.--- global _Gk global _alpha global _weight global _method global _base global _exp_power global _proc global _cache_maxsize # ------------------------------------------------------- _Gk = nk.nxadapter.nx2nk(G, weightAttr=weight) _alpha = alpha _weight = weight _method = method _base = base _exp_power = exp_power _proc = proc _cache_maxsize = cache_maxsize # Construct nx to nk dictionary nx2nk_ndict, nk2nx_ndict = {}, {} for idx, n in enumerate(G.nodes()): nx2nk_ndict[n] = idx nk2nx_ndict[idx] = n if edge_list: args = [(nx2nk_ndict[source], nx2nk_ndict[target]) for source, target in edge_list] else: args = [(nx2nk_ndict[source], nx2nk_ndict[target]) for source, target in G.edges()] # Start compute edge Ricci curvature t0 = time.time() p = Pool(processes=_proc) # Decide chunksize following method in map_async if chunksize is None: chunksize, extra = divmod(len(args), proc * 4) if extra: chunksize += 1 # Compute Ricci curvature for edges result = p.imap_unordered(_wrap_compute_single_edge, args, chunksize=chunksize) p.close() p.join() # Convert edge index from nk back to nx for final output output = {} for rc in result: for k in list(rc.keys()): output[(nk2nx_ndict[k[0]], nk2nx_ndict[k[1]])] = rc[k] logger.info("%8f secs for Ricci curvature computation." % (time.time() - t0)) return output
def downloadReport(i, remove=False): ''' 用于下载2017年企业社会责任报告,下载格式为 pdf,传入参数为第 i 条企业 ''' filename = r"./pdf/{}{}.pdf".format(dfDownload["code"].iloc[i], dfDownload["title"].iloc[i]) if os.path.exists(filename): if remove: os.remove(filename) print("{} removed".format(filename)) else: print("{} exists".format(filename)) return None response = getWeb(dfDownload["reportDownload"].iloc[i], proxies=proxies, Return="", sleep=True, sleepMultiply=3) with open(filename, "wb") as f: f.write(response.content) print("{} 成功获取".format(filename)) return response from multiprocessing import Pool pool = Pool() res = pool.imap_unordered(downloadReport, range(dfDownload.shape[0])) resultPDF = [item for item in res] pool.close()
def db_MapDB(params): params = utils.load_paramDict(params) params['dbtype'] = params.get('dbtype', 'bowtie2') db_columns = [ c for c in params['db_columns'] + params['metadata_columns'] + params['taxa_columns'] if c not in ('sha256') ] assert params.get('seqlist', None) is not None, 'seqlist is required. ' data = utils.load_database(**params) if params['seqlist'] in ('stdin', '-', ''): fin = sys.stdin else: fin = open(params['seqlist']) glist = pd.read_csv(fin, delimiter='\t', dtype='str') fin.close() mapdb = params['MapDB'] mapdb = os.path.join(params['bowtie_db'], mapdb) start_id = 0 indices = {i: 1 for i in glist['index'].tolist()} if len(glob.glob(mapdb + '.*')) > 0: assert params.get('mode', '') in ( 'overwrite', 'append' ), 'Old database with same name present. You have to use a new name with "MapDB=", or choose between "mode=overwrite" and "mode=append".' if params.get('mode', '') == 'overwrite': for fname in glob.glob(mapdb + '.*'): os.unlink(fname) elif params.get('mode', '') == 'append': for fname in glob.glob(mapdb + '.*.taxa.gz'): i = int(fname.rsplit('.', 3)[1]) if i >= start_id: start_id = i + 1 with gzip.open(fname) as fin: for line in fin: indices[line.strip().split()[1]] = 2 data = data.set_index('index', drop=False) data['size'] = data['size'].astype(int) data = data.loc[[i for i, t in indices.iteritems() if t == 1]].sort_values(by=['size'], ascending=[False]) min_file_num = int(np.ceil( np.sum(data['size']).astype(float) / 3800000000)) buckets = [[0, []] for n in xrange(min_file_num)] id = -1 for index, size, file_path, url_path in data[[ 'index', 'size', 'file_path', 'url_path' ]].as_matrix(): size, done = int(size), 0 for id in range(id + 1, len(buckets)) + range(id + 1): b = buckets[id] if b[0] + size <= 3800000000: b[0] += size b[1].append([index, size, file_path, url_path]) done = 1 break if done == 0: buckets.append([size, [[index, size, file_path, url_path]]]) if params['dbtype'] == 'bowtie2': pool = Pool(min(params['n_thread'], len(buckets))) result = pool.imap_unordered(create_db, [[ params['bowtie2_build'], mapdb, start_id + id, bucket[1], params['dbtype'] ] for id, bucket in enumerate(buckets)]) else: result = map(create_db, [[ params['malt_build'], mapdb, start_id + id, bucket[1], params['dbtype'] ] for id, bucket in enumerate(buckets)]) for r in result: if r[2] != 0: print 'Database {0}.{1} FAILED with code {2}!'.format(*r) with open(mapdb + '.info', 'w') as fout: for id, bucket in enumerate(buckets): for b, _, _, _ in bucket[1]: fout.write('{0}\t{1}\n'.format(b, id + start_id)) print 'Done' if __name__ == '__main__': db_MapDB( dict([[k.strip() for k in arg.split('=', 1)] for arg in sys.argv[1:]]))
temp = temp[temp.end_sta_lon.apply(lambda x: npy.ceil(x) in [-72,-73,-74,-75])] temp.bike_id.fillna('0') temp.user = temp.user.fillna(method='bfill') #Missing birth fields with 0 temp.birth = temp.birth.fillna(0) temp.gender = temp.gender.fillna(method='ffill') temp.dropna(axis=0) #Write output to a file file_name = str(file).split('.')[0] temp.to_csv(path+file_name+'_cleaned.csv',encoding=ftype,header=True,index=False) if __name__ == '__main__': f = open('C:/Users/Naveen/Downloads/Springboard/GitHub/new_york_citibikes/data/rides/extracted.json','r') file_names = json.load(f) f.close() names = list(file_names.keys()) #Multiprocessing t = time.time() p = Pool() #Display progress for i, _ in enumerate(p.imap_unordered(clean, names), 1): sys.stderr.write('\rdone {0:%}'.format(i/len(names))) p.close() p.join() print("Completed in.....", time.time()-t)
def preprocess(): """Run preprocessing process and compute statistics for normalizing.""" config = parse_and_config() dataset_processor = { "ljspeech": LJSpeechProcessor, "kss": KSSProcessor, "libritts": LibriTTSProcessor, "baker": BakerProcessor, "thorsten": ThorstenProcessor, } dataset_symbol = { "ljspeech": LJSPEECH_SYMBOLS, "kss": KSS_SYMBOLS, "libritts": LIBRITTS_SYMBOLS, "baker": BAKER_SYMBOLS, "thorsten": THORSTEN_SYMBOLS, } dataset_cleaner = { "ljspeech": "english_cleaners", "kss": "korean_cleaners", "libritts": None, "baker": None, "thorsten": "german_cleaners", } logging.info(f"Selected '{config['dataset']}' processor.") processor = dataset_processor[config["dataset"]]( config["rootdir"], symbols=dataset_symbol[config["dataset"]], cleaner_names=dataset_cleaner[config["dataset"]], ) # check output directories build_dir = lambda x: [ os.makedirs(os.path.join(config["outdir"], x, y), exist_ok=True) for y in ["raw-feats", "wavs", "ids", "raw-f0", "raw-energies"] ] build_dir("train") build_dir("valid") # save pretrained-processor to feature dir processor._save_mapper( os.path.join(config["outdir"], f"{config['dataset']}_mapper.json"), extra_attrs_to_save={"pinyin_dict": processor.pinyin_dict} if config["dataset"] == "baker" else {}, ) # build train test split if config["dataset"] == "libritts": train_split, valid_split, _, _ = train_test_split( processor.items, [i[-1] for i in processor.items], test_size=config["test_size"], random_state=42, shuffle=True, ) else: train_split, valid_split = train_test_split( processor.items, test_size=config["test_size"], random_state=42, shuffle=True, ) logging.info(f"Training items: {len(train_split)}") logging.info(f"Validation items: {len(valid_split)}") get_utt_id = lambda x: os.path.split(x[1])[-1].split(".")[0] train_utt_ids = [get_utt_id(x) for x in train_split] valid_utt_ids = [get_utt_id(x) for x in valid_split] # save train and valid utt_ids to track later np.save(os.path.join(config["outdir"], "train_utt_ids.npy"), train_utt_ids) np.save(os.path.join(config["outdir"], "valid_utt_ids.npy"), valid_utt_ids) # define map iterator def iterator_data(items_list): for item in items_list: yield processor.get_one_sample(item) train_iterator_data = iterator_data(train_split) valid_iterator_data = iterator_data(valid_split) p = Pool(config["n_cpus"]) # preprocess train files and get statistics for normalizing partial_fn = partial(gen_audio_features, config=config) train_map = p.imap_unordered( partial_fn, tqdm(train_iterator_data, total=len(train_split), desc="[Preprocessing train]"), chunksize=10, ) # init scaler for multiple features scaler_mel = StandardScaler(copy=False) scaler_energy = StandardScaler(copy=False) scaler_f0 = StandardScaler(copy=False) id_to_remove = [] for result, mel, energy, f0, features in train_map: if not result: id_to_remove.append(features["utt_id"]) continue save_features_to_file(features, "train", config) # partial fitting of scalers if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0: id_to_remove.append(features["utt_id"]) continue # partial fitting of scalers if len(energy[energy != 0]) == 0 or len(f0[f0 != 0]) == 0: id_to_remove.append(features["utt_id"]) continue scaler_mel.partial_fit(mel) scaler_energy.partial_fit(energy[energy != 0].reshape(-1, 1)) scaler_f0.partial_fit(f0[f0 != 0].reshape(-1, 1)) if len(id_to_remove) > 0: np.save( os.path.join(config["outdir"], "train_utt_ids.npy"), [i for i in train_utt_ids if i not in id_to_remove], ) logging.info( f"removed {len(id_to_remove)} cause of too many outliers or bad mfa extraction" ) # save statistics to file logging.info("Saving computed statistics.") scaler_list = [(scaler_mel, ""), (scaler_energy, "_energy"), (scaler_f0, "_f0")] save_statistics_to_file(scaler_list, config) # preprocess valid files partial_fn = partial(gen_audio_features, config=config) valid_map = p.imap_unordered( partial_fn, tqdm(valid_iterator_data, total=len(valid_split), desc="[Preprocessing valid]"), chunksize=10, ) for *_, features in valid_map: save_features_to_file(features, "valid", config)
def create_length_distributions(db_path, cores, qnames=False, region=False): ''' input: db_path - path to the database from hoobari's patch; cores - number of cores to use for multiprocessing; db_prefix - if hoobari's patch was ran for many different regions, it creates many DB's with the same prefix. since all those databases are required in this step, the prefix is also required. output: a tuple with two pandas dataframes that contain fragment length distributions which were calculated using *all* the databases ''' # if the number of cores was specified - use it. if not, and there's more than 1 core, use all cores # except for one. # if cores: # pool = Pool(int(cores)) # else: # if cpu_count() > 1: # pool = Pool(cpu_count() - 1) # else: # pool = Pool(1) pool = Pool(int(cores)) db_path = os.path.abspath(db_path) if os.path.isfile(db_path): db_files = [db_path] elif os.path.isdir(db_path): if region: db_files_loc = db_path db_files = [ os.path.join(db_files_loc, f) for f in os.listdir(db_files_loc) if f.endswith('.db') ] else: sys.exit( 'If no database file is specified, a region must be specified using hoobari -r chrN:NNN-NNN' ) else: sys.exit( 'Please specify a database file or the dir or using hoobari -d LOCATION \ a region using hoobari -r chrN:NNN-NNN') # run the function get_fetal_and_shared_lengths for each path in db_files # pooled_results = pool.map(get_fetal_and_shared_lengths, db_files) get_qnames_and_alleles_with_args = partial(get_fetal_and_shared_lengths, qnames=qnames) # pooled_results = pool.map(get_qnames_and_alleles_with_args, db_files) #TODO: use pool.imap_unordered(func, iterable[, chunksize]) # pool.close() # pool.join() # create two lists, one with all the shared fragments results, and one for the fetal fragments results con = db.Variants(db_files[0], probe=False) try: shared_lengths = con.getSharedLengths() fetal_lengths = con.getFetalLengths() except: sys.exit(str(db_path)) if qnames: fetal_qnames_set, shared_qnames_set = con.getFetalSharedQnames() for tup in pool.imap_unordered(get_qnames_and_alleles_with_args, db_files[1:], 100): shared_lengths = shared_lengths.add(tup[0], fill_value=0) fetal_lengths = fetal_lengths.add(tup[1], fill_value=0) if qnames: fetal_qnames_set.update(tup[3]) shared_qnames_set.update(tup[2]) # for db_path in db_files[1:]: # con = db.Variants(db_path, probe=False) # shared_lengths = shared_lengths.add(con.getSharedLengths(), fill_value=0) # fetal_lengths = fetal_lengths.add(con.getFetalLengths(), fill_value=0) # if qnames: # tup = con.getFetalSharedQnames() # shared_qnames_set.update(tup[1]) # fetal_qnames_set.update(tup[0]) if qnames: with open('shared_qnames_list.txt', 'w') as f: for q in shared_qnames_set: print(q, file=f) with open('fetal_qnames_list.txt', 'w') as f: for q in fetal_qnames_set: print(q, file=f) # with pd.option_context('display.max_rows', None): # pulled_lengths = shared_lengths.add(fetal_lengths, fill_value=0) # pulled_lengths = pulled_lengths[pulled_lengths.index < 1001] # pulled_lengths_densities = pulled_lengths / pulled_lengths.sum() # printverbose('pulled_lengths_densities') # lpulled = list(pulled_lengths_densities['COUNT(length)']) # lpulled = list(pulled_lengths_densities['COUNT(length)']) # printverbose(lpulled) # zeros = pd.DataFrame(np.zeros((1001,1))) # printverbose('maternal lengths') # maternal_lengths = shared_lengths - fetal_lengths # maternal_lengths = maternal_lengths.add(zeros, fill_value=0) # maternal_lengths = maternal_lengths.fillna(0).clip(lower = 0) # maternal_lengths = maternal_lengths[maternal_lengths.index < 1001] # printverbose(list(maternal_lengths['COUNT(length)'])) # printverbose('fetal lengths') # fetal_lengths = fetal_lengths.add(zeros, fill_value=0) # fetal_lengths = fetal_lengths[fetal_lengths.index < 1001] # printverbose(list(fetal_lengths['COUNT(length)'])) return (shared_lengths, fetal_lengths)
def main(): parser = argparse.ArgumentParser(description='Compares all entries in a ' 'fasta file using MASH') main_options = parser.add_argument_group('Main options') main_options.add_argument('-i', '--input_references', dest='inputfile', nargs='+', required=True, help='Provide the ' 'input fasta ' 'files to ' 'parse.') main_options.add_argument('-o', '--output', dest='output_tag', required=True, help='Provide an output tag.') main_options.add_argument('-t', '--threads', dest='threads', default="1", help='Provide the number of threads to be used. ' 'Default: 1.') mash_options = parser.add_argument_group('MASH related options') mash_options.add_argument( '-k', '--kmers', dest='kmer_size', default="21", help='Provide the number of k-mers to be provided to mash ' 'sketch. Default: 21.') mash_options.add_argument('-p', '--pvalue', dest='pvalue', default="0.05", help='Provide the p-value to ' 'consider a distance ' 'significant. Default: ' '0.05.') mash_options.add_argument('-md', '--mashdist', dest='mashdistance', default="0.1", help='Provide the maximum mash ' 'distance to be parsed to ' 'the matrix. Default: 0.1.') other_options = parser.add_argument_group('Other options') other_options.add_argument('-rm', '--remove', dest='remove', action='store_true', help='Remove any temporary ' 'files and folders not ' 'needed (not present ' 'in results ' 'subdirectory).') other_options.add_argument('-hist', '--histograms', dest='histograms', action='store_true', help='Checks the ' 'distribution of ' 'distances values ' 'plotting histograms') args = parser.parse_args() threads = args.threads kmer_size = args.kmer_size pvalue = args.pvalue mashdist = args.mashdistance ## lists all fastas given to argparser fastas = [ f for f in args.inputfile if f.endswith((".fas", ".fasta", ".fna", ".fsa", ".fa")) ] ## creates output directory tree output_tag = args.output_tag.replace("/", "") ## if the user gives and # input tag that is already a folder mother_directory = output_tree(fastas[0], output_tag) ## checks if multiple fastas are provided or not avoiding master_fasta # function print("***********************************") print("Creating main database...\n") main_fasta, sequence_info = master_fasta(fastas, output_tag, mother_directory) ######################### ### genera block here ### ######################### ## runs mash related functions print("***********************************") print("Sketching reference...\n") ref_sketch = sketch_references(main_fasta, output_tag, threads, kmer_size, mother_directory) ## breaks master fasta into multiple fastas with one genome each print("***********************************") print("Making temporary files for each genome in fasta...\n") genomes = genomes_parser(main_fasta, output_tag, mother_directory) ## This must be multiprocessed since it is extremely fast to do mash # against one plasmid sequence print("***********************************") print("Sketching genomes and running mash distances...\n") pool = Pool(int(threads)) # Create a multiprocessing Pool mp = pool.imap_unordered( partial(multiprocess_mash, ref_sketch, main_fasta, output_tag, kmer_size, mother_directory), genomes) # process genomes iterable with pool ## loop to print a nice progress bar try: for _ in tqdm.tqdm(mp, total=len(genomes)): pass except: print("progress will not be tracked because of 'reasons'... check if " "you have tqdm package installed.") pool.close() pool.join() ## needed in order for the process to end before the # remaining options are triggered # print # print "Finished MASH... uf uf uf!" ## Makes distances matrix csv file # print # print "***********************************" # print "Creating distance matrix..." # print lists_traces = mash_distance_matrix(mother_directory, sequence_info, pvalue, mashdist, threads) ## remove master_fasta if args.remove: # print "***********************************" # print "Removing temporary files and folders..." # print os.remove(main_fasta) for d in os.listdir(mother_directory): if d != "results": shutil.rmtree(os.path.join(mother_directory, d)) ## Histograms if args.histograms: # print "***********************************" # print "Outputing histograms..." # print plot_histogram(lists_traces, output_tag, mother_directory)
dict(type=decoder_to_pipeline_prefix[args.decoder] + 'Decode') ] dataset = build_dataset(cfg.data[args.split], dict(test_mode=(args.split != 'train'))) # prepare for checking if os.path.exists(args.output_file): # remove exsiting output file os.remove(args.output_file) pool = Pool(args.num_processes) lock = Manager().Lock() worker_fn = partial(_do_check_videos, lock, dataset, args.output_file) ids = range(len(dataset)) # start checking for _ in tqdm(pool.imap_unordered(worker_fn, ids), total=len(ids)): pass pool.join() # print results and release resources pool.close() with open(args.output_file, 'r') as f: print(f'Checked {len(dataset)} videos, ' f'{len(f)} is/are corrupted/missing.') if args.remove_corrupted_videos: print('Start deleting corrupted videos') cnt = 0 with open(args.output_file, 'r') as f: for line in f: if os.path.exists(line.strip()):
if len(times[x]) == runs and x not in printed_res: human_r = str( datetime.timedelta(seconds=sum(times[x]) / len(times[x]))) print( str(x).ljust(padding), str(round(statistics.mean(times[x]), 5)).ljust(padding), str(round(statistics.stdev(times[x]), 5)).ljust(padding), human_r.ljust(padding)) printed_res.add(x) sys.stdout.flush() args = parse_args() times = defaultdict(list) graphs = defaultdict(list) printed_res = set() arg_list = [] for x in range(args.s, args.e + 1, args.d): arg_list += [x] * args.r pool = Pool(args.p) print_res(times, args.r) for x, g, t in pool.imap_unordered(gen_graph, arg_list): times[x].append(t) graphs[x].append(g) print_res(times, args.r) if args.S: nx.write_graphml( g, "%s/internet-AS-graph-%d-%d.graphml" % (args.S, x, len(graphs[x])))
pblh[ lista ], num_pbl[ lista ] ) ) ) elif DAY and (not explicitAerosol): namelist_iter = iter( np.column_stack( ( pres0[ lista ], level[ lista ], case[ lista ], nzp[ lista ], dz[ lista ], q_inv[ lista ], tpot_inv[ lista ], lwp[ lista ], tpot_pbl[ lista ], \ pblh[ lista ], num_pbl[ lista ], cntlat[ lista ] ) ) ) elif (not DAY) and explicitAerosol: namelist_iter = iter( np.column_stack( ( pres0[ lista ], level[ lista ], case[ lista ], nzp[ lista ], dz[ lista ], q_inv[ lista ], tpot_inv[ lista ], lwp[ lista ], tpot_pbl[ lista ], \ pblh[ lista ], num_ks[ lista ], num_as[ lista ], num_cs[ lista ], dpg_as[ lista ] ) )) elif DAY and explicitAerosol: namelist_iter = iter( np.column_stack( ( pres0[ lista ], level[ lista ], case[ lista ], nzp[ lista ], dz[ lista ], q_inv[ lista ], tpot_inv[ lista ], lwp[ lista ], tpot_pbl[ lista ], \ pblh[ lista ], num_ks[ lista ], num_as[ lista ], num_cs[ lista ], dpg_as[ lista ], cntlat[ lista ] ) ) ) sound_in_iter = iter( np.column_stack( ( pres0[ lista ], windprofile[ lista ], case[ lista ], q_inv[ lista ], tpot_inv[ lista ], q_pbl[ lista ], tpot_pbl[ lista ], pblh[ lista ], \ dz[ lista ], nzp[ lista ] ) ) ) # run as unordered parallel processes for k in pool.imap_unordered(write_namelist, namelist_iter): pass for i in pool.imap_unordered(write_sound_in, sound_in_iter): pass #def dycoms(): # call(['rm','-rf', rootfolder+'*']) # case = 'dycoms' # q_inv = 4.45 # tpot_inv = 6.7 # q_pbl = 9.45 # tpot_pbl = 288.3 # pblh = 795. # write_sound_in( case, q_inv, tpot_inv, q_pbl, tpot_pbl, pblh) # write_namelist( case, 20., 660. )
def _extract_features_parallel_per_kind(kind_to_df_map, column_id, column_value, default_fc_parameters, kind_to_fc_parameters=None, chunksize=defaults.CHUNKSIZE, n_processes=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS, disable_progressbar=defaults.DISABLE_PROGRESSBAR, impute_function=defaults.IMPUTE_FUNCTION): """ Parallelize the feature extraction per kind. :param kind_to_df_map: The time series to compute the features for in our internal format :type kind_to_df_map: dict of pandas.DataFrame :param column_id: The name of the id column to group by. :type column_id: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for more information. :type default_fc_parameters: dict :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for default_fc_parameters. If you put a kind as a key here, the fc_parameters object (which is the value), will be used instead of the default_fc_parameters. :type kind_to_fc_parameters: dict :param chunksize: The size of one chunk for the parallelisation :type chunksize: None or int :param n_processes: The number of processes to use for parallelisation. :type n_processes: int :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators). :type show_warnings: bool :param disable_progressbar: Do not show a progressbar while doing the calculation. :type disable_progressbar: bool :param impute_function: None, if no imputing should happen or the function to call for imputing. :type impute_function: None or function :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series, column_id=column_id, column_value=column_value, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, show_warnings=show_warnings) pool = Pool(n_processes) if not chunksize: chunksize = _calculate_best_chunksize(kind_to_df_map, n_processes) total_number_of_expected_results = len(kind_to_df_map) extracted_features = tqdm(pool.imap_unordered(partial_extract_features_for_one_time_series, kind_to_df_map.items(), chunksize=chunksize), total=total_number_of_expected_results, desc="Feature Extraction", disable=disable_progressbar) pool.close() # Concatenate all partial results result = pd.concat(extracted_features, axis=1, join='outer').astype(np.float64) # Impute the result if requested if impute_function is not None: impute_function(result) pool.join() return result
ph_sub['n_cases'], ph_sub['n_controls']) # bake in globals ldsc_h2_map = partial(ldsc_h2_part, wd=wd, ld_ref_panel=ld_ref_panel, ld_w_panel=ld_w_panel, ld_frq_panel=ld_frq_panel, ss_bucket=ss_bucket, sex_group=str(sex_group)) # dispatch print "Starting ldsc..." print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())) pool = Pool(num_proc) results = pool.imap_unordered(ldsc_h2_map, iter_args) pool.close() pool.join() #### # Load output to dataframe print "Processing results..." print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())) #### dat = pd.concat(results) # dat = pd.DataFrame(index=ph_list, columns=col_ord) # for res in results: # dat.update(pd.DataFrame(data=res, index=pd.Series(res[0][0]), columns=col_ord)) #### # write results to file
def main(): """ Description: Main function """ # Argument parsing args = parse_arguments() # Create the directory if it does not exist. try: os.makedirs(args.output_dir) except OSError as e: if e.errno != errno.EEXIST: raise # Creating word list lang_dict = load_dict(args.language) # Create font (path) list if len(args.font_type) == 0: fonts = load_fonts(args.language) else: font = args.font_type # Creating synthetic sentences (or word) strings = [] if args.use_wikipedia: strings = create_strings_from_wikipedia(args.length, args.count, args.language) elif args.input_file != '': strings = create_strings_from_file(args.input_file, args.count) elif args.random_sequences: strings = create_strings_randomly(args.length, args.random, args.count, args.include_letters, args.include_numbers, args.include_symbols, args.language) # Set a name format compatible with special characters automatically if they are used if args.include_symbols or True not in (args.include_letters, args.include_numbers, args.include_symbols): args.name_format = 2 else: strings = create_strings_from_dict(args.length, args.random, args.count, lang_dict) string_count = len(strings) p = Pool(args.thread_count) if len(args.font_type) == 0: for _ in tqdm(p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip([i for i in range(0, string_count)], strings, [ fonts[random.randrange(0, len(fonts))] for _ in range(0, string_count) ], [args.output_dir] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [args.random_skew] * string_count, [args.blur] * string_count, [args.random_blur] * string_count, [args.background] * string_count, [args.distorsion] * string_count, [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, [args.width] * string_count, [args.alignment] * string_count, [args.text_color] * string_count)), total=args.count): pass else: for _ in tqdm(p.imap_unordered( FakeTextDataGenerator.generate_from_tuple, zip([i for i in range(0, string_count)], strings, [font for _ in range(0, string_count)], [args.output_dir] * string_count, [args.format] * string_count, [args.extension] * string_count, [args.skew_angle] * string_count, [args.random_skew] * string_count, [args.blur] * string_count, [args.random_blur] * string_count, [args.background] * string_count, [args.distorsion] * string_count, [args.distorsion_orientation] * string_count, [args.handwritten] * string_count, [args.name_format] * string_count, [args.width] * string_count, [args.alignment] * string_count, [args.text_color] * string_count)), total=args.count): pass p.terminate() if args.name_format == 2: # Create file with filename-to-label connections with open(os.path.join(args.output_dir, "labels.txt"), 'w', encoding="utf8") as f: for i in range(string_count): file_name = str(i) + "." + args.extension f.write("{} {}\n".format(file_name, strings[i]))
def testmain(**argdict): argdict = defaultdict(lambda: None, argdict) scriptdir = os.path.dirname(os.path.realpath(sys.argv[0]))+"/" samplefilename = argdict["samplefile"] sampledata = samplefile(argdict["samplefile"]) trnafile = argdict["trnafile"] logfile = argdict["logfile"] mapfile = argdict["mapfile"] bowtiedb = argdict["bowtiedb"] lazycreate = argdict["lazy"] minnontrnasize = argdict["minnontrnasize"] bamdir = argdict["bamdir"] trnamapfile = argdict["trnamapfile"] if bamdir is None: bamdir = "./" if "cores" in argdict: cores = int(argdict["cores"]) else: cores = min(8,cpu_count()) #sys.exit() print >>sys.stderr,"cores: "+str(cores) workingdir = bamdir #samplefile = open(args.samplefile) samples = sampledata.getsamples() trnafile = trnafile print >>sys.stderr, "logging to "+logfile if logfile and lazycreate: logfile = open(logfile,'a') print >>logfile, "New mapping" elif logfile: logfile = open(logfile,'w') else: logfile = sys.stderr unmaps = defaultdict(int) singlemaps = defaultdict(int) multimaps = defaultdict(int) totalreads = defaultdict(int) if not os.path.isfile(bowtiedb+".fa"): print >>sys.stderr, "No bowtie2 database "+bowtiedb sys.exit(1) badsamples = list() for samplename in samples: bamfile = workingdir+samplename if lazycreate and os.path.isfile(bamfile+".bam"): if not checkheaders(bamfile+".bam", sampledata.getfastq(samplename)): badsamples.append(bamfile+".bam") else: if os.path.isfile(bamfile+".bam"): if not checkheaders(bamfile+".bam", sampledata.getfastq(samplename)): badsamples.append(bamfile+".bam") if len(badsamples) > 0: print >>sys.stderr, "Bam files "+",".join(badsamples)+" does not match fq files" print >>sys.stderr, "Aborting" sys.exit(1) #'samtools sort -T '+tempfile.gettempdir()+"/"+outfile+'temp - -o '+outfile+'.bam' tempfilesover = list() missingfqfiles = list() for samplename in samples: #redundant but ensures compatibility bamfile = workingdir+samplename temploc = os.path.basename(bamfile) #print >>sys.stderr, "***" #print >>sys.stderr, samplename+'temp' for currfile in os.listdir(tempfile.gettempdir()): # if currfile.startswith(samplename+'temp'): tempfilesover.append(currfile) fqfile = sampledata.getfastq(samplename) if not os.path.isfile(fqfile): missingfqfiles.append(fqfile) if len(tempfilesover) > 0: for currfile in tempfilesover: print >>sys.stderr, tempfile.gettempdir() +"/"+ currfile + " temp bam files exists" print >>sys.stderr, "these files must be deleted to proceed" sys.exit(1) if len(missingfqfiles) > 0: print >>sys.stderr, ",".join(missingfqfiles) + " fastq files missing" sys.exit(1) mapresults = dict() multithreaded = True if multithreaded: mapargs = list() print >>sys.stderr, cores mappool = Pool(processes=cores) mapsamples = list() for samplename in samples: bamfile = workingdir+samplename if lazycreate and os.path.isfile(bamfile+".bam"): pass print >>sys.stderr, "Skipping "+samplename else: mapargs.append(compressargs(bowtiedb, sampledata.getfastq(samplename),bamfile,scriptdir, trnafile, expname = samplefilename, samplename = samplename, minnontrnasize = minnontrnasize)) #mapresults[samplename] = mapreads(bowtiedb, sampledata.getfastq(samplename),bamfile,scriptdir, trnafile, logfile=logfile, expname = samplefilename) mapsamples.append(samplename) #results = mappool.map(mapreadspool, mapargs) starttime = time.time() for currresult in mappool.imap_unordered(mapreadspool, mapargs): #print >>sys.stderr, "time "+currresult.samplename+": "+str(time.time() - starttime) if currresult.failedrun == True: print >>sys.stderr, "Failure to Bowtie2 map" #print >>sys.stderr, output[1] currresult.printbowtie(logfile) sys.exit(1) mapresults[currresult.samplename] = currresult currresult.printbowtie(logfile) else: for samplename in samples: bamfile = workingdir+samplename if lazycreate and os.path.isfile(bamfile+".bam"): pass print >>sys.stderr, "Skipping "+samplename else: mapresults[samplename] = mapreads(bowtiedb, sampledata.getfastq(samplename),bamfile,scriptdir, trnafile, logfile=logfile, expname = samplefilename, minnontrnasize = minnontrnasize) if lazycreate: #here is where I might add stuff to read old files in lazy mode pass if mapfile is not None and not lazycreate: mapinfo = open(mapfile,'w') print >>mapinfo, "\t".join(samples) print >>mapinfo, "unmap\t"+"\t".join(str(mapresults[currsample].unmaps) for currsample in samples) print >>mapinfo, "single\t"+"\t".join(str(mapresults[currsample].singlemaps) for currsample in samples) print >>mapinfo, "multi\t"+"\t".join(str(mapresults[currsample].multimaps) for currsample in samples) mapinfo.close() if trnamapfile is not None and not lazycreate: trnamapinfo = open(trnamapfile,'w') print >>trnamapinfo, "\t".join(samples) print >>trnamapinfo, "multi_nontRNA\t"+"\t".join(str(mapresults[currsample].trnamapinfo.multiplenon) for currsample in samples) print >>trnamapinfo, "unique_nontRNA\t"+"\t".join(str(mapresults[currsample].trnamapinfo.singlenon) for currsample in samples) print >>trnamapinfo, "multi_amino\t"+"\t".join(str(mapresults[currsample].trnamapinfo.multamino) for currsample in samples) print >>trnamapinfo, "unique_amino\t"+"\t".join(str(mapresults[currsample].trnamapinfo.multac) for currsample in samples) print >>trnamapinfo, "unique_anticodon\t"+"\t".join(str(mapresults[currsample].trnamapinfo.multtrans) for currsample in samples) print >>trnamapinfo, "unique_tRNA\t"+"\t".join(str(mapresults[currsample].trnamapinfo.singletrna) for currsample in samples) #print >>mapinfo, "total\t"+"\t".join(totalreads[currsample] for currsample in samples) trnamapinfo.close() #print >>logfile, "Processing "+samplename +" mappings" logfile.close()
def _maybe_convert_sets(target_dir, extracted_data): extracted_dir = os.path.join(target_dir, extracted_data) # override existing CSV with normalized one target_csv_template = os.path.join(target_dir, ARCHIVE_DIR_NAME, ARCHIVE_NAME.replace(".tgz", "_{}.csv")) if os.path.isfile(target_csv_template): return wav_root_dir = os.path.join(extracted_dir) # Get audiofile path and transcript for each sentence in tsv samples = [] glob_dir = os.path.join(wav_root_dir, "**/metadata.csv") for record in glob(glob_dir, recursive=True): if any(map(lambda sk: sk in record, SKIP_LIST)): # pylint: disable=cell-var-from-loop continue with open(record, "r") as rec: for re in rec.readlines(): re = re.strip().split("|") audio = os.path.join(os.path.dirname(record), "wavs", re[0] + ".wav") transcript = re[2] samples.append((audio, transcript)) counter = get_counter() num_samples = len(samples) rows = [] print("Importing WAV files...") pool = Pool() bar = progressbar.ProgressBar(max_value=num_samples, widgets=SIMPLE_BAR) for i, processed in enumerate(pool.imap_unordered(one_sample, samples), start=1): counter += processed[0] rows += processed[1] bar.update(i) bar.update(num_samples) pool.close() pool.join() with open(target_csv_template.format("train"), "w", encoding="utf-8", newline="") as train_csv_file: # 80% with open(target_csv_template.format("dev"), "w", encoding="utf-8", newline="") as dev_csv_file: # 10% with open(target_csv_template.format("test"), "w", encoding="utf-8", newline="") as test_csv_file: # 10% train_writer = csv.DictWriter(train_csv_file, fieldnames=FIELDNAMES) train_writer.writeheader() dev_writer = csv.DictWriter(dev_csv_file, fieldnames=FIELDNAMES) dev_writer.writeheader() test_writer = csv.DictWriter(test_csv_file, fieldnames=FIELDNAMES) test_writer.writeheader() for i, item in enumerate(rows): transcript = validate_label(item[2]) if not transcript: continue wav_filename = item[0] i_mod = i % 10 if i_mod == 0: writer = test_writer elif i_mod == 1: writer = dev_writer else: writer = train_writer writer.writerow( dict( wav_filename=os.path.relpath( wav_filename, extracted_dir), wav_filesize=os.path.getsize(wav_filename), transcript=transcript, )) imported_samples = get_imported_samples(counter) assert counter["all"] == num_samples assert len(rows) == imported_samples print_import_report(counter, SAMPLE_RATE, MAX_SECS)
def parallel(r,c): pool2=Pool(processes=8) for i in pool2.imap_unordered(f, range(r),c): print(i) pool2.close() pool2.join()
def filter_by_metadata(ab): venues = [] with gzip.open(ab) as f: for i, l in enumerate(f): metadata = json.loads(l.strip()) if article_allowed(metadata): venues.append((ab, l)) return venues if __name__ == "__main__": # Get all of the statistics for venues, also time how long it takes to iterate through all the data start = time.time() (data_loc / "filtered_metadata").mkdir(exist_ok=True) article_bundles = [] for article_bundle in data_loc.glob(f"metadata/*.gz"): article_bundles.append(article_bundle) pool = Pool(8) venue_frequencies = defaultdict(int) for vf in tqdm(pool.imap_unordered(filter_by_metadata, article_bundles), total=100): with gzip.open(f"{data_loc}/filtered_metadata/{vf[0][0].name}", 'w') as f: for l in vf: f.write(l[1]) pool.close() pool.join()
def download_license(threads=os.cpu_count(), force=False): """ Downloads license data from spdx.org. Lists data from https://spdx.org/licenses/licenses.json, https://spdx.org/licenses/exceptions.json and check if the version is already loaded. If the data already exists, simply skip else create a new CSV. CSV file names are created as <releaseDate>_<version>.csv. For each license, shortname, fullname, text, url, deprecated, osi_approved are collected. :param threads: Number of CPU to be used for downloading. This is done to speed up the process :param force: Bool value if licenses needs to be downloaded forcefully :return: File path if success, None otherwise. """ jsonData = request.urlopen( 'https://spdx.org/licenses/licenses.json').read() jsonData = json.loads(jsonData.decode('utf-8')) licenses = jsonData.get('licenses') jsonData_exceptions = request.urlopen( 'https://spdx.org/licenses/exceptions.json').read() jsonData_exceptions = json.loads(jsonData_exceptions.decode('utf-8')) license_exceptions = jsonData_exceptions.get('exceptions') version = jsonData.get('licenseListVersion').replace(".", "_") releaseDate = jsonData.get('releaseDate') if licenses is not None: fileName = releaseDate + '_' + version + '.csv' dir = os.path.dirname(os.path.abspath(__file__)) dir = os.path.abspath(dir + "/../../licenses") Path(dir).mkdir(exist_ok=True) filePath = Path(os.path.abspath(dir + "/" + fileName)) if filePath.is_file(): if (force): filePath.unlink() else: return str(filePath) licenseDataFrame = pd.DataFrame(columns=csvColumns) cpuCount = os.cpu_count() threads = cpuCount * 2 if threads > cpuCount * 2 else threads pool = ThreadPool(threads) for row in tqdm(pool.imap_unordered( LicenseDownloader.fetch_exceptional_license, license_exceptions), desc="Exceptions processed", total=len(license_exceptions), unit="exception"): licenseDataFrame = pd.concat([licenseDataFrame, row], sort=False, ignore_index=True) for row in tqdm(pool.imap_unordered( LicenseDownloader.fetch_license, licenses), desc="Licenses processed", total=len(licenses), unit="license"): licenseDataFrame = pd.concat([licenseDataFrame, row], sort=False, ignore_index=True) licenseDataFrame = licenseDataFrame.drop_duplicates( subset='shortname') licenseDataFrame = licenseDataFrame.sort_values( 'deprecated').drop_duplicates(subset='fullname', keep='first') licenseDataFrame = licenseDataFrame.sort_values( 'shortname').reset_index(drop=True) licenseDataFrame.to_csv(str(filePath), index=False, encoding='utf-8') return str(filePath) else: return None
def run(parser=None): """Access to the "run" interface of an operations module. Executing this function within a module will start a command line interface, that can be used to execute operations defined within the same module. All **top-level unary functions** will be intepreted as executable operation functions. For example, if we have a module as such: .. code-block:: python # operations.py def hello(job): print('hello', job) if __name__ == '__main__': import flow flow.run() Then we can execute the ``hello`` operation for all jobs from the command like like this: .. code-block:: bash $ python operations.py hello .. note:: You can control the degree of parallelization with the ``--np`` argument. For more information, see: .. code-block:: bash $ python operations.py --help """ if parser is None: parser = argparse.ArgumentParser() parser.add_argument('operation', type=str, choices=list(_get_operations()), help="The operation to execute.") parser.add_argument( 'jobid', type=str, nargs='*', help="The job ids, as registered in the signac project. " "Omit to default to all statepoints.") parser.add_argument( '--np', type=int, default=1, help="Specify the number of cores to parallelize to (default=1) or 0 " "to parallelize on as many cores as there are available.") parser.add_argument( '-t', '--timeout', type=int, help="A timeout in seconds after which the parallel execution " "of operations is canceled.") parser.add_argument('--progress', action='store_true', help="Display a progress bar during execution.") args = parser.parse_args() project = get_project() def _open_job_by_id(_id): try: return project.open_job(id=_id) except KeyError: msg = "Did not find job corresponding to id '{}'.".format(_id) raise KeyError(msg) except LookupError: raise LookupError("Multiple matches for id '{}'.".format(_id)) if len(args.jobid): try: jobs = [_open_job_by_id(jid) for jid in args.jobid] except (KeyError, LookupError) as e: print(e, file=sys.stderr) sys.exit(1) else: jobs = project module = inspect.getmodule(inspect.currentframe().f_back) try: operation_func = getattr(module, args.operation) except AttributeError: raise KeyError("Unknown operation '{}'.".format(args.operation)) if getattr(operation_func, '_flow_cmd', False): def operation(job): cmd = operation_func(job).format(job=job) fork(cmd=cmd, timeout=args.timeout) else: operation = operation_func # Serial execution if args.np == 1 or len(jobs) < 2: if args.timeout is not None: logger.warning("A timeout has no effect in serial execution!") for job in tqdm(jobs) if args.progress else jobs: operation(job) # Parallel execution elif six.PY2: # Due to Python 2.7 issue #8296 (http://bugs.python.org/issue8296) we # always need to provide a timeout to avoid issues with "hanging" # processing pools. timeout = sys.maxint if args.timeout is None else args.timeout pool = Pool(args.np) result = pool.imap_unordered(operation, jobs) for _ in tqdm(jobs) if args.progress else jobs: result.next(timeout) else: with Pool(args.np) as pool: result = pool.imap_unordered(operation, jobs) for _ in tqdm(jobs) if args.progress else jobs: result.next(args.timeout)
return port, True except (socket.timeout, socket.error): return port, False if __name__ == '__main__': if not len(sys.argv): print("Usage: scanner.py <target> <maxport>") if len(sys.argv) == 3: maxport = int(sys.argv[2]) else: maxport = 1025 target = sys.argv[1] # Resolve Host to IP, if necessary. if not target.replace(".", "").isdigit(): target = host_to_ip(target) print("[+] Scanning", target) ports = range(1, maxport + 1) scanlist = [(target, port) for port in ports] # Use 512 workers. Not sure how insane that is but it seems to work fine. pool = Pool(512) for port, status in pool.imap_unordered(scan, scanlist): if status: print("[!]", port, "is open") print("[+] Finished scanning", target)
text_dict = ljspeech(path) if args.dataset == 'databaker': text_dict = databaker(path) n_workers = max(1, args.num_workers) simple_table([('Sample Rate', hp.sample_rate), ('Bit Depth', hp.bits), ('Mu Law', hp.mu_law), ('Hop Length', hp.hop_length), ('CPU Usage', f'{n_workers}/{cpu_count()}'), ('Num Validation', hp.n_val)]) pool = Pool(processes=n_workers) dataset = [] cleaned_texts = [] for i, (item_id, length, cleaned_text) in enumerate( pool.imap_unordered(process_wav, wav_files), 1): if item_id in text_dict: dataset += [(item_id, length)] cleaned_texts += [(item_id, cleaned_text)] bar = progbar(i, len(wav_files)) message = f'{bar} {i}/{len(wav_files)} ' stream(message) random = Random(hp.seed) random.shuffle(dataset) train_dataset = dataset[hp.n_val:] val_dataset = dataset[:hp.n_val] # sort val dataset longest to shortest val_dataset.sort(key=lambda d: -d[1]) for id, text in cleaned_texts:
from multiprocessing import Pool, TimeoutError import time import os def f(x): return x*x if __name__ == '__main__': pool = Pool(processes=4) # start 4 worker processes # print "[0, 1, 4,..., 81]" print pool.map(f, range(10)) # print same numbers in arbitrary order for i in pool.imap_unordered(f, range(10)): print i, print # evaluate "f(20)" asynchronously res = pool.apply_async(f, (20,)) # runs in *only* one process print res.get(timeout=1) # prints "400" # evaluate "os.getpid()" asynchronously res = pool.apply_async(os.getpid, ()) # runs in *only* one process print res.get(timeout=1) # prints the PID of that process # launching multiple evaluations asynchronously *may* use more processes multiple_results = [pool.apply_async(os.getpid, ()) for i in range(4)] print [res.get(timeout=1) for res in multiple_results] # make a single worker sleep for 10 secs
background.save(f'{savedir}/{i}.png') pool = Pool(args.thread_count) count = len(font_list) * args.images font_names = [] font_vers = [] for key, value in font_list.items(): font_names.extend([key]*args.images) key_len = len(value) font_vers.extend(value * (args.images // key_len)) font_vers.extend(value[:args.images % key_len]) for _ in tqdm( pool.imap_unordered( generate_from_tuple, zip( [ i for i in range(count) ], [ args.font_folder ] * count, font_names, font_vers, [ random_string_from_dict() for _ in range(count) ], [ args.output_dir ] * count ) ), total=count ): pass pool.terminate()
def _extract_features_parallel_per_sample(kind_to_df_map, column_id, column_value, default_fc_parameters, kind_to_fc_parameters=None, chunksize=defaults.CHUNKSIZE, n_processes=defaults.N_PROCESSES, show_warnings=defaults.SHOW_WARNINGS, disable_progressbar=defaults.DISABLE_PROGRESSBAR, impute_function=defaults.IMPUTE_FUNCTION): """ Parallelize the feature extraction per kind and per sample. As the splitting of the dataframes per kind along column_id is quite costly, we settled for an async map in this function. The result objects are temporarily stored in a fifo queue from which they can be retrieved in order of submission. :param kind_to_df_map: The time series to compute the features for in our internal format :type kind_to_df_map: dict of pandas.DataFrame :param column_id: The name of the id column to group by. :type column_id: str :param column_value: The name for the column keeping the value itself. :type column_value: str :param default_fc_parameters: mapping from feature calculator names to parameters. Only those names which are keys in this dict will be calculated. See the class:`ComprehensiveFCParameters` for more information. :type default_fc_parameters: dict :param kind_to_fc_parameters: mapping from kind names to objects of the same type as the ones for default_fc_parameters. If you put a kind as a key here, the fc_parameters object (which is the value), will be used instead of the default_fc_parameters. :type kind_to_fc_parameters: dict :param chunksize: The size of one chunk for the parallelisation :type chunksize: None or int :param n_processes: The number of processes to use for parallelisation. :type n_processes: int :param: show_warnings: Show warnings during the feature extraction (needed for debugging of calculators). :type show_warnings: bool :param disable_progressbar: Do not show a progressbar while doing the calculation. :type disable_progressbar: bool :param impute_function: None, if no imputing should happen or the function to call for imputing. :type impute_function: None or function :return: The (maybe imputed) DataFrame containing extracted features. :rtype: pandas.DataFrame """ partial_extract_features_for_one_time_series = partial(_extract_features_for_one_time_series, column_id=column_id, column_value=column_value, default_fc_parameters=default_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, show_warnings=show_warnings) pool = Pool(n_processes) total_number_of_expected_results = 0 # Submit map jobs per kind per sample results_fifo = Queue() for kind, df_kind in kind_to_df_map.items(): df_grouped_by_id = df_kind.groupby(column_id) total_number_of_expected_results += len(df_grouped_by_id) if not chunksize: chunksize = _calculate_best_chunksize(df_grouped_by_id, n_processes) results_fifo.put( pool.imap_unordered( partial_extract_features_for_one_time_series, [(kind, df_group) for _, df_group in df_grouped_by_id], chunksize=chunksize ) ) pool.close() # Wait for the jobs to complete and concatenate the partial results dfs_per_kind = [] # Do this all with a progress bar with tqdm(total=total_number_of_expected_results, desc="Feature Extraction", disable=disable_progressbar) as progress_bar: # We need some sort of measure, when a new result is there. So we wrap the # map_results into another iterable which updates the progress bar each time, # a new result is there def iterable_with_tqdm_update(queue, progress_bar): for element in queue: progress_bar.update(1) yield element result = pd.DataFrame() while not results_fifo.empty(): map_result = results_fifo.get() dfs_kind = iterable_with_tqdm_update(map_result, progress_bar) df_tmp = pd.concat(dfs_kind, axis=0).astype(np.float64) # Impute the result if requested if impute_function is not None: impute_function(df_tmp) result = pd.concat([result, df_tmp], axis=1).astype(np.float64) pool.join() return result