def multi_validate_rows(rows, col_size): n_cores = 4 print('N_CORES', n_cores) pool = Pool(n_cores) chunks = ((rows[i::n_cores], col_size) for i in range(n_cores)) pool.imap(validate_rows, chunks) pool.close() pool.join()
def repackage_revisions(revisions, revision_map, verify_run, staging_dir, context, quit_event=None, progress_event=None): """Repackages all Chrome builds listed in revisions. This function calls 'repackage_single_revision' with multithreading pool. """ p = Pool(3) func = partial(repackage_single_revision, revision_map, verify_run, staging_dir, context) p.imap(func, revisions) p.close() p.join()
def send(accounts, message): num = 0 pool = Pool(processes=cpu_count()*2) for data in accounts : email, password = data['email'], data['pass'] proxy, num = proxies[num], num + 1 cookie = "cookies/" + str(data['email']) + "_cookie" pool.imap(do_send, [(email, password, message, proxy, cookie)]) pool.close()
def build_condensed_matrix(seqs, mode=2): result = np.array([], dtype=default_dtype) p = Pool(processes=cpu_count()) if mode == 1: n = len(seqs) #chunksize = 500000 chunksize = int(n * (n - 1) / 2 / cpu_count() / 2) result_one = p.imap(get_score, make_iter(seqs, mode=1), chunksize=chunksize) result = np.array(list(result_one), dtype=default_dtype) else: result_one_row = p.imap(get_scores_one_row, make_iter(seqs, mode=2), chunksize=100) result = np.concatenate(list(result_one_row)) #p.close() #p.join() return result
def translate_concurrent(input_object, url, weights=None, num_processes=8): pool = Pool(processes=num_processes) text_args = [(line, weights, url) for line in input_object] for translated_line in pool.imap(translate_single_line, text_args): print translated_line
def main(data, total): global all_headlines global inv with open(INV_DOC_COUNTS) as inf: inv = json.load(inf) all_headlines = get_headlines(data) pool = Pool(1) counter = 0 out_data = [] for article, possible_headlines in pool.imap(assign_headline_tfidf_total, data[:3]): print counter, article["headline"], possible_headlines[0] counter += 1 article["top_tfidf"] = possible_headlines out_data += [article] with open(RESULT_FILE,'w') as outf: json.dump(out_data, outf) num_correct = 0 num_incorrect = 0 incrt = [] for article in out_data: if article["headline"] == article["tf_idf_prediction"][0][0]: num_correct += 1 else: num_incorrect += 1 incrt += [article] print "Num correct: %i" % num_correct print "Num incorrect: %i" % num_incorrect with open(OUT_INCORRECT,'w') as outf: json.dump(incrt, outf)
def main(opts): """The main loop of the module, do the renaming in parallel etc.""" log = logging.getLogger("exif2timestream") setup_logs(opts) # beginneth the actual main loop start_time = time() cameras = parse_camera_config_csv(opts["-c"]) n_images = 0 for camera in cameras: msg = "Processing experiment {}, location {}\n".format( camera[FIELDS["expt"]], camera[FIELDS["location"]], ) msg += "Images are coming from {}, being put in {}".format( camera[FIELDS["source"]], camera[FIELDS["destination"]], ) print(msg) log.info(msg) for ext, images in find_image_files(camera).iteritems(): images = sorted(images) n_cam_images = len(images) print("{0} {1} images from this camera".format(n_cam_images, ext)) log.info("Have {0} {1} images from this camera".format( n_cam_images, ext)) n_images += n_cam_images last_date = None subsec = 0 count = 0 # TODO: sort out the whole subsecond clusterfuck if "-1" in opts and opts["-1"]: log.info("Using 1 process (What is this? F*****g 1990?)") for image in images: count += 1 print("Processed {: 5d} Images".format(count), end='\r') process_image((image, camera, ext)) else: from multiprocessing import Pool, cpu_count if "-t" in opts and opts["-t"] is not None: try: threads = int(opts["-t"]) except ValueError: threads = cpu_count() - 1 else: threads = cpu_count() - 1 # Ensure that we're using at least one thread threads = max(threads, 1) log.info("Using {0:d} processes".format(threads)) # set the function's camera-wide arguments args = zip(images, cycle([camera]), cycle([ext])) pool = Pool(threads) for _ in pool.imap(process_image, args): count += 1 print("Processed {: 5d} Images".format(count), end='\r') pool.close() pool.join() print("Processed {: 5d} Images. Finished this cam!".format(count)) secs_taken = time() - start_time print("\nProcessed a total of {0} images in {1:.2f} seconds".format( n_images, secs_taken))
class SimStream(object): def __init__(self, target_dir, configs, super_seed, copy_op=move, num_workers=1): self.template = get_config_template() self.work_dir = tempfile.mkdtemp(prefix='craysim') self.seed_stream = seed_stream(super_seed) self.pool = Pool(num_workers, maxtasksperchild=1) config_stream = izip( configs, repeat(self.work_dir), self.seed_stream, repeat(self.template) ) self.result_stream = self.pool.imap(sim_worker, config_stream, chunksize=1) self.copy_op = copy_op self.target_dir = target_dir def stream(self): for config, stdout, stderr, workspace, output_path in self.result_stream: if output_path is None: raise Exception(stdout + '\n\n' + stderr) try: yield self.copy_op(self.target_dir, config, stdout, stderr) except Exception as e: import traceback import warnings warnings.warn(str(config)) traceback.print_exc() def clean(self): import shutil as sh sh.rmtree(self.work_dir)
def main(out): out.write(('P4\n%d %d\n' % (size, size)).encode('ASCII')) pool = Pool() step = 2.0j / size for row in pool.imap(do_row, (step*y-(1.5+1j) for y in range(size))): out.write(row)
def main(): parser = ArgumentParser(description="Speed up your SHA. A different hash style.") parser.add_argument("-1", "--sha1", action="store_true") parser.add_argument("-2", "--sha224", action="store_true") parser.add_argument("-3", "--sha256", action="store_true") parser.add_argument("-4", "--sha384", action="store_true") parser.add_argument("-5", "--sha512", action="store_true") parser.add_argument("-f", "--file", type=str, help="The path to the file") if len(sys.argv) == 1: parser.print_help() return global args args = parser.parse_args() hashtree = "" big_file = open(args.file, "rb") pool = Pool(multiprocessing.cpu_count()) for chunk_hash in pool.imap(hashing, chunks(big_file)): hashtree = hashtree + chunk_hash pool.terminate() if os.path.getsize(args.file) < 20971520: print(hashtree) else: print(str(hashing(hashtree)))
def main_internal(args, name='mxsniff'): """ Console script >>> main_internal(['*****@*****.**']) [email protected]: google-gmail """ import argparse import json from multiprocessing import Pool parser = argparse.ArgumentParser( prog=name, description='Identify email service providers given an email address, URL or domain name', fromfile_prefix_chars='@') parser.add_argument('names', metavar='email_or_url', nargs='+', help="email or URL to look up; use @filename to load from a file") parser.add_argument('-v', '--verbose', action='store_true', help="show both provider name and mail server names") parser.add_argument('-i', '--ignore-errors', action='store_true', help="ignore DNS lookup errors and continue with next item") args = parser.parse_args(args) pool = Pool(processes=10) it = pool.imap(multiprocess_mxsniff, args.names, 10) try: for result in it: if args.verbose: print(json.dumps(result)) + ',' else: print("{item}: {provider}".format(item=result['query'], provider=', '.join(result['match']))) except KeyboardInterrupt: pool.terminate()
def process(self): try: urls = redis_one.hkeys(self.sitemap_prefix) ofh = open('test_urls.txt', 'w+') urls.sort() ofh.write(('\n'.join(urls)).encode('utf8', 'ignore')) logger.error('total urls len %s' % len(urls)) dict_res = defaultdict(int) i = 0 while i <= len(urls): pool = Pool(processes=15) q = Queue() dict_subres = defaultdict(int) list_urls = [urls[i + j * 10000:i+(j+1)*10000] for j in range(15)] #list_dict_res = list(pool.map_async(parse_content, list_urls)) for d in pool.imap(parse_content, list_urls): for k, v in d.iteritems(): dict_res[k] += v logger.error('Parser %s %s' % (len(list_urls), len(dict_res))) i += 10000 * 15 sorted_dict_res = sorted(dict_res.iteritems(), key = lambda s: s[1], reverse=True) ofh = open('./test_sitemap_keywords', 'w+') ofh.write('\n'.join(['%s\t%s' % (k,v) for (k,v) in sorted_dict_res if v>=3]).encode('utf8', 'ignore')) ofh.close() except: logger.error(traceback.format_exc())
def y(): pool = Pool(2) x, y = ({}, {}) x = numpy.array([2,3]) y = numpy.array([-1,3]) for a in pool.imap(change, [conv_str(x), conv_str(y)]): print a
def main(): seq = stdin.read() ilen = len(seq) seq = sub('>.*\n|\n', '', seq) clen = len(seq) pool = Pool(initializer = init, initargs = (seq,)) variants = ( 'agggtaaa|tttaccct', '[cgt]gggtaaa|tttaccc[acg]', 'a[act]ggtaaa|tttacc[agt]t', 'ag[act]gtaaa|tttac[agt]ct', 'agg[act]taaa|ttta[agt]cct', 'aggg[acg]aaa|ttt[cgt]ccct', 'agggt[cgt]aa|tt[acg]accct', 'agggta[cgt]a|t[acg]taccct', 'agggtaa[cgt]|[acg]ttaccct') for f in zip(variants, pool.imap(var_find, variants)): print(f[0], f[1]) subst = { 'B' : '(c|g|t)', 'D' : '(a|g|t)', 'H' : '(a|c|t)', 'K' : '(g|t)', 'M' : '(a|c)', 'N' : '(a|c|g|t)', 'R' : '(a|g)', 'S' : '(c|g)', 'V' : '(a|c|g)', 'W' : '(a|t)', 'Y' : '(c|t)'} for f, r in list(subst.items()): seq = sub(f, r, seq) print() print(ilen) print(clen) print(len(seq))
def parmap(f,problems,leavefree=1,debug=False,verbose=False): global mypool problems = list(problems) njobs = len(problems) if njobs==0: if verbose: print('NOTHING TO DO?') return [] if not debug and (not 'mypool' in globals() or mypool is None): if verbose: print('NO POOL FOUND. RESTARTING.') mypool = Pool(cpu_count()-leavefree) enumerator = map(f,problems) if debug else mypool.imap(f,problems) results = {} sys.stdout.write('\n') for i,result in enumerator: sys.stdout.write('\rdone %0.1f%% '%((i+1)*100./njobs)) sys.stdout.flush() if isinstance(result,tuple) and len(result)==1: result=result[0] results[i]=result if verbose and type(result) is RuntimeError: print('ERROR PROCESSING',problems[i]) sys.stdout.write('\r \r') return [results[i] if i in results else None \ for i,k in enumerate(problems)]
def parmap_dict(f,problems,leavefree=1,debug=False,verbose=False): global mypool problems = list(problems) njobs = len(problems) if njobs==0: if verbose: print('NOTHING TO DO?') return [] if not debug and (not 'mypool' in globals() or mypool is None): if verbose: print('NO POOL FOUND. RESTARTING.') mypool = Pool(cpu_count()-leavefree) enumerator = map(f,problems) if debug else mypool.imap(f,problems) results = {} sys.stdout.write('\n') for key,result in enumerator: if isinstance(result,tuple) and len(result)==1: result=result[0] results[key]=result if verbose and type(result) is RuntimeError: print('ERROR PROCESSING',problems[i]) sys.stdout.write('\r \r') results = {key:results[key] for key in problems if key in results and not results[key] is None} return results
def newest_snapshot(project_id, hosts=None, timeout=20): """ Return most recent snapshot or empty string if none. If host is a single ip address, return newest snapshot on that host. If hosts is a list of ip addresses (or hostnames), returns a dictionary with keys the entries in hosts and the values the names of the newest snapshots. Hosts that don't respond within timeout seconds are ignored. """ if not isinstance(hosts, list): return _newest_snapshot(project_id, hosts) pool = Pool(processes=len(hosts)) start = time.time() x = pool.imap(mp_newest_snapshot, [(project_id, dest) for dest in hosts]) result = [] while True: try: t = timeout - (start-time.time()) if t > 0: result.append(x.next(t)) else: raise TimeoutError except TimeoutError, mesg: log.info("timed out connecting to some destination -- %s", mesg) pool.terminate() break except StopIteration: break
def process(dataset, result): """ For each unknown_author, calculate the distance between his AR and the IR of each known_author. Then find the true place of unknown_author. Save the true place and distance value into @result. """ global AR_TYPE if AR_TYPE.startswith('fixed'): ar_authors = loader.get_fixed_authors() else: ar_authors = dataset.authors[0:40] tups = [] for unknown in ar_authors: tups.append((unknown, dataset)) pool = Pool(processes=NUMBER_OF_CORES) it = pool.imap(process_distance_unknown, tups) pool.close() pool.join() for unknown in ar_authors: distance_results = it.next() for distance_result in distance_results: [ar_size, position, distance] = distance_result result.add(ar_size, unknown, position, distance) return
def subsample(cache_dir, image_sets, ipython_profile): parameters = [(cache_dir, images) for images in image_sets] if ipython_profile: from IPython.parallel import Client, LoadBalancedView client = Client(profile='lsf') lview = client.load_balanced_view() generator = lview.imap(_compute_group_subsample, parameters) elif ipython_profile == False: generator = (_compute_group_subsample(p) for p in parameters) else: from multiprocessing import Pool lview = Pool() generator = lview.imap(_compute_group_subsample, parameters) progress = progressbar.ProgressBar(widgets=['Subsampling:', progressbar.Percentage(), ' ', progressbar.Bar(), ' ', progressbar.Counter(), '/', str(len(parameters)), ' ', progressbar.ETA()], maxval=len(parameters)) results = list(generator) subsample = [] for i, (p, r) in enumerate(zip(parameters, results)): if r is None: print >>sys.stderr, '#### There was an error, recomputing locally: %s' % parameters[i][1] results[i] = _compute_group_subsample(p) # just to see throw the exception subsample.extend(r) print "the subsampling set contains %d items" % len(subsample) return subsample
def run(): H = np.random.rand(2,300) W = np.random.rand(3000, 300) t = time() C1 = cdist(H, W, "sqeuclidean") print time() - t print "done 1" t = time() k = cpu_count() N = H.shape[0] idxs = np.array_split(np.arange(N), k*10) jobs = [(H[ix], W, ix) for ix in idxs] p = Pool(k) C1p = np.empty((N, W.shape[0])) # for h, w, ix in jobs: # C1p[ix] = cdist(h, w, "sqeuclidean") t2 = time() for h0, ix in p.imap(f, jobs): C1p[ix] = h0 print time() - t2 p.close() print time() - t assert np.allclose(C1, C1p)
def find_words(self): """ Run all words through find_word using Pool.map """ if not all((self.number, self.wordlist, self.combos)): raise ValueError('Must have a number, a wordlist, and combos!') # TODO: Reduce memory footprint and waste on this whole operation. def format_results(resultsets): """ format final results """ if resultsets: resultsfmt = {} for resultset in resultsets: if resultset: resultsfmt.update(resultset) return resultsfmt return {} # setup a pool of processes/workers. pool = Pool(processes=self.processes) # map find_word to the wordlist, and format final results. rawresult = pool.imap( self.find_word, self.wordlist, chunksize=self.chunksize) results = format_results(rawresult) return results, self.totallen
def main(): starttime = datetime.now() concatenate = False parser = argparse.ArgumentParser(description="This program will run \ KaKs_Calculator on a directory.") parser.add_argument("-i", help = "Path to input file.") parser.add_argument("-o", help = "Path to output file.") parser.add_argument("-m", default = "NG", help = "Method for calculating Ka/Ks.") parser.add_argument("-t", type = int, default = 1, help = "Number of threads.") # Parse arguments and assign to variables args = parser.parse_args() indir = args.i if indir[-1] != "/": indir += "/" outdir = args.o if outdir != "/": outdir += "/" method = args.m cpu = args.t if cpu > MAXCPU: cpu = MAXCPU # Call Ka/Ks_Calculator in parallel. genes = glob(indir + "*.axt") l = int(len(genes)) pool = Pool(processes = cpu) func = partial(calculateKaKs, indir, outdir, method) print("\tRunning KaKs_Caclulator with", str(cpu), "threads....") rcml = pool.imap(func, genes) pool.close() pool.join() # Compile output compileKsKs(outdir) print("\tKaKs_Calculator runtime: ", datetime.now() - starttime)
def main(formula_list): formulas = open(formula_list).read().split("\n")[:MAX_NUMBER] try: os.mkdir(IMAGE_DIR) except OSError as e: pass # except because throws OSError if dir exists print("Turning formulas into images...") pool = Pool(THREADS) names = list(pool.imap(formula_to_image, formulas)) zipped = list(zip(formulas, names)) new_dataset_lines = [] new_formulas = [] ctr = 0 for formula in zipped: if formula[1] is None: continue for rendering_setup in formula[1]: new_dataset_lines.append(str(ctr) + " " + " ".join(rendering_setup)) new_formulas.append(formula[0]) ctr += 1 with open(NEW_FORMULA_FILE, "w") as f: f.write("\n".join(new_formulas)) with open(DATASET_FILE, "w") as f: f.write("\n".join(new_dataset_lines))
def extract_new_dataframes(dirs): pool = Pool(8) pbar = tqdm.tqdm(total=len(dirs)) for job in pool.imap(extract_dataframe_subdir, dirs): pbar.update(1) pbar.close() pool.close()
def process_fasta_file(fasta_file, out_file, num_processes): pool = Pool(processes = num_processes) outgen = pool.imap(SubmitELMServer, fasta_iter(fasta_file), chunksize=2*num_processes) with open(out_file, 'w') as handle: writer = csv.writer(handle, delimiter = '\t') writer.writerow(['Header', 'ELM', 'Start', 'End', 'Match']) for name, html in outgen: if html: try: out = ReadData(html) except: continue logging.warning('%s had %i matches' % (name, len(out))) for elm, pos in out: try: outrow = [name, elm] + extract_numbers(pos[0]) + [pos[1]] except: continue writer.writerow(outrow) else: logging.warning('%s had no ELMs' % name)
def calc_mv_classifier(clf, scorer, regions=None, processes=7, method='sequential'): import os.path as path from tempfile import mkdtemp n_regions = clf.data.shape[0] if regions is None: regions = range(0, n_regions) if processes != 1: from multiprocessing import Pool pool = Pool(processes=processes) else: pool = itertools pb = tools.ProgressBar(len(regions), start=True) filename = path.join(mkdtemp(), 'data.dat') data = np.memmap(filename, dtype='object', mode='w+', shape=clf.comp_dims) data[:] = clf.data[:] overall_results = [] for result in pool.imap(calc_mv_parallel_classifier, itertools.izip(itertools.repeat((filename, clf.classifier, scorer, clf.comp_dims, clf.feature_importances, np.array(clf.feature_names), method)), regions)): pb.next() for row in result: overall_results.append(row) overall_results = pd.DataFrame( overall_results, columns=['score', 'num_features', 'region', 'feature']) overall_results.region += 1 return overall_results
def fit_parallel(self, X, num_workers=4): import gc gc.collect() pool = Pool(num_workers, maxtasksperchild=2) share = min(int(2e5), math.ceil(len(X)/num_workers)) # share = int(1e5) tagger = GrammarTagger() num_parts = math.ceil(len(X)/share) x_gen = (X[i*share:i*share+share] for i in range(num_parts)) # delegate work to all available processes i = 0 for result in pool.imap(Processor(tagger, self.tagtype), x_gen): tag_results, base_struct_results = result for base_struct, count in base_struct_results.items(): self.base_structures[base_struct] += count self.counter += count for tag, terminals in tag_results.items(): for string, count in terminals.items(): self.tag_dicts[tag][string] += count i += 1 log.info("Processed {}/{} result batches...".format(i, num_parts)) log.info("Fitting completed.")
def get_valid_fragments(G, stoich_rank): #reactions, complexes = bipartite.sets(G) complexes, reactions = bipartite.sets(G) complexes = list(complexes) reactions = list(reactions) if 'w1' not in complexes and 'w1' not in reactions: raise Exception('my hack to resolve this unexpected behavior shown by bipartite.sets assumes that reaction nodes are named \'w1\', \'w2\', ...') if 'w1' in complexes: complexes, reactions = reactions, complexes if not ('w1' in reactions and 's1' in complexes): raise Exception('Something went wrong generating the lists of complexes of reactions.') complex_perms = list(it.combinations(complexes,stoich_rank)) reaction_perms = list(it.combinations_with_replacement(reactions,stoich_rank)) fragments = list(it.product(complex_perms, reaction_perms)) valid_fragments = [] pool = Pool() chunksize = 100 myval = functools.partial(validate_fragments, G, stoich_rank) fragment_list = pool.imap(myval, fragments, chunksize) valid_fragments = [f for f in fragment_list if f is not None] return get_unique_fragments(valid_fragments)
def main(): parser = ArgumentParser(description="Speed up your SHA. A different hash style.") parser.add_argument('-1', '--sha1', action='store_true') parser.add_argument('-2', '--sha224', action='store_true') parser.add_argument('-3', '--sha256', action='store_true') parser.add_argument('-4', '--sha384', action='store_true') parser.add_argument('-5', '--sha512', action='store_true') parser.add_argument('-f', '--file', type=str, help="The path to the file") if len(sys.argv) == 1: parser.print_help() return global args args = parser.parse_args() hashtree = '' big_file = open(args.file, 'rb') pool = Pool(multiprocessing.cpu_count()) for chunk_hash in pool.imap(hashing, chunks(big_file)): hashtree += chunk_hash + ":hash" pool.terminate() print(str(hashing(hashtree.encode('ascii'))))
def save_make_pseudo_data( pred_data_dir='/data/pneumo_log/val_1/2019_0815_1742/submission/snapshot_model_2/', zero_max=0.005, one_min=0.8, cpu_num=16, test_base_path='/data/pneumo/dicom-images-test/', test_data=True): ''' save pseudo label as dictionary {'img':, 'mask'} under pred_data_dir+'/pseudo/' This can be applied to train data (fold) too. set test_data=False ''' if test_data: save_path = pred_data_dir + '/pseudo/' else: save_path = pred_data_dir + '/pseudo_train_fold/' data_prep._make_dir(save_path) print('start to make pseudo label under {}'.format(save_path)) pred_data_path_list = glob(pred_data_dir + '/*.npy') p = Pool(processes=cpu_num) job_args = [(pred_data_path, save_path, zero_max, one_min, test_base_path, test_data) for pred_data_path in pred_data_path_list] list(tqdm(p.imap(_wrap_save_pseudo_label, job_args), total=len(job_args)))
def parse_nl_data(path, outpath): Path(outpath).mkdir(parents=True, exist_ok=True) pool = Pool(cpu_count()) total_files = sum(1 for _ in glob.glob("{}/*.json".format(path))) for part in range(total_files): with open('{}/split-{:03d}.json'.format(path, part), 'r', encoding='utf-8') as f: data = [json.loads(line.strip()) for line in f] results = [] with tqdm(total=len(data), desc='Processing') as pbar: for i, ex in enumerate(pool.imap(process_chunk, data, 1000)): pbar.update() tokens = ex.split() if len(tokens) > 10: results.append(' '.join(tokens)) if part == total_files - 1: with open('{}/test.description.txt'.format(outpath), 'w', encoding='utf-8') as fw: fw.write('\n'.join(results[:10000])) with open('{}/valid.description.txt'.format(outpath), 'w', encoding='utf-8') as fw: fw.write('\n'.join(results[10000:20000])) with open('{}/train.{}.description.txt'.format(outpath, part), 'w', encoding='utf-8') as fw: fw.write('\n'.join(results[20000:])) else: with open('{}/train.{}.description.txt'.format(outpath, part), 'w', encoding='utf-8') as fw: fw.write('\n'.join(results))
def main(data_path: str = None): ''' Download quarterly and base data from https://finance.yahoo.com Parameters ---------- data_path: path to folder in which downloaded data will be stored. OR ``None`` (downloading path will be as ``yahoo_data_path`` from `~/.ml_investment/config.json` ''' if data_path is None: config = load_config() data_path = config['yahoo_data_path'] global _data_path _data_path = data_path tickers = load_tickers()['base_us_stocks'] os.makedirs('{}/quarterly'.format(data_path), exist_ok=True) os.makedirs('{}/base'.format(data_path), exist_ok=True) p = Pool(12) for _ in tqdm(p.imap(_single_ticker_download, tickers)): None
def cross_validate(answersets, labels, cfier_factory, num_rounds=num_rounds): """ Perform num_rounds-fold cross-validation of the model, returning the list of test scores in each fold. """ # Do not pass cv_data as parameters as that'll create a separate copy # for each sub-process, dramatically increasing memory improvements; # 16GB RAM is not enough for 8-thread cross-validation on large2180. global _g_cv_data _g_cv_data = (answersets, labels, cfier_factory) processes = os.environ.get('ANSWERTRAIN_N_THREADS', os.environ.get('YODAQA_N_THREADS', None)) if processes is not None: processes = int(processes) pool = Pool(processes=processes) scores = [] for res in pool.imap(cross_validate_one, range(num_rounds)): print('// (test) ' + test_msg(*res)) scores.append(list(res)) pool.close() return np.array(scores)
def _f(self, w): # it turned out that it doesn't pay off to evaluate the function # in separate processes, so we turn it off if False: # self.multicore: likelihood = 0 pool = Pool() try: for i, (f_, d_) in enumerate( pool.imap( with_tracing(_methodcaller('_f', sideeffects=True)), [(l, w) for l in self.learners])): self.learners[i].__dict__ = d_ likelihood += f_ except Exception as e: logger.error('Error in child process. Terminating pool...') pool.close() raise e finally: pool.terminate() pool.join() return likelihood else: return sum([l._f(w) for l in self.learners])
def calculate(self, data_loader, info_df: pd.DataFrame) -> pd.DataFrame: ''' Interface to calculate targets for dates and tickers in info_df based on data from data_loader Parameters ---------- data_loader: class implements load_quarterly_data(tickers: List[str]) -> pd.DataFrame interface info_df: pd.DataFrame containing information of tickers and dates to calculate targets for. Should have columns: ["ticker", "date"]. Returns ------- pd.DataFrame with targets having 'y' column ''' self._data_loader = data_loader grouped = info_df.groupby('ticker')['date'].apply( lambda x: x.tolist()).reset_index() params = [(ticker, dates) for ticker, dates in grouped.values] n_jobs = 10 p = Pool(n_jobs) result = [] for ticker_result in tqdm(p.imap(self._single_ticker_target, params)): result.append(ticker_result) result = pd.concat(result, axis=0) result = result.drop_duplicates(['ticker', 'date']) result = pd.merge(info_df, result, on=['ticker', 'date'], how='left') result = result.set_index(['ticker', 'date']) result = result.infer_objects() return result
def create_image_thumbs(self): ''' Create output thumbs in 32px, 64px, and 128px ''' print(' * creating image thumbs') resize_args = [] n_thumbs = len(self.image_files) for c, j in enumerate(self.image_files): sizes = [] out_paths = [] for i in sorted(self.sizes, key=int, reverse=True): out_dir = join(self.output_dir, 'thumbs', str(i) + 'px') out_path = join( out_dir, get_filename(j) + '.png' ) if os.path.exists(out_path) and not self.rewrite_image_thumbs: continue sizes.append(i) out_paths.append(out_path) if len(sizes) > 0: resize_args.append([j, c, n_thumbs, sizes, out_paths]) pool = Pool() for result in pool.imap(resize_thumb, resize_args): if result: self.errored_images.add( get_filename(result) )
def remove_incomplete_flows(self): print('\nStarted removing incomplete flows.') pool = Pool(self.num_worker) num_cases = np.max(self.csv_file['Case_ID']) + 1 print('\tFound number of cases.') case_ids = range(num_cases) cases = [] for i in case_ids: cases.append( self.csv_file.loc[self.csv_file['Case_ID'] == i].values) print('\tSeparated cases.') sub_cases = [] k, m = divmod(num_cases, self.num_worker) for i in range(self.num_worker): head = i * k tail = head + k if i == self.num_worker - 1: tail = num_cases sub_cases.append(cases[head:tail]) processed = [] print('\tBuilt subsets.') for chunk in pool.imap(Functions.fun_remove_incomplete, sub_cases): processed.append(np.concatenate(chunk, axis=0)) processed = np.concatenate(processed, axis=0) self.csv_file.loc[:, :] = processed self.csv_file = self.csv_file[self.csv_file['Flags'] != 'Bad'] print('Removed incomplete flows.')
def _preprocess_docs_odin(self, texts, vocabulary, keep_order): # ====== main processing ====== # def initializer(filters, preprocessors, lang, lemma, charlevel, stopwords): globals()['__preprocessors'] = preprocessors globals()['__filters'] = filters globals()['__lang'] = lang globals()['__lemma'] = lemma globals()['__charlevel'] = charlevel globals()['__stopwords'] = stopwords # add the index for ordering nb_docs = 0 pool = Pool(processes=self.nb_processors, initializer=initializer, initargs=(self.filters, self.preprocessors, self.language, self.lemmatization, self.char_level, self.stopwords)) # return the tokenized documents as original order. if keep_order: it = pool.imap(func=_preprocess_func, iterable=texts, chunksize=self.batch_size) # don't care about the order, often used for fitting else: it = pool.imap_unordered(func=_preprocess_func, iterable=texts, chunksize=self.batch_size) # iterate over each return document for doc in it: nb_docs += 1 if vocabulary is not None: doc = [token for token in doc if token in vocabulary] yield nb_docs, doc pool.close() pool.join()
def main(args): import sys import os sys.path.append( os.path.normpath( os.path.join(os.path.dirname(__file__), '..', 'helpers'))) # how to search for all ground truth searchFine = os.path.join(args.datadir, "gtFine", "*", "*", "*_gt*_polygons.json") # search files filesFine = glob.glob(searchFine) filesFine.sort() files = filesFine if not files: tqdm.writeError("Did not find any files. Please consult the README.") # a bit verbose tqdm.write("Processing {} annotation files".format(len(files))) # iterate through files progress = 0 tqdm.write("Progress: {:>3} %".format(progress * 100 / len(files)), end=' ') from multiprocessing import Pool import time pool = Pool(args.num_workers) # results = pool.map(process_pred_gt_pair, pairs) results = list(tqdm(pool.imap(process_folder, files), total=len(files))) pool.close() pool.join()
def eval_on_dev(self): t = time.time() print >> logs, "garbage collection...", gc.collect() print >> logs, "took %.1f seconds" % (time.time() - t) Parser.debuglevel = 0 if FLAGS.multi != 1: ncpus = FLAGS.multi print >> logs, "using %d CPUs for eval... chunksize=%d" % ( ncpus, len(self.devchunks[0])) tot = self.decoder.evalclass() pool = Pool(processes=ncpus) for sub in pool.imap(self.eval_worker, self.devchunks, chunksize=1): tot += sub else: tot = self.eval_worker(self.devlines) Parser.debuglevel = FLAGS.debuglevel # restore return tot
def main(formula_list): formulas = open(formula_list).read().split("\n")[:MAX_NUMBER] try: os.mkdir(IMAGE_DIR) except OSError as e: pass #except because throws OSError if dir exists print("Turning formulas into images...") # Running a thread pool masks debug output. Uncomment command below to run # formulas over images sequentially to see debug errors more clearly # names = [formula_to_image(formula) for formula in formulas] # Also remember to comment threaded version if you use sequential: pool = Pool(THREADS) names = list(pool.imap(formula_to_image, formulas)) zipped = list(zip(formulas, names)) new_dataset_lines = [] new_formulas = [] ctr = 0 for formula in zipped: if formula[1] is None: continue for rendering_setup in formula[1]: new_dataset_lines.append( str(ctr) + " " + " ".join(rendering_setup)) new_formulas.append(formula[0]) ctr += 1 with open(NEW_FORMULA_FILE, "w") as f: f.write("\n".join(new_formulas)) with open(DATASET_FILE, "w") as f: f.write("\n".join(new_dataset_lines))
def main(): n_proc = 50 #blensor_result_path = args.br_path root_path = 'data/scannet' gt_dir = "gtFine" img_dir = "leftImg8bit" imglists = "imglists" dirs = [gt_dir, img_dir, imglists] splits = [0.5, 0.3, 0.2] # train/val/test splits = np.cumsum(splits) splits_dict = {0: "train", 1: "val", 2: "test"} check_mkdir(root_path) for i in range(len(dirs)): d = os.path.join(root_path, dirs[i]) check_mkdir(d) for spl in splits_dict.values(): check_mkdir(os.path.join(d, spl)) ids_ = os.listdir(blensor_result_path) # eg. scene0041_01 l = multiprocessing.Lock() pool = Pool(n_proc, initializer=init_pool, initargs=(l, )) files = {} for spl in splits_dict.values(): files[spl] = open("%s/%s/%s.lst" % (root_path, imglists, spl), 'w') len_ids = len(ids_) workers = {} for spl in splits_dict.values(): workers['train'] = lambda i: worker(i, spl, img_dir, gt_dir, files) pool.imap(worker, ids_[:splits[0] * len_ids]) pool.imap(worker, ids_[splits[0] * len_ids:(splits[0] + splits[1]) * len_ids]) pool.imap(worker, ids_[(splits[0] + splits[1]) * len_ids:]) pool.close() pool.join()
def represent(self, molecules): """ provides coulomb matrix representation for input molecules. Parameters ---------- molecules : chemml.chem.Molecule object or array If list, it must be a list of chemml.chem.Molecule objects, otherwise we raise a ValueError. In addition, all the molecule objects must provide the XYZ information. Please make sure the XYZ geometry has been stored or optimized in advance. Returns ------- features : Pandas DataFrame A data frame with same number of rows as number of molecules will be returned. The exact shape of the dataframe depends on the type of CM as follows: - shape of Unsorted_Matrix (UM): (n_molecules, max_n_atoms**2) - shape of Unsorted_Triangular (UT): (n_molecules, max_n_atoms*(max_n_atoms+1)/2) - shape of eigenspectrums (E): (n_molecules, max_n_atoms) - shape of Sorted_Coulomb (SC): (n_molecules, max_n_atoms*(max_n_atoms+1)/2) - shape of Random_Coulomb (RC): (n_molecules, nPerm * max_n_atoms * (max_n_atoms+1)/2) """ # check input molecules if isinstance(molecules, (list, np.ndarray)): molecules = np.array(molecules) elif isinstance(molecules, Molecule): molecules = np.array([molecules]) else: msg = "The molecule must be a chemml.chem.Molecule object or a list of objects." raise ValueError(msg) if molecules.ndim > 1: msg = "The molecule must be a chemml.chem.Molecule object or a list of objects." raise ValueError(msg) self.n_molecules_ = molecules.shape[0] # max number of atoms based on the list of molecules if self.max_n_atoms_ == 'auto': try: self.max_n_atoms_ = max( [m.xyz.atomic_numbers.shape[0] for m in molecules]) except: msg = "The xyz representation of molecules is not available." raise ValueError(msg) # pool of processes if self.n_jobs == -1: self.n_jobs = cpu_count() pool = Pool(processes=self.n_jobs) # Create an iterator # http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] # find size of each batch batch_size = int(len(molecules) / self.n_jobs) if batch_size == 0: batch_size = 1 molecule_chunks = chunks(molecules, batch_size) # MAP: CM in parallel map_function = partial(self._represent) if self.verbose: print('featurizing molecules in batches of %i ...' % batch_size) pbar = Progbar(len(molecules), width=50) tensor_list = [] for tensors in pool.imap(map_function, molecule_chunks): pbar.add(len(tensors[0])) tensor_list.append(tensors) print('Merging batch features ... ', end='') else: tensor_list = pool.map(map_function, molecule_chunks) if self.verbose: print('[DONE]') # REDUCE: Concatenate the obtained tensors pool.close() pool.join() return pd.concat(tensor_list, axis=0, ignore_index=True)
import subprocess from multiprocessing import Pool def runACO(runid): result = subprocess.check_output(['python3', 'RunACOExperiments.py']) return str(runid) + "," + result.decode('utf-8') if __name__ == "__main__": runs = 25 p = Pool(runs) outputs = p.imap(runACO, range(runs)) for output in outputs: print(output)
print("Loading model...") model = VGG16(weights=None, pooling=pool, include_top=False) model.load_weights( '../pretrained/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5') n_items = Value('i', -1) # Async number of items sparse_features = [] # items_ids = [] pool = Pool(2) bar = None X_batch = [] try: # Threaded generator is usful for both parallel blocking read and to limit # items buffered by pool.imap (may cause OOM) generator = ThreadedGenerator(generate_files(n_items), 50) for item_id, im in pool.imap(im_decode_resize, generator): if bar is None: bar = tqdm(total=n_items.value, mininterval=bar_iterval, unit_scale=True) # Replace None with empty image if im is None: im = empty_im X_batch.append(im) # items_ids.append(item_id) del im if len(X_batch) == batch_size: sparse_features.append(predict_batch(model, X_batch))
def rerank_by_m2(): data_dir = ASSESS_DIR + "data/" k_best_dir = data_dir + "K-best/" system_file = k_best_dir + "conll14st.output.1.best100" reference_dir = data_dir + "references/" first_nucle = reference_dir + "NUCLEA.m2" combined_nucle = reference_dir + "NUCLE.m2" BN = reference_dir + "BN.m2" ALL = reference_dir + "ALL.m2" gold_files = [first_nucle, combined_nucle, BN, ALL] (path, dirs, files) = next(os.walk(reference_dir)) for fl in files: if "subset" in fl: gold_files.append(path + fl) calculations_dir = "calculations_data/" output_file = "first_rank_results" for gold_file in gold_files: out_text_file = calculations_dir + \ output_file + name_extension(gold_file)[0] out_res_file = calculations_dir + "prf_" + \ output_file + name_extension(gold_file)[0] if os.path.isfile(out_text_file): print("file already found", out_text_file) else: print("processing " + gold_file) source_sentences, gold_edits = m2scorer.load_annotation(gold_file) # load system hypotheses fin = m2scorer.smart_open(system_file, 'r') system_sentences = [line.strip() for line in fin.readlines()] fin.close() # pack and parse RoRo's k-best packed_system_sentences = get_roro_packed(system_sentences) # candidate_num = 0 # for sentence_num, (source, this_edits) in enumerate(zip(source_sentences, gold_edits)): # curr_sentences = [] # # keep packing until reached another sentence, assumes k-best are consequetive # while (candidate_num < len(system_sentences) and # system_sentences[candidate_num].split()[0] == str(sentence_num)): # sentence = re.sub("\|\d+-\d+\| ","",system_sentences[candidate_num].split("|||")[1][1:]) # candidate_num += 1 # curr_sentences.append(sentence) # packed_system_sentences.append(curr_sentences) # print(len(packed_system_sentences), len(gold_edits), # len(source_sentences)) # find top ranking pool = Pool(POOL_SIZE) assert (len(packed_system_sentences) == len(gold_edits) and len(gold_edits) == len(source_sentences)) results = pool.imap( M2SCORER_oracle, zip(source_sentences, gold_edits, packed_system_sentences)) pool.close() pool.join() results = list(results) sentences = "\n".join(list(zip(*results))[0]) results = list(zip(*results))[1] results = "\n".join([str(x) for x in results]) print("writing to " + out_text_file) with codecs.open(out_text_file, "w+", "utf-8") as fl: fl.write(sentences) with open(out_res_file, "w+") as fl: fl.write(results)
from multiprocessing import Pool def md5_file(filename): with open(filename) as f: return (hashlib.md5(f.read()).hexdigest(), filename) directories = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "the", "u", "v", "w", "x", "y" ] try: base_directory = sys.argv[1] pool = Pool(8) with open("md5sums.txt", "w") as f: writer = csv.writer(f) for d in directories: print "Calculating hashes for the {} directory.".format(d) image_files = glob.iglob("{}/{}/*".format(base_directory, d)) for hash_and_name in pool.imap(md5_file, image_files): writer.writerow(hash_and_name) except IndexError: print "{0}: Syntax: {0} <album covers base directory>".format(sys.argv[0]) sys.exit(0)
def _run(self): """ verbose: whether to print results (or anything at all, in fact) details: (given that verbose is true) whether to output additional status information debug: (given that verbose is true) if true, outputs debug information, in particular the distribution over possible worlds debugLevel: level of detail for debug mode """ # check consistency with hard constraints: self._watch.tag('check hard constraints', verbose=self.verbose) hcgrounder = FastConjunctionGrounding(self.mrf, simplify=False, unsatfailure=True, formulas=[f for f in self.mrf.formulas if f.weight == HARD], **(self._params + {'multicore': False, 'verbose': False})) for gf in hcgrounder.itergroundings(): if isinstance(gf, Logic.TrueFalse) and gf.truth() == .0: raise SatisfiabilityException('MLN is unsatisfiable due to hard constraint violation by evidence: {} ({})'.format(str(gf), str(self.mln.formula(gf.idx)))) self._watch.finish('check hard constraints') # compute number of possible worlds worlds = 1 for variable in self.mrf.variables: values = variable.valuecount(self.mrf.evidence) worlds *= values numerators = [0.0 for i in range(len(self.queries))] denominator = 0. # start summing logger.debug("Summing over %d possible worlds..." % worlds) if worlds > 500000 and self.verbose: print colorize('!!! %d WORLDS WILL BE ENUMERATED !!!' % worlds, (None, 'red', True), True) k = 0 self._watch.tag('enumerating worlds', verbose=self.verbose) global global_enumAsk global_enumAsk = self bar = None if self.verbose: bar = ProgressBar(width=100, steps=worlds, color='green') if self.multicore: pool = Pool() logger.debug('Using multiprocessing on {} core(s)...'.format(pool._processes)) try: for num, denum in pool.imap(with_tracing(eval_queries), self.mrf.worlds()): denominator += denum k += 1 for i, v in enumerate(num): numerators[i] += v if self.verbose: bar.inc() except Exception as e: logger.error('Error in child process. Terminating pool...') pool.close() raise e finally: pool.terminate() pool.join() else: # do it single core for world in self.mrf.worlds(): # compute exp. sum of weights for this world num, denom = eval_queries(world) denominator += denom for i, _ in enumerate(self.queries): numerators[i] += num[i] k += 1 if self.verbose: bar.update(float(k) / worlds) logger.debug("%d worlds enumerated" % k) self._watch.finish('enumerating worlds') if 'grounding' in self.grounder.watch.tags: self._watch.tags['grounding'] = self.grounder.watch['grounding'] if denominator == 0: raise SatisfiabilityException( 'MLN is unsatisfiable. All probability masses returned 0.') # normalize answers dist = map(lambda x: float(x) / denominator, numerators) result = {} for q, p in zip(self.queries, dist): result[str(q)] = p return result
def represent(self, molecules): """ provides bag of bonds representation for input molecules. Parameters ---------- molecules : chemml.chem.Molecule object or array If list, it must be a list of chemml.chem.Molecule objects, otherwise we raise a ValueError. In addition, all the molecule objects must provide the XYZ information. Please make sure the XYZ geometry has been stored or optimized in advance. Returns ------- features : pandas data frame, shape: (n_molecules, max_length_of_combinations) The bag of bond features. """ if isinstance(molecules, (list, np.ndarray)): molecules = np.array(molecules) elif isinstance(molecules, Molecule): molecules = np.array([molecules]) else: msg = "The input molecules must be a chemml.chem.Molecule object or a list of objects." raise ValueError(msg) if molecules.ndim > 1: msg = "The molecule must be a chemml.chem.Molecule object or a list of objects." raise ValueError(msg) # pool of processes if self.n_jobs == -1: self.n_jobs = cpu_count() pool = Pool(processes=self.n_jobs) # Create an iterator # http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): yield l[i:i + n] # find size of each batch batch_size = int(len(molecules) / self.n_jobs) if batch_size == 0: batch_size = 1 molecule_chunks = chunks(molecules, batch_size) # MAP: CM in parallel map_function = partial(self._represent) if self.verbose: print('featurizing molecules in batches of %i ...' % batch_size) pbar = Progbar(len(molecules), width=50) bbs_info = [] for tensors in pool.imap(map_function, molecule_chunks): pbar.add(len(tensors[0])) bbs_info.append(tensors) print('Merging batch features ... ', end='') else: bbs_info = pool.map(map_function, molecule_chunks) if self.verbose: print('[DONE]') # REDUCE: Concatenate the obtained tensors pool.close() pool.join() return self.concat_mol_features(bbs_info)
"noretrmedian": noretrmedian, "noretrmin": noretrmin, "noretrmax": noretrmax, "neverretrmedian": neverretrmedian, "neverretrmin": neverretrmin, "neverretrmax": neverretrmax, } for val in [ noretrmedian, noretrmin, noretrmax, neverretrmedian, neverretrmin, neverretrmax ]: results[val] = {} print "Calculating VICBF parameters..." pool = Pool(maxtasksperchild=1) resiter = pool.imap(find_params, results.keys()) pbar = ProgressBar(maxval=len(results.keys())) pbar.start() c = 0 for i in results.keys(): n = resiter.next() c += 1 pbar.update(c) results[i] = { "hash_functions": str(n[0]), "slots": str(n[1]), "probability": str(n[2]), "len_uncompressed": str(n[1] + 10), "len_compressed": str(int(round((n[1] + 10) * 0.52))) }
def main(): """ Helper script to encode raw text with the GPT-2 BPE using multiple processes. The encoder.json and vocab.bpe files can be obtained here: - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.zbpe """ parser = argparse.ArgumentParser() parser.add_argument( "--model-file", help='path to encoder.json', ) parser.add_argument( "--inputs", nargs="+", default=['-'], help="input files to filter/encode", ) parser.add_argument( "--outputs", nargs="+", default=['-'], help="path to save encoded outputs", ) parser.add_argument( "--keep-empty", action="store_true", help="keep empty lines", ) parser.add_argument("--max_len", type=int, default=510) parser.add_argument("--workers", type=int, default=20) args = parser.parse_args() assert len(args.inputs) == len(args.outputs), \ "number of input and output paths should match" with contextlib.ExitStack() as stack: inputs = [ stack.enter_context(open(input, "r", encoding="utf-8")) if input != "-" else sys.stdin for input in args.inputs ] outputs = [ stack.enter_context(open(output, "w", encoding="utf-8")) if output != "-" else sys.stdout for output in args.outputs ] encoder = MultiprocessingEncoder(args) pool = Pool(args.workers, initializer=encoder.initializer) encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100) stats = Counter() for i, (filt, enc_lines) in enumerate(encoded_lines, start=1): if filt == "PASS": for enc_line, output_h in zip(enc_lines, outputs): print(enc_line, file=output_h) else: stats["num_filtered_" + filt] += 1 if i % 10000 == 0: print("processed {} lines".format(i), file=sys.stderr) for k, v in stats.most_common(): print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
class Simpletxt2Json(): def __init__(self, dst_version, city, sub_imageset_folds, multi_processing=False, num_processor=16): self.splitted_image_dir = './data/buildchange/{}/{}/images'.format( dst_version, city) self.splitted_label_dir = './data/buildchange/{}/{}/labels'.format( dst_version, city) self.json_dir = './data/buildchange/v2/{}/labels_json'.format(city) self.wrong_shp_file_dict = dict() for sub_fold in sub_imageset_folds[city]: wrong_file = './data/buildchange/v0/{}/{}/wrongShpFile.txt'.format( city, sub_fold) ori_filenames = self.read_wrong_file(wrong_file) self.wrong_shp_file_dict[sub_fold] = ori_filenames wwtool.mkdir_or_exist(self.json_dir) self.city = city self.multi_processing = multi_processing self.pool = Pool(num_processor) def read_wrong_file(self, wrong_file): ori_filenames = [] with open(wrong_file, 'r') as f: lines = f.readlines() for line in lines: ori_filename = line.strip('\n').split('/')[-1].split('.csv')[0] ori_filenames.append(ori_filename) return ori_filenames def simpletxt_parse(self, label_file): """parse simpletxt style dataset label file Arguments: label_file {str} -- label file path Returns: dict, {'bbox': [...], 'label': class_name} -- objects' location and class """ with open(label_file, 'r') as f: lines = f.readlines() objects = [] basic_label_str = " " for line in lines: object_struct = dict() line = line.rstrip().split(' ') label = basic_label_str.join(line[-1]) polygon = [float(_) for _ in line[0:-1]] object_struct['polygon'] = polygon object_struct['label'] = label objects.append(object_struct) return objects def get_footprint(self, mask, coordinate, roof_polygons, roof_properties): # print(mask, coordinate, roof_polygon, roof_property) transform_matrix = [1, 0, 0, 1, coordinate[0], coordinate[1]] roi_mask = affinity.affine_transform(mask, transform_matrix) # print("move: ", mask, moved_mask, coordinate) for idx, roof_polygon in enumerate(roof_polygons): if roof_polygon.equals(roi_mask): xoffset = roof_properties[idx].to_dict()['xoffset'] yoffset = roof_properties[idx].to_dict()['yoffset'] break else: xoffset, yoffset = 0, 0 transform_matrix = [1, 0, 0, 1, -coordinate[0], -coordinate[1]] split_mask = affinity.affine_transform(roi_mask, transform_matrix) transform_matrix = [1, 0, 0, 1, -xoffset, -yoffset] footprint_polygon = affinity.affine_transform(split_mask, transform_matrix) return footprint_polygon, xoffset, yoffset def simpletxt2json(self, image_fn): # 1. open the ignore file and get the polygons base_name = wwtool.get_basename(image_fn) sub_fold = base_name.split("__")[0].split('_')[0] ori_image_fn = "_".join(base_name.split("__")[0].split('_')[1:]) # if ori_image_fn in self.wrong_shp_file_dict[sub_fold]: # print("Skip this wrong shape file") # return coord_x, coord_y = base_name.split("__")[1].split( '_') # top left corner coord_x, coord_y = int(coord_x), int(coord_y) print( f"splitted items: {self.city}, {sub_fold}, {ori_image_fn}, {(coord_x, coord_y)}" ) ignore_file = './data/buildchange/{}/{}/{}/pixel_anno_v2/{}'.format( src_version, self.city, sub_fold, ori_image_fn + '.png') # print("ignore file name: ", ignore_file) roof_shp_file = './data/buildchange/{}/{}/{}/roof_shp_4326/{}'.format( src_version, self.city, sub_fold, ori_image_fn + '.shp') geo_info_file = './data/buildchange/{}/{}/{}/geo_info/{}'.format( src_version, self.city, sub_fold, ori_image_fn + '.png') objects = shp_parser(roof_shp_file, geo_info_file) roof_polygon_4326 = [obj['converted_polygon'] for obj in objects] roof_property = [obj['converted_property'] for obj in objects] pixel_anno = cv2.imread(ignore_file) if pixel_anno is None: return objects = mask_parser(pixel_anno[coord_y:coord_y + sub_img_h, coord_x:coord_x + sub_img_w, :], category=255) if objects == []: return ignore_polygons = [obj['polygon'] for obj in objects] # print("ignore polygon: ", ignore_polygons) # 2. read the simpletxt file and convert to polygons objects = self.simpletxt_parse( os.path.join(self.splitted_label_dir, base_name + '.txt')) roof_polygons = [ wwtool.mask2polygon(obj['polygon']) for obj in objects ] # print("roof polygon: ", roof_polygons) _, ignore_indexes = wwtool.cleaning_polygon_by_polygon( roof_polygons[:], ignore_polygons, show=False) ignore_list = len(roof_polygons) * [0] for ignore_index in ignore_indexes: ignore_list[ignore_index] = 1 new_anno_objects = [] for idx, roof_polygon in enumerate(roof_polygons): footprint_polygon, xoffset, yoffset = self.get_footprint( roof_polygon, [coord_x, coord_y], roof_polygon_4326, roof_property) object_struct = dict() ignore_flag = ignore_list[idx] object_struct['roof'] = wwtool.polygon2mask(roof_polygon) object_struct['footprint'] = wwtool.polygon2mask(footprint_polygon) object_struct['offset'] = [xoffset, yoffset] object_struct['ignore'] = ignore_flag new_anno_objects.append(object_struct) image_info = { "ori_filename": ori_image_fn + '.jpg', "subimage_filename": image_fn, "width": 1024, "height": 1024, "city": self.city, "sub_fold": sub_fold, "coordinate": [coord_x, coord_y] } json_data = {"image": image_info, "annotations": new_anno_objects} json_file = os.path.join(self.json_dir, f'{base_name}.json') with open(json_file, "w") as jsonfile: json.dump(json_data, jsonfile, indent=4) def core(self): if self.multi_processing: image_fn_list = os.listdir(self.splitted_image_dir) num_image = len(image_fn_list) worker = partial(self.simpletxt2json) # self.pool.map(worker, image_fn_list) ret = list( tqdm.tqdm(self.pool.imap(worker, image_fn_list), total=num_image)) self.pool.close() self.pool.join() else: image_fn_list = os.listdir(self.splitted_image_dir) progress_bar = mmcv.ProgressBar(len(image_fn_list)) for _, image_fn in enumerate(image_fn_list): self.simpletxt2json(image_fn) progress_bar.update() def __getstate__(self): self_dict = self.__dict__.copy() del self_dict['pool'] return self_dict def __setstate__(self, state): self.__dict__.update(state)
def main(): parser = argparse.ArgumentParser( prog='PrePARE', description='Validate CMIP6 file for ESGF publication.') parser.add_argument( '-l', '--log', metavar='CWD', type=str, const='{}/logs'.format(os.getcwd()), nargs='?', help='Logfile directory. Default is the working directory.\n' 'If not, standard output is used. Only available in multiprocessing mode.' ) parser.add_argument('--variable', help='Specify geophysical variable name.\n' 'If not variable is deduced from filename.') parser.add_argument( '--table-path', action=DIRECTORYAction, default=os.environ['CMIP6_CMOR_TABLES'] if 'CMIP6_CMOR_TABLES' in list(os.environ.keys()) else './Tables', help='Specify the CMIP6 CMOR tables path (JSON file).\n' 'If not submitted read the CMIP6_CMOR_TABLES environment variable if exists.\n' 'If a directory is submitted table is deduced from filename (default is "./Tables").' ) parser.add_argument( '--max-processes', metavar='4', type=processes_validator, default=4, help= 'Number of maximal processes to simultaneously treat several files.\n' 'Set to one seems sequential processing (default). Set to "-1" seems\n' 'all available resources as returned by "multiprocessing.cpu_count()".' ) parser.add_argument( '--all', action='store_true', default=False, help= 'Show all results. Default only shows error(s) (i.e., file(s) not compliant)' ) parser.add_argument( '--ignore-dir', metavar="PYTHON_REGEX", type=str, default='^.*/\.[\w]*$', help='Filter directories NON-matching the regular expression.\n' 'Default ignores paths with folder name(s) starting with "."') parser.add_argument('--include-file', metavar='PYTHON_REGEX', type=regex_validator, action='append', help='Filter files matching the regular expression.\n' 'Duplicate the flag to set several filters.\n' 'Default only include NetCDF files.') parser.add_argument( '--exclude-file', metavar='PYTHON_REGEX', type=regex_validator, action='append', help='Filter files NON-matching the regular expression.\n' 'Duplicate the flag to set several filters.\n' 'Default only exclude hidden files (with names not\n' 'starting with ".").') parser.add_argument( 'input', nargs='+', action=INPUTAction, help= 'Input CMIP6 netCDF data to validate (ex: clisccp_cfMon_DcppC22_NICAM_gn_200001-200001.nc).\n' 'If a directory is submitted all netCDF recursively found will be validate independently.' ) # Check command-line error try: args = parser.parse_args() except argparse.ArgumentTypeError as errmsg: print(str(errmsg), file=sys.stderr) return 1 except SystemExit: return 1 # Get log logname = 'PrePARE-{}.log'.format(datetime.now().strftime("%Y%m%d-%H%M%S")) log = None if args.log: if not os.path.isdir(args.log): os.makedirs(args.log) log = os.path.join(args.log, logname) # Collects netCDF files for process sources = Collector(args.input) # Set scan filters file_filters = list() if args.include_file: file_filters.extend([(f, True) for f in args.include_file]) else: # Default includes netCDF only file_filters.append(('^.*\.nc$', True)) if args.exclude_file: # Default exclude hidden files file_filters.extend([(f, False) for f in args.exclude_file]) else: file_filters.append(('^\..*$', False)) # Init collector file filter for regex, inclusive in file_filters: sources.FileFilter.add(regex=regex, inclusive=inclusive) # Init collector dir filter sources.PathFilter.add(regex=args.ignore_dir, inclusive=False) nb_sources = len(sources) errors = 0 # Init process context cctx = dict() cctx['table_path'] = args.table_path cctx['variable'] = args.variable cctx['all'] = args.all # Separate sequential process and multiprocessing if args.max_processes != 1: # Create pool of processes pool = Pool(processes=args.max_processes, initializer=initializer, initargs=(list(cctx.keys()), list(cctx.values()))) # Run processes logfiles = list() progress = 0 for logfile, rc in pool.imap(process, sources): progress += 1 percentage = int(progress * 100 / nb_sources) msg = BCOLORS.OKGREEN + '\rCheck netCDF file(s): ' + BCOLORS.ENDC msg += '{}% | {}/{} files'.format(percentage, progress, nb_sources) sys.stdout.write(msg) sys.stdout.flush() logfiles.append(logfile) errors += rc sys.stdout.write('\r\033[K') sys.stdout.flush() # Print results from logfiles and remove them for logfile in set(logfiles): if not os.stat(logfile).st_size == 0: with open(logfile, 'r') as f: if log: with open(log, 'a+') as r: r.write(f.read()) else: sys.stdout.write(f.read()) sys.stdout.flush() os.remove(logfile) # Close pool of processes pool.close() pool.join() else: print('Checking data, please wait...') initializer(list(cctx.keys()), list(cctx.values())) for source in sources: errors += sequential_process(source) # Print results summary msg = BCOLORS.HEADER + '\nNumber of files scanned: {}'.format( nb_sources) + BCOLORS.ENDC if errors: msg += BCOLORS.FAIL else: msg += BCOLORS.OKGREEN msg += '\nNumber of file with error(s): {}'.format(errors) + BCOLORS.ENDC if log: with open(log, 'a+') as r: r.write(msg) print(msg) # Evaluate errors and exit with appropriate return code if errors != 0: if errors == nb_sources: # All files has error(s). Error code = -1 sys.exit(-1) else: # Some files (at least one) has error(s). Error code = nb files with error(s) sys.exit(errors) else: # No errors. Error code = 0 sys.exit(0)
with open(args.rid_phase_map) as f: for row in f: row = row.strip().split() arid2phase[row[0]] = (row[1], row[2], row[3] ) #ctg_id, phase_blk_id, phase_id exe_pool = Pool(args.n_core) file_list = open(args.fofn).read().split("\n") inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((db_fn, fn, max_diff, max_cov, min_cov, min_len)) ignore_all = [] for res in exe_pool.imap(filter_stage1, inputs): ignore_all.extend(res[1]) inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all)) contained = set() for res in exe_pool.imap(filter_stage2, inputs): contained.update(res[1]) #print res[0], len(res[1]), len(contained) #print "all", len(contained) inputs = []
if not os.path.isdir(os.path.split(path)[0]): os.makedirs(os.path.split(path)[0]) imwrite(path, img_crop) tformed_landmarks.shape = -1 name_landmark_str = ('%s' + ' %.1f' * n_landmark * 2) % ( (name, ) + tuple(tformed_landmarks)) succeed = True break except: succeed = False if succeed: return name_landmark_str else: print('%s fails!' % img_names[i]) if __name__ == '__main__': pool = Pool(args.n_worker) name_landmark_strs = list( tqdm.tqdm(pool.imap(work, range(len(img_names))), total=len(img_names))) pool.close() pool.join() landmarks_path = os.path.join(save_dir, 'landmark.txt') with open(landmarks_path, 'w') as f: for name_landmark_str in name_landmark_strs: if name_landmark_str: f.write(name_landmark_str + '\n')
def to_nx_OLD( self, graph_id, directed=False, parallel_processing=True, n_jobs=multiprocessing.cpu_count(), progress=True, chunksize=100, ): """Convert the graph specified by its graph_id to networkx graph""" if ( graph_id in self.G_nx.keys() ): # if self.G_nx[graph_id] already exists, just return it, otherwise evaluate it return self.G_nx[graph_id] else: print("Converting the EPGM graph {} to NetworkX graph...".format( graph_id)) if not any([graph_id in g["id"] for g in self.G["graphs"]]): raise Exception( "Graph with id {} does not exist".format(graph_id)) # List relevant nodes and edges: print("...extracting relevant nodes...", end="") nodes = [ v["id"] for v in self.G["vertices"] if graph_id in v["meta"]["graphs"] ] print(" ...{} nodes extracted...".format(len(nodes))) print("...extracting relevant edges...", end="") edges = [(e["source"], e["target"]) for e in self.G["edges"] if graph_id in e["meta"]["graphs"]] print(" ...{} edges extracted...".format(len(edges))) # TODO: implement the case of weighted edges # create a graph as dict of lists in the format (node_id: [neighbour nodes]) print("...building the graph as dict of lists...") print("...[parallel_processing: {}, n_jobs: {}, progress_bar: {}]". format(parallel_processing, n_jobs, progress)) if parallel_processing: # parallel execution pool = Pool(processes=n_jobs) if progress: n = len(nodes) self.G_nx[graph_id] = [] # pbar = ProgressBar( # widgets=[ # SimpleProgress( # format="%(value_s)s of %(max_value_s)s nodes processed (%(percentage)3d%%)" # ) # ], # maxval=n, # ).start() # _ = [pool.apply_async(partial(node_neighbours, edges=edges), args=(v,), # callback=self.G_nx[graph_id].append) for v in nodes] # it seems that appending results using callback works much slower than either pool.map_async or pool.map # while len(self.G_nx[graph_id]) != n: # pbar.update(len(self.G_nx[graph_id])) # sleep(1) graph = pool.imap(partial(node_neighbours, edges=edges), nodes, chunksize) # lazy map # evaluate batches of imap, as the progress bar is being updated: while len(self.G_nx[graph_id]) != n: self.G_nx[graph_id].append(next(graph)) # pbar.update(len(self.G_nx[graph_id])) # pbar.finish() self.G_nx[graph_id] = dict(self.G_nx[graph_id]) else: self.G_nx[graph_id] = dict( pool.map(partial(node_neighbours, edges=edges), nodes)) pool.close() pool.join() else: # sequential execution self.G_nx[graph_id] = { v: [e[1] for e in edges if e[0] == v] for v in nodes } # this works ~2.5x faster (for cora dataset) than the above for loop print("...converting the graph to nx format...") self.G_nx[graph_id] = nx.from_dict_of_lists(self.G_nx[graph_id]) if directed: self.G_nx[graph_id] = self.G_nx[graph_id].to_directed() else: self.G_nx[graph_id] = self.G_nx[graph_id].to_undirected() return self.G_nx[graph_id]
def multiprocess_get_flag(beg, end, n_processes): from multiprocessing import Pool pool = Pool(processes=n_processes) return ''.join(pool.imap(get_char, range(beg, end)))
from keras.datasets import cifar10 from multiprocessing import Pool, cpu_count import cv2 import numpy as np from tqdm import tqdm def save_image(n): array = x_test[n] # array = array.transpose(1,2,0) array = cv2.cvtColor(array, cv2.COLOR_RGB2BGR) return cv2.imwrite("cifar10/image" + str(n) + ".png", array) (x_train, y_train), (x_test, y_test) = cifar10.load_data() pool = Pool(cpu_count()) images = list( tqdm(pool.imap(save_image, range(len(x_test))), total=len(x_test))) pool.close() pool.join()
default=1.5) parser.add_argument('--perfect', action='store', dest='perfect', type=bool, help='Do you want a perfect signal and sequence', default=False) parser.add_argument('--perflen', action='store', dest='perflen', type=int, help='repeat length for perfect mode', default=1) #---------- input list ---------------# arg = parser.parse_args() seq_list = get_seq_list(arg.input) id_list = get_id_list(arg.input) in_list = zip(seq_list, id_list) #---------- load pore model ----------# kmer_poremodel=load_official_poremodel(arg.poremodel) #---------- partial function ---------# func=partial(sequence_to_true_signal, \ kmer_poremodel=kmer_poremodel, perfect=arg.perfect, p_len=arg.perflen, \ event_std=arg.event_std, filter_freq=arg.filter_freq, noise_std=arg.noise_std, \ repeat_alpha=arg.alpha, sigroot=arg.output, aliroot=arg.alignment) #---------- multi process ------------# p = Pool(arg.threads) list(tqdm(p.imap(func, in_list),total=len(in_list))) p.close() p.join()
def update_vertex_positions_mt(self): num_procs = 16 pool = Pool(processes = num_procs) self.blobj = self.get_blender_object().data.vertices pool.imap(update_one_vertex_no_matrix, self._vertices, len(self._vertices)//8) pool.close() pool.join()