def process(self): try: urls = redis_one.hkeys(self.sitemap_prefix) ofh = open('test_urls.txt', 'w+') urls.sort() ofh.write(('\n'.join(urls)).encode('utf8', 'ignore')) logger.error('total urls len %s' % len(urls)) dict_res = defaultdict(int) i = 0 while i <= len(urls): pool = Pool(processes=15) q = Queue() dict_subres = defaultdict(int) list_urls = [urls[i + j * 10000:i+(j+1)*10000] for j in range(15)] #list_dict_res = list(pool.map_async(parse_content, list_urls)) for d in pool.imap(parse_content, list_urls): for k, v in d.iteritems(): dict_res[k] += v logger.error('Parser %s %s' % (len(list_urls), len(dict_res))) i += 10000 * 15 sorted_dict_res = sorted(dict_res.iteritems(), key = lambda s: s[1], reverse=True) ofh = open('./test_sitemap_keywords', 'w+') ofh.write('\n'.join(['%s\t%s' % (k,v) for (k,v) in sorted_dict_res if v>=3]).encode('utf8', 'ignore')) ofh.close() except: logger.error(traceback.format_exc())
def main(): idir, ofile, dffile = _parse_cmdline() print u'Loading doc-freqs file {}...'.format(dffile) with open(dffile, 'rb') as f: df = pickle.load(f) print u'Reading input directory: {}'.format(idir) jobs = _load_jobs(idir, df) # Do the work. pool = Pool(4) njobs = len(jobs) try: import sys with codecs.open(ofile, 'wb') as pf: pickle.dump(njobs, pf) results = pool.imap_unordered(worker, jobs) for i, result in enumerate(results, 1): pickle.dump(result, pf) per = 100 * (float(i) / njobs) sys.stdout.write(u'\rPercent Complete: {:2.3f}%'.format(per)) sys.stdout.flush() sys.stdout.write(u'\rPercent Complete: 100% \n') sys.stdout.flush() except KeyboardInterrupt: sys.stdout.write(u'\rPercent Complete: {:2.3f}% \n'.format(per)) sys.stdout.write(u'Shutting down.\n') sys.stdout.flush() sys.exit() print u'Complete!'
class withPool: def __init__(self, procs): self.p = Pool(procs, init_func) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.p.close()
def parse_genetrees(args): """parse a set of genetrees in serial or parallel fashion and run through PHYBASE""" is_nexus = False if args.input_file.endswith('.nex') or args.input_file.endswith('.nexus'): is_nexus = True chunks = get_genetree_chunks(args, is_nexus) print "Cleaning genetrees" if args.cores > 1: p = Pool(args.cores) trees = p.map(clean_genetree_worker, chunks) else: trees = map(clean_genetree_worker, chunks) p.close() # get taxa from first tree taxa = getTaxa(trees[0]) # instantiate Phybase instance and analyse trees phybase = Phybase() star_tree, steac_tree = phybase.run(trees, args.outgroup, taxa) template = """#NEXUS\nbegin trees;\ntree 'STAR' = %s\ntree 'STEAC' = %s\nend;""" % (star_tree, steac_tree) print template star_steac_out = os.path.splitext(args.input_file)[0] star_steac_out += '.star_steac.trees' star_steac_out = open(star_steac_out, 'w') star_steac_out.write(template) star_steac_out.close()
def crawl_recursive_threaded(dirpath, ext): from database import indexer from database import utils from multiprocessing import Pool # convert to our infos cdir = indexer.DirInfo(dirpath, ext) cInfos = indexer.dirs_to_info(cdir.subfolders(), ext) # comment if you want a silent indexing print(cdir.to_string()) # recursive pooled call # NOTE: child calls must not be pooled p = Pool(utils.Settings.config['processes']) infos = p.map(crawl_recursive, cInfos) p.close() # remove hierarchy dirInfos = [d for sublist in infos for d in sublist] dirInfos.append(cdir) print('I was crawling with %d processes' % utils.Settings.config['processes']) return dirInfos
def __decrypt_file(self, private_d, public_n, keys, path_to_file, CRT, k): if CRT: pool = Pool(processes = k) promises = [] decrpted_data = '' with open(path_to_file, 'r') as f: encrypted_data = f.read() encrypted_data_chunks = list(map(''.join, zip(*[iter(encrypted_data)]*len(str(public_n))))) for i in range(len(encrypted_data_chunks)): stripped = encrypted_data_chunks[i].lstrip('0') if CRT: promise = pool.apply_async(self.compute_part_of_message, args=(stripped, keys, i)) promises.append(promise) else: decrpted_data += chr(self.__decrypt_message(stripped, private_d, public_n)) if CRT: results = [promise.get() for promise in promises] decrypted_sorted = sorted(results, key = lambda x: x[1]) for data in decrypted_sorted: decrpted_data += chr(data[0]) if CRT: pool.close() with open(path_to_file + '.dec', 'w') as f: f.write(decrpted_data) return decrpted_data
def start_crawlers(connector_class, num_processes=1): """ Starts a spider process for each spider class in the project :param num_processes: the number of simultaneous crawling processes :param connector_class: the connector class that should be used by the spiders """ spider_classes = pyjobs_crawlers.tools.get_spiders_classes() if num_processes == 0: connector = connector_class() with _get_lock('ALL') as acquired: if acquired: crawl(spider_classes, connector) else: print("Crawl process of 'ALL' already running") return # Splits the spider_classes list in x lists of size num_processes spider_classes_chunks = list() for x in range(0, len(spider_classes), num_processes): spider_classes_chunks.append(spider_classes[x:x + num_processes]) # Start num_processes number of crawling processes for spider_classes_chunk in spider_classes_chunks: process_params_chunk = [(spider_class, connector_class) for spider_class in spider_classes_chunk] p = Pool(len(process_params_chunk)) p.map(start_crawl_process, process_params_chunk)
def main(world_folder, replacement_file_name): global replacements world = nbt.world.WorldFolder(world_folder) logger = configure_logging() logger.info("Starting processing of %s", world_folder) if not isinstance(world, nbt.world.AnvilWorldFolder): logger.error("%s is not an Anvil world" % (world_folder)) return 65 # EX_DATAERR if replacement_file_name != None: logger.info("Using Replacements file: %s", replacement_file_name) with open(replacement_file_name, 'r') as replacement_file: replacements = json.load(replacement_file) # get list of region files, going to pass this into function to process region region_files = world.get_regionfiles(); # Parallel q = Queue() lp = threading.Thread(target=logger_thread, args=[q]) lp.start() p = Pool(initializer=process_init, initargs=[q,replacements], maxtasksperchild=1) region_data = p.map(process_region, region_files) # Map has finished up, lets close the logging QUEUE q.put(None) lp.join() # Not Parallel # region_data = map(process_region, region_files) # Write output data write_block_data(region_data,"output.txt") return 0
def k_rbm(infile, outfile): #dataset data = sio.loadmat(infile)['data'] # reconstruction cost cost_dict = {} p = Pool(5) first_arg = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5"] second_arg = data a,b,c,d,e = p.map(rbm_star, itertools.izip(first_arg, itertools.repeat(second_arg))) # p.map(rbm_star, itertools.izip(first_arg, itertools.repeat(second_arg))) # get the costs from the tuples cost_1 = a[0] cost_2 = b[1] cost_3 = c[2] cost_4 = d[3] cost_5 = e[4] # find the cluster assignments for i in xrange(len(cost_1)): mincost = min(cost_1[i],cost_2[i],cost_3[i],cost_4[i],cost_5[i]) if mincost == cost_1[i]: cost_dict[i+1] = 1 elif mincost == cost_2[i]: cost_dict[i+1] = 2 elif mincost == cost_3[i]: cost_dict[i+1] = 3 elif mincost == cost_4[i]: cost_dict[i+1] = 4 else: cost_dict[i+1] = 5 # store results json.dump(cost_dict, open(outfile, 'w'))
def downloadImages(self, dirName, urlData): child_folder = 'pictures' failures = 0 dirName = os.path.join(dirName,child_folder) process_pool = Pool(processes=self._pool_size) results = [] for ud in urlData: abs_img = os.path.join(dirName,urlparse(ud).path.strip('/')) try: os.makedirs(dirname(abs_img)) except: pass results.append( process_pool.apply_async( urllib.urlretrieve, [ ud, abs_img ] ) ) self.initialize_bar(max=len(results)) for result in results: try: result.get(self._timeout) except Exception: failures += 1 else: self.update_bar() self.finish_bar() if failures: print(" Completed with errors: Downloaded {0}/{1}".format(len(results) - failures, len(results))) self.finish_bar()
def rc(rf, alphabet, numOfThreads): tryn=0 counterTmp = 0 printCounter = 1000 listBasic = [] if rf.endswith('.rar'): funcChosen = unrar elif rf.endswith('.zip') or rf.endswith('.7z') : funcChosen = zipFileUnzip for a in range(1,len(alphabet)+1): for b in itertools.product(alphabet,repeat=a): k="".join(b) k=re.escape(k) listBasic.append(k) tryn+=1 if len(listBasic) == numOfThreads: pool = Pool(numOfThreads) pool.map_async(funcChosen, listBasic, callback = exitPass) pool.close() if resultPass: timeWasted = time.time()-start print 'Found! Password is '+resultPass print "It took " +str(round(time.time()-start,3))+" seconds" print "Speed: "+str(round(tryn/float(timeWasted),2))+" passwords/sec" print "Tried "+str(tryn)+" passwords" exit() listBasic = [] counterTmp+=1 if counterTmp >= printCounter: print 'Trying combination number '+str(tryn)+':'+str(k) timeWasted = round(time.time()-start,2) if timeWasted > 0: print "It took already " +str(timeWasted) +" seconds. Speed: "+str(round(tryn/float(timeWasted),2))+" passwords/sec" counterTmp=0
def fetch_imagery(image_locations, local_dir): pool = Pool(cpu_count()) tupled = [(loc[0], loc[1], local_dir) for loc in image_locations] try: pool.map(fetch_imagery_uncurried, tupled) finally: pool.close()
def compress_file(self,corpus, np=4,separator=None): """ construct WLZW pattern out of a corpus, parallelism is an option @param corpus - string, file path of the corpus @param np - number of processes, if np = 1 the algorithm is run in serial @param separator - the separator string to separate doc id and document. pass None if no doc id is given @return set, the final set containing all frequent patterns """ #if only one process, no need for parallelization if np==1: return set(_compress_file((corpus,0,np,separator))) p=Pool(processes=np) l=[] for i in range(0,np): l.append((corpus,i,np,separator)) result=p.imap_unordered(_compress_file,l,1) if np==1: final_set=result.next() else: final_set=_union(result) return final_set
def main(): """ --------------------------------------------------------------------------- AUTHOR: Kyle Hernandez EMAIL: [email protected] Calculate the distribution of polymorphic RAD loci across site classes. --------------------------------------------------------------------------- USAGE: python snp_locations.py gmatrix.tab file.gff out.tab n_threads ARGUMENTS: gmatrix.tab - Tab-delimited genotype matrix file of variant sites file.gff - GFF file out.tab - Output file of counts n_threads - The number of threads to run """ # Load the GFF and SNP positions into dictionaries load_gff() intergenic = process_matrix() # Map: # Create a pool of n_threads workers and use them to process # scaffolds separately ch_vals = sorted(gff_dict.keys()) sys.stdout.write("Counting features...\n") pool = Pool(processes=n_threads) ct_list = pool.map(process_dicts, ch_vals) # Reduce: # Process the list of dicts print_counts(intergenic, ct_list)
def spawn_runpy(cp, wait=60, cb=check_rst): "as decorator to run job" global WAITQ, RUNQ, CFG pool = Pool(processes=CFG['MAXJOBS']) while len(WAITQ) > 0 or len(RUNQ) > 0: if len(RUNQ) <= CFG['MAXJOBS'] and len(WAITQ) > 0: path, test = WAITQ.pop() rst = pool.apply_async(call_runpy, (cp, path, test,)) RUNQ.append((rst, test, timeit.default_timer())) else: for r in RUNQ: usec = float("%.2f" %(timeit.default_timer()-r[2])) if r[0].successful: print "[{0}] success used {1} usec".format(r[1], usec) RUNQ.remove(r) if cb: cb(r[1], 'pass', usec) else: if usec > CFG['TIMEOUT']: print "[{0}] unsuccess used timeout {1} usec".format(r[1], usec) r[0].terminate() if cb: cb(r[1], 'fail', usec) time.sleep(float(wait))
def mass_tri_plot(data, savedir, name='plot', Type='speed', Map=False): """ Plots all time series. Makes use of multiprocessing for speed. """ trigrid = data['trigrid'] #get the data to plot try: toPlot = data[Type] except KeyError: print Type + " is not an element of data. Please calculate it." raise Exception("Invalid dictionary entry") #set the variable as a global variable global plotvar plotvar = toPlot global saveDir saveDir = savedir global grid grid = trigrid #see if the save directory exists, or make it if not os.path.exists(savedir): os.makedirs(savedir) l = toPlot.shape[0] p = Pool(4) plt.gca().set_aspect('equal') p.map(save_plot, range(50)) clearall()
def score_all_genes(self, graph, num_procs=1): partial_score_gene = partial(score_gene, graph=graph, top_genes=self.top_genes) p = Pool(num_procs) result = p.map(partial_score_gene, list(self.vd.gene_names())) p.close() # convert them all to percentiles cent_hist = numpy.array([x[1] for x in result if x[1] != -1]) nn_hist = numpy.array([x[2] for x in result if x[2] != -1]) batch = [] for gene, cent_score, nn_score in result: # edge case: gene is a top gene if gene in self.top_genes: cent_perc = 1 nn_perc = 1 # edge case: gene isn't in network elif cent_score == -1 or \ nn_score == -1: cent_perc = 0 nn_perc = 0 else: cent_perc = scipy.stats.percentileofscore(cent_hist, cent_score) / 100.0 nn_perc = 1 - scipy.stats.percentileofscore(nn_hist, nn_score) / 100.0 print "gene: %s\n c: %s\n c_p: %s\n n: %s\n n_p: %s" % \ (gene, cent_score, cent_perc, nn_score, nn_perc) batch.append((cent_score, cent_perc, nn_score, nn_perc, gene)) self.vd._c.executemany("UPDATE genes SET cent_score = ?, cent_perc = ?, " \ "nn_score = ?, nn_perc = ? WHERE name = ?", batch) self.vd._conn.commit()
def updateTranslation(args): # Get map that contains (besides other stuff) # the crowdin ID for a given file translationFilemap = getTranslationFilemapCache(args.language, args.force_filemap_update) # Collect valid downloadable files for parallel processing fileinfos = [] for filename, fileinfo in translationFilemap.items(): filepath = os.path.join("cache", args.language, fileinfo["path"]) # Create dir if not exists try: os.makedirs(os.path.dirname(filepath)) except OSError as exc: if exc.errno == errno.EEXIST: pass else: raise fileid = fileinfo["id"] fileinfos.append((fileid, filepath)) # Curry the function with the language performDownload = functools.partial(performPOTDownload, args.language) # Perform parallel download if args.num_processes > 1: pool = Pool(args.num_processes) pool.map(performDownload, fileinfos) else: for t in fileinfos: performDownload(t) #Set download timestamp timestamp = datetime.datetime.now().strftime("%y-%m-%d %H:%M:%S") with open("lastdownload.txt", "w") as outfile: outfile.write(timestamp)
def multi_mode(start, stop): print "going multi" from multiprocessing import Pool pool = Pool(processes=4) result = pool.map(factorize, xrange(start, stop + 1), chunksize=100) print uniq_counter(result)
def compute_tdbf(): conn = db_conn('bnc') cur = conn.cursor() # select keys and parsed from table sql = 'SELECT xmlID, divIndex, globalID, parsed FROM entropy_DEM100' cur.execute(sql) data = cur.fetchall() # initialize pool = Pool(multiprocessing.cpu_count()) manager = Manager() queue = manager.Queue() # mp args = [(d, queue) for d in data] result = pool.map_async(compute_tdbf_worker, args, chunksize=5000) # manager loop while True: if result.ready(): print('\n all rows processed') break else: sys.stdout.write('\r{}/{} processed'.format(queue.qsize(), len(args))) sys.stdout.flush() time.sleep(1) # update processed_results = result.get() for i, res in enumerate(processed_results): xml_id, div_idx, g_id, sub_tree, td, bf = res sql = 'UPDATE entropy_DEM100 SET parsedSimple = %s, td = %s, bf = %s WHERE xmlID = %s AND divIndex = %s AND globalID = %s' cur.execute(sql, (sub_tree, td, bf, xml_id, div_idx, g_id)) if i % 999 == 0 and i > 0: sys.stdout.write('\r{}/{} updated'.format(i+1, len(processed_results))) sys.stdout.flush() conn.commit()
def get_needle_tips(images): """Get sample tips from images.""" tips = [] results = [] # Do not make more processes than needed for the number of images. if len(images) > multiprocessing.cpu_count(): proc_count = multiprocessing.cpu_count() else: proc_count = len(images) pool = Pool(processes=proc_count) for image in images: results.append(pool.apply_async(_get_ellipse_point, args=(image,))) for result in results: tip = result.get() if tip is not None: tips.append(tip) if len(tips) == 0: raise ValueError("No sample tip points found.") return tips
def main(): parser = ArgumentParser(description="Speed up your SHA. A different hash style.") parser.add_argument('-1', '--sha1', action='store_true') parser.add_argument('-2', '--sha224', action='store_true') parser.add_argument('-3', '--sha256', action='store_true') parser.add_argument('-4', '--sha384', action='store_true') parser.add_argument('-5', '--sha512', action='store_true') parser.add_argument('-f', '--file', type=str, help="The path to the file") if len(sys.argv) == 1: parser.print_help() return global args args = parser.parse_args() hashtree = '' big_file = open(args.file, 'rb') pool = Pool(multiprocessing.cpu_count()) for chunk_hash in pool.imap(hashing, chunks(big_file)): hashtree += chunk_hash + ":hash" pool.terminate() print(str(hashing(hashtree.encode('ascii'))))
def get(self): mode = toAlpha3Code(self.get_argument('lang')) text = self.get_argument('q') if not text: self.send_error(400, explanation='Missing q argument') return def handleCoverage(coverage): if coverage is None: self.send_error(408, explanation='Request timed out') else: self.sendResponse([coverage]) if mode in self.analyzers: pool = Pool(processes=1) result = pool.apply_async(getCoverage, [text, self.analyzers[mode][0], self.analyzers[mode][1]]) pool.close() @run_async_thread def worker(callback): try: callback(result.get(timeout=self.timeout)) except TimeoutError: pool.terminate() callback(None) coverage = yield tornado.gen.Task(worker) handleCoverage(coverage) else: self.send_error(400, explanation='That mode is not installed')
class JobPool(object): """ Pool container. """ pool = None message_queue = None def __init__(self, max_instances=4): self.message_queue = Queue() self.pool = Pool(max_instances, execute_task, (self.message_queue,)) atexit.register(self.clear) def add_analysis(self, analysis): """ Add analysis to the pool. """ analysis.set_started() self.message_queue.put(analysis) def clear(self): """ Pool cleanup. """ self.pool.terminate() self.pool.join()
class YaraJobPool(object): """ Yara pool container. """ pool = None message_queue = None def __init__(self, max_instances=3): self.message_queue = Queue() self.pool = Pool(max_instances, execute_yara_task, (self.message_queue,)) atexit.register(self.clear) def add_yara_task(self, yara_task): """ Adds the yara task. """ self.message_queue.put(yara_task) def clear(self): """ Pool cleanup. """ self.pool.terminate() self.pool.join()
def get_fractional_errors(R_star, L_star, P_c, T_c): """ Pass in "guess" conditions. Will then calculate inward and outward errors, Returns: [Data array] dY - over/undershoots (+/-, going outward) [dx handled outside this] """ # R_star, L_star, P_c, T_c = x P_c_0 = modelparameters.P_c # core pressure, [dyne cm^-2] T_c_0 = modelparameters.T_c # core temperature, [K] R_star_0 = modelparameters.R_star L_star_0 = modelparameters.L_star print "" print "R: " + str(R_star / R_star_0) print "L: " + str(L_star / L_star_0) print "P: " + str(P_c / P_c_0) print "T: " + str(T_c / T_c_0) X = modelparameters.X Y = modelparameters.Y Z = modelparameters.Z mu = modelparameters.mu params = (X, Y, Z, mu) M_star = modelparameters.M_star m_fitting_point = modelparameters.m_fitting_point pool = Pool(2) outward_results = pool.apply_async(integrate.integrate_outwards, [M_star, m_fitting_point, P_c, T_c, mu, X, Y, Z] ) inward_results = pool.apply_async(integrate.integrate_inwards, [M_star, m_fitting_point, R_star, L_star, mu, X, Y, Z] ) m_outward, y_outward, infodict_outward = outward_results.get() m_inward, y_inward, infodict_inward = inward_results.get() dr = y_inward[-1,0] - y_outward[-1,0] dl = y_inward[-1,1] - y_outward[-1,1] dP = y_inward[-1,2] - y_outward[-1,2] dT = y_inward[-1,3] - y_outward[-1,3] dY = np.array([dr, dl, dP, dT]) print '' print 'fractional errors:' print "dR: " + str(dr / y_inward[-1,0]) print "dL: " + str(dl / y_inward[-1,1]) print "dP: " + str(dP / y_inward[-1,2]) print "dT: " + str(dT / y_inward[-1,3]) return dY
def get_location(self): """ Extracts the location of each pixel in the satellite image """ self.ncols = self.satellite_gdal.RasterXSize / 2 self.nrows = self.satellite_gdal.RasterYSize / 2 self.length_df = self.nrows * self.ncols print 'Columns, rows', self.ncols, self.nrows cols_grid, rows_grid = np.meshgrid( range(0, self.ncols), range(0, self.nrows)) self.cols_grid = cols_grid.flatten() self.rows_grid = rows_grid.flatten() print 'Checking the meshgrid procedure works' # getting a series of lat lon points for each pixel self.geotransform = self.satellite_gdal.GetGeoTransform() print 'Getting locations' self.location_series = np.array(parmap.starmap( pixel_to_coordinates, zip(self.cols_grid, self.rows_grid), self.geotransform, processes = self.processes)) print 'Converting to Points' pool = Pool(self.processes) self.location_series = pool.map( point_wrapper, self.location_series)
def main(path, out, cores): """ Compute contact energies for each pdb in path and write results to 'out'. :param path: str :param out: str :param cores: int :return: None """ # Find all pdbs in path workload = [] for file in os.listdir(path): if os.path.splitext(file)[1].lower() == ".pdb": workload.append(file) # Print few newlines to prevent progressbar from messing up the shell print("\n\n") # Compute energies pool = Pool(processes=cores) results = [] for (nr, pdb) in enumerate(workload): updateprogress(pdb, nr / len(workload)) e = computecontactenergy(os.path.join(path, pdb), pool) results.append((pdb, e)) pool.close() # Make 100% to appear updateprogress("Finished", 1) # Store output with open(out, "w") as handler: handler.write("PDB,Energy in kcal/mol\n") for pair in results: handler.write("{},{}\n".format(*pair))
def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir!=None) articles = articles.filter(Entity.sep_dir!='') articles = articles.distinct().all() articles = [a[0] for a in articles] # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() #serial processing for tests ''' doc_lines = [] for title in articles: lines = process_article(title, terms, entity_type, None, corpus_root) doc_lines.append(lines) ''' # write graph output to file print output_filename with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def matrix_vector_iteration_by_processes(A,x,k): # create a temporary directory to store the matrix and the vectors tmpdir = tempfile.mkdtemp() nvec = get_nvec(x) y = x.copy() save_matrix(tmpdir,A) for i in xrange(nvec): save_x(tmpdir,x,i) # start processes pool = Pool(processes=min(nvec,6)) processes = [] for i in xrange(nvec): processes.append( pool.apply_async(matrix_vector_iteration_process, (tmpdir,i,k)) ) # fetch results (vector/matrix shape version) if x.ndim == 1: processes[0].get() y = load_x(tmpdir,0) else: for i in xrange(nvec): processes[i].get() y[:,i] = load_x(tmpdir,i) pool.close() # remove temporary directory (with all it contains) shutil.rmtree(tmpdir) return y
def learn_failures((X, Y, L)): def func(failures): ret = -maximum_likelihood_helpers.func(X,Y,failures) return ret def grad(failures): ret = -maximum_likelihood_helpers.grad(X,Y,failures) return ret X0 = np.log(np.ones(L+1, dtype=float)*(1-1e-6)) #initialize a small distance away from the bound bounds = [(None, 0) for _ in xrange(L+1)] bounds[-1] = (None, np.log(1-1e-6)) #never allow the leak to go too close to 0 failures, _, _ = opt.fmin_l_bfgs_b(func, X0, grad, bounds=bounds, disp=0) failures = np.exp(failures) #optimization was in log space. Exponentiate to get values. return failures if __name__ == '__main__': pool = Pool(20) L = 50 # latent variables M = 300 # observations N = 1000 # patients X_matrix = np.array(np.random.rand(N, M) > 0.7, dtype=int) Y_matrix = np.array(np.random.rand(N, L) > 0.8, dtype=int) Y = [np.nonzero(y)[0].tolist() for y in Y_matrix] targets = ((X_matrix[:, j].tolist(), Y, L) for j in xrange(M)) failures = np.array(pool.map(learn_failures, targets)) print failures.shape
def parse(country, cc): print(f'Start: {country}') path = f'data/{country}' emissions = parse_xs(f'{path}/{os.listdir(path)[-1]}') energy = parse_energy(f'{cc}') print(f'Done: {country}') return country, [[u'CO₂e'] + emissions] \ + list(zip(energy_params, *energy)) \ + [['years'] + list(range(1990, 2016))] if __name__ == '__main__': if sys.version_info < (3, 6): sys.exit('Python 3.6 or later is required.\n') with open('countries.json') as f: countries = json.load(f) with Pool(processes=20) as pool: res = pool.starmap(parse, [(c, countries[c]) for c in awesome + f****d]) data = { c: v for c, v in res } data['Kazakhstan'] = data.pop('Kazakhstan2') with io.open(f'renewable_to_emissions.min.json', 'w', encoding='utf8') as f: json.dump(data, f, ensure_ascii=False)
class Validation: def __init__(self, in_file, fp_db_name, output, categories, top, blacklist, malware_ctx_file, proc_list): if output == sys.stdout: self.out_file_pointer = sys.stdout else: self.out_file_pointer = open(output, 'w') self.categories = categories self.top = top self.blacklist = blacklist self.malware_ctx = None if malware_ctx_file != None: self.malware_ctx = {} for line in gzip.open(malware_ctx_file): fp_ = json.loads(line) self.malware_ctx[fp_['str_repr']] = fp_ self.proc_list = None if proc_list != None: self.proc_list = [] t_ = proc_list.split(';') for s in t_: if s != '': tmp_proc_list = s.split(',') self.categories.append(tmp_proc_list[0]) self.proc_list.append(tmp_proc_list) # read in application categories app_cat_file = 'application_categories.json.gz' with gzip.open(app_cat_file,'r') as fp: self.app_cat_data = json.loads(fp.read()) self.mt_pool = Pool(32) self.input_file = in_file if in_file.endswith('.csv.gz'): self.data = self.read_file_csv(in_file) elif in_file.endswith('.json.gz') and 'dmz' in in_file: self.data = self.read_file_dmz_json(in_file) elif in_file.endswith('.json.gz'): self.data = self.read_file_json(in_file) else: print('error: file format not supported') sys.exit(-1) def validate_process_identification(self): results = [] unknown_fp = 0 unknown_s = 0 if self.top: results = self.mt_pool.map(get_results_top, [self.data[k] for k in self.data]) elif self.blacklist: results = self.mt_pool.map(get_results_blacklist, [self.data[k] for k in self.data]) else: results = self.mt_pool.map(get_results, [self.data[k] for k in self.data]) # for k in self.data: # results.append(get_results(self.data[k])) self.data = None self.analyze_results(results) def analyze_results(self, results): r_tmp_ = self.mt_pool.map(process_result, [(sl, self.categories) for sl in results]) r_tmp_ = [x for sl in r_tmp_ for x in sl] r_ = [sum([row[i] for row in r_tmp_]) for i in range(0,len(r_tmp_[0][:-1]))] print('FILE: %s' % self.input_file) print('\tTotal:\t\t\t\t % 8i' % r_[0]) print('\t :\t top-1 top-2 top-3 top-4 top-5') print('\tProcess Name Category Accuracy:\t %0.6f %0.6f %0.6f %0.6f %0.6f' % (r_[2]/r_[0], (r_[2]+r_[5])/r_[0], (r_[2]+r_[5]+r_[7])/r_[0], (r_[2]+r_[5]+r_[7]+r_[9])/r_[0], (r_[2]+r_[5]+r_[7]+r_[9]+r_[11])/r_[0])) print('\tProcess Name Accuracy:\t\t %0.6f %0.6f %0.6f %0.6f %0.6f' % (r_[1]/r_[0], (r_[1]+r_[4])/r_[0], (r_[1]+r_[4]+r_[6])/r_[0], (r_[1]+r_[4]+r_[6]+r_[8])/r_[0], (r_[1]+r_[4]+r_[6]+r_[8]+r_[9])/r_[0])) # print('\tSHA-256 Accuracy:\t\t %0.6f' % (r_[3]/r_[0])) r_c = [row[-1] for row in r_tmp_] idx = 0 for c in self.categories: if c == '': continue r_ = [sum([row[idx][i] for row in r_c]) for i in range(0,len(r_c[0][0]))] print('\n\t%s Accuracy:\t\t %0.6f' % (c, (r_[1]/r_[0]))) print('\t%s Confusion Matrix:' % c) print('\t\t\t Positive Negative') print('\t\tPositive:% 9i\t% 9i' % (r_[2], r_[5])) print('\t\tNegative:% 9i\t% 9i' % (r_[4], r_[3])) if r_[2]+r_[5] > 0: print('\t\tRecall: %0.6f' % (r_[2]/(r_[2]+r_[5]))) else: print('\t\tRecall: %0.6f' % (0.0)) if r_[2]+r_[4] > 0: print('\t\tPrecision: %0.6f' % (r_[2]/(r_[2]+r_[4]))) else: print('\t\tPrecision: %0.6f' % (0.0)) idx += 1 def read_file_csv(self, f): data = {} max_lines = 30000000 cur_line = 0 start = time.time() for line in os.popen('zcat %s' % (f)): cur_line += 1 if cur_line > max_lines: break # if '(0000)' not in line: # continue t_ = line.strip().split(',') src = t_[0] proc = t_[3] sha256 = t_[4] type_ = t_[5] fp_str = t_[6].replace('()','') dst_x = t_[7].split(')') os_ = clean_os_str(t_[8]) if os_ == None: continue dst_ip = dst_x[0][1:] dst_port = int(dst_x[1][1:]) server_name = dst_x[2][1:] src_port = int(t_[9].split(')')[1][1:]) av_hits = 0 if len(t_) > 10: av_hits = int(t_[10]) proc = clean_proc_name(proc) if proc in uninformative_proc_names: continue fp_malware_ = False if self.malware_ctx != None: if fp_str in self.malware_ctx: fp_malware_ = is_fp_malware(self.malware_ctx[fp_str]) else: continue app_cat = None if proc in self.app_cat_data: app_cat = self.app_cat_data[proc] malware = is_proc_malware({'process':proc}, fp_malware_, av_hits) domain = server_name sni_split = server_name.split('.') if len(sni_split) > 1: domain = sni_split[-2] + '.' + sni_split[-1] if server_name in sni_whitelist or domain in domain_whitelist: malware = False app_cats = {} app_cats['malware'] = malware for c in self.categories: if c == 'malware': app_cats[c] = malware else: app_cats[c] = False if c == app_cat: app_cats[c] = True if os_ == None: continue if src not in data: data[src] = [] data[src].append((src,src_port,proc,sha256,type_,fp_str,dst_ip,dst_port,server_name,1,os_,app_cats, self.proc_list)) print('time to read data:\t%0.2f' % (time.time()-start)) return data def read_file_json(self, f): data = {} start = time.time() key_ = 0 data[key_] = [] for line in os.popen('zcat %s' % (f)): fp_ = json.loads(line) if 'str_repr' in fp_: fp_str = fp_['str_repr'] else: fp_str = fp_['md5'] if 'process_info' in fp_: new_procs = [] fp_malware_ = is_fp_malware(fp_) for p_ in fp_['process_info']: if 'process' not in p_: p_['process'] = p_['filename'] p_['process'] = clean_proc_name(p_['process']) if is_proc_malware(p_, fp_malware_): new_procs.extend(clean_malware_proc(p_)) else: new_procs.append(p_) fp_['process_info'] = new_procs for p_ in fp_['process_info']: proc = p_['process'] sha256 = p_['sha256'] if p_['process'] in uninformative_proc_names: continue # uncomment to classify non-top processes # pn = proc # pn = app_families[pn] if pn in app_families else pn # if pn in ['Chromium','Firefox','Safari','Internet Explorer','Adobe Tools', # 'Microsoft Office','Cisco Webex','Cisco AMP','iCloud','Box']: # continue app_cat = None if proc in self.app_cat_data: app_cat = self.app_cat_data[proc] malware = is_proc_malware(p_, False) app_cats = {} app_cats['malware'] = malware for c in self.categories: if c == 'malware': app_cats[c] = malware else: app_cats[c] = False if c == app_cat: app_cats[c] = True for x_ in p_['dst_info']: dst_x = x_['dst'].split(')') dst_ip = dst_x[0][1:] dst_port = int(dst_x[1][1:]) server_name = dst_x[2][1:] data[key_].append((None,None,proc,sha256,'tls',fp_str,dst_ip,dst_port, server_name,x_['count'],None,app_cats,self.proc_list)) if len(data[key_]) > 5000: key_ += 1 data[key_] = [] print('time to read data:\t%0.2f' % (time.time()-start)) return data def read_file_dmz_json(self, f): data = {} key_ = 0 data[key_] = [] start = time.time() for line in os.popen('zcat %s' % (f)): fp_ = json.loads(line) if 'str_repr' in fp_: fp_str = fp_['str_repr'] else: fp_str = fp_['md5'] if fp_str in schannel_fps: fp_str = 'schannel' proc = 'dmz_process' sha256 = 'dmz_process' app_cats = {} app_cats['malware'] = False # if fp_str not in data: # data[fp_str] = [] dst_info_key = 'dmz_dst_info' if dst_info_key not in fp_: dst_info_key = 'dst_info' for x_ in fp_[dst_info_key]: dst_x = x_['dst'].split(')') dst_ip = dst_x[0][1:] dst_port = int(dst_x[1][1:]) server_name = dst_x[2][1:] # data[fp_str].append((None,None,proc,sha256,'tls',fp_str,dst_ip,dst_port, # server_name,x_['count'],None,app_cats)) data[key_].append((None,None,proc,sha256,'tls',fp_str,dst_ip,dst_port, server_name,x_['count'],None,app_cats,self.proc_list)) if len(data[key_]) > 5000: key_ += 1 data[key_] = [] print('time to read data:\t%0.2f' % (time.time()-start)) return data
items = re.findall(pattern, html) itemdict = {} for item in items: itemdict['index'] = item[0] itemdict['name'] = item[1].strip()[3:] itemdict['score'] = item[2] + '.' + item[3] yield itemdict def write_to_file(content): with codecs.open('maoyan.txt', 'a', 'utf-8') as f: f.write(json.dumps(content,ensure_ascii=False) + '\n') f.close() def main(offset): url = 'http://maoyan.com/board/4?offset={}'.format(offset) html = get_one_page(url) for dictitem in parse_one_page(html): print dictitem write_to_file(dictitem) if __name__ == '__main__': # for i in range(0,100,10): # main(i) # pool = Pool() pool.map(main, range(0,100,10)) # map(main, range(0, 100, 10))
webpage = urlopen(req).read() html = bs(webpage, 'html.parser') return html # get all club links under a url def get_club_links(url): html = get_html(url) club_links = {elem.text: 'https://footballdatabase.com' + elem['href'] for elem in html.find_all('a', {'class': 'sm_logo-name clubbrowser-club'})} return club_links # urls to club lists for all clubs starting with each letter club_letter_urls = ['https://footballdatabase.com/clubs-list-letter/' + letter for letter in ascii_uppercase] if __name__ == '__main__': pool = Pool(6) # get all club links for all club letter urls club_link_dicts = pool.imap(get_club_links, club_letter_urls) all_club_links = {} for club_link_dict in club_link_dicts: all_club_links.update(club_link_dict) for club in all_club_links: # add the lowercase name to the dictionary all_club_links[club.lower()] = all_club_links[club] # look for FC at the end or start of each club and add the name without FC if re.search('^FC\s+|\s+FC$', club): club_no_FC = re.sub('^FC\s+|\s+FC$', '', club) all_club_links[club_no_FC] = all_club_links[club] all_club_links[club_no_FC.lower()] = all_club_links[club] pool.close() pool.join()
cslsdistance = [] for layer in [0, 1, 2]: rightside = batchvectors[tree_index - 1][layer][keys[3]] distance.append(csd(leftside[layer], rightside)) rneigh_ind = knn_from_tree(neigh_tree[layer], tree_index, args.k) rneigh = [ batchvectors[i][layer][keys[3]] for i in rneigh_ind if i < len(batchvectors) ] cslsdistance.append( csls(leftside[layer], rightside, lneigh[layer], rneigh)) return (vocab[tree_index - 1], distance, cslsdistance) with Pool(8) as p: distances = p.map(calc_distance, candidates) for element in distances: scores[element[0]] = element[1] cslsscores[element[0]] = element[2] is_better = np.array([0, 0, 0]) csls_better = np.array([0, 0, 0]) if correct in scores: for pick in scores: is_better += [ scores[pick][i] <= scores[correct][i] for i in range(3) ] csls_better += [ cslsscores[pick][i] <= cslsscores[correct][i] for i in range(3) ]
print( f'Loaded expression matrix of {ex_matrix.shape[0]} cells and {ex_matrix.shape[1]} genes in {end_time - start_time} seconds...', file=sys.stdout) tf_names = load_tf_names(args.tfs_fname.name) print(f'Loaded {len(tf_names)} TFs...', file=sys.stdout) ex_matrix, gene_names, tf_names = _prepare_input(ex_matrix, gene_names, tf_names) tf_matrix, tf_matrix_gene_names = to_tf_matrix(ex_matrix, gene_names, tf_names) print(f'starting {args.method} using {args.num_workers} processes...', file=sys.stdout) start_time = time.time() with Pool(args.num_workers) as p: adjs = list( tqdm.tqdm(p.imap(run_infer_partial_network, target_gene_indices(gene_names, target_genes='all'), chunksize=1), total=len(gene_names))) adj = pd.concat(adjs).sort_values(by='importance', ascending=False) end_time = time.time() print(f'Done in {end_time - start_time} seconds.', file=sys.stdout) adj.to_csv(args.output, index=False, sep="\t")
venue_id = sys.argv[2] tips = save_tips(venue_id) print("Got %s tips for venue %s" % (len(tips), venue_id)) elif argument == 'listen': # Set up the logging logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO) # Set up the RabbitMQ channel # Tutorial: https://www.rabbitmq.com/tutorials/tutorial-six-python.html connection = pika.BlockingConnection(pika.ConnectionParameters(host=RABBITMQ_HOST)) channel = connection.channel() channel.queue_declare(queue=RABBITMQ_QUEUE) channel.basic_qos(prefetch_count=1) channel.basic_consume(on_request, queue=RABBITMQ_QUEUE) ts = int(time.time()) pool = Pool(10) logging.info(' [x] Waiting for RPC requests...') channel.start_consuming() else: print("unknown argument - %s" % argument) sys.exit(0)
x = Variable() objective = Minimize(quad_form(x, 1) + 1) constraint = [quad_form(x, 1) - 6 * x + 8 <= u] p = Problem(objective, constraint) # Assign a value to gamma and find the optimal x. def get_x(u_value): u.value = u_value result = p.solve() return x.value u_values = np.linspace(-0.9, 10, num=50) # Serial computation. x_values = [get_x(value) for value in u_values] # Parallel computation. pool = Pool(processes=4) x_values = pool.map(get_x, u_values) # Plot the tradeoff curve plot(u_values, x_values) # label title('Sensitivity Analysis: p*(u) vs u') xlabel('u') ylabel('p*(u)') axis([-2, 10, -1, 3]) show()
from multiprocessing import Pool def f(x): return x*x if __name__ == '__main__': pool = Pool(processes=40) result = pool.apply_async(f, [10]) print result.get(timeout=1) print pool.map(f, range(10))
if '-c' in sys.argv: categories = GetCategories(sys.argv[sys.argv.index('-c')+1]) if '-topk' in sys.argv: topk = int(sys.argv[sys.argv.index('-topk')+1]) if '-d' in sys.argv: abs_distance = float(sys.argv[sys.argv.index('-d')+1]) if '-i' in sys.argv: unIoU = 1- float(sys.argv[sys.argv.index('-i')+1]) real_file = GetFileList(sys.argv[1], []) predicted_file = GetFileList(sys.argv[2], []) real_txt_list = [] predicted_txt_list = [] for pth in real_file: real_txt_list += GetFileList(pth + '/txt',['.txt']) for pth in predicted_file: predicted_txt_list += GetFileList(pth,['.txt']) pool = Pool(int(cpu_count()*7/8)) results = pool.map(Experiment, zip(real_txt_list, predicted_txt_list)) precision, recall, F_score, error_rate, missing_rate, over_detected_rate = PerformanceToCsv(results) print("************************************************************************************************************") print("precision:{:.3f}; recall:{:.3f}; F_score:{:.3f}; error_rate:{:.3f}; missing_rate:{:.3f}; over_detected_rate:{:.3f}"\ .format(precision, recall, F_score, error_rate, missing_rate, over_detected_rate)) print("************************************************************************************************************") toc = time.clock() print('running time: {:.3f} seconds'.format(toc-tic))
#pool = multiprocessing.Pool(4) #out1, out2, out3 = zip(*pool.map(calc_stuff, range(0, 10 * offset, offset))) from multiprocessing import Pool def f(x): return x*x if __name__ == '__main__': with Pool(5) as p: print(p.map(f, [1, 2, 3]))
from multiprocessing import Pool def f(x): return x*x # pool is used for parallesim for when multiple inputs are provided to the function, # thus processing all the functions at the same time if __name__ == '__main__': p = Pool(5) print(p.map(f, [1, 2, 3]))
try: resp = re.search(r'resp=(.*),cost', lines[0]) except IndexError as e : pass else: response = resp.group(1) if resp: ret = urllib.parse.unquote(response) return_rule = get_return_rule(json.loads(ret)['response']) result_queue.put_nowait(return_rule) else: print('没有response') if __name__ == '__main__': pool = Pool(4) qid_queue = Manager().Queue() # result_queue = Manager().Queue() pool.apply_async(find_log, args=('query', 0, qid_queue,), callback=call_back) print('有一个线程去找qid了') while isinstance(qid_queue.get(), int): pool.apply_async(find_log, args=('responseServer', qid_queue.get_nowait(), qid_queue,), callback=call_back) pool.close() pool.join() index = 1 while qid_queue.qsize() > 0: if not isinstance(qid_queue.get_nowait(), int) and index < 30: csv_file = open('travco_return_rule.csv', 'a', encoding='utf8') writer = csv.writer(csv_file) writer.writerow([qid_queue.get()]) csv_file.close()
train_ind_dict[i] = 1 valid_ind_dict[j] = 1 test_ind_dict[k] = 1 with open(file_name,'r') as ref: for ind,line in enumerate(ref): tmpp = line.strip().split(',')[0] if ind in train_ind_dict.keys(): train_set.write(tmpp+'\n') elif ind in valid_ind_dict.keys(): valid_set.write(tmpp+'\n') elif ind in test_ind_dict.keys(): test_set.write(tmpp+'\n') train_set.close() valid_set.close() test_set.close() if __name__=='__main__': print(file_path,protein,data_directory) try: os.mkdir(file_path+'/'+protein+"/iteration_"+str(n_it)) except: pass f_names = [] for f in glob.glob(data_directory+'/*.txt'): f_names.append(f) t=time.time() with closing(Pool(np.min([tot_process,len(f_names)]))) as pool: pool.map(train_valid_test,f_names) print(time.time()-t)
if __name__ == '__main__': argss = [ { 'prefix': 'hist_', 'key': 'card_id', 'num_aggregations': { 'category_2_1': stats, 'category_2_2': stats, 'category_2_3': stats, 'category_2_4': stats, 'category_2_5': stats, 'category_3_0': stats, 'category_3_1': stats, 'category_3_2': stats, } } ] pool = Pool(NTHREAD) callback = pool.map(aggregate, argss) pool.close() #============================================================================== utils.end(__file__)
def PTRC(init_code, p_error, p_sampling=None, droplets=4, Nc=None, steps=20000, conv_mult=2.0): p_sampling = p_sampling or p_error iters = 10 if type(init_code) == list: # this is either 4 or 16, depending on what type of code is used. nbr_eq_classes = init_code[0].nbr_eq_classes # make sure one init code is provided for each class assert len(init_code) == nbr_eq_classes, 'if init_code is a list, it has to contain one code for each class' # store system_size for brevity size = init_code[0].system_size # if Nc is not provided, use code system_size Nc = Nc or size # initiate class ladders eq_ladders = [Ladder(p_sampling, eq_code, Nc) for eq_code in init_code] else: # this is either 4 or 16, depending on what type of code is used. nbr_eq_classes = init_code.nbr_eq_classes # store system_size for brevity size = init_code[0].system_size # if Nc is not provided, use code system_size Nc = Nc or size # convert init_code to every class and initiate ladders eq_ladders = [None] * nbr_eq_classes for eq in range(nbr_eq_classes): eq_code = copy.deepcopy(init_code) eq_code.qubit_matrix = eq_code.to_class(eq) eq_ladders[eq] = Ladder(p_sampling, eq_code, Nc) # reduce number of steps to account for parallel markov chains steps = steps // Nc # this is where we save all samples in a dict, to find the unique ones. qubitlist = {} # keep track of convergence conv_step = np.zeros(nbr_eq_classes) # keep track of shortest observed chains shortest = np.ones(nbr_eq_classes) * (2 * size ** 2) # keep track of when to stop if using convergence criteria stop = steps # inverse temperature when writing probability in exponential form beta_error = -log((p_error / 3) / (1 - p_error)) # array of betas correspoding to ladder temperatures beta_ladder = -np.log((eq_ladders[0].p_ladder[:-1] / 3) / (1 - eq_ladders[0].p_ladder[:-1])) d_beta = beta_ladder - beta_error # Array to hold the boltzmann factors for every class Z_arr = np.zeros(nbr_eq_classes) # initiate worker pool if droplets > 1: pool = Pool(droplets) # do mcmc sampling, one class at a time for eq in range(nbr_eq_classes): if droplets == 1: unique_lengths_ladder, len_counts_ladder = PTRC_droplet(eq_ladders[eq], steps, iters, conv_mult) else: # ladder of lengths of qubit matrices unique_lengths_ladder = [{} for _ in range(Nc)] # ladder of observations of qubit matrix lengths len_counts_ladder = [{} for _ in range(Nc)] args = [(copy.deepcopy(eq_ladders[eq]), steps, iters, conv_mult) for _ in range(droplets)] output = pool.starmap_async(PTRC_droplet, args).get() # combine outputs for res in output: # iterate ladder for i in range(Nc): unique_lengths_ladder[i].update(res[0][i]) # iterate output unique_lengths dictionary for length, counts in res[1][i].items(): # aggregate or init length counter if length in len_counts_ladder[i]: # aggregate N(n) len_counts_ladder[i][length][0] += counts[0] # aggregate m(n) len_counts_ladder[i][length][1] += counts[1] else: len_counts_ladder[i][length] = counts # iterate through all but top chain in ladder for i in range(Nc - 1): # sort len_counts by length sorted_counts = sorted(len_counts_ladder[i].items(), key=itemgetter(0)) # make length and count array from sorted list lengths, counts = [np.array(lst) for lst in zip(*sorted_counts)] ## calculate C estimate for each length, count pair #C_ests = counts[:, 0] / counts[:, 1] * np.exp(-beta_ladder[i] * (lengths - lengths[0])) ## remove outlier estimates #tmp = C_ests[C_ests * 2 > C_ests[0]] ## calculate final estimate #C_mean = np.sqrt(np.mean(np.square(tmp))) # Root mean square so the average is "top-weighted" C_mean = np.mean(counts[:2, 0] / counts[:2, 1] * np.exp(-beta_ladder[i] * (lengths[:2] - lengths[0]))) # calculate boltzmann factor from C estimate Z_est = C_mean * (counts[:, 1] * np.exp(lengths * d_beta[i] - beta_ladder[i] * lengths[0])).sum() # Accumulate boltzmann factor for equivalence class Z_arr[eq] += Z_est # Retrun normalized eq_distr return (Z_arr / np.sum(Z_arr) * 100).astype(np.uint8)#, conv_step
'sun3d-mit_76_studyroom-76-1studyroom2', 'sun3d-mit_lab_hj-lab_hj_tea_nov_2_2012_scan1_erika' ] # will evaluate the descriptor in `{desc_name}_{timestr}` folder. desc_name = sys.argv[1] timestr = sys.argv[2] # inlier_ratio = float(sys.argv[3]) # distance_threshold = float(sys.argv[4]) inlier_ratio = 0.05 # 5% distance_threshold = 0.10 # 10cm # multiprocessing to register each pair in each scene. # this part is time-consuming from multiprocessing import Pool pool = Pool(len(scene_list)) func = partial(deal_with_one_scene, inlier_ratio, distance_threshold) pool.map(func, scene_list) pool.close() pool.join() # collect all the data and print the results. inliers_list = [] recall_list = [] inliers_ratio_list = [] pred_match = 0 gt_match = 0 for scene in scene_list: # evaluate pcdpath = f"../data/3DMatch/fragments/{scene}/" resultpath = f"pred_result/{scene}/{desc_name}_result_{timestr}"
def read(path) -> DataFrame: def _clean(row): text = URL_REGEX.sub('', row.contents) if row.is_forward and '//@' in text: # 如果是转发的且格式正确 if text.startswith('//@'): # 如果单纯转发,则内容设置为最原始微博的内容 try: text = FORWARD_CONTENT.findall(text)[-1] i = FORWARD_SPLIT.match(text).regs[0][1] text = text[i:] except IndexError: text = text.replace('//@', '') # TODO 可以用weibo的API处理 else: # 否则截取新内容 text = text[:text.find('//@')] return text temp_name = os.path.basename(path).replace('.xlsx', '') if os.path.isfile(cache_path(temp_name)): data, texts = load_cache(temp_name) else: output(f"===> Reading from <{path}>.") data: DataFrame = read_excel(path) # .iloc[:280] # 只保留想要的4列,并去除空值,截取日期 data = data[['contents', 'time', 'id', 'is_forward']].dropna().reset_index() data['date'] = data['time'].apply(lambda s: s[:10]) data['contents'] = data['contents'].astype(str) # 预处理文本 texts = data.apply(_clean, axis=1).to_list() dump_cache((data, texts), temp_name) output(f"===> got {len(data)} rows from <{path}>.") # 解析GPU ID ltp_ids = [i.strip() for i in _ARGS.ltpIDS.split(',')] skep_ids = [i.strip() for i in _ARGS.skepIDS.split(',')] # 初始化进程池,管理器,数据队列 pool = Pool(1 + len(ltp_ids) + len(skep_ids)) # 分别分词、获取skep输入、skep运算 manager = Manager() feqture_queue = manager.Queue(16 * len(skep_ids)) result_queue = manager.Queue(16 * len(skep_ids)) # 异步任务启动 pool.apply_async(skep_producer, (feqture_queue, texts, 16, len(skep_ids))) tokens = dict() for i, (s, p) in zip(ltp_ids, generate_batch(texts, len(texts) // len(ltp_ids) + 1)): tokens[(s.start, s.stop)] = pool.apply_async(ltp_tokenzier, (p, 192, i)) for i in skep_ids: pool.apply_async(skep_consumer, (feqture_queue, result_queue, i)) # 接收结果 scores, counter = zeros(len(texts)), 1 while True: _slice, array = result_queue.get() # print(_slice) if array is None: if counter < len(skep_ids): counter += 1 else: break else: scores[_slice] = array data['tokens'] = None for s, t in tokens.items(): data['tokens'].update(Series(t.get(), range(*s))) data['sentiment_score'] = scores pool.close() pool.join() return data[['date', 'tokens', 'id', 'sentiment_score']]
def STRC(init_code, p_error, p_sampling=None, droplets=10, steps=20000, conv_mult=0): # set p_sampling equal to p_error by default p_sampling = p_sampling or p_error if type(init_code) == list: # this is either 4 or 16, depending on what type of code is used. nbr_eq_classes = init_code[0].nbr_eq_classes # make sure one init code is provided for each class assert len(init_code) == nbr_eq_classes, 'if init_code is a list, it has to contain one code for each class' # Create chains with p_sampling, this is allowed since N(n) is independet of p. eq_chains = [Chain(p_sampling, copy.deepcopy(code)) for code in init_code] # don't apply uniform stabilizers if low energy inits are provided randomize = False else: # this is either 4 or 16, depending on what type of code is used. nbr_eq_classes = init_code.nbr_eq_classes # Create chains with p_sampling, this is allowed since N(n) is independet of p. eq_chains = [None] * nbr_eq_classes for eq in range(nbr_eq_classes): eq_chains[eq] = Chain(p_sampling, copy.deepcopy(init_code)) eq_chains[eq].code.qubit_matrix = eq_chains[eq].code.to_class(eq) # apply uniform stabilizers, i.e. rain randomize = True # error model beta_error = -log((p_error / 3) / (1 - p_error)) beta_sampling = -log((p_sampling / 3) / (1 - p_sampling)) d_beta = beta_sampling - beta_error # Array to hold the boltzmann factors for every class Z_arr = np.zeros(nbr_eq_classes) # Largest possible chain length max_length = 2 * eq_chains[0].code.system_size ** 2 if droplets > 1: pool = Pool(droplets) # Iterate through equivalence classes for eq in range(nbr_eq_classes): chain = eq_chains[eq] # Start parallel processes with droplets. if droplets == 1: unique_lengths, len_counts, short_unique = STRC_droplet(copy.deepcopy(chain), steps, max_length, eq, randomize, conv_mult) shortest = next(iter(short_unique[0].values())) next_shortest = next(iter(short_unique[1].values())) else: args = [(copy.deepcopy(chain), steps, max_length, eq, randomize, conv_mult) for _ in range(droplets)] output = pool.starmap_async(STRC_droplet, args).get() # We need to combine the results from all raindrops unique_lengths = {} len_counts = {} short_unique = [{} for _ in range(2)] shortest = max_length next_shortest = max_length # Find shortest and next shortest length found by any chain for i in range(droplets): _,_,data = output[i] if next(iter(data[0].values())) < shortest: next_shortest = shortest shortest = next(iter(data[0].values())) if next(iter(data[1].values())) < next_shortest: next_shortest = next(iter(data[1].values())) # Add data from each droplet to the combined dataset for i in range(droplets): # Unpack results unique_lengths_i, len_counts_i, short_unique_i = output[i] # Combine unique lengths ( not really needed? ) unique_lengths.update(unique_lengths_i) # Combine len_counts for key in len_counts_i: if key in len_counts: len_counts[key] += len_counts_i[key] else: len_counts[key] = len_counts_i[key] # Combine the sets of shortest and next shortest chains shortest_i = next(iter(short_unique_i[0].values())) next_shortest_i = next(iter(short_unique_i[1].values())) if shortest_i == shortest: short_unique[0].update(short_unique_i[0]) if shortest_i == next_shortest: short_unique[1].update(short_unique_i[0]) if next_shortest_i == next_shortest: short_unique[1].update(short_unique_i[1]) # Partial result needed for boltzmann factor shortest_count = len(short_unique[0]) shortest_fraction = shortest_count / len_counts[shortest] next_shortest_count = len(short_unique[1]) # Handle rare cases where only one chain length is observed if next_shortest != max_length: next_shortest_fraction = next_shortest_count / len_counts[next_shortest] mean_fraction = 0.5 * (shortest_fraction + next_shortest_fraction * exp(-beta_sampling * (next_shortest - shortest))) else: mean_fraction = shortest_fraction # Calculate boltzmann factor from observed chain lengths Z_e = sum([m * exp(-beta_sampling * shortest + d_beta * l) for l, m in len_counts.items()]) * mean_fraction Z_arr[eq] = Z_e # Use boltzmann factors as relative probabilities and normalize distribution return (Z_arr / np.sum(Z_arr) * 100)
def map_all_address_to_exchange(self): pool = Pool(WORKERS)#, self.init_worker) pool.map(self.map_address_to_exchange, exchangesPages)
def PTDC(init_code, p_error, p_sampling=None, droplets=4, Nc=None, steps=20000, conv_mult=0): p_sampling = p_sampling or p_error iters = 10 if type(init_code) == list: # this is either 4 or 16, depending on what type of code is used. nbr_eq_classes = init_code[0].nbr_eq_classes # make sure one init code is provided for each class assert len(init_code) == nbr_eq_classes, 'if init_code is a list, it has to contain one code for each class' # store system_size for brevity size = init_code[0].system_size # if Nc is not provided, use code system_size Nc = Nc or size # initiate class ladders eq_ladders = [Ladder(p_sampling, eq_code, Nc) for eq_code in init_code] else: # this is either 4 or 16, depending on what type of code is used. nbr_eq_classes = init_code.nbr_eq_classes # store system_size for brevity size = init_code.system_size # if Nc is not provided, use code system_size Nc = Nc or size # convert init_code to ecery class and initiate ladders eq_ladders = [None] * nbr_eq_classes for eq in range(nbr_eq_classes): eq_code = copy.deepcopy(init_code) eq_code.qubit_matrix = eq_code.to_class(eq) eq_ladders[eq] = Ladder(p_sampling, eq_code, Nc) # reduce number of steps to account for parallel markov chains steps = steps // Nc # this is where we save all samples in a dict, to find the unique ones. qubitlist = {} # Z_E will be saved in eqdistr eqdistr = np.zeros(nbr_eq_classes) # keep track of convergence conv_step = np.zeros(nbr_eq_classes) # keep track of shortest observed chains shortest = np.ones(nbr_eq_classes) * (2 * size ** 2) # keep track of when to stop if using convergence criteria stop = steps # error-model beta = -log((p_error / 3) / (1 - p_error)) # initiate worker pool if droplets > 1: pool = Pool(droplets) # Do mcmc sampling, one class at a time for eq in range(nbr_eq_classes): if droplets == 1: qubitlist = PTDC_droplet(eq_ladders[eq], steps, iters, conv_mult) else: args = [(copy.deepcopy(eq_ladders[eq]), steps, iters, conv_mult) for _ in range(droplets)] output = pool.starmap_async(PTDC_droplet, args).get() for res in output: qubitlist.update(res) # mcmc sampling for class is finished. calculate boltzmann factor for key in qubitlist: eqdistr[eq] += exp(-beta * qubitlist[key]) qubitlist.clear() # Retrun normalized eq_distr return (np.divide(eqdistr, sum(eqdistr)) * 100).astype(np.uint8)#, conv_step
def build_save_dataset(corpus_type, fields, src_reader, cue_reader, tgt_reader, opt): assert corpus_type in ['train', 'valid'] if corpus_type == 'train': counters = defaultdict(Counter) srcs = opt.train_src cues = opt.train_cue tgts = opt.train_tgt ids = opt.train_ids elif corpus_type == 'valid': counters = None srcs = [opt.valid_src] cues = [opt.valid_cue] tgts = [opt.valid_tgt] ids = [None] src_vocab, tgt_vocab, existing_fields = maybe_load_vocab( corpus_type, counters, opt) existing_shards = check_existing_pt_files(opt, corpus_type, ids, existing_fields) # every corpus has shards, no new one if existing_shards == ids and not opt.overwrite: return def shard_iterator(srcs, cues, tgts, ids, existing_shards, existing_fields, corpus_type, opt): """ Builds a single iterator yielding every shard of every corpus. """ for src, tgt, cue, maybe_id in zip(srcs, tgts, cues, ids): if maybe_id in existing_shards: if opt.overwrite: logger.warning( "Overwrite shards for corpus {}".format(maybe_id)) else: if corpus_type == "train": assert existing_fields is not None,\ ("A 'vocab.pt' file should be passed to " "`-src_vocab` when adding a corpus to " "a set of already existing shards.") logger.warning("Ignore corpus {} because " "shards already exist".format(maybe_id)) continue if ((corpus_type == "train" or opt.filter_valid) and tgt is not None): filter_pred = partial(inputters.filter_example, use_src_len=opt.data_type == "text", max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) else: filter_pred = None src_shards = split_corpus(src, opt.shard_size) cue_shards = split_corpus(cue, opt.shard_size) tgt_shards = split_corpus(tgt, opt.shard_size) for i, (ss, cs, ts) in enumerate(zip(src_shards, cue_shards, tgt_shards)): yield (i, (ss, cs, ts, maybe_id, filter_pred)) shard_iter = shard_iterator(srcs, cues, tgts, ids, existing_shards, existing_fields, corpus_type, opt) with Pool(opt.num_threads) as p: dataset_params = (corpus_type, fields, src_reader, cue_reader, tgt_reader, opt, existing_fields, src_vocab, tgt_vocab) func = partial(process_one_shard, dataset_params) for sub_counter in p.imap(func, shard_iter): if sub_counter is not None: for key, value in sub_counter.items(): counters[key].update(value) if corpus_type == "train": vocab_path = opt.save_data + '.vocab.pt' if existing_fields is None: fields = _build_fields_vocab( fields, counters, opt.data_type, opt.share_vocab, opt.vocab_size_multiple, opt.src_vocab_size, opt.src_words_min_frequency, opt.cue_vocab_size, opt.cue_words_min_frequency, opt.tgt_vocab_size, opt.tgt_words_min_frequency) else: fields = existing_fields torch.save(fields, vocab_path)
def run_program(self, dataset, parameters): self.logger.info("Starting run\nParameters:\n{}".format("\n".join( ["\t{}: {}".format(k, v) for k, v in parameters.items()]))) self.logger.info( "Distributing load over {} cores".format(NUM_OF_WORKERS)) kg_i, kg_s = dataset # fit model t0 = timer() # MP manager manager = Manager() # generate semantic item sets from sampled graph si_sets = manager.dict(generate_semantic_item_sets(kg_i)) # generate common behaviour sets work = manager.Queue() keys = list(si_sets.keys()) slices = self.diagonal_matrix_slicer(keys) cbs_sets = manager.list() pool = [] for i in range(NUM_OF_WORKERS): p = Process(target=generate_common_behaviour_sets, args=(si_sets, cbs_sets, work, parameters["similarity_threshold"])) p.daemon = True p.start() pool.append(p) for slce in slices: work.put(slce) for p in pool: work.put(None) # join shared variables for p in pool: p.join() # extend common behaviour sets cbs_size = 2 cbs_sets_extended = manager.list(cbs_sets) while cbs_size < parameters["max_cbs_size"]: func = partial(extend_common_behaviour_sets, cbs_sets_extended, parameters["similarity_threshold"]) slices = self.diagonal_matrix_slicer(cbs_sets_extended) cbs_sets_extention = manager.list() with Pool(processes=NUM_OF_WORKERS) as pool: it = pool.imap_unordered(func=func, iterable=slices) while True: try: cbs_subset = next(it) cbs_sets_extention.extend(cbs_subset) except StopIteration: break cbs_sets.extend(cbs_sets_extention) cbs_sets_extended = cbs_sets_extention cbs_size *= 2 # generate semantic item sets from sampled graph association rules rules = manager.list() work = manager.Queue() size = max(1, floor(len(cbs_sets) / NUM_OF_WORKERS)) slices = [slice(i, i + size) for i in range(0, len(cbs_sets), size)] pool = [] for i in range(NUM_OF_WORKERS): p = Process(target=generate_semantic_association_rules, args=(kg_i, kg_s, cbs_sets, work, rules, parameters["minimal_local_support"])) p.daemon = True p.start() pool.append(p) for slce in slices: work.put(slce) for p in pool: work.put(None) # join shared variables for p in pool: p.join() # calculate support and confidence, skip those not meeting minimum requirements final_rule_set = manager.list() work = manager.Queue() size = max(1, floor(len(rules) / NUM_OF_WORKERS)) slices = [slice(i, i + size) for i in range(0, len(rules), size)] pool = [] for i in range(NUM_OF_WORKERS): p = Process(target=evaluate_rules, args=(kg_i, rules, work, final_rule_set, parameters["minimal_support"], parameters["minimal_confidence"])) p.daemon = True p.start() pool.append(p) for slce in slices: work.put(slce) for p in pool: work.put(None) # join shared variables for p in pool: p.join() # sorting rules on both support and confidence final_rule_set.sort(key=itemgetter(2, 1), reverse=True) # time took t1 = timer() dt = t1 - t0 print(" Program completed in {:.3f} ms".format(dt)) print(" Found {} rules".format(len(final_rule_set))) return final_rule_set
patterns = [ ''.join([random.choice("CD") for _ in lookup_table_keys]) for i in range(number) ] # zip together the keys and the patterns to give a table tables = [dict(zip(lookup_table_keys, pattern)) for pattern in patterns] return tables if __name__ == '__main__': arguments = docopt(__doc__, version='Lookup Evolver 0.1') # set up the process pool pool = Pool(processes=int(arguments['-i'])) # vars for the genetic algorithm starting_pop = int(arguments['-k']) mutation_rate = float(arguments['-u']) generations = int(arguments['-g']) bottleneck = int(arguments['-b']) plys = int(arguments['-p']) start_plys = int(arguments['-s']) # generate a starting population of tables and score them # these will start off the first generation starting_tables = get_random_tables(plys, start_plys, starting_pop) real_starting_tables = axelrod_utils.score_tables(starting_tables, pool) # kick off the evolve function
'score': item[5] + item[6] } ''' print '\n' for i in item: print i ''' def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') f.close() def main(offset): url = 'http://maoyan.com/board/4?offset=' + str(offset) html = get_one_page(url) for item in parse_one_page(html): #print('writing'+item+'\n\n') write_to_file(item) #for i in item: #print i,':',item[i].strip() if __name__ == '__main__': #for i in range(10): # main(i*10) pool = Pool() pool.map(main, [i * 10 for i in range(10)])
def pool_handler(): p = Pool(2) p.map(work_log, work)
monconn_jobs_local_cur = monconn_jobs_local.getCursor() monconn_jobs_local.dropTable() print 'Connecting to Mongodb...finished' del(monconn_jobs_local) ######################################################################################################### ############----------------- Initiating Multiprocessing and extracting Jobs ############----------------- Set flag pprocessing = 1 for multiprocessing (avoid) ######################################################################################################### numChunks = 100 chunkIDs = range(0, numChunks) print chunkIDs pprocessing = 0 if pprocessing == 0: preProcessChunk(1) #for chunkID in chunkIDs: # preProcessChunk(chunkID) pass else: numConcurrentThreads = 5 pool = Pool(numConcurrentThreads) pool.map(preProcessChunk, chunkIDs) except: send_email(['*****@*****.**', '*****@*****.**','*****@*****.**'],"Midout Mailers -Urgent!!!","Jobs Processing from SQL Failed!!!!!\nCall Akash (+91-8527716555) or Kanika (+91-9560649296) asap.")
def main(): with Pool(processes=1, maxtasksperchild=2) as p: print(p.starmap_async(myTask, [(4, 3), (2, 1), (3, 2), (5, 1)]).get()) print(p.starmap_async(myTask, [(4, 3), (2, 1), (3, 2), (2, 3)]).get())
p1.join p2.join print(p1) print(p2)1 print("Done") #using map reduce from multiprocessing import Pool def f(n): return n*n p = Pool(processes=3) result = p.map(f,[1,2,3,4,5]) for n in result: print(n) #using multithread import threading t=time.time() t1=threading.Thread(target=calculate_square, args=(arrs,))#just like the others target=function, args=arguments of function t2=threading.Thread(target=calculate_cube, args=(arrs,))#same thing but or cube t1.start#start process for thread 1 t2.start#start process for thread 2 t1.join()#stop thread 1