def process_articles(entity_type=Entity, output_filename='output-all.txt', corpus_root='corpus/'): terms = select_terms(entity_type) Session.expunge_all() Session.close() articles = Session.query(Entity.sep_dir).filter(Entity.sep_dir!=None) articles = articles.filter(Entity.sep_dir!='') articles = articles.distinct().all() articles = [a[0] for a in articles] # parallel processing of articles p = Pool() args = [(title, terms, entity_type, None, corpus_root) for title in articles] doc_lines = p.map(process_wrapper, args) p.close() #serial processing for tests ''' doc_lines = [] for title in articles: lines = process_article(title, terms, entity_type, None, corpus_root) doc_lines.append(lines) ''' # write graph output to file print output_filename with open(output_filename, 'w') as f: for lines in doc_lines: f.writelines(lines)
def main(): parser = argparse.ArgumentParser(description='Analyze a bandersnatch mirror.') parser.add_argument('--json', help='save raw data to a json file', default=None) args = parser.parse_args() concurrency = 8 root = "/var/spool/pypi/web/packages/source/" p = Pool() results = {} try: try: for path, result in \ p.imap_unordered(analyse_sdist, yield_packages(root)): results[path] = result p.close() except: p.terminate() raise finally: p.join() if args.json: with open(args.json, 'wb') as f: f.write(json.dumps(results)) pprint.pprint(results)
class withPool: def __init__(self, procs): self.p = Pool(procs, init_func) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.p.close()
def enumerate_all_subgraphs_upto_size_k_parallel(document_graph, k, num_of_workers=4): """ returns all subgraphs of a DiscourseDocumentGraph (i.e. a MultiDiGraph) with up to k nodes. This is a trivially parallelized version of enumerate_all_subgraphs_upto_size_k() """ document_nodes = len(document_graph) if k > document_nodes: k = document_nodes int_graph = nx.convert_node_labels_to_integers(nx.DiGraph(document_graph), first_label=1, label_attribute='node_id') pool = Pool(processes=num_of_workers) # number of CPUs results = [pool.apply_async(enumerate_all_size_k_subgraphs, args=(int_graph, i)) for i in xrange(1, k+1)] pool.close() pool.join() subgraphs = [] for result in results: tmp_result = result.get() if isinstance(tmp_result, list): subgraphs.extend(tmp_result) else: subgraphs.append(tmp_result) return subgraphs
def crawl_recursive_threaded(dirpath, ext): from database import indexer from database import utils from multiprocessing import Pool # convert to our infos cdir = indexer.DirInfo(dirpath, ext) cInfos = indexer.dirs_to_info(cdir.subfolders(), ext) # comment if you want a silent indexing print(cdir.to_string()) # recursive pooled call # NOTE: child calls must not be pooled p = Pool(utils.Settings.config['processes']) infos = p.map(crawl_recursive, cInfos) p.close() # remove hierarchy dirInfos = [d for sublist in infos for d in sublist] dirInfos.append(cdir) print('I was crawling with %d processes' % utils.Settings.config['processes']) return dirInfos
def main(): global output_doc_path if output_doc_path == '': output_doc_path = os.path.join(os.path.split(input_doc_path)[0], 'outputTinypng') if not os.path.exists(output_doc_path): os.mkdir(output_doc_path) for parent,dirnames,filenames in os.walk(input_doc_path): #三个参数:分别返回1.父目录 2.所有文件夹名字(不含路径) 3.所有文件名字 for dirname in dirnames: #输出文件夹信息 # print("parent is:" + parent) # print("dirname is" + dirname) outDir = os.path.join(output_doc_path,os.path.relpath(os.path.join(parent,dirname),input_doc_path)) if not os.path.exists(outDir): os.mkdir(outDir) for filename in filenames: #输出文件信息 # print("parent is:" + parent) # print("filename is:" + filename) filePaths.append(os.path.join(parent,filename)) pngFilePaths = filter(lambda x:os.path.splitext(x)[1]=='.png' or os.path.splitext(x)[1]=='.jpg',filePaths) print('Parent process %s.' % os.getpid()) p = Pool(poolLimite) for fileName in pngFilePaths: p.apply_async(getTinyPng, args=(fileName,)) print('Waiting for all subprocesses done...') p.close() p.join() print('All subprocesses done.')
def __decrypt_file(self, private_d, public_n, keys, path_to_file, CRT, k): if CRT: pool = Pool(processes = k) promises = [] decrpted_data = '' with open(path_to_file, 'r') as f: encrypted_data = f.read() encrypted_data_chunks = list(map(''.join, zip(*[iter(encrypted_data)]*len(str(public_n))))) for i in range(len(encrypted_data_chunks)): stripped = encrypted_data_chunks[i].lstrip('0') if CRT: promise = pool.apply_async(self.compute_part_of_message, args=(stripped, keys, i)) promises.append(promise) else: decrpted_data += chr(self.__decrypt_message(stripped, private_d, public_n)) if CRT: results = [promise.get() for promise in promises] decrypted_sorted = sorted(results, key = lambda x: x[1]) for data in decrypted_sorted: decrpted_data += chr(data[0]) if CRT: pool.close() with open(path_to_file + '.dec', 'w') as f: f.write(decrpted_data) return decrpted_data
def rc(rf, alphabet, numOfThreads): tryn=0 counterTmp = 0 printCounter = 1000 listBasic = [] if rf.endswith('.rar'): funcChosen = unrar elif rf.endswith('.zip') or rf.endswith('.7z') : funcChosen = zipFileUnzip for a in range(1,len(alphabet)+1): for b in itertools.product(alphabet,repeat=a): k="".join(b) k=re.escape(k) listBasic.append(k) tryn+=1 if len(listBasic) == numOfThreads: pool = Pool(numOfThreads) pool.map_async(funcChosen, listBasic, callback = exitPass) pool.close() if resultPass: timeWasted = time.time()-start print 'Found! Password is '+resultPass print "It took " +str(round(time.time()-start,3))+" seconds" print "Speed: "+str(round(tryn/float(timeWasted),2))+" passwords/sec" print "Tried "+str(tryn)+" passwords" exit() listBasic = [] counterTmp+=1 if counterTmp >= printCounter: print 'Trying combination number '+str(tryn)+':'+str(k) timeWasted = round(time.time()-start,2) if timeWasted > 0: print "It took already " +str(timeWasted) +" seconds. Speed: "+str(round(tryn/float(timeWasted),2))+" passwords/sec" counterTmp=0
def build_from_queries(queries): p = Pool(5) query_results = p.map(q_exec, queries) p.close() p.join() #process the query_results return query_results
def calcSynScores(scoresO,aabrhRawScoreSummmaryD,geneNames,geneOrderT,synWSize,numSynToTake,numThreads): '''Calculate the synteny score between two genes and add to edge attributes of scoresO. We only bother making synteny scores for those genes that have an edge in scoresO. ''' neighborTL = createNeighborL(geneNames,geneOrderT,synWSize) # make list of groups of arguments to be passed to p.map. There # should be numThreads groups. argumentL = [([],neighborTL,numSynToTake,geneNames,aabrhRawScoreSummmaryD,scoresO) for i in range(numThreads)] i=0 for gn1,gn2 in scoresO.iterateEdgesByEndNodes(): argumentL[i%numThreads][0].append((gn1,gn2)) i+=1 p=Pool(numThreads) # num threads synScoresLL = p.map(synScoreGroup, argumentL) p.close() p.join() # add to scores object for synScoresL in synScoresLL: for gn1,gn2,sc in synScoresL: scoresO.addScoreByEndNodes(gn1,gn2,sc,'synSc') return scoresO
def expand_all_commits(code_dir, target_dir, only_year=None): print code_dir uname_lookup_by_year_q = load_uname_lookup_by_year_q() latest_submissions = get_latest_submissions(code_dir) num_students = len(latest_submissions) def get_commit_args(args): i, student = args latest_submit = latest_submissions[student] student_dir = os.path.join(code_dir, latest_submit) year_q = get_submit_time(student_dir) if (not year_q) or only_year != year_q: return (-1,'','',-1,'',-1) year_target_dir = os.path.join(target_dir, year_q) if year_q not in uname_lookup_by_year_q or \ latest_submit not in uname_lookup_by_year_q[year_q]: add_uname_to_lookup(latest_submit, year_q, uname_lookup_by_year_q) student_id = uname_lookup_by_year_q[year_q][latest_submit] #if student_id != '2012010247': return (-1,'','',-1,'',-1) return i, student, student_dir, student_id, year_target_dir, num_students students = sorted(latest_submissions.keys()) zipped_args = map(get_commit_args, enumerate(students)) non_students = [student for i, student in enumerate(students) if zipped_args[i][0] == -1] #print "unsuccessful" #print '\n'.join([latest_submissions[student] for student in non_students]) pool = ThreadPool(8) results = pool.map(thread_process_commit, zipped_args) pool.close() pool.join() export_uname_lookup_by_year_q(uname_lookup_by_year_q)
def import_images(folder, par=True, ttime=True): """ This function loads images from a folder as PIL Image files and thresholds them, creating a list of z-slices to be turned into a matrix This version is not currently used. """ fils = [os.listdir(folder)] def keep_tifs(rawlist): tiflist = [] for f in rawlist: if len(f.split('.'))>1: if f.split('.')[1] == 'tif': tiflist.append(f) return tiflist tiflist = keep_tifs(fils) newtiflist = [folder+f for f in tiflist].sort() # alphabetize tifobjs = [load_img_array(f) for f in tiflist] # here start parallel stuff if par or ttime: start_time_par = timer() pool = Pool(8) results_par = pool.map(show_at_thresh, tifobjs) pool.close() pool.join() total_time_par = timer() - start_time_par # or non-parallel stuff elif par==False or ttime: start_time_nopar = timer() results_nopar = [show_at_thresh(f) for f in newtiflist] total_time_nopar = timer() - start_time_nopar print('Time for parallel: %.2f seconds' % total_time_par) print('Time for non-parallel: %.2f seconds' % total_time_nopar) return results_par, results_nopar
def calcRawScores(fastaFilePath,numThreads,geneNames,gapOpen, gapExtend, matrix, scoresO): '''Get a global alignment based raw score for every edge in scoresO.''' # load sequences protFnL=glob.glob(fastaFilePath) seqD=genomes.loadProt(protFnL) # make list of sets of arguments to be passed to p.map. There # should be numThreads sets. argumentL = [([],seqD,gapOpen, gapExtend, matrix) for i in range(numThreads)] i=0 for g1,g2 in scoresO.iterateEdgesByEndNodes(): edgeNum = scoresO.endNodesToEdge(g1,g2) edgeT = edgeNum,geneNames.numToName(g1),geneNames.numToName(g2) argumentL[i%numThreads][0].append(edgeT) i+=1 # run p=Pool(numThreads) scoresLL = p.map(rawScoreGroup, argumentL) p.close() p.join() # store in scoresO for scoresL in scoresLL: for edgeNum,sc in scoresL: scoresO.addScoreByEdge(edgeNum,sc,'rawSc') return scoresO
def getData(): if os.path.isfile("chat_urls.p"): chat_urls = pickle.load( open( "chat_urls.p", "rb" ) ) else: chat_urls = {} for user in users: chat_urls[user] = get_urls(user) teams_url = "http://espn.go.com/mlb/teams" pickle.dump( chat_urls, open( "chat_urls.p", "wb" ) ) # for user in chat_urls: # urls = chat_urls[user] # for url in urls: # getLog(url) logDB = {} for user in chat_urls: logDB[user] = {} p = Pool(20) i=0 manager = Manager() db = manager.dict() for user in chat_urls: for url in chat_urls[user]: i+=1 p.apply_async(addLogData, args=(url,db)) p.close() p.join() out = db._getvalue() outfile = open("rawChat.txt","wb") for url in out: outfile.write(out[url]+"\n")
def fetch_imagery(image_locations, local_dir): pool = Pool(cpu_count()) tupled = [(loc[0], loc[1], local_dir) for loc in image_locations] try: pool.map(fetch_imagery_uncurried, tupled) finally: pool.close()
def correction_terms_threaded(self): '''Finds the correction terms assoctiated to the quadratic form, for each of the equivalance classes it finds the maximum by iterating through the relation vectors of the group. Uses multiprocessing.''' print 'Using multiprocessing' pool = Pool() # default: processes=None => uses cpu_count() manager = Manager() start_time = time.time() coef_lists = lrange(self.group.structure) # representatives = elements of C_1(V) (np.matrix) representatives = map(lambda l: self.find_rep(l), coef_lists) # list of maxes lst = manager.list([None for i in xrange(len(representatives))]) alphalist = list(self.get_alpha()) # cannot pickle generators pool.map_async(functools.partial(process_alpha_outside, self, representatives, lst), alphalist) pool.close() pool.join() # wait for pool to finish # get corrterms via (|alpha|^2+b)/4 print 'Computed from quadratic form in %g seconds' \ % (time.time() - start_time) return [Fraction(Fraction(alpha, self.int_inverse[1]) + self.b, 4) \ for alpha in lst]
def main(): if MAIL_TO: signal.signal(signal.SIGALRM, send_email_by_alarm) signal.alarm(TIME_NOTIFICATION_BY_EMAIL) send_email_start() start_time = int(time.time()) manager = Manager() queue = manager.Queue() pool = Pool(PROCESS_NUMBER + 1) jobs = [] pool.apply_async(listener, args=(queue,)) for config_file in FINAL_CONFIG_TO_SCRAPE: job = pool.apply_async(scraper, (config_file, queue)) jobs.append(job) for i, job in enumerate(jobs): job.get() # although all job finished, but for unknown some providers still running time.sleep(10) #extend more time to make sure there is not any provider running for i in range(1000): if len(get_summary().provider_running) > 0: time.sleep(500) else: break print "Run all has finished" queue.put(LISTENER_KILL_SIGNAL) pool.close() if MAIL_TO: send_email_end()
def dirImgProcess(path): global workerPool, workerOutput, theGreatIndex workerPool = Pool() workerOutput = [] work = [] theGreatIndex = {} pagenumber = 0 for (dirpath, dirnames, filenames) in os.walk(path): for afile in filenames: if getImageFileName(afile) is not None: pagenumber += 1 work.append([afile, dirpath, options]) if GUI: GUI.progressBarTick.emit(str(pagenumber)) if len(work) > 0: for i in work: workerPool.apply_async(func=fileImgProcess, args=(i, ), callback=fileImgProcess_tick) workerPool.close() workerPool.join() if GUI and not GUI.conversionAlive: rmtree(os.path.join(path, '..', '..'), True) raise UserWarning("Conversion interrupted.") if len(workerOutput) > 0: rmtree(os.path.join(path, '..', '..'), True) raise RuntimeError("One of workers crashed. Cause: " + workerOutput[0]) else: rmtree(os.path.join(path, '..', '..'), True) raise UserWarning("Source directory is empty.")
def score_all_genes(self, graph, num_procs=1): partial_score_gene = partial(score_gene, graph=graph, top_genes=self.top_genes) p = Pool(num_procs) result = p.map(partial_score_gene, list(self.vd.gene_names())) p.close() # convert them all to percentiles cent_hist = numpy.array([x[1] for x in result if x[1] != -1]) nn_hist = numpy.array([x[2] for x in result if x[2] != -1]) batch = [] for gene, cent_score, nn_score in result: # edge case: gene is a top gene if gene in self.top_genes: cent_perc = 1 nn_perc = 1 # edge case: gene isn't in network elif cent_score == -1 or \ nn_score == -1: cent_perc = 0 nn_perc = 0 else: cent_perc = scipy.stats.percentileofscore(cent_hist, cent_score) / 100.0 nn_perc = 1 - scipy.stats.percentileofscore(nn_hist, nn_score) / 100.0 print "gene: %s\n c: %s\n c_p: %s\n n: %s\n n_p: %s" % \ (gene, cent_score, cent_perc, nn_score, nn_perc) batch.append((cent_score, cent_perc, nn_score, nn_perc, gene)) self.vd._c.executemany("UPDATE genes SET cent_score = ?, cent_perc = ?, " \ "nn_score = ?, nn_perc = ? WHERE name = ?", batch) self.vd._conn.commit()
def train_word2id(): """把训练集的所有词转成对应的id。""" time0 = time.time() print('Processing train data.') df_train = pd.read_csv('../raw_data/question_train_set.txt', sep='\t', usecols=[0, 2, 4], names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object}) print('training question number %d ' % len(df_train)) # 没有 content 的问题用 title 来替换 na_content_indexs = list() for i in tqdm(xrange(len(df_train))): word_content = df_train.word_content.values[i] if type(word_content) is float: na_content_indexs.append(i) print('There are %d train questions without content.' % len(na_content_indexs)) for na_index in tqdm(na_content_indexs): df_train.at[na_index, 'word_content'] = df_train.at[na_index, 'word_title'] # 没有 title 的问题, 丢弃 na_title_indexs = list() for i in xrange(len(df_train)): word_title = df_train.word_title.values[i] if type(word_title) is float: na_title_indexs.append(i) print('There are %d train questions without title.' % len(na_title_indexs)) df_train = df_train.drop(na_title_indexs) print('After dropping, training question number(should be 2999952) = %d' % len(df_train)) # 转为 id 形式 p = Pool() train_title = np.asarray(p.map(get_id4words, df_train.word_title.values)) np.save('../data/wd_train_title.npy', train_title) train_content = np.asarray(p.map(get_id4words, df_train.word_content.values)) np.save('../data/wd_train_content.npy', train_content) p.close() p.join() print('Finished changing the training words to ids. Costed time %g s' % (time.time() - time0))
def multiprocessing_method(n_of_darts, n_of_pools = 10): '''Uses 10 processes to split the work of dart_algorithm. Since the processes are independent, \ I split the number of darts over the number of processes, and used the pool function. Instead of \ looping over the number of processes, which would be time-consuming, I wrote them explicitly.''' pool = Pool(processes=n_of_pools) darts_per_pool = n_of_darts/n_of_pools start=time() result1 = pool.map_async(dart_algorithm, [darts_per_pool]) result2 = pool.map_async(dart_algorithm, [darts_per_pool]) result3 = pool.map_async(dart_algorithm, [darts_per_pool]) result4 = pool.map_async(dart_algorithm, [darts_per_pool]) result5 = pool.map_async(dart_algorithm, [darts_per_pool]) result6 = pool.map_async(dart_algorithm, [darts_per_pool]) result7 = pool.map_async(dart_algorithm, [darts_per_pool]) result8 = pool.map_async(dart_algorithm, [darts_per_pool]) result9 = pool.map_async(dart_algorithm, [darts_per_pool]) result10 = pool.map_async(dart_algorithm, [darts_per_pool]) end=time() Pool_darts_in_circle = result1.get()[0] + result2.get()[0]+result3.get()[0] +\ result4.get()[0]+result5.get()[0]+result6.get()[0]+ result7.get()[0]+ \ result8.get()[0]+ result9.get()[0]+ result10.get()[0] multiprocessing_time = end-start pool.close() pool.join() pi = 4 * Pool_darts_in_circle / float(n_of_darts) # print "multiprocessing pi approximation= ", pi return multiprocessing_time
def multi_remote_exec_cmd(hosts, username, cmd): pool = Pool(processes=5) for host in hosts: username, password, ip, port = get_connect_item(username, host) pool.apply_async(remote_exec_cmd, (ip, port, username, password, cmd)) pool.close() pool.join()
def get(self, tag="貓咪", max_tag_id=None): if tag == "": tag = "貓咪" p = Pool(10) if self.prefix == "ajax": medias, next_ = util.search_by_tag(tag, 3, max_tag_id) else: medias, next_ = util.search_by_tag(tag, 5, max_tag_id) fs = p.map(util.features, medias) p_label, _, _ = libsvm.svm_predict([1] * len(fs), fs, model) for (m, f) in zip(medias, fs): print(m["caption"]["text"]) print(f) if self.prefix == "ajax": medias = map(lambda (m, l): Media(m, l).__dict__, zip(medias, p_label)) self.write(json.dumps({ "max_tag_id": next_, "medias": medias })) else: medias = map(lambda (m, l): Media(m, l), zip(medias, p_label)) if self.prefix == "demo1": self.render("demo1.html", medias=medias, tag_name=tag, max_tag_id=next_) elif self.prefix == "demo2": self.render("demo2.html", medias=medias, tag_name=tag, max_tag_id=next_) else: self.render("main.html", medias=medias, tag_name=tag, max_tag_id=next_) p.close() p.join()
def matrix_vector_iteration_by_processes(A,x,k): # create a temporary directory to store the matrix and the vectors tmpdir = tempfile.mkdtemp() nvec = get_nvec(x) y = x.copy() save_matrix(tmpdir,A) for i in xrange(nvec): save_x(tmpdir,x,i) # start processes pool = Pool(processes=min(nvec,6)) processes = [] for i in xrange(nvec): processes.append( pool.apply_async(matrix_vector_iteration_process, (tmpdir,i,k)) ) # fetch results (vector/matrix shape version) if x.ndim == 1: processes[0].get() y = load_x(tmpdir,0) else: for i in xrange(nvec): processes[i].get() y[:,i] = load_x(tmpdir,i) pool.close() # remove temporary directory (with all it contains) shutil.rmtree(tmpdir) return y
def run_train_models(processes, model_library, **kwargs): """Train many supervised learning problems in parallel model_library = a list specifying the model library for the dataset in format needed for TrainModelCV **kwargs: all the rest of the input to TrainModelCV""" # sample input for model_library: # [[LogisticRegression, classification_error, 'parameters.json', (), {'lam':0.5}], # [LogisticRegression, auc_wmw_fast, None, (), {'C':50}]] # use a process pool top execute all the training jobs # collect the results and combine to return from multiprocessing import Pool p = Pool(processes) #ret = {} #for model in model_library: # p.apply_async(_pool_helper, (model_library, ), kwargs, callback=ret.update) results = [] for model in model_library: results.append(p.apply_async(_pool_helper, (model, ), kwargs)) # wait on the pool to finish p.close() p.join() # collect the results ret = {} for result in results: ret.update(result.get()) return ret
def get(self): mode = toAlpha3Code(self.get_argument('lang')) text = self.get_argument('q') if not text: self.send_error(400, explanation='Missing q argument') return def handleCoverage(coverage): if coverage is None: self.send_error(408, explanation='Request timed out') else: self.sendResponse([coverage]) if mode in self.analyzers: pool = Pool(processes=1) result = pool.apply_async(getCoverage, [text, self.analyzers[mode][0], self.analyzers[mode][1]]) pool.close() @run_async_thread def worker(callback): try: callback(result.get(timeout=self.timeout)) except TimeoutError: pool.terminate() callback(None) coverage = yield tornado.gen.Task(worker) handleCoverage(coverage) else: self.send_error(400, explanation='That mode is not installed')
def main(path, out, cores): """ Compute contact energies for each pdb in path and write results to 'out'. :param path: str :param out: str :param cores: int :return: None """ # Find all pdbs in path workload = [] for file in os.listdir(path): if os.path.splitext(file)[1].lower() == ".pdb": workload.append(file) # Print few newlines to prevent progressbar from messing up the shell print("\n\n") # Compute energies pool = Pool(processes=cores) results = [] for (nr, pdb) in enumerate(workload): updateprogress(pdb, nr / len(workload)) e = computecontactenergy(os.path.join(path, pdb), pool) results.append((pdb, e)) pool.close() # Make 100% to appear updateprogress("Finished", 1) # Store output with open(out, "w") as handler: handler.write("PDB,Energy in kcal/mol\n") for pair in results: handler.write("{},{}\n".format(*pair))
def _run(self, source, destination_format, clear_source=False, workers=-1): """ parallel version of the `convert` method :param source: (rdf) files to convert (source path) :param destination_format: the destination format :param clear_source: if set, delete the source files. Default = False :return: None """ files = [] src = os.path.abspath(source) if os.path.isdir(src): files = [os.path.join(src, f) for f in os.listdir(src) if to_process(f, destination_format)] elif os.path.exists(src): files = [src] self._log.info('to process: {0}'.format(files)) if clear_source: self._log.warn('will remove original files after conversion') def job_finished(res): print '.', sys.stdout.flush() num_cpus = cpu_count() num_workers = workers if 0 < workers < num_cpus else num_cpus pool = Pool(processes=num_workers) for src in files: dst = dest_file_name(src, destination_format) if dst: pool.apply_async(convert_file, (src, dst, clear_source), callback=job_finished) pool.close() pool.join()
def run_make_submission(settings, targets_and_pipelines, split_ratio): pool = Pool(settings.N_jobs) for i, (target, pipeline, feature_masks, classifier, classifier_name) in enumerate(targets_and_pipelines): for j, feature_mask in enumerate(feature_masks): progress_str = 'T=%d/%d M=%d/%d' % (i+1, len(targets_and_pipelines), j+1, len(feature_masks)) pool.apply_async(make_submission_predictions, [settings, target, pipeline, classifier, classifier_name], {'feature_mask': feature_mask, 'progress_str': progress_str, 'quiet': True}) pool.close() pool.join() guesses = ['clip,preictal'] num_masks = None classifier_names = [] for target, pipeline, feature_masks, classifier, classifier_name in targets_and_pipelines: classifier_names.append(classifier_name) if num_masks is None: num_masks = len(feature_masks) else: assert num_masks == len(feature_masks) test_predictions = [] for feature_mask in feature_masks: data = make_submission_predictions(settings, target, pipeline, classifier, classifier_name, feature_mask=feature_mask) test_predictions.append(data.mean_predictions) predictions = np.mean(test_predictions, axis=0) guesses += make_csv_for_target_predictions(target, predictions) output = '\n'.join(guesses) write_submission_file(settings, output, 'ensemble n=%d split_ratio=%s' % (num_masks, split_ratio), None, str(classifier_names), targets_and_pipelines)
def test_word2id(): """把测试集的所有词转成对应的id。""" time0 = time.time() print('Processing eval data.') df_eval = pd.read_csv('../raw_data/question_eval_set.txt', sep='\t', usecols=[0, 2, 4], names=['question_id', 'word_title', 'word_content'], dtype={'question_id': object}) print('test question number %d' % len(df_eval)) # 没有 title 的问题用 content 来替换 na_title_indexs = list() for i in xrange(len(df_eval)): word_title = df_eval.word_title.values[i] if type(word_title) is float: na_title_indexs.append(i) print('There are %d test questions without title.' % len(na_title_indexs)) for na_index in na_title_indexs: df_eval.at[na_index, 'word_title'] = df_eval.at[na_index, 'word_content'] # 没有 content 的问题用 title 来替换 na_content_indexs = list() for i in tqdm(xrange(len(df_eval))): word_content = df_eval.word_content.values[i] if type(word_content) is float: na_content_indexs.append(i) print('There are %d test questions without content.' % len(na_content_indexs)) for na_index in tqdm(na_content_indexs): df_eval.at[na_index, 'word_content'] = df_eval.at[na_index, 'word_title'] # 转为 id 形式 p = Pool() eval_title = np.asarray(p.map(get_id4words, df_eval.word_title.values)) np.save('../data/wd_eval_title.npy', eval_title) eval_content = np.asarray(p.map(get_id4words, df_eval.word_content.values)) np.save('../data/wd_eval_content.npy', eval_content) p.close() p.join() print('Finished changing the eval words to ids. Costed time %g s' % (time.time() - time0))
def main(): # set arguments # arguments are passed to classes parser = argparse.ArgumentParser( description="Evaluate completeness and contamination of a MAG." ) parser.add_argument( "genomes", type=str, help="Find marker for these genomes", nargs="+" ) parser.add_argument( "--out", "-o", type=str, required=False, help="Path to output folder (Default: .)", default=".", ) parser.add_argument("--db", type=str, default=None, help="Path to EukCC DB") parser.add_argument( "--threads", type=int, help="Number of threads to use (Default: 1)", default=1 ) parser.add_argument( "--tree", type=int, help="Number of profiles to use at target for tree profiles (default: 30)", default=30, ) parser.add_argument( "--clade", default="base", type=str, help="Define clade as base, fungi, protozoa or plants (Defaut: base)", ) parser.add_argument( "--quiet", "-q", dest="quiet", action="store_true", default=False, help="Silcence most output", ) parser.add_argument( "--debug", "-d", action="store_true", default=False, help="Debug and thus ignore safety", ) parser.add_argument( "-v", "--version", action="version", version="EukCC version {}".format(version.__version__), ) args = parser.parse_args() state = eukcc_state( workdir=os.path.join(args.out, "refine_workdir"), options=vars(args) ) file.isdir(state["workdir"]) # define logging logLevel = logging.INFO if state["quiet"]: logLevel = logging.WARNING elif state["debug"]: logLevel = logging.DEBUG logging.basicConfig( format="%(asctime)s %(message)s", datefmt="%d-%m-%Y %H:%M:%S: ", level=logLevel, ) # if db is not set, we check for env variable if state["db"] is None: if os.environ.get("EUKCC2_DB") is not None: state["db"] = os.environ.get("EUKCC2_DB") logging.debug( "Defined db via env variable EUKCC2_DB as '{}'".format(state["db"]) ) else: logging.error("No database was provided via --db or EUKCC2_DB env variable") exit(202) logging.info("EukCC version {}".format(version.__version__)) logging.info( "Looking for shared markers across {} genomes".format(len(state["genomes"])) ) n_per_worker = 4 # using more threads for hmmer makes no sense, so we parallize accroos genomes if state["threads"] > (2 * n_per_worker): # multithreading pool n_processes = math.floor(state["threads"] / n_per_worker) logging.info( "Launching {} threads with {} threads each".format( n_processes, n_per_worker ) ) pool = Pool(processes=n_processes) # change threads not opt = {k: v for k, v in state.opt.items()} opt["threads"] = n_per_worker search_genome_p = partial(search_genome, state=opt) data = pool.map(search_genome_p, state["genomes"]) pool.close() pool.join() else: data = [] for genome in state["genomes"]: data.append(search_genome(genome, state)) tree_profiles = define_tree_set(data, n_target=args.tree) result = find_intersection(data, missing=3) outfile = os.path.join(state["out"], "profiles.txt") with open(outfile, "w") as fout: for key, profiles in result.items(): for profile in profiles: fout.write("{}\t{}\n".format(key, profile)) for profile in tree_profiles: fout.write("{}\t{}\n".format("tree", profile)) logging.info("wrote profiles to {}".format(outfile))
Params.append(YearFrom) Params.append(MonthFrom) Params.append(DayFrom) Params.append(YearTo) Params.append(MonthTo) Params.append(DayTo) Params.append(CityCoordinates) Params.append(args.ephem) Params.append(stdmag) Params.append(CityElevation) paramToFunction.append(Params) if len(paramToFunction) > 0: pool = Pool() function_output = pool.imap_unordered(calculatePasses, paramToFunction) pool.close() output = [] errors = [] for _ in tqdm.tqdm(function_output, total=len(paramToFunction)): if isinstance(_, str): errors.append(_) else: output.append(_) pool.join() print("--- %s seconds calculating orbits ---" % ((time.time() - start_time))) if errors is not None: if len(errors) > 0: with open(ErrorFile, 'w') as f: for item in errors: f.write("%s\n" % item)
def main(argv=None): """The main entry-point to salvo.""" if argv is None: argv = sys.argv[1:] parser = argparse.ArgumentParser(description='Provision a new salvo.') parser.add_argument('config', type=argparse.FileType('r'), help='salvo configuration file to run') parser.add_argument('--playbook', '-p', type=argparse.FileType('r'), default='./deploy/playbook.yml', help='directory where playbooks reside') parser.add_argument('--wait', '-w', default=False, action='store_true', help='wait for [Enter] before cleaning up') parser.add_argument('--deployment', '-d', type=str, default='salvo', help='deployment name for this salvo') parser.add_argument('--set', '-s', nargs='*', type=str, help='key:value pair to set for this salvo execution') parser.add_argument('--dry-run', '-n', action='store_true', default=False, help='only print what actions would be taken') args = parser.parse_args(argv) args.set = dict(item.split(":", maxsplit=1) for item in args.set) if args.set is not None else {} topology = Topology.load_file(args.config, args.set) hq = Cluster('hq', { 'expose': [22], }, {}) topology.clusters = [hq] + topology.clusters agenda.section("Set up network") client = boto3.client('ec2') ec2 = boto3.resource('ec2') # Set up VPC agenda.task("Create VPC") vpc = client.create_vpc(DryRun=args.dry_run, CidrBlock='10.0.0.0/16') vpc = ec2.Vpc(vpc['Vpc']['VpcId']) agenda.task("Attach VPC internet gateway") gateway = client.create_internet_gateway(DryRun=args.dry_run) gateway = ec2.InternetGateway( gateway['InternetGateway']['InternetGatewayId']) gateway.attach_to_vpc(DryRun=args.dry_run, VpcId=vpc.id) agenda.task("Create internet-enabled route table") iroutable = vpc.create_route_table(DryRun=args.dry_run) iroutable.create_route(DryRun=args.dry_run, DestinationCidrBlock='0.0.0.0/0', GatewayId=gateway.id) subnets = [] secs = [] for i, c in enumerate(topology.clusters): agenda.task("Allocate subnet #{}".format(i + 1)) subnet = vpc.create_subnet(DryRun=args.dry_run, CidrBlock='10.0.{}.0/24'.format(i)) if c.internet: agenda.subtask("Hook in internet-enable route table") iroutable.associate_with_subnet(DryRun=args.dry_run, SubnetId=subnet.id) # set up security croups agenda.subtask("Create network security group") sec = vpc.create_security_group( DryRun=args.dry_run, GroupName='{}-cluster-{}'.format(args.deployment, i + 1), Description='Ingress rules for cluster {}-{}'.format( args.deployment, c.name)) # allow all internal traffic sec.authorize_ingress(DryRun=args.dry_run, IpProtocol='tcp', FromPort=1, ToPort=65535, CidrIp='10.0.0.0/16') if c.expose is not False: for p in c.expose: agenda.subtask("Allow ingress traffic on port {}".format(p)) sec.authorize_ingress(DryRun=args.dry_run, IpProtocol='tcp', FromPort=p, ToPort=p, CidrIp='0.0.0.0/0') secs.append(sec) subnets.append(subnet) # Tag all our VPC resources agenda.task("Tag all VPC resources") ec2.create_tags(DryRun=args.dry_run, Resources=[ vpc.id, gateway.id, iroutable.id, ] + [sn.id for sn in subnets] + [sg.id for sg in secs], Tags=[{ 'Key': 'salvo', 'Value': args.deployment, }]) # Create access keys agenda.task("Generate VPC key pair") try: keys = client.create_key_pair(DryRun=args.dry_run, KeyName=args.deployment) except botocore.exceptions.ClientError: # Key probably already exists. Delete and re-create. agenda.subfailure("Could not create key pair") agenda.subtask("Attempting to delete old key pair") client.delete_key_pair(DryRun=args.dry_run, KeyName=args.deployment) agenda.subtask("Attempting to generate new key pair") keys = client.create_key_pair(DryRun=args.dry_run, KeyName=args.deployment) keymat = keys['KeyMaterial'] keys = ec2.KeyPair(keys['KeyName']) agenda.section("Launch instances") # Launch instances clusters = [] for i, c in enumerate(topology.clusters): nics = [{ "DeviceIndex": 0, "Groups": [secs[i].id], "SubnetId": subnets[i].id, "DeleteOnTermination": True, "AssociatePublicIpAddress": c.internet, }] agenda.task("Launching {} instances in cluster {}".format( c.attrs['count'], c.name)) clusters.append( list( map(lambda x: ec2.Instance(x), [ instance['InstanceId'] for instance in client.run_instances( DryRun=args.dry_run, KeyName=keys.name, NetworkInterfaces=nics, ImageId=c.attrs['image'], MinCount=c.attrs['count'], MaxCount=c.attrs['count'], InstanceType=c.attrs['itype'], InstanceInitiatedShutdownBehavior='terminate') ['Instances'] ]))) exit = 1 try: agenda.task("Wait for HQ to start running") hq = clusters[0][0] while hq.state['Name'] == 'pending': agenda.subtask("Still in 'pending' state") sleep(3) hq.load() if hq.state['Name'] != 'running': agenda.failure(hq.state_reason['Message']) raise ChildProcessError(hq.state_reason['Message']) def prepare(ci, instance): global hq print("instance {} in {} now available through {}", instance.private_ip_address, topology.clusters[ci].name, hq.public_ip_address) agenda.task("Wait for workers to reach 'running' state") done = [] p = Pool(5) pending = True while pending: pending = False for i, cluster in enumerate(clusters): for ii, instance in enumerate(cluster): if instance.state['Name'] == 'pending': agenda.subtask( "Instance {}.{} is still pending".format( i + 1, ii + 1)) pending = True instance.load() break elif instance.state['Name'] != 'running': agenda.subfailure("Instance {}.{} failed: {}".format( i + 1, ii + 1, instance.state_reason['Message'])) raise ChildProcessError( instance.state_reason['Message']) else: # State is now 'running' tag = (i, ii) if tag not in done: # State hasn't been 'running' before done.append(tag) p.apply_async(prepare, [i, instance]) if pending: break sleep(3) p.close() p.join() agenda.task("Wait for HQ to become pingable") # Wait for hq to be pingable deployment = Deployer(args.playbook.name, topology, keymat, clusters) while not deployment.test(hq.public_ip_address): sleep(1) agenda.task("Wait for workers to become pingable") # Wait for workers to be pingable for i, cluster in enumerate(clusters): for ii, instance in enumerate(cluster): while not deployment.test(instance.private_ip_address): sleep(1) # Deploy! agenda.section("Deploy application") exit = deployment.deploy() except: import traceback traceback.print_exc() finally: agenda.section("Clean up VPC") if args.wait: agenda.prompt("Press [Enter] when you are ready to clean") input() # Terminate instances and delete VPC resources agenda.task("Terminate all instances") instances = list(vpc.instances.all()) vpc.instances.terminate(DryRun=args.dry_run) still_running = True while still_running: still_running = False for i in instances: i.load() if i.state['Name'] != 'terminated': agenda.subtask("At least one instance still shutting down") still_running = True sleep(3) break agenda.task("Delete network resources") agenda.subtask("key pair") keys.delete(DryRun=args.dry_run) agenda.subtask("internet-enabled route associations") for r in iroutable.associations.all(): r.delete(DryRun=args.dry_run) agenda.subtask("internet-enabled route table") iroutable.delete(DryRun=args.dry_run) agenda.subtask("internet gateway") gateway.detach_from_vpc(DryRun=args.dry_run, VpcId=vpc.id) gateway.delete(DryRun=args.dry_run) agenda.subtask("subnets") try: for sn in subnets: sn.delete(DryRun=args.dry_run) except: agenda.subfailure("failed to delete subnet:") import traceback traceback.print_exc() agenda.subtask("security groups") for sg in secs: sg.delete() agenda.subtask("network interfaces") for i in vpc.network_interfaces.all(): i.delete(DryRun=args.dry_run) agenda.task("Delete the VPC") vpc.delete(DryRun=args.dry_run) return exit
from IGP_scene_prediction import navigation, data_clean, overall_plot from multiprocessing import Pool import numpy as np import matplotlib.pyplot as plt import matplotlib.image as mpimg import math if __name__ == '__main__': agent, observation, time_span = data_clean(300, 540, 12) # result, mod = navigation(observation, time_span, 96, 0.01, 100) # overall_plot(agent, result) p = Pool(4) arguments = [(observation, time_span, 12, 0.01, 100), (observation, time_span, 24, 0.01, 100), (observation, time_span, 48, 0.01, 100), (observation, time_span, 96, 0.01, 100)] result = p.starmap(navigation, arguments) p.close() p.join()
def runADMM(G1, sizeOptVar, sizeData, lamb, rho, numiters, x, u, z, a, edgeWeights, useConvex, epsilon, mu): # print("a:",a) nodes = G1.GetNodes() edges = G1.GetEdges() maxNonConvexIters = 6 * numiters # Find max degree of graph; hash the nodes (maxdeg, counter) = (0, 0) node2mat = TIntIntH() for NI in G1.Nodes(): maxdeg = np.maximum(maxdeg, NI.GetDeg()) node2mat.AddDat(NI.GetId(), counter) counter = counter + 1 # Stopping criteria eabs = math.pow(10, -2) erel = math.pow(10, -3) (r, s, epri, edual, counter) = (1, 1, 0, 0, 0) A = np.zeros((2 * edges, nodes)) for EI in G1.Edges(): A[2 * counter, node2mat.GetDat(EI.GetSrcNId())] = 1 A[2 * counter + 1, node2mat.GetDat(EI.GetDstNId())] = 1 counter = counter + 1 (sqn, sqp) = (math.sqrt(nodes * sizeOptVar), math.sqrt(2 * sizeOptVar * edges)) # Non-convex case - keeping track of best point so far bestx = x bestu = u bestz = z bestObj = 0 cvxObj = 10000000 * np.ones((1, nodes)) if(useConvex != 1): # Calculate objective for i in range(G1.GetNodes()): bestObj = bestObj + cvxObj[0, i] for EI in G1.Edges(): weight = edgeWeights.GetDat(TIntPr(EI.GetSrcNId(), EI.GetDstNId())) edgeDiff = LA.norm(x[:, node2mat.GetDat( EI.GetSrcNId())] - x[:, node2mat.GetDat(EI.GetDstNId())]) bestObj = bestObj + lamb * weight * \ math.log(1 + edgeDiff / epsilon) initObj = bestObj # Run ADMM iters = 0 maxProcesses = 80 pool = Pool(processes=np.minimum(np.maximum(nodes, edges), maxProcesses)) # while(iters < numiters and (r > epri or s > edual or iters < 1)): dt = [] obj = [] tevolution = [] while(iters < numiters): sys.stdout.write('\r'+'network_lasso_cvx_status:'+str(int(100*iters/numiters))+'%') # print("iters:",iters) # x-update neighs = np.zeros(((2 * sizeOptVar + 1) * maxdeg, nodes)) edgenum = 0 numSoFar = TIntIntH() t0 = time.time() for EI in G1.Edges(): if (not numSoFar.IsKey(EI.GetSrcNId())): numSoFar.AddDat(EI.GetSrcNId(), 0) counter = node2mat.GetDat(EI.GetSrcNId()) counter2 = numSoFar.GetDat(EI.GetSrcNId()) neighs[counter2 * (2 * sizeOptVar + 1), counter] = edgeWeights.GetDat( TIntPr(EI.GetSrcNId(), EI.GetDstNId())) neighs[counter2 * (2 * sizeOptVar + 1) + 1:counter2 * (2 * sizeOptVar + 1) + (sizeOptVar + 1), counter] = u[:, 2 * edgenum] neighs[counter2 * (2 * sizeOptVar + 1) + (sizeOptVar + 1):(counter2 + 1) * (2 * sizeOptVar + 1), counter] = z[:, 2 * edgenum] numSoFar.AddDat(EI.GetSrcNId(), counter2 + 1) if (not numSoFar.IsKey(EI.GetDstNId())): numSoFar.AddDat(EI.GetDstNId(), 0) counter = node2mat.GetDat(EI.GetDstNId()) counter2 = numSoFar.GetDat(EI.GetDstNId()) neighs[counter2 * (2 * sizeOptVar + 1), counter] = edgeWeights.GetDat( TIntPr(EI.GetSrcNId(), EI.GetDstNId())) neighs[counter2 * (2 * sizeOptVar + 1) + 1:counter2 * ( 2 * sizeOptVar + 1) + (sizeOptVar + 1), counter] = u[:, 2 * edgenum + 1] neighs[counter2 * (2 * sizeOptVar + 1) + (sizeOptVar + 1):(counter2 + 1) * (2 * sizeOptVar + 1), counter] = z[:, 2 * edgenum + 1] numSoFar.AddDat(EI.GetDstNId(), counter2 + 1) edgenum = edgenum + 1 temp = np.concatenate((x, a, neighs, np.tile( [mu, sizeData, rho, lamb, sizeOptVar], (nodes, 1)).transpose()), axis=0) values = pool.map(solveX, temp.transpose()) newx = np.array(values)[:, 0].tolist() newcvxObj = np.array(values)[:, 1].tolist() # print("newcvxObj:", newcvxObj) # x = np.array(newx).transpose()[0] # print("newx:",newx) x = np.array(newx).transpose() # print("Size of x:",x.shape,"x:",x) # cvxObj = np.reshape(np.array(newcvxObj), (-1, nodes)) # print("cvxObj:",cvxObj) # z-update ztemp = z.reshape(2 * sizeOptVar, edges, order='F') utemp = u.reshape(2 * sizeOptVar, edges, order='F') xtemp = np.zeros((sizeOptVar, 2 * edges)) counter = 0 weightsList = np.zeros((1, edges)) for EI in G1.Edges(): xtemp[:, 2 * counter] = np.array(x[:, node2mat.GetDat(EI.GetSrcNId())]) xtemp[:, 2 * counter + 1] = x[:, node2mat.GetDat(EI.GetDstNId())] weightsList[0, counter] = edgeWeights.GetDat( TIntPr(EI.GetSrcNId(), EI.GetDstNId())) counter = counter + 1 xtemp = xtemp.reshape(2 * sizeOptVar, edges, order='F') temp = np.concatenate((xtemp, utemp, ztemp, np.reshape(weightsList, (-1, edges)), np.tile( [epsilon, useConvex, rho, lamb, sizeOptVar], (edges, 1)).transpose()), axis=0) newz = pool.map(solveZ, temp.transpose()) ztemp = np.array(newz).transpose()[0] ztemp = ztemp.reshape(sizeOptVar, 2 * edges, order='F') # For dual residual s = LA.norm(rho * np.dot(A.transpose(), (ztemp - z).transpose())) z = ztemp # u-update (xtemp, counter) = (np.zeros((sizeOptVar, 2 * edges)), 0) for EI in G1.Edges(): xtemp[:, 2 * counter] = np.array(x[:, node2mat.GetDat(EI.GetSrcNId())]) xtemp[:, 2 * counter + 1] = x[:, node2mat.GetDat(EI.GetDstNId())] counter = counter + 1 temp = np.concatenate( (u, xtemp, z, np.tile(rho, (1, 2 * edges))), axis=0) newu = pool.map(solveU, temp.transpose()) u = np.array(newu).transpose() # Update best objective (for non-convex) if(useConvex != 1): tempObj = 0 # Calculate objective for i in range(G1.GetNodes()): tempObj = tempObj + cvxObj[0, i] initTemp = tempObj for EI in G1.Edges(): weight = edgeWeights.GetDat( TIntPr(EI.GetSrcNId(), EI.GetDstNId())) edgeDiff = LA.norm(x[:, node2mat.GetDat( EI.GetSrcNId())] - x[:, node2mat.GetDat(EI.GetDstNId())]) tempObj = tempObj + lamb * weight * \ math.log(1 + edgeDiff / epsilon) # Update best variables if(tempObj <= bestObj): bestx = x bestu = u bestz = z bestObj = tempObj print("Iteration ", iters, "; Obj = ", tempObj, "; Initial = ", initTemp) if(iters == numiters - 1 and numiters < maxNonConvexIters): if(bestObj == initObj): numiters = numiters + 1 # Stopping criterion - p19 of ADMM paper epri = sqp * eabs + erel * \ np.maximum(LA.norm(np.dot(A, x.transpose()), 'fro'), LA.norm(z, 'fro')) edual = sqn * eabs + erel * \ LA.norm(np.dot(A.transpose(), u.transpose()), 'fro') r = LA.norm(np.dot(A, x.transpose()) - z.transpose(), 'fro') s = s #print r, epri, s, edual t1 = time.time() - t0 dt.append(t1) # objtemp = (LA.norm(x-a))**2+LA.norm(x) objtemp = (LA.norm(x - a))**2 for edge in G1.Edges(): node1 = edge.GetSrcNId() node2 = edge.GetDstNId() objtemp = objtemp + lamb * LA.norm(x[:, node1] - x[:, node2]) obj.append(objtemp) iters = iters + 1 pool.close() pool.join() objerror = [] temp = 0 for k in range(numiters): temp = temp + dt[k] tevolution.append(temp) for k in range(numiters): objerror.append(np.absolute(obj[k] - obj[-1])) return x, tevolution, obj, objerror
def pmap(f, col): pool = Pool(5) result = pool.map(f, col) pool.close() return result
""" LoadModel(ifhdnn=False) print("New ESOINN model has %d clusters" % GPARAMS.Esoinn_setting.Model.class_id) os.system("cp *.ESOINN Sfactor.in ./networks") Dataer_Process = Process(target=dataer, args=(DataQueue, )) Dataer_Process.start() TrainerPool = Pool(len(GPARAMS.Compute_setting.Gpulist)) Resultlist = [] for i in range( max(GPARAMS.Esoinn_setting.Model.class_id, GPARAMS.Train_setting.Modelnumperpoint)): print("Create HDNN subnet for class %d" % i) result = TrainerPool.apply_async(trainer, (DataQueue, GPUQueue)) Resultlist.append(result) TrainerPool.close() for i in range( max(GPARAMS.Esoinn_setting.Model.class_id, GPARAMS.Train_setting.Modelnumperpoint)): tmp = Resultlist[i].get() print(tmp) TrainerPool.terminate() TrainerPool.join() Dataer_Process.join() """ if os.path.exists(GPARAMS.Compute_setting.Traininglevel): os.system("mkdir %s/Stage%d"%(GPARAMS.Compute_setting.Traininglevel,GPARAMS.Train_setting.Trainstage)) os.system("mv %s/*.record %s/Stage%d"%(GPARAMS.Compute_setting.Traininglevel,\ GPARAMS.Compute_setting.Traininglevel,\ GPARAMS.Train_setting.Trainstage)) for i in range(len(GPARAMS.System_setting)):
def main(): global src_port_ids_global global dst_port_ids_global global port_map_global global port_reverse_map_global global ntf_global global ttf_global global DATABASE_FILE parser = ArgumentParser(description="Generate Test Packets for Internet2") parser.add_argument("-p", dest="percentage", type=int, default="100", help="Percentage of test terminals") parser.add_argument("-f", dest="filename", default="internet2.sqlite", help="Filename of the database") parser.add_argument("-e", action="store_true", default=False, help="Edge port only") args = parser.parse_args() DATABASE_FILE = "work/%s" % args.filename cs = juniperRouter(1) output_port_addition = cs.PORT_TYPE_MULTIPLIER * cs.OUTPUT_PORT_TYPE_CONST # Load .tf files ntf_global = load_internet2_backbone_ntf() ttf_global = load_internet2_backbone_ttf() (port_map_global, port_reverse_map_global) = load_internet2_backbone_port_to_id_map() # Initialize the database if os.access(DATABASE_FILE, os.F_OK): os.remove(DATABASE_FILE) conn = sqlite3.connect(DATABASE_FILE) conn.execute('CREATE TABLE %s (rule TEXT, input_port TEXT, output_port TEXT, action TEXT, file TEXT, line TEXT)' % TABLE_NETWORK_RULES) conn.execute('CREATE TABLE %s (rule TEXT, input_port TEXT, output_port TEXT)' % TABLE_TOPOLOGY_RULES) conn.execute('CREATE TABLE %s (header TEXT, input_port INTEGER, output_port INTEGER, ports TEXT, no_of_ports INTEGER, rules TEXT, no_of_rules INTEGER)' % TABLE_TEST_PACKETS) conn.execute('CREATE TABLE %s (header TEXT, input_port INTEGER, output_port INTEGER, ports TEXT, no_of_ports INTEGER, rules TEXT, no_of_rules INTEGER)' % TABLE_TEST_PACKETS_LOCALLY_COMPRESSED) conn.execute('CREATE TABLE %s (rules TEXT, no_of_rules INTEGER)' % TABLE_TEST_PACKETS_GLOBALLY_COMPRESSED) conn.execute('CREATE TABLE %s (rule TEXT)' % TABLE_RESULT_RULES) rule_count = 0 for tf in ntf_global.tf_list: rule_count += len(tf.rules) for rule in tf.rules: # print '-------------' # print tf.rules[4] # exit(1) query = "INSERT INTO %s VALUES (?, ?, ?, ?, ?, ?)" % TABLE_NETWORK_RULES conn.execute(query, (rule['id'],' '.join(map(str, rule['in_ports'])), ' '.join(map(str, rule['out_ports'])), rule['action'], rule["file"], ' '.join(map(str, rule["line"])))) print "Total Rules: %d" % rule_count conn.commit() rule_count = len(ttf_global.rules) for rule in ttf_global.rules: query = "INSERT INTO %s VALUES (?, ?, ?)" % TABLE_TOPOLOGY_RULES conn.execute(query, (rule['id'],' '.join(map(str, rule['in_ports'])), ' '.join(map(str, rule['out_ports'])))) print "Total Links: %d" % rule_count # Generate all ports for rtr in port_map_global.keys(): src_port_ids_global |= set(port_map_global[rtr].values()) print '-------------' print port_map_global exit(1) total_length = len(src_port_ids_global) if args.e == True: for rule in ttf_global.rules: if rule['out_ports'][0] in src_port_ids_global: src_port_ids_global.remove(rule['out_ports'][0]) new_length = len(src_port_ids_global)* args.percentage / 100 src_port_ids_global = random.sample(src_port_ids_global, new_length) print "Total Length: %d" % total_length print "New Length: %d" % new_length for port in src_port_ids_global: port += output_port_addition dst_port_ids_global.add(port) #src_port_ids_global = [300013] #dst_port_ids_global = [320010] conn.commit() conn.close() # Run reachability start_time = time.time() pool = Pool() result = pool.map_async(find_test_packets, src_port_ids_global) # Close pool.close() pool.join() end_time = time.time() test_packet_count = result.get() total_paths = sum(test_packet_count) print "========== Before Compression =========" print "Total Paths = %d" % total_paths print "Average packets per port = %f" % (float(total_paths) / len(src_port_ids_global)) print "Total Time = %fs" % (end_time - start_time) #Global Compressing start_time = time.time() conn = sqlite3.connect(DATABASE_FILE, 6000) result_rule_lists = [] query = "SELECT rules FROM %s" % TABLE_TEST_PACKETS_LOCALLY_COMPRESSED rows = conn.execute(query) for row in rows: result_rule_lists.append(row[0].split()) conn.close() chunk_size = 80000 while(True): print "Start a new round!" conn = sqlite3.connect(DATABASE_FILE, 6000) conn.execute('DROP TABLE IF EXISTS %s' % TABLE_SCRATCHPAD) conn.execute('CREATE TABLE %s (rules TEXT, no_of_rules INTEGER)' % TABLE_SCRATCHPAD) conn.commit() conn.close() start_len = len(result_rule_lists) print start_len pool = Pool() no_of_chunks = len(result_rule_lists) / chunk_size + 1 rule_list_chunks = chunks(result_rule_lists, no_of_chunks) result = pool.map_async(rule_lists_compress, rule_list_chunks) # Close pool.close() pool.join() result.get() print "End of this round." result_rule_lists = read_rule_lists_from_database(TABLE_SCRATCHPAD) end_len = len(result_rule_lists) if(float(end_len) / float(start_len) > 0.99): break end_time = time.time() query = "INSERT INTO %s VALUES (?, ?)" % TABLE_TEST_PACKETS_GLOBALLY_COMPRESSED query2 = "INSERT INTO %s VALUES (?)" % TABLE_RESULT_RULES total_paths = len(result_rule_lists) total_length = 0 conn = sqlite3.connect(DATABASE_FILE, 6000) conn.execute('DROP TABLE IF EXISTS %s' % TABLE_TEST_PACKETS_GLOBALLY_COMPRESSED) conn.execute('CREATE TABLE %s (rules TEXT, no_of_rules INTEGER)' % TABLE_TEST_PACKETS_GLOBALLY_COMPRESSED) for rule_list in result_rule_lists: total_length += len(rule_list) conn.execute(query, (" ".join(rule_list), len(rule_list))) for rule in rule_list: conn.execute(query2, (rule,)) conn.commit() conn.close() print "========== After Compression =========" print "Total Paths = %d" % total_paths print "Average packets per port = %f" % (float(total_paths) / len(src_port_ids_global)) print "Average length of rule list = %f" % (float(total_length) / total_paths) print "Total Time = %fs" % (end_time - start_time)
def map_model_reactions(model1, model2, cpd_pred, nproc=1, outpath='.', log=False, gene=False, compartment_map={}): """Map reactions of two models.""" # Mapping of reactions reaction_pairs = len(model1.reactions) * len(model2.reactions) # Reaction prior # For the prior, use a guesstimate that 95% # of the smaller model can be mapped. reaction_prior = (0.95 * min(len(model1.reactions), len( model2.reactions))) / reaction_pairs # Initialize parallel pool of workers chunksize = reaction_pairs // nproc pool = Pool(nproc) # Reaction ID # Marginal probability of observing two reactions with the same ids. tasks = ((util.id_equals, (r1.id, r2.id)) for r1, r2 in product( itervalues(model1.reactions), itervalues(model2.reactions))) result = pool.map(parallel_equel, tasks, chunksize=chunksize) reaction_id_equal_marg = sum(result) / float(reaction_pairs) # Marginal probability of observing two reactions with different ids. reaction_id_not_equal_marg = 1.0 - reaction_id_equal_marg print('Calculating reaction ID likelihoods...') sys.stdout.flush() reaction_id_likelihoods = pairwise_likelihood( pool, chunksize, model1.reactions, model2.reactions, reaction_id_likelihood, (reaction_prior, reaction_id_equal_marg, reaction_id_not_equal_marg)) # Reaction name # Marginal probability of observing two reactions with the same name. tasks = ((util.name_equals, (r1.name, r2.name)) for r1, r2 in product( itervalues(model1.reactions), itervalues(model2.reactions))) result = pool.map(parallel_equel, tasks, chunksize=chunksize) reaction_name_equal_marg = sum(result) / float(reaction_pairs) print('Calculating reaction name likelihoods...') sys.stdout.flush() reaction_name_likelihoods = pairwise_likelihood( pool, chunksize, model1.reactions, model2.reactions, reaction_name_likelihood, (reaction_prior, reaction_name_equal_marg)) # Reaction equation print('Calculating reaction equation likelihoods...') sys.stdout.flush() reaction_equation_likelihoods = pairwise_likelihood( pool, chunksize, model1.reactions, model2.reactions, reaction_equation_compound_mapping_likelihood, (cpd_pred, compartment_map)) # Reaction genes # For each gene, the marginal probability of observing that gene # in each model. We use this as an approximation of the probability of # observing a pair of genes in two reactions given that the reaction # do _not_ match. if gene: print('Calculating reaction genes likelihoods...') sys.stdout.flush() reaction_genes_likelihoods = pairwise_likelihood( pool, chunksize, model1.reactions, model2.reactions, reaction_genes_likelihood, ()) else: reaction_genes_likelihoods = pairwise_likelihood( pool, chunksize, model1.reactions, model2.reactions, fake_likelihood, ()) pool.close() pool.join() if log: merge_result = pd.merge(reaction_id_likelihoods, reaction_name_likelihoods, left_index=True, right_index=True, suffixes=('_id', '_name')) merge_result = pd.merge(merge_result, reaction_equation_likelihoods, left_index=True, right_index=True, suffixes=('_name', '_equation')) merge_result = pd.merge(merge_result, reaction_genes_likelihoods, left_index=True, right_index=True, suffixes=('_equation', '_genes')) merge_result.to_csv(outpath + '/reaction_log.tsv', sep='\t') all_likelihoods = [ reaction_id_likelihoods, reaction_name_likelihoods, reaction_equation_likelihoods, reaction_genes_likelihoods ] return (bayes_posterior(reaction_prior, likelihood_products(all_likelihoods)), bayes_posterior(reaction_prior, reaction_id_likelihoods), bayes_posterior(reaction_prior, reaction_name_likelihoods), bayes_posterior(reaction_prior, reaction_equation_likelihoods), bayes_posterior(reaction_prior, reaction_genes_likelihoods))
def readMSMSData(self,msFile,targets,tic_cutoff,frag_cutoff): # make DecoID object # write temporary peak file targets.to_csv(self.uid + ".csv", index=False) # read in file and save all spectra self.decID.readData(msFile,self.ms2_resolution, True, True, self.ppm, peakDefinitions=self.uid + ".csv", tic_cutoff=tic_cutoff, frag_cutoff=frag_cutoff) # structure to hold spectra output_dict = {} polarity = 0 # process spectra for each CE if len(self.decID.samples) > 0: # get charge polarity = self.decID.samples[0]["mode"] switcher = {"Positive": 1, "Negative": -1} polarity = switcher[polarity] self.decID.samples = [x for x in self.decID.samples if targets.at[targets.index.values[x["group"]],"Charge"] == polarity] samplesAll = deepcopy(self.decID.samples) # get unique CEs ces = list(set([x["CE"] for x in samplesAll])) ces.sort() # iterate over CEs ceList = [] args = [] gs = [] for ce in ces: # parse relevant samples self.decID.samples = [x for x in samplesAll if x["CE"] == ce] self.decID.label = str(ce) groups = list(set([x["group"] for x in self.decID.samples])) for g in groups: specs = [x["spectra"] for x in self.decID.samples if x["group"] == g] args.append([specs,self.ms2_resolution]) ceList.append(ce) gs.append(g) if len(args) > 0: p = Pool(min([self.numCores,len(args)])) results = p.starmap(sumSpectra,args,chunksize=int(len(args)/min([self.numCores,len(args)]))) p.close() p.join() spectra = {(g,ce):spec for g,spec,ce in zip(gs,results,ceList)} names = [] ceList = [] args = [] for (ind,ce),spectrum in spectra.items(): row = targets.iloc[ind,:] rts = [x["rt"] for x in self.decID.samples if x["group"] == ind] args.append([spectrum,rts,self.decID.ms1,row["mz"],row["rt_start"],row["rt_end"],self.ppm]) names.append(row["Name"]) ceList.append(ce) p = Pool(min([self.numCores,len(args)])) results = p.starmap(normalizeSpectrum,args,chunksize=int(len(args)/min([self.numCores,len(args)]))) p.close() p.join() for name,ce,spectrum in zip(names,ceList,results): if name not in output_dict: output_dict[name] = {} output_dict[name][ce] = spectrum os.remove(self.uid + ".csv") return output_dict,polarity
# data = random_class_selection(data) Y = data["Y"] X = data["X"] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(X.values.astype('U')).toarray() x_train = X y_train = Y # kmeans = KMeans(n_clusters=4,n_jobs=-1).fit(x_train) # print('Score:',metrics.adjusted_rand_score(y_train,kmeans.predict(x_train))) cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(x_train.T, 4, 12, error=0.00001, maxiter=100000, init=None) print('Score:', metrics.adjusted_rand_score(y_train, np.argmax(u, axis=0))) docinc = [[] for x in range(4)] for k in range(x_train.shape[0]): # For KMeans clustering, replace the line below with the commented line: # docinc[kmeans.predict(x_train[k])[0]] docinc[np.argmax(u[:, k])].append(k) clus = [0, 1, 2, 3, 4, 5, 6, 7] mpool = Pool(4) mapped_data = mpool.map(generate_cf_matrix, clus) mpool.close()
from multiprocessing import Process, Pool import time def func(num): num += 1 # print(num) if __name__ == '__main__': p = Pool(5) start = time.time() p.map(func, [i for i in range(1000)]) p.close() # 指不能再向进程池中添加任务 p.join() # 等待进程池中的所有任务执行完毕 print(time.time() - start) p_l = [] start = time.time() for i in range(1000): p = Process(target=func, args=(i,)) p.start() p_l.append(p) [i.join() for i in p_l] print(time.time() - start)
def extract_from_shp(lyr, df_xy, id_field=None, n_jobs=0): ''' Extract the id of the polygon that each point in df_xy is within''' '''ds = ogr.Open(shp) lyr = ds.GetLayer()''' id_field_given = True if not id_field: id_field = 'name' id_field_given = False if n_jobs: t1 = time.time() args = [] n_features = lyr.GetFeatureCount() for i in xrange(n_features): feature = lyr.GetFeature(i) geometry = feature.GetGeometryRef() geom_coords = stem_conus.get_coords_from_geometry( geometry, multipart='split') # Initially select only samples that fit within the bounds of the this feature min_x, max_x, min_y, max_y = geometry.GetEnvelope() xy_temp = df_xy[(df_xy.x >= min_x) & (df_xy.x < max_x) & (df_xy.y > min_y) & (df_xy.y <= max_y)] #if id_field_given: # feature_id = feature.GetField(id_field) #else: feature_id = feature.GetFID() args.append( [geom_coords, xy_temp, id_field, feature_id, i, n_features]) feature.Destroy() sys.stdout.write( '\rInitial filter of points for (%%%.1f) of features' % (float(i) / n_features * 100)) sys.stdout.flush() print '\nTime for getting args: %.1f seconds\n' % (time.time() - t1) # Predict in parallel t1 = time.time() pool = Pool(n_jobs) points = pool.map(par_within, args, 1) pool.close() pool.join() print '\nTime for extraction: %.1f minutes\n' % ( (time.time() - t1) / 60) t1 = time.time() for i, p in points: df_xy.ix[p, 'tile_fid'] = i print 'Time for adding to df: %.1f seconds\n' % ((time.time() - t1)) else: for i in xrange(lyr.GetFeatureCount()): feature = lyr.GetFeature(i) geometry = feature.GetGeometryRef() #geom_coords = stem_conus.get_coords_from_geometry(geometry) min_x, max_x, min_y, max_y = geometry.GetEnvelope() # Initially select only samples that fit within the bounds of the this feature xy_temp = df_xy[(df_xy.x >= min_x) & (df_xy.x < max_x) & (df_xy.y > min_y) & (df_xy.y <= max_y)] # With square tiles, this is not necessary #points = [i for i, (x, y) in xy_temp[['x','y']].iterrows() if within(x, y, geometry)] if id_field_given: feature_id = feature.GetField(id_field) else: feature_id = feature.GetFID() df_xy.ix[xy_temp.index, 'tile_fid'] = feature_id feature.Destroy() return df_xy
def map_model_compounds(model1, model2, nproc=1, outpath='.', log=False, kegg=False): """Map compounds of two models.""" compound_pairs = len(model1.compounds) * len(model2.compounds) # Compound prior # For the prior, use a guesstimate that 95% of the # smaller model can be mapped. compound_prior = (0.95 * min(len(model1.compounds), len( model2.compounds))) / compound_pairs # Initialize parallel pool of workers chunksize = compound_pairs // nproc pool = Pool(nproc) # Compound ID # Marginal probability of observing two equal compound IDs tasks = ((util.id_equals, (c1.id, c2.id)) for c1, c2 in product( itervalues(model1.compounds), itervalues(model2.compounds))) result = pool.map(parallel_equel, tasks, chunksize=chunksize) compound_id_marg = sum(result) / float(compound_pairs) print('Calculating compound ID likelihoods...') sys.stdout.flush() compound_id_likelihoods = pairwise_likelihood( pool, chunksize, model1.compounds, model2.compounds, compound_id_likelihood, (compound_prior, compound_id_marg)) # Compound name # Marginal probability of observing two similar names tasks = ((util.name_equals, (c1.name, c2.name)) for c1, c2 in product( itervalues(model1.compounds), itervalues(model2.compounds))) result = pool.map(parallel_equel, tasks, chunksize=chunksize) compound_name_marg = sum(result) / float(compound_pairs) print('Calculating compound name likelihoods...') sys.stdout.flush() compound_name_likelihoods = pairwise_likelihood( pool, chunksize, model1.compounds, model2.compounds, compound_name_likelihood, (compound_prior, compound_name_marg)) # Compound charge # Marginal probability of observing two compounds with the same charge compound_charge_equal_marg = sum( c1.charge is not None and c2.charge is not None and c1.charge == c2.charge for c1, c2 in product(itervalues(model1.compounds), itervalues(model2.compounds))) / compound_pairs # Marginal probability of observing two compounds with different charge compound_charge_not_equal_marg = sum( c1.charge is not None and c2.charge is not None and c1.charge != c2.charge for c1, c2 in product(itervalues(model1.compounds), itervalues(model2.compounds))) / compound_pairs print('Calculating compound charge likelihoods...') sys.stdout.flush() compound_charge_likelihoods = pairwise_likelihood( pool, chunksize, model1.compounds, model2.compounds, compound_charge_likelihood, (compound_prior, compound_charge_equal_marg, compound_charge_not_equal_marg)) # Compound formula # Marginal probability of observing two compounds with the same formula tasks = ((util.formula_equals, (c1.formula, c2.formula, c1.charge, c2.charge)) for c1, c2 in product(itervalues(model1.compounds), itervalues(model2.compounds))) result = pool.map(parallel_equel, tasks, chunksize=chunksize) compound_formula_equal_marg = sum(result) / float(compound_pairs) # Marginal probability of observing two compounds with different formula compound_formula_not_equal_marg = 1.0 - compound_formula_equal_marg - (sum( c1.formula is None or c2.formula is None for c1, c2 in product(itervalues(model1.compounds), itervalues(model2.compounds))) / compound_pairs) print('Calculating compound formula likelihoods...') sys.stdout.flush() compound_formula_likelihoods = pairwise_likelihood( pool, chunksize, model1.compounds, model2.compounds, compound_formula_likelihood, (compound_prior, compound_formula_equal_marg, compound_formula_not_equal_marg)) # Compound KEGG id if kegg: # run KEGG id mapping # Marginal probability of observing two compounds # where KEGG ids are equal compound_kegg_equal_marg = sum( c1.kegg is not None and c2.kegg is not None and c1.kegg == c2.kegg for c1, c2 in product(itervalues(model1.compounds), itervalues( model2.compounds))) / compound_pairs # Marginal probability of observing two compounds # where KEGG ids are different compound_kegg_not_equal_marg = sum( c1.kegg is not None and c2.kegg is not None and c1.kegg != c2.kegg for c1, c2 in product(itervalues(model1.compounds), itervalues( model2.compounds))) / compound_pairs print('Calculating compound KEGG ID likelihoods...') sys.stdout.flush() compound_kegg_likelihoods = pairwise_likelihood( pool, chunksize, model1.compounds, model2.compounds, compound_kegg_likelihood, (compound_prior, compound_kegg_equal_marg, compound_kegg_not_equal_marg)) else: # run fake mapping compound_kegg_likelihoods = pairwise_likelihood( pool, chunksize, model1.compounds, model2.compounds, fake_likelihood, ()) pool.close() pool.join() if log: merge_result = pd.merge(compound_id_likelihoods, compound_name_likelihoods, left_index=True, right_index=True, suffixes=('_id', '_name')) merge_result = pd.merge(merge_result, compound_charge_likelihoods, left_index=True, right_index=True, suffixes=('_name', '_charge')) merge_result = pd.merge(merge_result, compound_formula_likelihoods, left_index=True, right_index=True, suffixes=('_charge', '_formula')) merge_result = pd.merge(merge_result, compound_kegg_likelihoods, left_index=True, right_index=True, suffixes=('_formula', '_kegg')) merge_result.to_csv(outpath + '/compound_log.tsv', sep='\t') all_likelihoods = [ compound_id_likelihoods, compound_name_likelihoods, compound_charge_likelihoods, compound_formula_likelihoods, compound_kegg_likelihoods ] return (bayes_posterior(compound_prior, likelihood_products(all_likelihoods)), bayes_posterior(compound_prior, compound_id_likelihoods), bayes_posterior(compound_prior, compound_name_likelihoods), bayes_posterior(compound_prior, compound_charge_likelihoods), bayes_posterior(compound_prior, compound_formula_likelihoods), bayes_posterior(compound_prior, compound_kegg_likelihoods))
def aovhm_periodfind( times, mags, errs, nharmonics=6, magsarefluxes=False, autofreq=True, startp=None, endp=None, normalize=True, stepsize=1.0e-4, nbestpeaks=5, periodepsilon=0.1, # 0.1 sigclip=10.0, nworkers=None, verbose=True): '''This runs a parallel AoV period search. NOTE: normalize = True here as recommended by Schwarzenberg-Czerny 1996, i.e. mags will be normalized to zero and rescaled so their variance = 1.0 ''' # get rid of nans first and sigclip stimes, smags, serrs = sigclip_magseries(times, mags, errs, magsarefluxes=magsarefluxes, sigclip=sigclip) # make sure there are enough points to calculate a spectrum if len(stimes) > 9 and len(smags) > 9 and len(serrs) > 9: # get the frequencies to use if startp: endf = 1.0 / startp else: # default start period is 0.1 day endf = 1.0 / 0.1 if endp: startf = 1.0 / endp else: # default end period is length of time series startf = 1.0 / (stimes.max() - stimes.min()) # if we're not using autofreq, then use the provided frequencies if not autofreq: frequencies = np.arange(startf, endf, stepsize) if verbose: LOGINFO( 'using %s frequency points, start P = %.3f, end P = %.3f' % (frequencies.size, 1.0 / endf, 1.0 / startf)) else: # this gets an automatic grid of frequencies to use frequencies = get_frequency_grid(stimes, minfreq=startf, maxfreq=endf) if verbose: LOGINFO('using autofreq with %s frequency points, ' 'start P = %.3f, end P = %.3f' % (frequencies.size, 1.0 / frequencies.max(), 1.0 / frequencies.min())) # map to parallel workers if (not nworkers) or (nworkers > NCPUS): nworkers = NCPUS if verbose: LOGINFO('using %s workers...' % nworkers) pool = Pool(nworkers) # renormalize the working mags to zero and scale them so that the # variance = 1 for use with our LSP functions if normalize: nmags = (smags - npmedian(smags)) / npstd(smags) else: nmags = smags # figure out the weighted variance # www.itl.nist.gov/div898/software/dataplot/refman2/ch2/weighvar.pdf magvariance_top = npsum(nmags / (serrs * serrs)) magvariance_bot = (nmags.size - 1) * npsum( 1.0 / (serrs * serrs)) / nmags.size magvariance = magvariance_top / magvariance_bot tasks = [(stimes, nmags, serrs, x, nharmonics, magvariance) for x in frequencies] lsp = pool.map(aovhm_theta_worker, tasks) pool.close() pool.join() del pool lsp = nparray(lsp) periods = 1.0 / frequencies # find the nbestpeaks for the periodogram: 1. sort the lsp array by # highest value first 2. go down the values until we find five # values that are separated by at least periodepsilon in period # make sure to filter out non-finite values finitepeakind = npisfinite(lsp) finlsp = lsp[finitepeakind] finperiods = periods[finitepeakind] # make sure that finlsp has finite values before we work on it try: bestperiodind = npargmax(finlsp) except ValueError: LOGERROR('no finite periodogram values ' 'for this mag series, skipping...') return { 'bestperiod': npnan, 'bestlspval': npnan, 'nbestpeaks': nbestpeaks, 'nbestlspvals': None, 'nbestperiods': None, 'lspvals': None, 'periods': None, 'method': 'mav', 'kwargs': { 'startp': startp, 'endp': endp, 'stepsize': stepsize, 'normalize': normalize, 'nharmonics': nharmonics, 'autofreq': autofreq, 'periodepsilon': periodepsilon, 'nbestpeaks': nbestpeaks, 'sigclip': sigclip } } sortedlspind = np.argsort(finlsp)[::-1] sortedlspperiods = finperiods[sortedlspind] sortedlspvals = finlsp[sortedlspind] prevbestlspval = sortedlspvals[0] # now get the nbestpeaks nbestperiods, nbestlspvals, peakcount = ([finperiods[bestperiodind]], [finlsp[bestperiodind]], 1) prevperiod = sortedlspperiods[0] # find the best nbestpeaks in the lsp and their periods for period, lspval in zip(sortedlspperiods, sortedlspvals): if peakcount == nbestpeaks: break perioddiff = abs(period - prevperiod) bestperiodsdiff = [abs(period - x) for x in nbestperiods] # print('prevperiod = %s, thisperiod = %s, ' # 'perioddiff = %s, peakcount = %s' % # (prevperiod, period, perioddiff, peakcount)) # this ensures that this period is different from the last # period and from all the other existing best periods by # periodepsilon to make sure we jump to an entire different peak # in the periodogram if (perioddiff > (periodepsilon * prevperiod) and all(x > (periodepsilon * prevperiod) for x in bestperiodsdiff)): nbestperiods.append(period) nbestlspvals.append(lspval) peakcount = peakcount + 1 prevperiod = period return { 'bestperiod': finperiods[bestperiodind], 'bestlspval': finlsp[bestperiodind], 'nbestpeaks': nbestpeaks, 'nbestlspvals': nbestlspvals, 'nbestperiods': nbestperiods, 'lspvals': lsp, 'periods': periods, 'method': 'mav', 'kwargs': { 'startp': startp, 'endp': endp, 'stepsize': stepsize, 'normalize': normalize, 'nharmonics': nharmonics, 'autofreq': autofreq, 'periodepsilon': periodepsilon, 'nbestpeaks': nbestpeaks, 'sigclip': sigclip } } else: LOGERROR('no good detections for these times and mags, skipping...') return { 'bestperiod': npnan, 'bestlspval': npnan, 'nbestpeaks': nbestpeaks, 'nbestlspvals': None, 'nbestperiods': None, 'lspvals': None, 'periods': None, 'method': 'mav', 'kwargs': { 'startp': startp, 'endp': endp, 'stepsize': stepsize, 'normalize': normalize, 'nharmonics': nharmonics, 'autofreq': autofreq, 'periodepsilon': periodepsilon, 'nbestpeaks': nbestpeaks, 'sigclip': sigclip } }
def extract_var(year, var_name, by_tile, data_band, data_type, df_tile, df_xy, basepath, search_str, path_filter, mosaic_tx, file_count, n_files, nodata=None, kernel=False): ''' Return a dataframe of ''' t0 = time.time() dfs = [] # For storing kernel and stats var_col = var_name # + str(year) file_col = 'file_' + var_col #file_count = last_file # Store the filepath for each tile if by_tile: df_tile[file_col] = [ find_file(basepath, search_str.format(year), tile, path_filter) for tile in df_tile.tile_id ] else: df_tile[file_col] = find_file(basepath, search_str.format(year), path_filter=path_filter) # Handle any rows for for which the file is null if df_tile[file_col].isnull().any(): df_null = df_tile[df_tile[file_col].isnull()] print 'Tiles excluded from extractions for %s from %s:' % (var_name, year) for ind, row in df_null.iterrows(): print row['tile_str'] print '' n_null = len(df_null) # Make the file name a unique integer so that it can be # distinguished from real files and from other null files df_tile.loc[df_null.index, file_col] = range(n_null) # Get the file string for each xy for each year df_xy[file_col] = '' # Creates the column but keeps it empty for tile in df_xy.tile_id.unique(): try: df_xy.loc[df_xy['tile_id'] == tile, file_col] = df_tile.loc[df_tile['tile_id'] == tile, file_col].values[0] except: import pdb pdb.set_trace() # For each file, get the dataset as an array and extract all values at each row col val_cols = ['%s_%s' % (var_col, i) for i in range(1, 10)] '''for f in df_tile[file_col].unique(): print 'Extracting for array %s of approximately %s from:\n%s\n'\ % (last_file, n_files, f) dfs.append(extract_by_rowcol(df_xy, f, file_col, var_col, data_band, mosaic_tx, val_cols, data_type, nodata, kernel )) file_count += 1''' args = [] for i, f in enumerate(df_tile[file_col].unique()): args.append([ df_xy, f, file_col, var_col, data_band, mosaic_tx, val_cols, data_type, nodata, kernel, i + 1 + file_count ]) n_jobs = 10 pool = Pool(n_jobs) this_count = len(args) print 'Extracting from %s-%s files of %s...' % (file_count, file_count + this_count, n_files) dfs = pool.map(par_extract_by_rowcol, args, 1) pool.close() pool.join() print '\nTime for this variable: %.1f minutes\n' % ( (time.time() - t0) / 60) file_count += this_count # Comnbine all the pieces for this year df_var = pd.concat(dfs) return df_var, file_count
def apply_by_multiprocessing_list_to_list(df, func, **kwargs): workers = kwargs.pop('workers') pool = Pool(processes=workers) result = pool.map(apply_list, [(d, func, kwargs) for d in np.array_split(df, workers)]) pool.close() return result
def run(**kwargs): if len(kwargs.get("pipeline")) == 0: raise TLException("pipeline command must be given.") parallel_count = int(kwargs['parallel_count']) input_files = kwargs["input"] running_configs = [] gpu_resources = kwargs.get("gpu_resources") # setup the running config pipeline_cleaned = kwargs['pipeline'] for i, each in enumerate(input_files): each_config = { "input": each, "command": pipeline_cleaned, "output_folder": kwargs.get("output_folder"), "output_name": kwargs.get("output_name"), "ground_truth_pattern": kwargs.get("ground_truth_pattern"), "ground_truth_directory": kwargs.get("ground_truth_directory", ""), "score_column": kwargs.get("score_column"), "debug": kwargs.get("debug", False) } running_configs.append(each_config) if len(gpu_resources) > 0: each_config["gpu_id"] = gpu_resources[i % len(gpu_resources)] else: each_config["gpu_id"] = None # start running try: from multiprocessing import Pool from tqdm import tqdm import time import pandas as pd from io import StringIO from tl.utility.run_pipelines_utility import PipelineUtility if parallel_count == 1: results = [] for each in tqdm(running_configs): results.append(PipelineUtility.run_one_pipeline(each)) else: from multiprocessing import set_start_method set_start_method("spawn") # use multiprocess pool function to run in parallel mode p = Pool(parallel_count) result = p.map_async(PipelineUtility.run_one_pipeline, running_configs) pbar = tqdm(total=len(running_configs)) previous_remain = len(running_configs) while not result.ready(): remain_job = result._number_left if remain_job != previous_remain: pbar.update(previous_remain - remain_job) previous_remain = remain_job time.sleep(2) pbar.close() results = result.get() p.close() p.join() PipelineUtility.print_pipeline_running_results( results, omit_header=kwargs['omit_headers'], input_files=input_files, tag=kwargs.get('tag')) except: message = 'Command: run-pipeline\n' message += 'Error Message: {}\n'.format(traceback.format_exc()) raise TLException(message)
class MapWrapper: """ Parallelisation wrapper for working with map-like callables, such as `multiprocessing.Pool.map`. Parameters ---------- pool : int or map-like callable If `pool` is an integer, then it specifies the number of threads to use for parallelization. If ``int(pool) == 1``, then no parallel processing is used and the map builtin is used. If ``pool == -1``, then the pool will utilize all available CPUs. If `pool` is a map-like callable that follows the same calling sequence as the built-in map function, then this callable is used for parallelization. """ def __init__(self, pool=1): self.pool = None self._mapfunc = map self._own_pool = False if callable(pool): self.pool = pool self._mapfunc = self.pool else: from multiprocessing import Pool # user supplies a number if int(pool) == -1: # use as many processors as possible self.pool = Pool() self._mapfunc = self.pool.map self._own_pool = True elif int(pool) == 1: pass elif int(pool) > 1: # use the number of processors requested self.pool = Pool(processes=int(pool)) self._mapfunc = self.pool.map self._own_pool = True else: raise RuntimeError("Number of workers specified must be -1," " an int >= 1, or an object with a 'map' " "method") def __enter__(self): return self def terminate(self): if self._own_pool: self.pool.terminate() def join(self): if self._own_pool: self.pool.join() def close(self): if self._own_pool: self.pool.close() def __exit__(self, exc_type, exc_value, traceback): if self._own_pool: self.pool.close() self.pool.terminate() def __call__(self, func, iterable): # only accept one iterable because that's all Pool.map accepts try: return self._mapfunc(func, iterable) except TypeError as e: # wrong number of arguments raise TypeError("The map-like callable must be of the" " form f(func, iterable)") from e
def create_threads(): pool = Pool() results = pool.map(get_rosters, get_roster_links()) pool.close() pool.join() return results
def download_video_page_async_multi_process(self, process_num=10): pool = Pool(processes=process_num) for line in range(process_num): pool.apply_async(self.download_video_page_async_single_process) pool.close() pool.join()
def gen_html_report(param_dct, output_df, lipid_info_img_lst): usr_vendor = param_dct['vendor'] output_folder = param_dct['img_output_folder_str'] usr_ms1_ppm = param_dct['ms_ppm'] usr_ms2_ppm = param_dct['ms2_ppm'] usr_ms1_precision = usr_ms1_ppm * 1e-6 usr_ms2_precision = usr_ms2_ppm * 1e-6 usr_core_num = param_dct['core_number'] usr_dpi = param_dct['img_dpi'] usr_img_type = param_dct['img_type'] hunter_start_time_str = param_dct['hunter_start_time'] # keep stay in current working directory current_path = os.getcwd() if os.path.isdir(output_folder): os.chdir(output_folder) if os.path.isdir('LipidHunter_Results_Figures_%s' % hunter_start_time_str): print('[INFO] --> Output folder existed...') else: os.mkdir('LipidHunter_Results_Figures_%s' % hunter_start_time_str) print('[INFO] --> Output folder created...') else: os.mkdir(output_folder) os.chdir(output_folder) os.mkdir('LipidHunter_Results_Figures_%s' % hunter_start_time_str) print('[INFO] --> Output folder created...') os.chdir(current_path) # generate html files log_pager = LogPageCreator(output_folder, hunter_start_time_str, param_dct) log_pager.add_all_info(output_df) log_pager.close_page() # del log_pager print('[STATUS] >>> start to generate images: image count %i' % len(lipid_info_img_lst)) if usr_core_num > 1: parallel_pool = Pool(usr_core_num) img_num = len(lipid_info_img_lst) img_sub_len = int(math.ceil(img_num / usr_core_num)) img_sub_key_lst = [ lipid_info_img_lst[k:k + img_sub_len] for k in range(0, img_num, img_sub_len) ] worker_count = 1 for img_sub_lst in img_sub_key_lst: if isinstance(img_sub_lst, tuple) or isinstance(img_sub_lst, list): if None in img_sub_lst: img_sub_lst = [x for x in img_sub_lst if x is not None] else: pass # img_params_dct = {'lipid_info_img_lst': img_sub_lst, 'usr_core_num': usr_core_num, # 'usr_img_type': usr_img_type, 'usr_dpi': usr_dpi, 'usr_vendor': usr_vendor, # 'usr_ms1_precision': usr_ms1_precision, 'worker_count': worker_count} if len(img_sub_lst) > 0: print( '[STATUS] >>> Core #%i ==> Generating output images ... image count: %i' % (worker_count, len(img_sub_lst))) if 'debug_mode' in list(param_dct.keys()): if param_dct['debug_mode'] == 'ON': for img_param_dct in img_sub_lst: print(img_param_dct['save_img_as']) parallel_pool.apply_async( gen_plot, args=(img_sub_lst, worker_count, usr_img_type, usr_dpi, usr_vendor, usr_ms1_precision)) worker_count += 1 # del img_sub_key_lst # del img_sub_lst parallel_pool.close() parallel_pool.join() else: worker_count = 1 print('[INFO] --> Using single core mode...') if isinstance(lipid_info_img_lst, tuple) or isinstance( lipid_info_img_lst, list): if None in lipid_info_img_lst: lipid_info_img_lst = [ x for x in lipid_info_img_lst if x is not None ] else: pass if len(lipid_info_img_lst) > 0: gen_plot(lipid_info_img_lst, worker_count, usr_img_type, usr_dpi, usr_vendor, usr_ms1_precision)
def analyse(config, prog, argv): parser = argparse.ArgumentParser(prog=prog, description='Analyse patch stacks') # thresholds parser.add_argument('-th', dest='thres_heading', metavar='threshold', default=config.thresholds.heading, type=float, help='Minimum diff hunk section heading similarity ' '(default: %(default)s)') parser.add_argument('-tf', dest='thres_filename', metavar='threshold', default=config.thresholds.filename, type=float, help='Minimum filename similarity ' '(default: %(default)s)') parser.add_argument( '-dlr', dest='thres_diff_lines', metavar='threshold', type=float, default=config.thresholds.diff_lines_ratio, help='Diff lines ratio threshold (default: %(default)s)') parser.add_argument('-adi', dest='thres_adi', metavar='days', type=int, default=config.thresholds.author_date_interval, help='Author date interval (default: %(default)s)') parser.add_argument('-er', dest='er_filename', metavar='filename', default=config.f_evaluation_result, help='Evaluation result PKL filename') parser.add_argument('-cpu', dest='cpu_factor', metavar='cpu', type=float, default=1.0, help='CPU factor for parallelisation ' '(default: %(default)s)') parser.add_argument('-linux', dest='linux', action='store_true', default=False, help='Make a Linux kernel specific analysis') # choose analysis mode parser.add_argument('mode', default='succ', choices=['succ', 'rep', 'upstream'], help='rep: ' 'compare representatives of the stack - ' 'succ: ' 'compare successive versions of the stacks - ' 'upstream: ' 'compare representatives against upstream - ' '(default: %(default)s)') parser.add_argument('-upstream', dest='upstream_range', metavar='<revision range>', default=None, help='Specify upstream revision range, ' 'e.g.: v0.1..v0.2 (default: %s)' % config.upstream_range) parser.add_argument('-differential', dest='differential', action='store_true', default=False, help='Perform a differential analysis') args = parser.parse_args(argv) config.thresholds.heading = args.thres_heading config.thresholds.filename = args.thres_filename config.thresholds.diff_lines_ratio = args.thres_diff_lines config.thresholds.author_date_interval = args.thres_adi repo = config.repo mbox = config.mode == Config.Mode.MBOX mode = args.mode if mbox and mode == 'succ': log.error('Analysis mode succ is not available in mailbox mode!') return -1 if not mbox and args.differential: log.error( 'Differential analysis can only be performed in mailbox mode') f_cluster, cluster = config.load_cluster(must_exist=False) def fill_result(hashes, tag): for hash in hashes: cluster.insert_element(hash) if tag: cluster.mark_upstream(hash, True) # intermediate persistence cluster.to_file(f_cluster) if mbox: log.info('Regarding mails in time window %s--%s' % (format_date_ymd( config.mbox_mindate), format_date_ymd(config.mbox_maxdate))) # load mbox ccache very early, because we need it in any case if it # exists. config.load_ccache_mbox() new_patches = set() if mode == 'rep': victims = repo.mbox.get_ids(config.mbox_time_window) # we have to temporarily cache those commits to filter out invalid # emails. Commit cache is already loaded, so evict everything except # victims and then cache all victims. repo.cache_evict_except(victims) repo.cache_commits(victims) # we might have loaded invalid emails, so reload the victim list once # more. This time, include all patches from the pre-existing (partial) # result, and check if all patches are reachable victims = repo.mbox.get_ids(config.mbox_time_window) | \ cluster.get_downstream() # in case of an mbox analysis, we will definitely need all untagged # commit hashes as we need to determine the representative system for # both modes, rep and upstream. available = repo.cache_commits(victims) if available != victims: missing = victims - available log.warning('MAILBOX RESULT CONTAINS %d MESSAGES THAT ARE NOT ' 'REACHABLE BY THE MAILBOX CONFIGURATION' % len(missing)) log.warning('Those messages will be removed from the result') log.warning( 'Waiting 5 seconds before starting. Press Ctrl-C to ' 'abort.') sleep(5) for miss in missing: cluster.remove_element(miss) cluster.optimize() victims = available if args.linux: if config.mbox_use_patchwork_id: log.error('Doesn\'t work with USE_PATCHWORK_ID = true') return -1 log.info('Searching for non-Linux patches...') repo.mbox.load_threads() characteristic = load_linux_mail_characteristics(repo, victims) linux_patches = { victim for victim in victims if characteristic[victim].patches_linux } log.info( 'Will consider only %u/%u patches (%0.3f%%) as Linux' 'patches' % (len(linux_patches), len(victims), len(linux_patches) * 100.0 / len(victims))) victims = linux_patches repo.cache_evict_except(victims) # get new downstream patches since previous analysis new_patches = victims - cluster.get_downstream() log.info('Cached %d relevant mails' % len(available)) fill_result(victims, False) cherries = EvaluationResult() if mode == 'succ': victims = config.psd.commits_on_stacks fill_result(victims, False) num_cpus = int(cpu_count() * args.cpu_factor) psd = config.psd global _repo repo = config.repo _repo = repo config.load_ccache_stack() evaluation_list = [] for patch_stack in psd: successor = psd.get_successor(patch_stack) if successor == None: break log.info('Queueing %s <-> %s' % (patch_stack.stack_version, successor.stack_version)) evaluation_list.append( (patch_stack.commit_hashes, successor.commit_hashes)) # cache missing commits repo.cache_commits(psd.commits_on_stacks) cherries = find_cherries(repo, psd.commits_on_stacks, psd.commits_on_stacks) f = partial(_evaluate_patch_list_wrapper, config.thresholds) log.info('Starting evaluation.') pool = Pool(num_cpus, maxtasksperchild=1) results = pool.map(f, evaluation_list, chunksize=5) pool.close() pool.join() log.info(' ↪ done.') _repo = None evaluation_result = EvaluationResult(False, EvaluationType.PatchStack) for result in results: evaluation_result.merge(result) else: # mode is rep or upstream # iterate over similar patch list and get latest commit of patches log.info('Determining patch stack representative system') # Get the complete representative system # The lambda compares two patches of an equivalence class and chooses # the one with the later release version if mbox: representatives = cluster.get_representative_system( lambda x, y: repo.get_commit(x).author.date > repo.get_commit( y).author.date) else: representatives = cluster.get_representative_system( lambda x, y: config.psd.is_stack_version_greater( config.psd.get_stack_of_commit(x), config.psd.get_stack_of_commit(y))) log.info(' ↪ done') if mode == 'upstream': if args.upstream_range is not None: candidates = set(repo.get_commithash_range( args.upstream_range)) else: candidates = set(config.upstream_hashes) # get new upstream patches since last analysis new_patches |= candidates - cluster.get_upstream() fill_result(candidates, True) config.load_ccache_upstream() # cache missing commits repo.cache_commits(representatives | candidates) repo.cache_evict_except(representatives | candidates) cherries = find_cherries(repo, representatives, candidates) type = EvaluationType.Upstream elif mode == 'rep': repo.cache_commits(representatives) candidates = representatives if not mbox: cherries = find_cherries(repo, representatives, config.psd.commits_on_stacks) type = EvaluationType.PatchStack if args.differential: representatives = representatives | new_patches log.info('Starting differential evaluation of %u new patches' % len(new_patches)) differential_evaluation = evaluate_commit_list( repo, config.thresholds, mbox, type, representatives, new_patches, parallelise=True, verbose=True, cpu_factor=args.cpu_factor) evaluation_result = EvaluationResult.from_file( config.f_evaluation_result, config.d_false_positives) if evaluation_result: evaluation_result.merge(differential_evaluation) else: evaluation_result = differential_evaluation else: log.info('Starting evaluation') evaluation_result = evaluate_commit_list( repo, config.thresholds, mbox, type, representatives, candidates, parallelise=True, verbose=True, cpu_factor=args.cpu_factor) log.info(' ↪ done.') evaluation_result.merge(cherries) evaluation_result.to_file(args.er_filename)
def update_book_info(isbnfile, shelf_file, info_file): '''根据isbn更新half.com价格''' global result_file #结果文件 global onshelf_file #上架信息结果文件 global success_isbn_file global offshelf_isbn_file global not_crawl_file global not_list_file # global bu_onshelf_file #condition不符合要求的记录 # global bu_half_file #---- global delete_file #记录重量不合条件的ISBN,和采购价低于1的书籍信息 # global all_fail_file global success_sku_file global lock lock = Lock() result_file = open(info_file, "w") onshelf_file = open(shelf_file, "w") # bu_onshelf_file = open('./update/info/onshelf_condition_fail.csv', "w") # bu_half_file = open('./update/info/half_condition_fail.csv', "w") success_isbn_file = open("./update/info/success_isbn.txt", "w") offshelf_isbn_file = open("./update/info/offshelf_isbn.txt", "w") not_crawl_file = open("./update/info/not_crawl.txt", "w") not_list_file = open("./update/info/not_found.txt", "w") delete_file = open("./update/info/delete_isbn.txt", "w") success_sku_file = open("./update/isbn/success_sku.txt", 'w') # all_fail_file = open("./update/isbn/fail_file.txt",'w') titles = [ 'ISBN', 'ISBN13', 'weight', 'auther', 'condition', 'price', 'sec_price' ] shelf_titles = [ 'sku', 'price', 'minimum-seller-allowed-price', 'auther', 'maximum-seller-allowed-price', 'quantity', 'leadtime-to-ship' ] create_titles(info_file, titles) create_titles(shelf_file, shelf_titles) #取isbn列表 # path="./update/isbn" # isbns=get_isbns(path) isbn_file = open(isbnfile) isbns = isbn_file.readlines() #isbns = isbns[:100] # #print isbns pool = Pool(20) pool.map(get_book_info, isbns) pool.close() pool.join() result_file.close() onshelf_file.close() success_isbn_file.close() success_sku_file.close() offshelf_isbn_file.close() not_list_file.close() # bu_onshelf_file.close() # bu_half_file.close() not_crawl_file.close() delete_file.close()
def initialize_topics(): """重新初始化topic 1. 把话题status变为initializing 2. 设置话题起始时间 3. 清除新闻表中每条新闻的subeventid字段、weight字段、duplicate字段、same_from字段 4. 清除子事件表中话题的相关数据 5. 清除子事件特征词表中话题的相关数据 """ topic_start = [] """ topicname = u'APEC2014' start_datetime = "2014-11-04 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'手术台自拍' start_datetime = "2014-12-23 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'呼格案' start_datetime = "2014-12-14 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'复旦投毒案' start_datetime = "2014-02-18 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'非法占中' start_datetime = "2014-09-30 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'马航失联' start_datetime = "2014-03-10 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'博鳌论坛' start_datetime = "2014-04-03 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'昆明火车站暴恐案' start_datetime = "2014-03-03 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'乌鲁木齐火车站暴恐' start_datetime = "2014-05-01 00:00:00" topic_start.append((topicname, start_datetime)) topicname = u'全军政治工作会议' start_datetime = "2014-11-04 00:00:00" topic_start.append((topicname, start_datetime)) """ topicname = u'高校思想宣传工作' start_datetime = "2015-01-31 00:00:00" """ topicname = u'张灵甫遗骨被埋羊圈' start_datetime = "2015-01-31 00:00:00" """ topic_start.append((topicname, start_datetime)) pool = Pool() pool.map(one_topic_clear, topic_start) pool.close() pool.join()
class MultiProcPlugin(DistributedPluginBase): """Execute workflow with multiprocessing, not sending more jobs at once than the system can support. The plugin_args input to run can be used to control the multiprocessing execution and defining the maximum amount of memory and threads that should be used. When those parameters are not specified, the number of threads and memory of the system is used. System consuming nodes should be tagged: memory_consuming_node.interface.estimated_memory_gb = 8 thread_consuming_node.interface.num_threads = 16 The default number of threads and memory for a node is 1. Currently supported options are: - non_daemon : boolean flag to execute as non-daemon processes - n_procs: maximum number of threads to be executed in parallel - memory_gb: maximum memory (in GB) that can be used at once. """ def __init__(self, plugin_args=None): # Init variables and instance attributes super(MultiProcPlugin, self).__init__(plugin_args=plugin_args) self._taskresult = {} self._task_obj = {} self._taskid = 0 non_daemon = True self.plugin_args = plugin_args self.processors = cpu_count() self.memory_gb = get_system_total_memory_gb( ) * 0.9 # 90% of system memory self._timeout = 2.0 self._event = threading.Event() # Check plugin args if self.plugin_args: if 'non_daemon' in self.plugin_args: non_daemon = plugin_args['non_daemon'] if 'n_procs' in self.plugin_args: self.processors = self.plugin_args['n_procs'] if 'memory_gb' in self.plugin_args: self.memory_gb = self.plugin_args['memory_gb'] logger.debug("MultiProcPlugin starting %d threads in pool" % (self.processors)) # Instantiate different thread pools for non-daemon processes if non_daemon: # run the execution using the non-daemon pool subclass self.pool = NonDaemonPool(processes=self.processors) else: self.pool = Pool(processes=self.processors) def _wait(self): if len(self.pending_tasks) > 0: if self._config['execution']['poll_sleep_duration']: self._timeout = float( self._config['execution']['poll_sleep_duration']) sig_received = self._event.wait(self._timeout) if not sig_received: logger.debug( 'MultiProcPlugin timeout before signal received. Deadlock averted??' ) self._event.clear() def _async_callback(self, args): self._taskresult[args['taskid']] = args self._event.set() def _get_result(self, taskid): if taskid not in self._taskresult: result = None else: result = self._taskresult[taskid] return result def _report_crash(self, node, result=None): if result and result['traceback']: node._result = result['result'] node._traceback = result['traceback'] return report_crash(node, traceback=result['traceback']) else: return report_crash(node) def _clear_task(self, taskid): del self._task_obj[taskid] def _submit_job(self, node, updatehash=False): self._taskid += 1 if hasattr(node.inputs, 'terminal_output'): if node.inputs.terminal_output == 'stream': node.inputs.terminal_output = 'allatonce' self._task_obj[self._taskid] = \ self.pool.apply_async(run_node, (node, updatehash, self._taskid), callback=self._async_callback) return self._taskid def _close(self): self.pool.close() return True def _send_procs_to_workers(self, updatehash=False, graph=None): """ Sends jobs to workers when system resources are available. Check memory (gb) and cores usage before running jobs. """ executing_now = [] # Check to see if a job is available currently_running_jobids = np.flatnonzero((self.proc_pending == True) & \ (self.depidx.sum(axis=0) == 0).__array__()) # Check available system resources by summing all threads and memory used busy_memory_gb = 0 busy_processors = 0 for jobid in currently_running_jobids: if self.procs[jobid]._interface.estimated_memory_gb <= self.memory_gb and \ self.procs[jobid]._interface.num_threads <= self.processors: busy_memory_gb += self.procs[ jobid]._interface.estimated_memory_gb busy_processors += self.procs[jobid]._interface.num_threads else: raise ValueError( "Resources required by jobid %d (%f GB, %d threads)" "exceed what is available on the system (%f GB, %d threads)" % (jobid, self.procs[jobid].__interface.estimated_memory_gb, self.procs[jobid].__interface.num_threads, self.memory_gb, self.processors)) free_memory_gb = self.memory_gb - busy_memory_gb free_processors = self.processors - busy_processors # Check all jobs without dependency not run jobids = np.flatnonzero((self.proc_done == False) & \ (self.depidx.sum(axis=0) == 0).__array__()) # Sort jobs ready to run first by memory and then by number of threads # The most resource consuming jobs run first jobids = sorted(jobids, key=lambda item: (self.procs[item]._interface.estimated_memory_gb, self. procs[item]._interface.num_threads)) if str2bool(config.get('execution', 'profile_runtime')): logger.debug('Free memory (GB): %d, Free processors: %d', free_memory_gb, free_processors) # While have enough memory and processors for first job # Submit first job on the list for jobid in jobids: if str2bool(config.get('execution', 'profile_runtime')): logger.debug('Next Job: %d, memory (GB): %d, threads: %d' \ % (jobid, self.procs[jobid]._interface.estimated_memory_gb, self.procs[jobid]._interface.num_threads)) if self.procs[jobid]._interface.estimated_memory_gb <= free_memory_gb and \ self.procs[jobid]._interface.num_threads <= free_processors: logger.info('Executing: %s ID: %d' % (self.procs[jobid]._id, jobid)) executing_now.append(self.procs[jobid]) if isinstance(self.procs[jobid], MapNode): try: num_subnodes = self.procs[jobid].num_subnodes() except Exception: etype, eval, etr = sys.exc_info() traceback = format_exception(etype, eval, etr) report_crash(self.procs[jobid], traceback=traceback) self._clean_queue(jobid, graph) self.proc_pending[jobid] = False continue if num_subnodes > 1: submit = self._submit_mapnode(jobid) if not submit: continue # change job status in appropriate queues self.proc_done[jobid] = True self.proc_pending[jobid] = True free_memory_gb -= self.procs[ jobid]._interface.estimated_memory_gb free_processors -= self.procs[jobid]._interface.num_threads # Send job to task manager and add to pending tasks if self._status_callback: self._status_callback(self.procs[jobid], 'start') if str2bool(self.procs[jobid].config['execution'] ['local_hash_check']): logger.debug('checking hash locally') try: hash_exists, _, _, _ = self.procs[jobid].hash_exists() logger.debug('Hash exists %s' % str(hash_exists)) if (hash_exists and (self.procs[jobid].overwrite == False or (self.procs[jobid].overwrite == None and not self.procs[jobid]._interface.always_run))): self._task_finished_cb(jobid) self._remove_node_dirs() continue except Exception: etype, eval, etr = sys.exc_info() traceback = format_exception(etype, eval, etr) report_crash(self.procs[jobid], traceback=traceback) self._clean_queue(jobid, graph) self.proc_pending[jobid] = False continue logger.debug('Finished checking hash') if self.procs[jobid].run_without_submitting: logger.debug('Running node %s on master thread' \ % self.procs[jobid]) try: self.procs[jobid].run() except Exception: etype, eval, etr = sys.exc_info() traceback = format_exception(etype, eval, etr) report_crash(self.procs[jobid], traceback=traceback) self._task_finished_cb(jobid) self._remove_node_dirs() else: logger.debug('MultiProcPlugin submitting %s' % str(jobid)) tid = self._submit_job(deepcopy(self.procs[jobid]), updatehash=updatehash) if tid is None: self.proc_done[jobid] = False self.proc_pending[jobid] = False else: self.pending_tasks.insert(0, (tid, jobid)) else: break
class CeleryExecutor(BaseExecutor): """ CeleryExecutor is recommended for production use of Airflow. It allows distributing the execution of task instances to multiple worker nodes. Celery is a simple, flexible and reliable distributed system to process vast amounts of messages, while providing operations with the tools required to maintain such a system. """ def __init__(self): super().__init__() # Celery doesn't support querying the state of multiple tasks in parallel # (which can become a bottleneck on bigger clusters) so we use # a multiprocessing pool to speed this up. # How many worker processes are created for checking celery task state. self._sync_parallelism = conf.getint('celery', 'SYNC_PARALLELISM') if self._sync_parallelism == 0: self._sync_parallelism = max(1, cpu_count() - 1) self._sync_pool = None self.tasks = {} self.last_state = {} def start(self) -> None: self.log.debug( 'Starting Celery Executor using %s processes for syncing', self._sync_parallelism) def _num_tasks_per_send_process(self, to_send_count: int) -> int: """ How many Celery tasks should each worker process send. :return: Number of tasks that should be sent per process :rtype: int """ return max( 1, int(math.ceil(1.0 * to_send_count / self._sync_parallelism))) def _num_tasks_per_fetch_process(self) -> int: """ How many Celery tasks should be sent to each worker process. :return: Number of tasks that should be used per process :rtype: int """ return max( 1, int(math.ceil(1.0 * len(self.tasks) / self._sync_parallelism))) def trigger_tasks(self, open_slots: int) -> None: """ Overwrite trigger_tasks function from BaseExecutor :param open_slots: Number of open slots :return: """ sorted_queue = self.order_queued_tasks_by_priority() task_tuples_to_send: List[TaskInstanceInCelery] = [] for _ in range(min((open_slots, len(self.queued_tasks)))): key, (command, _, queue, simple_ti) = sorted_queue.pop(0) task_tuples_to_send.append( (key, simple_ti, command, queue, execute_command)) cached_celery_backend = None if task_tuples_to_send: tasks = [t[4] for t in task_tuples_to_send] # Celery state queries will stuck if we do not use one same backend # for all tasks. cached_celery_backend = tasks[0].backend if task_tuples_to_send: # Use chunks instead of a work queue to reduce context switching # since tasks are roughly uniform in size chunksize = self._num_tasks_per_send_process( len(task_tuples_to_send)) num_processes = min(len(task_tuples_to_send), self._sync_parallelism) send_pool = Pool(processes=num_processes) key_and_async_results = send_pool.map(send_task_to_executor, task_tuples_to_send, chunksize=chunksize) send_pool.close() send_pool.join() self.log.debug('Sent all tasks.') for key, command, result in key_and_async_results: if isinstance(result, ExceptionWithTraceback): self.log.error( # pylint: disable=logging-not-lazy CELERY_SEND_ERR_MSG_HEADER + ":%s\n%s\n", result.exception, result.traceback) elif result is not None: # Only pops when enqueued successfully, otherwise keep it # and expect scheduler loop to deal with it. self.queued_tasks.pop(key) result.backend = cached_celery_backend self.running.add(key) self.tasks[key] = result self.last_state[key] = celery_states.PENDING def sync(self) -> None: num_processes = min(len(self.tasks), self._sync_parallelism) if num_processes == 0: self.log.debug("No task to query celery, skipping sync") return self.log.debug("Inquiring about %s celery task(s) using %s processes", len(self.tasks), num_processes) # Recreate the process pool each sync in case processes in the pool die self._sync_pool = Pool(processes=num_processes) # Use chunks instead of a work queue to reduce context switching since tasks are # roughly uniform in size chunksize = self._num_tasks_per_fetch_process() self.log.debug("Waiting for inquiries to complete...") task_keys_to_states = self._sync_pool.map(fetch_celery_task_state, self.tasks.items(), chunksize=chunksize) self._sync_pool.close() self._sync_pool.join() self.log.debug("Inquiries completed.") self.update_task_states(task_keys_to_states) def update_task_states( self, task_keys_to_states: List[Union[TaskInstanceStateType, ExceptionWithTraceback]] ) -> None: """Updates states of the tasks.""" for key_and_state in task_keys_to_states: if isinstance(key_and_state, ExceptionWithTraceback): self.log.error( # pylint: disable=logging-not-lazy CELERY_FETCH_ERR_MSG_HEADER + ", ignoring it:%s\n%s\n", repr(key_and_state.exception), key_and_state.traceback) continue key, state = key_and_state self.update_task_state(key, state) def update_task_state(self, key: TaskInstanceKeyType, state: str) -> None: """Updates state of a single task.""" # noinspection PyBroadException try: if self.last_state[key] != state: if state == celery_states.SUCCESS: self.success(key) del self.tasks[key] del self.last_state[key] elif state == celery_states.FAILURE: self.fail(key) del self.tasks[key] del self.last_state[key] elif state == celery_states.REVOKED: self.fail(key) del self.tasks[key] del self.last_state[key] else: self.log.info("Unexpected state: %s", state) self.last_state[key] = state except Exception: # pylint: disable=broad-except self.log.exception( "Error syncing the Celery executor, ignoring it.") def end(self, synchronous: bool = False) -> None: if synchronous: while any([ task.state not in celery_states.READY_STATES for task in self.tasks.values() ]): time.sleep(5) self.sync() def execute_async(self, key: TaskInstanceKeyType, command: CommandType, queue: Optional[str] = None, executor_config: Optional[Any] = None): """Do not allow async execution for Celery executor.""" raise AirflowException("No Async execution for Celery executor.") def terminate(self): pass
def produce_bam_custom(kmers_trie, name, label, guides_filename, args, offdist, maxoffcount, processes, n, parts): """Produce BAM file with guideRNA database. Run after all files and trie were generated by kmers.extract_process_kmers() and guides.analyze_guides() Produce files: sorted BAM file with off-target info: <name>/<name>_guides_<label>.bam index for the BAM file with off-target info: <name>/<name>_guides_<label>.bam.bai Args: kmers_trie: trie.trie object with all guideRNAs as produced by guides.analyze_guides() name: project name, used to get project args and in all output label: str, add it to file name of output database for this run guides_filename: name of file with all k-mers that are considered good candidate guideRNAs, one per line; if file name ends with .gz assume file is gzipped; args: arguments of the project, used to print some info in SAM header offdist: maximum Hamming distance to consider from guideRNA to its off-target; use -1 for omitting any off-target info in resulting BAM (works much faster); running time icreases somewhat exponentially as this value increases; offdist=4 may be infeasible when running genome-wide analysis on mammalian genome maxoffcount: store at most this many off-targets for a guideRNA; ignore if offdist is -1 processes: int, how many processes to use in parallel; do not specify more than available in the system; currently not implemented, use 1 """ guidesfiles = [] # parts = 256 tempdir = '%s%s' % (name,'/classifiedfiles/tempfiles') util.print_log('produce SAM files...') samfiles = ['%s/%s.sam' % (tempdir, i) for i in range(parts)] # samfiles = [tempfile.NamedTemporaryFile(dir=name, suffix='.sam%s' % i) # for i in xrange(parts)] # util.print_log('store SAM in these files (gzipped): %s' # % (', '.join([basename(f.name) for f in samfiles]))) if isinstance(guides_filename, str): util.print_log('split %s in %s parts...' % (guides_filename, parts)) guidesfiles = [tempfile.NamedTemporaryFile(dir=name, suffix='.guides%s' % i) for i in range(parts)] util.print_log('store guides in these files: %s' % (', '.join([basename(f.name) for f in guidesfiles]))) guidesfile = gzip.open(guides_filename) \ if guides_filename.endswith('.gz') \ else open(guides_filename) index_num = 0 guidecount = 0 for line in guidesfile: kmer1 = line.split()[0][0:n] index_num = guides.get_num(kmer1, n) guidesfiles[index_num].write(line) guidecount += 1 guidesfile.close() for f in guidesfiles: f.flush() util.print_log('%s guideRNAs to process' % guidecount) util.print_log('done') process_list = [] all_task = Queue() for i in range(parts): task = (guides_filename[i].name, samfiles[i].name, i) all_task.put(task) for i in range(processes): p = Process(target=process_pool, args=(all_task, kmers_trie, args, offdist, maxoffcount, i, n, parts)) p.start() process_list.append(p) for p in process_list: p.join() for i in range(parts): guidesfiles[i].close() else: process_list = [] all_task = Queue() for i in range(parts): task = (guides_filename[i], samfiles[i], i) all_task.put(task) for i in range(processes): p = Process(target=process_pool, args=(all_task, kmers_trie, args, offdist, maxoffcount, i, n, parts)) p.start() process_list.append(p) for p in process_list: p.join() util.print_log('produce sorted BAM files...') bamfiles = ['%s/%s.bam' % (tempdir, i) for i in range(parts)] # bamfiles = [tempfile.NamedTemporaryFile(dir=name, suffix='.bam%s' % i) # for i in xrange(parts)] # util.print_log('store BAM in these files: %s' # % (', '.join([basename(f.name) for f in bamfiles]))) pool = Pool(processes) util.print_log('poolSize %s...' % processes) index=False for i in range(parts): pool.apply_async(sam_to_bam,(samfiles[i], bamfiles[i], index,)) util.print_log('Waiting for all subprocesses done...') pool.close() pool.join() # for i in xrange(parts): # samfiles[i].close() util.print_log('merge into one BAM file...') bamfile = '%s/%s_guides%s.bam' % (name, name, '_%s' % label if label else '') util.print_log('store in %s' % bamfile) util.warn_file_exists(bamfile) if parts > 1000: mid = parts // 2 bamfiles_temp = [tempfile.NamedTemporaryFile(dir=name, suffix='.bam%s' % i) for i in xrange(2)] samtools_command1 = 'samtools merge -f %s %s' \ % (bamfiles_temp[0].name, ' '.join(bamfiles[0:mid])) os.system(samtools_command1) samtools_command2 = 'samtools merge -f %s %s' \ % (bamfiles_temp[1].name, ' '.join(bamfiles[mid:parts])) os.system(samtools_command2) samtools_command = 'samtools merge -f %s %s' \ % (bamfile, ' '.join([f.name for f in bamfiles_temp])) os.system(samtools_command) for f in bamfiles_temp: f.close() else: samtools_command = 'samtools merge -f %s %s' \ % (bamfile, ' '.join(bamfiles)) # print samtools_command os.system(samtools_command) samtools_index_command = 'samtools index %s' % bamfile # print samtools_index_command os.system(samtools_index_command) util.print_log('done') # for i in xrange(parts): # bamfiles[i].close() for i in range(parts): if(os.path.exists(samfiles[i])): os.remove(samfiles[i]) if(os.path.exists(bamfiles[i])): os.remove(bamfiles[i]) util.print_log('samtools version') samtools_version_command = 'samtools --version' print samtools_version_command os.system(samtools_version_command)
def eval_map(det_results, annotations, scale_ranges=None, iou_thr=0.5, dataset=None, logger=None, tpfp_fn=None, nproc=4): """Evaluate mAP of a dataset. Args: det_results (list[list]): [[cls1_det, cls2_det, ...], ...]. The outer list indicates images, and the inner list indicates per-class detected bboxes. annotations (list[dict]): Ground truth annotations where each item of the list indicates an image. Keys of annotations are: - `bboxes`: numpy array of shape (n, 4) - `labels`: numpy array of shape (n, ) - `bboxes_ignore` (optional): numpy array of shape (k, 4) - `labels_ignore` (optional): numpy array of shape (k, ) scale_ranges (list[tuple] | None): Range of scales to be evaluated, in the format [(min1, max1), (min2, max2), ...]. A range of (32, 64) means the area range between (32**2, 64**2). Default: None. iou_thr (float): IoU threshold to be considered as matched. Default: 0.5. dataset (list[str] | str | None): Dataset name or dataset classes, there are minor differences in metrics for different datsets, e.g. "voc07", "imagenet_det", etc. Default: None. logger (logging.Logger | str | None): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. tpfp_fn (callable | None): The function used to determine true/ false positives. If None, :func:`tpfp_default` is used as default unless dataset is 'det' or 'vid' (:func:`tpfp_imagenet` in this case). If it is given as a function, then this function is used to evaluate tp & fp. Default None. nproc (int): Processes used for computing TP and FP. Default: 4. Returns: tuple: (mAP, [dict, dict, ...]) """ assert len(det_results) == len(annotations) num_imgs = len(det_results) num_scales = len(scale_ranges) if scale_ranges is not None else 1 num_classes = len(det_results[0]) # positive class num area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges] if scale_ranges is not None else None) pool = Pool(nproc) eval_results = [] for i in range(num_classes): # get gt and det bboxes of this class cls_dets, cls_gts, cls_gts_ignore = get_cls_results( det_results, annotations, i) # choose proper function according to datasets to compute tp and fp if tpfp_fn is None: if dataset in ['det', 'vid']: tpfp_fn = tpfp_imagenet else: tpfp_fn = tpfp_default if not callable(tpfp_fn): raise ValueError( f'tpfp_fn has to be a function or None, but got {tpfp_fn}') # compute tp and fp for each image with multiple processes tpfp = pool.starmap( tpfp_fn, zip(cls_dets, cls_gts, cls_gts_ignore, [iou_thr for _ in range(num_imgs)], [area_ranges for _ in range(num_imgs)])) tp, fp = tuple(zip(*tpfp)) # calculate gt number of each scale # ignored gts or gts beyond the specific scale are not counted num_gts = np.zeros(num_scales, dtype=int) for j, bbox in enumerate(cls_gts): if area_ranges is None: num_gts[0] += bbox.shape[0] else: gt_areas = (bbox[:, 2] - bbox[:, 0]) * ( bbox[:, 3] - bbox[:, 1]) for k, (min_area, max_area) in enumerate(area_ranges): num_gts[k] += np.sum((gt_areas >= min_area) & (gt_areas < max_area)) # sort all det bboxes by score, also sort tp and fp cls_dets = np.vstack(cls_dets) num_dets = cls_dets.shape[0] sort_inds = np.argsort(-cls_dets[:, -1]) tp = np.hstack(tp)[:, sort_inds] fp = np.hstack(fp)[:, sort_inds] # calculate recall and precision with tp and fp tp = np.cumsum(tp, axis=1) fp = np.cumsum(fp, axis=1) eps = np.finfo(np.float32).eps recalls = tp / np.maximum(num_gts[:, np.newaxis], eps) precisions = tp / np.maximum((tp + fp), eps) # calculate AP if scale_ranges is None: recalls = recalls[0, :] precisions = precisions[0, :] num_gts = num_gts.item() mode = 'area' if dataset != 'voc07' else '11points' ap = average_precision(recalls, precisions, mode) eval_results.append({ 'num_gts': num_gts, 'num_dets': num_dets, 'recall': recalls, 'precision': precisions, 'ap': ap }) pool.close() if scale_ranges is not None: # shape (num_classes, num_scales) all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results]) all_num_gts = np.vstack( [cls_result['num_gts'] for cls_result in eval_results]) mean_ap = [] for i in range(num_scales): if np.any(all_num_gts[:, i] > 0): mean_ap.append(all_ap[all_num_gts[:, i] > 0, i].mean()) else: mean_ap.append(0.0) else: aps = [] for cls_result in eval_results: if cls_result['num_gts'] > 0: aps.append(cls_result['ap']) mean_ap = np.array(aps).mean().item() if aps else 0.0 print_map_summary( mean_ap, eval_results, dataset, area_ranges, logger=logger) return mean_ap, eval_results
q = Manager().Queue() po = Pool() lock = Manager().Lock() dir_list = get_img_dir_list(parser.parse_args().imgDir) args = parser.parse_args() length = len(dir_list) if (0 == args.gpus): start = 0 end = length / 2 else: start = length / 2 - 1 end = length for i in range(start, end): q.put(dir_list[i]) print 'q.qsize=', q.qsize() for i in range(0, 5): po.apply_async(task, args=(parser.parse_args(), q, lock)) po.close() po.join() multiprocessing.freeze_support() print "Exiting Main Thread"