def process_folder(self): if self.in_folder.endswith(".txt"): paths_list = read_file(self.in_folder) else: paths_list, _ = traverse_dir_files(self.in_folder) print('[Info] 样本数: {}'.format(len(paths_list))) random.seed(47) random.shuffle(paths_list) if len(paths_list) > self.max_num: paths_list = paths_list[:self.max_num] print('[Info] 样本数: {}'.format(len(paths_list))) time_str = get_current_time_str() out_file = os.path.join(self.out_folder, "val_{}.txt".format(time_str)) out_html = os.path.join(self.out_folder, "val_{}.html".format(time_str)) pool = Pool(processes=100) for img_idx, img_path in enumerate(paths_list): if "rotation_datasets_hardcase" in img_path: # 过滤hardcase continue # ServiceTester.process_img_path(img_idx, img_path, self.service, out_file) pool.apply_async(ServiceTester.process_img_path, (img_idx, img_path, self.service, out_file)) pool.close() pool.join() print('[Info] 处理完成: {}'.format(out_file)) data_lines = read_file(out_file) print('[Info] 正确率: {}'.format( safe_div(len(paths_list) - len(data_lines), len(paths_list)))) out_list = [] for data_line in data_lines: items = data_line.split("\t") out_list.append(items) make_html_page(out_html, out_list) print('[Info] 处理完成: {}'.format(out_html))
def filter_checked_urls(self): in_dir = os.path.join(DATA_DIR, 'datasets_v4_checked_r_txt') out_dir = os.path.join( DATA_DIR, 'datasets_v4_checked_r_urls_out_{}'.format(get_current_time_str())) mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(in_dir) pool = Pool(processes=80) idx = 0 for in_path, in_name in zip(paths_list, names_list): out_error_path = os.path.join(out_dir, '{}.error.txt'.format(in_name)) out_right_path = os.path.join(out_dir, '{}.right.txt'.format(in_name)) print('[Info] out_file: {} - {}'.format(out_error_path, out_right_path)) data_lines = read_file(in_path) print('[Info] 文本数量: {}'.format(len(data_lines))) for data_line in data_lines: url = data_line pool.apply_async(DatasetFilter.check_url, (idx, url, out_error_path, out_right_path)) idx += 1 if idx % 1000 == 0: print('[Info] idx: {}'.format(idx)) pool.close() pool.join() print('[Info] 处理完成: {}'.format(out_dir))
def run(num, method, args): p = Pool(num) for arg in args: p.apply_async(method, args=(arg, )) p.close() p.join() p.terminate()
class Mobile(Device): def __init__(self, device_id, address): super().__init__(device_id) self.address = address self.p = Pool(6) self.capture = None self.pre_time = time.time() * 1000 def tap_button(self, button): cmd = "adb -s {:s} shell input tap {:d} {:d}".format( self.device_id, button[0], button[1]) self.p.apply_async(execute_cmd, args={cmd}) def swipe(self, action): cmd = "adb -s {:s} shell input swipe {:d} {:d} {:d} {:d} 300".format( self.device_id, action[0], action[1], action[2], action[3]) self.p.apply_async(execute_cmd, args={cmd}) def get_frame(self): if self.capture is None: self.capture = cv2.VideoCapture(self.address) state, img = self.capture.read() if state: time_mill = time.time() * 1000 if time_mill - self.pre_time >= 500: self.pre_time = time_mill return [cv2.resize(img, (540, 960)), 0] else: return [None, 0] else: self.capture.release() self.capture = None time.sleep(10) return [None, -1]
def process(self): data_dir = os.path.join(DATA_DIR, 'labeled_data') print('[Info] 数据文件夹: {}'.format(data_dir)) out_dir = os.path.join( DATA_DIR, 'labeled_data_out_{}'.format(get_current_time_str())) print('[Info] 输出文件夹: {}'.format(out_dir)) mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(data_dir) print('[Info] 文件数: {}'.format(len(paths_list))) out_file_format = os.path.join(out_dir, 'labeled_data_imgs_{}.txt') pool = Pool(processes=80) for path, name in zip(paths_list, names_list): name = name.split(".")[0] out_file = out_file_format.format(name) print('[Info] 输出文件: {}'.format(out_file)) data_lines = read_file(path) for idx, data_line in enumerate(data_lines): if idx == 0: continue # DataPrelabeled.process_line(out_file, idx, data_line) pool.apply_async(DataPrelabeled.process_line, (out_file, idx, data_line)) pool.close() pool.join() print('[Info] 处理完成: {}'.format(out_dir))
def get_img_href(): """ 与用户交互获取文件保存的路径, 使用多进程-多线程,下载图片 :return: 图片保存情况 """ url = "http://www.cct58.com/mneinv/1.html" # head = main_head() save_path = input("文件保存在: ") # 根据电脑CPU个数, 创建多进程 os.cpu_count() # pool = Pool(os.cpu_count()) pool = Pool(10) # 遍历获取每一页的url链接 for href in get_every_page(url, main_head(), proxy): try: for person_mx27, title in get_every_person(href, main_head(), proxy): # print(person_mx27, title) http://www.cct58.com/mneinv/19497/mx27/ 夏夏 # downloads(person_mx27, head, proxy,save_path) pool.apply_async(downloads, args=(person_mx27, main_head(), proxy, save_path)) except: continue pool.close() pool.join()
def main(original_dir, output_dir, crf, thread_count=None, overwrite=False): """ Compresses every MP4 video in a directory using the H.264 codec. Args: original_dir: Directory containing original videos. output_dir: Directory to output new videos to. Will be created if it does not exist. crf: Constant rate factor, i.e. the amount of compressing to perform. A higher number means higher compression. thread_count: Amount of threads to use. Set to None automatically choose a number of threads. """ if not os.path.exists(output_dir): os.makedirs(output_dir) videos = get_videos(original_dir, output_dir, overwrite) print('Compressing {} videos...'.format(len(videos))) pool = Pool(thread_count) for video, output in videos: pool.apply_async(compress_worker, (video, output, crf)) pool.close() pool.join() print('Done')
def check_all(self): func = self.check pool = Pool() for token in self.all(): pool.apply_async(func, args=(token, )) pool.join() pool.close()
def process_folder(self, in_dir, out_dir): """ 处理文件夹 """ print('[Info] in_dir: {}'.format(in_dir)) print('[Info] out_dir: {}'.format(out_dir)) mkdir_if_not_exist(out_dir) paths_list, names_list = traverse_dir_files(in_dir) print('[Info] 待处理文件数量: {}'.format(len(paths_list))) random.seed(47) paths_list, names_list = shuffle_two_list(paths_list, names_list) n_prc = 40 pool = Pool(processes=n_prc) # 多线程下载 for idx, (path, name) in enumerate(zip(paths_list, names_list)): pool.apply_async(DataProcessor.process_img, args=(path, name, out_dir)) # DataProcessor.process_img(path, name, out_dir) if (idx + 1) % 1000 == 0: print('[Info] num: {}'.format(idx + 1)) # 多进程逻辑 pool.close() pool.join() print('[Info] 处理完成! {}'.format(out_dir)) return
class QuantDslApplicationWithMultiprocessing(QuantDslApplication): def __init__(self, num_workers=None, call_evaluation_queue=None, **kwargs): if num_workers is not None: assert call_evaluation_queue is None # Parent. self.pool = Pool(processes=num_workers) self.manager = Manager() self.call_evaluation_queue = self.manager.Queue() else: # Child. self.pool = None super(QuantDslApplicationWithMultiprocessing, self).__init__(call_evaluation_queue=call_evaluation_queue, **kwargs) if self.pool: # Start worker pool. app_kwargs = self.get_subprocess_application_args() args = (self.manager.Lock(), self.__class__, app_kwargs) for i in range(num_workers): self.pool.apply_async(loop_on_evaluation_queue, args) def get_subprocess_application_args(self): app_kwargs = dict( call_evaluation_queue=self.call_evaluation_queue, ) return app_kwargs def close(self): super(QuantDslApplicationWithMultiprocessing, self).close() if self.pool: self.pool.terminate()
def download_right_angle(self): files_dir = os.path.join(ROOT_DIR, '..', 'datasets', '2020_11_26_vpf_right') paths_list, names_list = traverse_dir_files(files_dir) pool = Pool(processes=80) for path, name in zip(paths_list, names_list): name_x = name.split('.')[0] urls_file = os.path.join(ROOT_DIR, '..', 'datasets', '2020_11_26_vpf_right', '{}.txt'.format(name_x)) # 输入 out_dir = os.path.join(ROOT_DIR, '..', 'datasets', 'datasets_v4_checked', 'vpf_right', name_x) # 输出 mkdir_if_not_exist(out_dir) data_lines = read_file(urls_file) for idx, data_line in enumerate(data_lines): url, angle = data_line.split(',') pool.apply_async(DatasetFilter.process_img_angle, (idx, url, angle, out_dir)) pool.close() pool.join() print('[Info] 处理完成: {}'.format(files_dir))
def work(host, port, processes, threads, times): pool = Pool(processes, lambda: signal.signal(signal.SIGINT, signal.SIG_IGN)) p = Process(target=progress) p.daemon = True start = time.time() try: for chunk in divide(times, processes): pool.apply_async(thread, (host, port, threads, chunk)) p.start() pool.close() pool.join() p.terminate() p.join() except KeyboardInterrupt: pool.terminate() p.terminate() p.join() pool.join() return time.time() - start
def process(self): name = "nat_main_all_20211020" file_path = os.path.join(DATA_DIR, "{}.txt".format(name)) print("[Info] 输入文件: {}".format(file_path)) out_file_path = os.path.join( DATA_DIR, "{}.out.{}.txt".format(name, get_current_time_str())) out_html_path = os.path.join(DATA_DIR, "{}.out.html".format(name)) data_lines = read_file(file_path) random.seed(47) random.shuffle(data_lines) # data_lines = data_lines[:1000] print('[Info] 样本数: {}'.format(len(data_lines))) pool = Pool(processes=20) for data_idx, data_line in enumerate(data_lines): # RoiChecker.process_line_roi(data_idx, data_line, out_file_path) pool.apply_async(RoiChecker.process_line_roi, (data_idx, data_line, out_file_path)) pool.close() pool.join() print('[Info] 写入完成: {}'.format(out_file_path)) data_lines = read_file(out_file_path) items_list = [] for data_line in data_lines: items_list.append(data_line.split("\t")) make_html_page(out_html_path, items_list) print('[Info] 写入完成: {}'.format(out_html_path))
class _MultiExecutor(_Executor): """Execute functions async in a process pool""" def __init__(self): super(_MultiExecutor, self).__init__() self._children = 0 self.pool = Pool() def _collector(self, result): super(_MultiExecutor, self)._collector(result) self._children -= 1 def execute(self, func, args): self._children += 1 self.pool.apply_async(func, args, callback=self._collector) def wait_for_results(self): self.pool.close() # One would have hoped joining the pool would take care of this, but # apparently you need to first make sure that all your launched tasks # has returned their results properly, before calling join, or you # risk a deadlock. while self._children > 0: time.sleep(0.001) self.pool.join()
def run(): p = Pool(5) # 最多执行5个进程,打印5个数 for i in range(10000): p.apply_async(fun, args=(i,)) p.close() p.join() # 等待所有子进程结束,再往后执行 print("end")
def process_v2(self): val_folder = os.path.join(ROOT_DIR, '..', 'datasets', 'datasets_val') data_lines, _ = traverse_dir_files(val_folder) print('[Info] 样本数: {}'.format(len(data_lines))) type_name = "val" folder_name = "dataset_{}_{}".format(type_name, len(data_lines)) dataset_folder = os.path.join(self.out_ds_folder, folder_name) mkdir_if_not_exist(dataset_folder) print('[Info] 输出文件夹路径: {}'.format(dataset_folder)) mkdir_if_not_exist(self.out_files_folder) out_path_file = os.path.join(self.out_files_folder, "{}.txt".format(folder_name)) print('[Info] 输出文件路径: {}'.format(out_path_file)) pool = Pool(processes=100) for data_idx, data_line in enumerate(data_lines): pool.apply_async( DatasetReorder.copy_line_mul, (data_idx, data_line, type_name, dataset_folder, out_path_file)) pool.close() pool.join() path_list = read_file(out_path_file) print('[Info] 输出路径: {}, 样本数: {}'.format(len(path_list), len(data_lines))) print('[Info] 处理完成: {}'.format(out_path_file))
def process(self): data_dir = os.path.join(DATA_DIR, '2020_12_23') print('[Info] 数据文件: {}'.format(data_dir)) out_file = os.path.join(DATA_DIR, '2020_12_23.txt') print('[Info] 写出文件: {}'.format(out_file)) paths_list, names_list = traverse_dir_files(data_dir) print('[Info] 文件数: {}'.format(len(paths_list))) data_lines = [] for path, name in zip(paths_list, names_list): sub_lines = read_file(path) data_lines += sub_lines print('[Info] 文本行数: {}'.format(len(data_lines))) random.seed(47) random.shuffle(data_lines) pool = Pool(processes=80) for idx, data_line in enumerate(data_lines): # DataPreprocess.process_line(idx, data_line, out_file) pool.apply_async(DataPreprocess.process_line, (idx, data_line, out_file)) pool.close() pool.join() print('[Info] 处理完成: {}'.format(out_file))
def write_sub_bam(chrom_list, used_bam_file_tmp, exclude_bam_file_tmp, out_dir, total_modify_reads_file, total_delete_reads_file, total_add_reads_file, process): write_bam_pool = Pool(int(process)) exclude_bam_list = [exclude_bam_file_tmp] usedBamList = [] for chrom in chrom_list: excludeBam_chr = "%s/exclude_%s.bam" % (out_dir, chrom) exclude_bam_list.append(excludeBam_chr) usedBam_chr = "%s/used_%s.bam" % (out_dir, chrom) usedBamList.append(usedBam_chr) write_bam_pool.apply_async( write_bam_byChr, args=(used_bam_file_tmp, chrom, excludeBam_chr, usedBam_chr, total_modify_reads_file, total_delete_reads_file, total_add_reads_file)) write_bam_pool.close() write_bam_pool.join() exclude_bam_file = os.path.join(out_dir, "exclude.bam") bamMerge(exclude_bam_list, exclude_bam_file) used_bam_file = os.path.join(out_dir, "used.bam") if len(usedBamList) != 1: bamMerge(usedBamList, used_bam_file) else: used_bam_file = usedBamList[0] bamSort(used_bam_file, os.path.join(out_dir, "used.sort")) used_sort_bam_file = os.path.join(out_dir, "used.sort.bam") bamIndex(used_sort_bam_file) return used_sort_bam_file, exclude_bam_file
def insert_from_log(retry=3, docs=None): """ retry: 循环次数,一次循环中如果有失败的任务则会开启下一次循环,循环数-1,直到循环数为0。循环数设为负数表示循环直到所有任务完成。 """ from multiprocessing.pool import Pool import multiprocessing retry -= 1 if docs is None: docs = find() pool = Pool(None, initializer, ({name: globals()[name] for name in VARIABLES},)) for doc in docs: pool.apply_async(handle_doc, kwds=doc, error_callback=on_error) try: pool.close() pool.join() except KeyboardInterrupt: pool.terminate() return docs = list(find()) logging.warning("Unfilled: %s", len(docs)) if retry: logging.warning("retry | %s", retry) if docs: insert_from_log(retry, docs)
def read_group_posts(history_files, group_dict): """ Each user is represented by a list of strings which are their posts. The posts go through preprocessing. :param history_files: a list of paths to group Twitter history file :param group_dict: group dictionary to insert the preprocessed posts to :return: List of lists representing a list of user posts """ posts = manager.list() def update(*args): pbar.update() last_index = 0 for hist_file in history_files: with open(hist_file) as f: hist = json.load(f) pool = Pool(processes=6, initializer=init, initargs=(posts, group_dict)) indices = range(last_index, last_index + len(hist)) pbar = tqdm(zip(indices, hist.items()), total=len(hist.items()), desc='Reading {}'.format(hist_file)) for item in zip(indices, hist.items()): pool.apply_async(get_posts, args=(item, ), callback=update) last_index = item[0] + 1 pool.close() pool.join() pbar.close() return posts
def main(): args = dict(enumerate(sys.argv)) day = args.get(1) or datetime.now().strftime("%Y%m%d") if len(day) != 8: raise Exception print('date={0}'.format(day)) print('Start dumper...') goods = 'goods_{0}'.format(day) new = 'new_goods_{0}'.format(day) hot = 'hot_goods_{0}'.format(day) shop = 'shop_statistics_{0}'.format(day) detail = 'd_{0}'.format(day) monitor = 'm_{0}'.format(day) # todo 保存目录为可选项 # todo 保存目录为 linux windows 兼容 pool = Pool() pool.apply_async(func=dump_mongodb, args=(detail, )) pool.apply_async(func=dump_mongodb, args=(monitor, )) pool.apply_async(func=dump_mysql, args=(goods, )) pool.apply_async(func=dump_mysql, args=(new, )) pool.apply_async(func=dump_mysql, args=(hot, )) pool.apply_async(func=dump_mysql, args=(shop, )) pool.close() pool.join()
class RVWorkerPool(object): def __init__(self, processes=None): self.pool = Pool(processes) self.pipe_target = os.cpu_count() self.in_pipe = 0 self.jobs = [] def registerJob(self, func, args, ref=None, priority=1): job = JobRef(func, args, ref, priority) # Replace duplicate-source unstarted jobs if ref is not None: self.jobs = [j for j in self.jobs if j.ref != ref or j.started()] self.jobs.append(job) if self.in_pipe < self.pipe_target: job.jobStarted(self.pool.apply_async(func, args)) self.in_pipe += 1 return job def poll(self): done = [] sorted_jobs = sorted(self.jobs, key=lambda j: j.priority) for job in sorted_jobs: if job.started(): # Check if it's finished if job.ready(): self.jobs.remove(job) self.in_pipe -= 1 else: if self.in_pipe < self.pipe_target: job.jobStarted(self.pool.apply_async(job.func, job.args)) self.in_pipe += 1 def terminate(self): self.pool.terminate()
def diff_pro(dbname, tablename): "difference between slave and master " if is_sigint_up: os._exit(0) pool = Pool(processes=2) result = {} result['slave'] = pool.apply_async(table_count, args=(myhost, myport, myuser, mypassword, dbname, tablename)) result['master'] = pool.apply_async(table_count, args=(mahost, maport, mauser, mapassword, dbname, tablename)) pool.close() pool.join() slcount = result['slave'].get()['count'] slstart = result['slave'].get()['start'] slend = result['slave'].get()['end'] slengine = result['slave'].get()['engine'] sldelay = result['slave'].get()['delay'] macount = result['master'].get()['count'] mastart = result['master'].get()['start'] maend = result['master'].get()['end'] maengine = result['master'].get()['engine'] dbtable = '%s.%s' % (dbname, tablename) if slcount == -2 or macount == -2: mark = 'error' elif slcount == -1 or macount == -1: mark = '*' else: mark = macount - slcount msg = "%-40s |%-9s %-9s|%-8s %-8s |%-8s %-8s |%-7s|%-15s %-15s |%-10s" % ( dbtable, maengine, slengine, mastart, slstart, maend, slend, sldelay, macount, slcount, mark) print msg result.clear()
def compress_batch(input_folder, output_folder, target_compression, recursive=True, threads=1, follow_symlinks=True): # We require an absolute input path to we can replicate the data structure relative to it later on input_folder = os.path.abspath(input_folder) file_list = get_fast5_file_list(input_folder, recursive, follow_symlinks=follow_symlinks) if len(file_list) == 0: raise ValueError("No input fast5 files found in '{}'. Recursive={}".format(input_folder, recursive)) # Set up the process pool and the progressbar pool = Pool(min(threads, len(file_list))) pbar = get_progress_bar(len(file_list)) def update(result): pbar.update(pbar.currval + 1) for input_file in file_list: input_path = os.path.join(input_folder, input_file) output_path = os.path.join(output_folder, os.path.relpath(input_path, input_folder)) pool.apply_async(func=compress_file, args=(input_path, output_path, target_compression), callback=update) # Tear down the process pool and pbar. We can't use contextmanagers since we need to close() then join() pool.close() pool.join() pbar.finish()
def test_queue_and_thread(): strategy_name = parse_args() manager = Manager() in_queue = manager.Queue() out_queue = manager.Queue() # monitor([in_queue, out_queue]) pool = Pool(processes=3) results = [ pool.apply_async(start_engine, (in_queue, out_queue, strategy_name)), pool.apply_async(start_feed, (in_queue, )), pool.apply_async(start_executor, (out_queue, )) ] try: for r in results: r.get() except KeyboardInterrupt: print("Keybord interruption in main thread") finally: print("Cleaning main thread")
def fill_views(self): client = Client('hscic') pool = Pool(processes=len(self.view_paths)) tables = [] prescribing_date = ImportLog.objects.latest_in_category( 'prescribing').current_at.strftime('%Y-%m-%d') for path in self.view_paths: table_name = "vw__%s" % os.path.basename(path).replace('.sql', '') table = client.get_table(table_name) tables.append(table) with open(path) as f: sql = f.read() substitutions = {'this_month': prescribing_date} args = [table.name, sql, substitutions] pool.apply_async(query_and_export, args) pool.close() pool.join() # wait for all worker processes to exit for table in tables: self.download_and_import(table) self.log("-------------")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--boosted', action='store_true') args = parser.parse_args() boosted = args.boosted if boosted: pool = Pool(16) session, old_ip = renew_connection('') old_ip = '' else: session = new_session() req = Request('GET', CURLIE) prepared_req = session.prepare_request(req) resp = session.send(prepared_req, timeout=12) soup = BeautifulSoup(resp.text, 'html.parser') print('id,parent_id,name') cats = soup.select('.top-cat > a') for cat in cats: print('"' + cat['href'] + '","","' + cat.text + '"') if boosted: session, old_ip = renew_connection(old_ip) pool.apply_async(get_nodes, args=[cat['href'], '- ', session]) else: get_nodes(cat['href'], session) time.sleep(1) if boosted: pool.close() pool.join()
def download_imgs_for_mp(img_file, out_folder, prefix=None, n_prc=10): """ 多线程下载 :param img_file: 图片文件 :param out_folder: 输出文件夹 :param prefix: 图片前缀 :param n_prc: 进程数, 默认40个 :return: None """ print_info('进程总数: %s' % n_prc) pool = Pool(processes=n_prc) # 多线程下载 paths_list = read_file(img_file) print_info('文件数: %s' % len(paths_list)) _, imgs_names = traverse_dir_files(out_folder) for (index, path) in enumerate(paths_list): if prefix: pool.apply_async(download_img, (path, out_folder, imgs_names, prefix + '_' + str(index) + '.jpg')) else: pool.apply_async(download_img, (path, out_folder, imgs_names)) pool.close() pool.join() # _, imgs_names = traverse_dir_files(out_folder) # print_info('图片总数: %s' % len(imgs_names)) print_info('全部下载完成')
def merge_hardcase(self, file_list, type_name): data_lines = [] for file_name in file_list: file_path = os.path.join(self.folder, file_name) sub_lines = read_file(file_path) data_lines += sub_lines print('[Info] 样本行数: {}'.format(len(data_lines))) folder_name = "dataset_{}_{}".format(type_name, len(data_lines)) dataset_folder = os.path.join(self.out_ds_folder, folder_name) mkdir_if_not_exist(dataset_folder) print('[Info] 输出文件夹路径: {}'.format(dataset_folder)) mkdir_if_not_exist(self.out_files_folder) out_path_file = os.path.join(self.out_files_folder, "{}.txt".format(folder_name)) print('[Info] 输出文件路径: {}'.format(out_path_file)) pool = Pool(processes=100) for data_idx, data_line in enumerate(data_lines): pool.apply_async( DatasetReorder.copy_line_mul, (data_idx, data_line, type_name, dataset_folder, out_path_file)) pool.close() pool.join() path_list = read_file(out_path_file) print('[Info] 输出路径: {}, 样本数: {}'.format(len(path_list), len(data_lines))) print('[Info] 处理完成: {}'.format(out_path_file))
def process_pool(): pool = Pool(processes=3) for i in range(6): pool.apply_async(func) pool.close() pool.join()
def fill_views(self): client = Client('hscic') pool = Pool(processes=len(self.view_paths)) tables = [] prescribing_date = ImportLog.objects.latest_in_category( 'prescribing').current_at.strftime('%Y-%m-%d') for path in self.view_paths: table_name = "vw__%s" % os.path.basename(path).replace('.sql', '') table = client.get_table(table_name) tables.append(table) with open(path) as f: sql = f.read() substitutions = {'this_month': prescribing_date} args = [table.table_id, sql, substitutions] pool.apply_async(query_and_export, args) pool.close() pool.join() # wait for all worker processes to exit for table in tables: self.download_and_import(table) self.log("-------------") with connection.cursor() as cursor: for view_id in self.materialized_views: # This is quite slow! up to 10 mins. cursor.execute("REFRESH MATERIALIZED VIEW %s" % view_id)
def main(): print('Process (%s) start...' % os.getpid()) p = Pool() for i in range(4): p.apply_async(long_time_task, args=(i,)) print('Waiting for all subprocesses done...') p.close() p.join() print('All subprocesses done.')
class TcpController(object): def __init__(self,handlers): self.handlers=handlers self.workers=Pool(MAX_PROCESS_POOL_SIZE) def process(self,header,body): self.workers.apply_async(wrap,(self.handlers[header](),body,)) def destroy(self): self.handlers=None self.workers.close()
def _get(self, args): draft_id = args[0] id = args[1] if len(args) > 1 else None q = self.db.query(Player) if id is not None: player = q.filter(Player.id == int(id)).first() team = self.db.query(Team).filter(and_(Team.is_owner == True, Team.draft_id == draft_id)).first() available_players = self.db.query(Player).join(Player.core).filter(and_(PlayerCore.rank != None, PlayerCore.target_price != None, PlayerCore.points > 0, Player.draft_id == draft_id, Player.team_id == None, Player.id != player.id)).order_by(PlayerCore.rank).all() min_price = 1 max_price = min(player.core.target_price + 21, team.money) manager = Manager() max_starters_points = manager.dict() max_bench_points = manager.dict() pool = Pool(processes=8) starters, bench = get_starters_and_bench(self.db, team.id) max_starters_points[0] = optimizer.optimize_roster(starters, available_players, team.money - (constants.BENCH_SIZE - len(bench)))[1] for m in range(min_price, 10): pool.apply_async(wrap_optimizer, args=(starters, available_players, team.money - m - (constants.BENCH_SIZE - len(bench)) + 1, max_bench_points, m)) full_starters = True for s in starters: if s is None: full_starters = False if not full_starters: starters_clone = list(starters) bench_clone = list(bench) place_player(player, starters_clone, bench_clone) for m in range(min_price, max_price): pool.apply_async(wrap_optimizer, args=(starters_clone, available_players, team.money - m - (constants.BENCH_SIZE - len(bench_clone)), max_starters_points, m)) pool.close() pool.join() ret = player.to_dict(['core']) ret['max_starters_points'] = dict(max_starters_points) ret['max_bench_points'] = dict(max_bench_points) return ret else: players = q.join(PlayerCore).filter(and_(Player.draft_id == int(draft_id), PlayerCore.rank != None, PlayerCore.target_price != None)).all() return {'players': [p.to_dict(['core']) for p in players]}
def main(): """ Build all the models. Spin off a new process for each participant because the ANN library is not multithreaded. Process is used instead of thread to leverage multiple cores. """ parser = ArgumentParser() parser.add_argument("inputFilename") parser.add_argument("outputDirectory") args = parser.parse_args() inputFilename = args.inputFilename outputDirectory = args.outputDirectory data = pickle.load( open(inputFilename, 'rb') ) tasks = [ 'matb', 'rantask' ] participantIds = [ '001', '002', '003', '004', '005', '006', '007' ] # Cut off first row header for each data set for task in tasks: for participantId in participantIds: data[participantId][task] = data[participantId][task][1:] splits = performSplit( data ) # Record start time so that the elapsed time can be determined start_time = time.time() # Create a multicore processing pool with 7 processes ( 7 so that one core stays free # for system processes ) pool = Pool( processes = 7 ) # Build models for participants in a task for task in tasks: for participantId in participantIds: outputFilename = path.join( outputDirectory, 'testingOn-' + participantId + '-' + task + '.txt' ) # Spin off a process for the building pool.apply_async( tuneANN, ( splits[participantId][task], outputFilename ) ) # Close down the pool so that we can wait on all the processes pool.close() pool.join() # Calculate and print the elapsed time elapsed_time = time.time() - start_time print( "Elapsed time: " + str(elapsed_time) )
def stat_volume(stime,etime): tgsinfo = read_tgs_info() # from multiprocessing.dummy import Pool as ThreadPool from multiprocessing.pool import Pool pool = Pool() volume = [pool.apply_async(stat_tgs_volume,args=(stime,etime,int(cid))) for cid in tgsinfo.keys()] pool.close() print 'waiting to join....' pool.join() print 'start to writing to file...' volume0 = [] for i,elem in enumerate(volume): volume0.append((tgsinfo.keys()[i], elem.get())) volume0.sort(key=lambda x:x[1], reverse=True) total = 0 with open(os.path.join(root_dir, "result", "volume.txt"),"w") as f: for i,elem in enumerate(volume0): # cid = tgsinfo.keys()[i] # vol = elem.get() total += elem[1] line = "%5s,%s: %d\n" % (elem[0], tgsinfo[elem[0]]['kkmc'], elem[1]) f.write(line) print 'totally %d records.' % (total)
def manager_process(dir_queue, file_queue, out_queue): """Dispatches and manages path and scanning workers. """ pool = Pool(options.num_threads) atexit.register(at_exit_manager, pool) logging.info('Gathering Files...') pool.apply(explore_path, (dir_queue, file_queue)) logging.info('Files gathered. Scanning %s files...', file_queue.qsize()) logging.info('Starting %s scan processes...', options.num_threads) print '~' * 80 thread.start_new_thread(print_status, (file_queue,)) for _ in range(options.num_threads): pool.apply_async(parallel_scan, (file_queue, out_queue)) pool.close() pool.join() out_queue.put(StopIteration)
def run_jar_new_thread(config_path, **kwargs): """ kwargs - arguments dictionary of run_jar() """ pool = Pool(processes=1) __import__("ipdb").set_trace() result = pool.apply_async(_run_jar_with_config, [config_path], kwargs['kwargs']) job_id = result.get() return job_id
def run(self): if(self.fileName[0] != None and self.fileName[0] !=""): # p = Process(target=self.creationPDF, args=(self.fileName[0],)) # self.creationPDF(self.fileName[0]) # p.start() pool = Pool(processes=4) # start 4 worker processes result = pool.apply_async(self.creationPDF, [self.fileName[0]]) else: print("Sauvegarde annule")
def multiprocess_all_chromosomes(func, cls, *args, **kwargs): ''' Convenience method for splitting up queries based on tag id. ''' processes = current_settings.ALLOWED_PROCESSES set_chromosome_lists(cls, use_table=kwargs.get('use_table', None)) p = Pool(processes) try: for chr_list in current_settings.CHR_LISTS: p.apply_async(func, args=[cls, chr_list, ] + list(args)) p.close() p.join() except Exception as e: print('Terminating pool.') p.terminate() raise e
def create_execution_pool(): global execution_pool pool_size = engine.app.config['POOL_SIZE'] execution_pool = Pool(pool_size, initializer=initialize_worker) futures = [] for i in xrange(pool_size * 2): futures.append(execution_pool.apply_async(connect_worker)) [f.get() for f in futures]
def main(): queue_logger = setup_redirection() queue_logger.write("ABCDEF\n") try: p = Pool(10) results = [p.apply_async(some_process_body) for i in xrange(20)] [result.get() for result in results] p.close() finally: queue_logger.stop()
def _transport_backup_parallel(data, filename, aws_key, aws_secret, bucketname): """ Parallel multipart upload. """ headers = {} _logger.info('Backing up via S3 parallel multipart upload agent') keyname = filename tempInFile = NamedTemporaryFile(suffix='.zip', prefix='db-backup-', delete=False) tempInFile.write(data) tempInFile.close() source_path = tempInFile.name source_size = os.stat(source_path).st_size parallel_processes = (multiprocessing.cpu_count() * 2) + 1 conn = boto.connect_s3(aws_key, aws_secret) bucket = conn.get_bucket(bucketname) mtype = 'application/zip, application/octet-stream' headers.update({'Content-Type': mtype}) mp = bucket.initiate_multipart_upload(keyname, headers=headers) bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)), 5242880) chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk))) pool = Pool(processes=parallel_processes) for i in range(chunk_amount): offset = i * bytes_per_chunk remaining_bytes = source_size - offset bytes = min([bytes_per_chunk, remaining_bytes]) part_num = i + 1 pool.apply_async(_upload_part, [bucketname, aws_key, aws_secret, mp.id, part_num, source_path, offset, bytes]) pool.close() pool.join() if len(mp.get_all_parts()) == chunk_amount: mp.complete_upload() else: mp.cancel_upload() os.unlink(tempInFile.name) _logger.info('Data successfully backed up to s3')
def process(configuration: LogParserConfig, db: DatabaseConnection, process_count: int): influxdb_client = get_client(configuration) pool = Pool(process_count) currently_processing = defaultdict(list) dl_dir = os.path.join(db.root_dir, 'data') while True: logging.info('Checking for new files to process') settings = db.get_settings() new_files = get_new_files_to_process(configuration.buckets, settings) settings.files.extend(new_files) timestamp = int(time.time()) completed_files = defaultdict(list) # Empty queue of finished work and create a lists of all completed files per bucket while True: if finished_queue.empty(): break success, bucket, filename = finished_queue.get() currently_processing[bucket].remove(filename) if success: completed_files[bucket].append(filename) # Set processed timestamp on processed files for file in settings.files: if file.name in completed_files[file.bucket]: file.processed_timestamp = timestamp logging.info('%d files completed processing since last loop', sum(len(l) for l in completed_files.values())) completed_files.clear() db.save_settings(settings) added = 0 for file in settings.files: if file.processed_timestamp is None and file.name not in currently_processing[file.bucket]: currently_processing[file.bucket].append(file.name) pool.apply_async(process_file, (file.bucket, file.name, dl_dir, influxdb_client), {}, after_processed, after_error) added += 1 processing_count = sum(len(l) for l in currently_processing.values()) if added: logging.info('Added %s files to pool, %s files currently processing.', added, processing_count) else: logging.info('Nothing new to process, sleeping %s seconds. %s files currently in queue to be processed.', configuration.interval, processing_count) time.sleep(configuration.interval) continue
def start(self): print 'Start Speech Download Service...' language = self._language download_dir = self.setup_download_dir() with open(self._text_file_path, 'r') as f: # python 2.7 not supporting context manager with Pool() pool = Pool(processes=8) progress = Progress(len(f.readlines())) f.seek(0) results = [] def update_progress(result): results.append(result) progress.update(len(results)) for text in f: text = text.rstrip('\n') encoded_args = urlencode({ 'hl': language, 'src': text, 'key': config.API_KEY }) url = config.SPEECH_URL + encoded_args download_path = os.path.join(download_dir, text + config.DOWNLOAD_FILE_TYPE) downloader = SpeechDownloader(url, download_path) # Pool().apply() blocks until the process is finished # pool.apply(downloader.download) pool.apply_async(downloader.download, callback=update_progress) # Prevents any more tasks from being submitted to the pool pool.close() # Wait for the worker process to exit pool.join() print 'Done, Downloaded %d Speeches' % len(results)
def create_dictionary(filelist, output, tagmap, delimeter = '/'): """ Create a dictionary out of slashtag-based files. :param filelist: List of file paths :type filelist: list[str] :param output: output file path :type output: str :param tagmap: Optional :type tagmap: TagMap """ c = POSEvalDict() counts = {'tokens':0, 'lines':0} def merge_counts(result): d, cur_tokencount, cur_linecount = result c.combine(d) counts['tokens'] += cur_tokencount counts['lines'] += cur_linecount tm = None if tagmap: tm = TagMap(tagmap) # Initialize multithreading... p = Pool(cpu_count()) for path in filelist: p.apply_async(process_file, args=[path, tm, delimeter], callback=merge_counts) # result = p.apply(process_file, args=[path, tm, delimeter]) # merge_counts(result) p.close() p.join() # Now, dump the pickled POSEvalDict. print("Writing out dictionary...", end=' ') pickle.dump(c, open(output, 'wb')) print("Done.") print("{} tokens processed, {} sentences.".format(counts['tokens'], counts['lines']))
class MultiProcPlugin(DistributedPluginBase): """Execute workflow with multiprocessing The plugin_args input to run can be used to control the multiprocessing execution. Currently supported options are: - n_procs : number of processes to use - non_daemon : boolean flag to execute as non-daemon processes """ def __init__(self, plugin_args=None): super(MultiProcPlugin, self).__init__(plugin_args=plugin_args) self._taskresult = {} self._taskid = 0 n_procs = 1 non_daemon = False if plugin_args: if 'n_procs' in plugin_args: n_procs = plugin_args['n_procs'] if 'non_daemon' in plugin_args: non_daemon = plugin_args['non_daemon'] if non_daemon: # run the execution using the non-daemon pool subclass self.pool = NonDaemonPool(processes=n_procs) else: self.pool = Pool(processes=n_procs) def _get_result(self, taskid): if taskid not in self._taskresult: raise RuntimeError('Multiproc task %d not found'%taskid) if not self._taskresult[taskid].ready(): return None return self._taskresult[taskid].get() def _submit_job(self, node, updatehash=False): self._taskid += 1 self._taskresult[self._taskid] = self.pool.apply_async(run_node, (node, updatehash,)) return self._taskid def _report_crash(self, node, result=None): if result and result['traceback']: node._result = result['result'] node._traceback = result['traceback'] return report_crash(node, traceback=result['traceback']) else: return report_crash(node) def _clear_task(self, taskid): del self._taskresult[taskid]
def fetch_all(page_type, ids, output): #Python multithreding mess incoming start = time() pages = [] links = build(page_type, ids) #Number of worker processes to start num_of_proc = 8 pool = Pool(processes=num_of_proc) #Builds the process pool results = [pool.apply_async(fetch, (url, page_type[1], output,)) for url in links] #Proceeds to run the processes for result in results: result.get() print("Fetched all in %s" %(time() - start))
def fetch_all(page_type, ids, output): #Python multithreding mess incoming start = time() pages = [] links = build(page_type, ids) #Number of worker processes to start num_of_proc = 8 pool = Pool(processes=num_of_proc) #Fetches pages asynchronously results = [pool.apply_async(fetch, (url, page_type[1], output,)) for url in links] #Appends the fetched pages into the pages list for result in results: result.get() print("Fetched all in %s" %(time() - start))
def _save_content(self, key, content, headers): print "key.name", key.name source_size = os.stat(content.file.name).st_size bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)), 5242880) chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk))) mp = self.bucket.initiate_multipart_upload(key.name, headers=headers) pool = Pool(processes=1) for i in range(chunk_amount): offset = i * bytes_per_chunk remaining_bytes = source_size - offset bytes = min([bytes_per_chunk, remaining_bytes]) part_num = i + 1 pool.apply_async(_upload_part, [self.bucket_name, self.access_key, self.secret_key, mp.id, part_num, content.file.name, offset, bytes]) pool.close() pool.join() if len(mp.get_all_parts()) == chunk_amount: mp.complete_upload() key = self.bucket.get_key(key.name) else: mp.cancel_upload()
def load_from_web(): print "Loading from Web" movies = common.read_json(JSON_IN_FILE) pool = Pool(5) worker = [pool.apply_async(process_movie, [m]) for m in movies] imdb_movies = [] for w in worker: w.wait() result = w.get() if result is not None: imdb_movies.append(w.get()) common.write_json(JSON_OUT_FILE, imdb_movies)
def multi_proc5(self, batch): start_time = datetime.datetime.now() sql = "select count(id) from records" count_result = db_connection.execute(sql) for row in count_result: count = row[0] break sql = "select id from records" result = db_connection.execute(sql) record_ids = [] for idx, row in enumerate(result): if (idx % int(count/4) == 0) or (idx == count - 1): #4 because that is how many workers we have if idx == 0: some_records = [] else: record_ids.append(some_records) some_records = [] some_records.append(row[0]) input_pool = Pool(4) #Add id messages to input queue input_pool.map(partial(add_batch_ids_to_queue, batch_size=int(batch)), record_ids) input_pool.close() input_pool.join() output_pool = Pool(4) #Read ids from input_queue, read message from DB and write it to output_queue worker_results = [] for i in range(4): worker_results.append(output_pool.apply_async(read_id_from_queue, ())) output_pool.close() for r in worker_results: r.get() # This reports results, including errors, of workers output_pool.join() # This blocks until all the processes have finished end_time = datetime.datetime.now() time_taken = (end_time - start_time).total_seconds() return time_taken
def create_training_parallel(count): pool_size = 8 batch_count = pool_size * 5 pool = Pool(pool_size) print("generating") results = [] for i in range(batch_count): results.append(pool.apply_async(create_training_data, (count/batch_count,))) pool.close() pool.join() print("concatenating") output = [] for r in results: output.extend(r.get(1000)) return output
def multi_proc3(self, batch): start_time = datetime.datetime.now() sql = "select count(id) from records" count_result = db_connection.execute(sql) for row in count_result: count = row[0] break sql = "select id from records" result = db_connection.execute(sql) record_ids = [] for idx, row in enumerate(result): if (idx % int(batch) == 0) or (idx == count - 1): if idx == 0: some_records = [] else: record_ids.append(some_records) some_records = [] some_records.append(row[0]) #Add id messages to input queue msg_handler = MessageHandler() for records in record_ids: msg_handler.add_message(json.dumps({"ids":records}), "input_queue") worker_results = [] p = Pool(4) for i in range(4): worker_results.append(p.apply_async(read_id_from_queue, ())) p.close() for r in worker_results: r.get() p.join() # This blocks until all the processes have finished end_time = datetime.datetime.now() time_taken = (end_time - start_time).total_seconds() return time_taken
def multi_proc4(self, batch): start_time = datetime.datetime.now() sql = "select count(id) from records" count_result = db_connection.execute(sql) for row in count_result: count = row[0] break sql = "select id from records" result = db_connection.execute(sql) record_ids = [] for idx, row in enumerate(result): if (idx % int(batch) == 0) or (idx == count - 1): if idx == 0: some_records = [] else: record_ids.append(some_records) some_records = [] some_records.append(row[0]) p = Pool(4) #Add id messages to input queue p.map(add_ids_to_queue, record_ids) #Read ids from input_queue, read message from DB and write it to output_queue worker_results = [] p = Pool(4) for i in range(4): worker_results.append(p.apply_async(read_id_from_queue, ())) p.close() for r in worker_results: r.get() p.join() # This blocks until all the processes have finished end_time = datetime.datetime.now() time_taken = (end_time - start_time).total_seconds() return time_taken
def render(self, scene, bucket_order_info: BucketOrderInfo, multiThread: bool=True): self.scene = scene if self.surface_integrator is not None: self.surface_integrator.Preprocess(scene, self.camera, self) if self.volume_integrator is not None: self.volume_integrator.Preprocess(scene, self.camera, self) sample = Sample(self.main_sampler, self.surface_integrator, self.volume_integrator, scene) if multiThread: my_bucket_orders = BucketOrder.create(bucket_order_info.width, bucket_order_info.height, bucket_order_info.bucket_order_type) pool = Pool(processes=multiprocessing.cpu_count()) # pool = Pool(processes=1) pool._wrap_exception = False results = [] for i in range(bucket_order_info.width * bucket_order_info.height): # for i in range(55, 56): a = pool.apply_async(self.render_task, args=( i, my_bucket_orders.buckets_orders[i], bucket_order_info, sample, self), callback=self.draw) results.append(a) for r in results: r.wait() else: bucketOrderInfo = BucketOrderInfo(BucketOrderSortType.Random, 1, 1) self.render_task(0, 0, bucketOrderInfo, sample, self) print("Render end") data = write_png(self.camera.film.data, self.camera.film.width, self.camera.film.height) with open("my_image.png", 'wb') as fd: fd.write(data)
def fill_views(self): paths = [] if self.view: for path in self.view_paths: if self.view in path: paths.append(path) break else: paths = self.view_paths pool = Pool(processes=len(paths)) pool_results = [] prescribing_date = ImportLog.objects.latest_in_category( 'prescribing').current_at.strftime('%Y-%m-%d') for view in paths: if self.view and self.view not in view: continue # Perform bigquery parts of operation in parallel result = pool.apply_async( query_and_export, [self.dataset, view, prescribing_date]) pool_results.append(result) pool.close() pool.join() # wait for all worker processes to exit for result in pool_results: tablename, gcs_uri = result.get() f = download_and_unzip(gcs_uri) copy_str = "COPY %s(%s) FROM STDIN " copy_str += "WITH (FORMAT CSV)" fieldnames = f.readline().split(',') with connection.cursor() as cursor: with utils.constraint_and_index_reconstructor(tablename): self.log("Deleting from table...") cursor.execute("DELETE FROM %s" % tablename) self.log("Copying CSV to postgres...") try: cursor.copy_expert(copy_str % ( tablename, ','.join(fieldnames)), f) except Exception: import shutil shutil.copyfile(f.name, "/tmp/error") raise f.close() self.log("-------------")
def stat_first_tgs(stime,etime): begtime = long(time.mktime(stime.timetuple())*1000) endtime = long(time.mktime(etime.timetuple())*1000) tgsinfo = read_tgs_info() vehicles = {} test = random.sample(tgsinfo.keys(), 100) from multiprocessing.pool import Pool pool = Pool() result = [pool.apply_async(_stat_first_tgs_single, args=(int(cid),begtime,endtime)) for cid in tgsinfo.keys()] pool.close() pool.join() result1 = [elem.get() for elem in result] print 'joining....' result2 = reduce(_combine, result1) print 'totally %d vehicles. ' % (len(result2)) # print type(result2) c = Counter() for veh, info in result2.iteritems(): c[info[1]] += 1 print 'writing result into file...' with open(os.path.join(root_dir,"result","first_tgs.txt"),"w") as f: c1 = c.most_common() # print c1[0] # for cid,count in c1.iteritems(): for elem in c1: line = "%5d,%6d\n" % (elem[0],elem[1]) f.write(line) print 'finished.'
def __test_multi_processed(self, test_set, method, model, threshold, trees, all_node_ids): """ Create a process pool to distribute the prediction. """ #process_count = multiprocessing.cpu_count() process_count = 4 pool = Pool(processes=process_count) step = int(math.ceil(float(len(test_set)) / process_count)) results = [] for j in range(0, len(test_set), step): meme_ids = test_set[j: j + step] res = pool.apply_async(test_meme, (meme_ids, method, model, threshold, trees, all_node_ids, self.user_ids, self.users_map, self.verbosity)) results.append(res) pool.close() pool.join() prp1_list = [] prp2_list = [] precisions = [] recalls = [] fprs = [] f1s = [] # Collect results of the processes. for res in results: r1, r2, r3, r4, r5, r6 = res.get() precisions.extend(r1) recalls.extend(r2) fprs.extend(r3) recalls.extend(r4) prp1_list.extend(r5) prp2_list.extend(r6) return precisions, recalls, f1s, fprs, prp1_list, prp2_list