def get_taotu_pages(category_url): # 找到某个分类下全部的分页URL print('process category: {0}'.format(category_url)) soup = commons.soup(category_url, encoding='utf8') print('process index: {0}'.format(soup.title)) last_no = get_last_page_no(soup) urls = ['{0}/list_{1}.html'.format(category_url, i) for i in range(2, last_no + 1)] # for url in urls: # download_by_page(url) retry = 0 while True: pool = ThreadPool(4) try: pool.map(download_by_page, urls) pool.close() pool.join() print('all images downloaded completely.') break except KeyboardInterrupt, e: print('download terminated by user, quit now.', e) pool.terminate() pool.join() break except Exception, e: pool.terminate() pool.join() retry += 1 traceback.print_exc() try: print('download error: {0}, {1} retry in {2}s'.format( e, retry, retry * 20 % 120)) except Exception: pass time.sleep(retry * 20 % 120)
def dispatch(objects: list, method: str, parentId: int): # print("{} sleepin 5".format(parentId)) # time.sleep(5) # print("{} done sleepin 5".format(parentId)) # return parentId if method.lower() == "serial": for object in objects: try: runner(object) except clsDelayedException as tb: tb.re_raise() elif method.lower() == "parallel": # pool = ThreadPool(processes=len(objects)) pool = ThreadPool(processes=5) try: # imap_unordered returns the result as soon as they're available list(pool.imap_unordered(runner, objects)) except clsDelayedException as tb: logging.error( f"Terminating thread pool of parent with id {parentId}") pool.close() pool.terminate() tb.re_raise() else: pool.close() pool.join() # pool.wait(timeout=10) else: raise AttributeError( "Only 'serial' or 'parallel' supported for dispatch method")
def _get_available_endpoint(self): """ Private method that handles filtering endpoints of the right size, parallely asking those endpoints if they're available and handling what they find out. :return: an endpoint that a scan can be ran on or raise a NoServersAvailableError """ Logger.app.debug("Searching for appropriately sized servers") correct_sized_endpoints = self._get_endpoints_of_the_right_size() # correct_sized_endpoints = random.shuffle(correct_sized_endpoints) Logger.app.debug("correct_sized_endpoints: {}".format(correct_sized_endpoints)) # if there are no endpoints of that size. if len(correct_sized_endpoints) == 0: Logger.app.error("No servers of that size are available.") raise NoServersAvailableError pool = ThreadPool(len(correct_sized_endpoints)) # start some threads - first to finish adds to a queue and that is what we return. pool.imap_unordered(self._is_endpoint_available, correct_sized_endpoints) try: correct_sized_endpoint = self._results_queue.get(block=True, timeout=self.timeout) except queue.Empty as e: # if queue is still empty after timeout period this is raised. Logger.app.error("The search has timed out after {} seconds.".format(self.timeout)) raise NoServersAvailableError Logger.app.debug("We have an endpoint: {}".format(correct_sized_endpoint)) # kill all running threads pool.terminate() return correct_sized_endpoint
def extract_features(extract_fn, image_path, ofile, params={}): cwd = os.getcwd() image_path = utils.absolute_path(image_path) # Read filenames files = [] if os.path.isdir(image_path): for dirpath, _, filenames in os.walk(image_path): for f in filenames: path = os.path.abspath(os.path.join(dirpath, f)) if not utils.is_valid_image(path): print("Warning, please provide a valid image: ", f) else: files.append(path) else: files = [image_path] files.sort(key=utils.natural_sort_key) output_file = utils.absolute_path(ofile) if os.path.isdir(output_file): print("The provided file is a directory:", output_file) sys.exit(0) if os.path.exists(output_file): os.remove(output_file) def extract_and_save(path): try: X = extract_fn(path, **params) except Exception as e: print("Cannot extract feactures from", path) print(str(e)) return X = X.reshape((1, X.shape[0])) lock.acquire() with open(output_file, 'a+') as f_handle: with open(output_file + ".label", 'a+') as f_handle_label: numpy.savetxt(f_handle, X) f_handle_label.write(os.path.basename(path) + "\n") lock.release() pool = ThreadPool(cpu_count()) results = pool.map(extract_and_save, files) pool.close() pool.terminate() pool.join() """ for path in files: X = feaext.SRM_extract(path, **params) print X.shape X = X.reshape((1, X.shape[0])) with open(sys.argv[3], 'a+') as f_handle: numpy.savetxt(f_handle, X) """ os.chdir(cwd)
def fetch_or_load_urls(filename): # 获取全部的图卦URL列表 # 如果存在缓存,从缓存读取 # 否则从网上多线程/进程获取 if os.path.exists(jsonfile): print('found url json file cache {0}'.format(filename)) return json.load(open(jsonfile, 'r')) pool = ThreadPool(4) try: pool.map(find_all_tugua_urls, range(1, 51)) pool.close() pool.join() except KeyboardInterrupt: print('terminated by user.') pool.terminate() pool.join() items = [url_to_item(url) for url in sorted(urls, cmp=url_cmp)] if items: # json.dump(items, open('urls.json', 'w'),indent=2) # 输出\uxxxx json.dump( items, codecs.open(filename, 'w', 'utf8'), # 输出中文文字 ensure_ascii=False, indent=2) print('saved url json file cache to {0}'.format(filename)) else: print('no url items found, maybe something wrong!') return items
def ComputeTimelineBasedMetrics(self): assert not self._current_page_run, 'Cannot compute metrics while running.' def _GetCpuCount(): try: return multiprocessing.cpu_count() except NotImplementedError: # Some platforms can raise a NotImplementedError from cpu_count() logging.warn('cpu_count() not implemented.') return 8 runs_and_values = self._FindRunsAndValuesWithTimelineBasedMetrics() if not runs_and_values: return # Note that this is speculatively halved as an attempt to fix # crbug.com/953365. threads_count = min(_GetCpuCount() / 2 or 1, len(runs_and_values)) pool = ThreadPool(threads_count) try: for result in pool.imap_unordered(_ComputeMetricsInPool, runs_and_values): self._AddPageResults(result) finally: pool.terminate() pool.join()
def SerializeHtmlTraces(results): """Creates html trace files for each story run, if necessary. For each story run, takes all trace files from individual trace agents and runs trace2html on them. This is done only once, subsequent calls to this function will not do anything. TODO(crbug.com/981349): Remove this function entirely when trace serialization has been handed over to results processor. """ assert not results.current_story_run, 'Cannot serialize traces while running.' def _GetCpuCount(): try: return multiprocessing.cpu_count() except NotImplementedError: # Some platforms can raise a NotImplementedError from cpu_count() logging.warn('cpu_count() not implemented.') return 8 available_runs = list(run for run in results.IterRunsWithTraces()) if not available_runs: return # Note that this is speculatively halved as an attempt to fix # crbug.com/953365. threads_count = min(_GetCpuCount() / 2 or 1, len(available_runs)) pool = ThreadPool(threads_count) try: for _ in pool.imap_unordered(_SerializeHtmlTraceInPool, available_runs): pass finally: pool.terminate() pool.join()
def run(): t = [ ('users', User().create), ('forums', Forum().create), ('threads', Thread().create), ('posts', Post().create), ("followers", User().follow), ("subscribptions", Thread().subscribe), ] for entity, factory in t: entities = [True for i in range(int(settings[entity]))] num_tasks = len(entities) pool = ThreadPool(int(settings['num_threads'])) try: progress = range(5, 105, 5) for i, _ in enumerate(pool.imap(factory, entities)): perc = i * 100 / num_tasks if perc % 5 == 0 and perc in progress: log.print_out('Creating %s: %d%% done' % (entity, perc)) progress.remove(perc) pool.close() pool.join() except Exception, e: print e pool.terminate() sys.exit(1)
def processJobs(jobs, concurrentTasks, sortOutput=False): job_count = len(jobs) logging.info("Processing {} job(s) with a concurrency of {}".format( job_count, concurrentTasks)) if RANDOMIZE_JOBS: shuffle(jobs) pool = Pool(concurrentTasks) try: job_progress = 0 for x in tqdm(pool.imap_unordered(worker, jobs), total=len(jobs)): job_progress += 1 logging.info("{} out of {} staged jobs remaining".format( job_count - job_progress, job_count)) pool.close() pool.join() except KeyboardInterrupt: printAndLog( "\nReceived keyboard interrupt. Cleaning up and exiting...") pool.terminate() cleanup() sys.exit(1) except SystemExit: pool.terminate() sys.exit(1) if sortOutput: cleanup() print("\n")
def tests(): # for vm in VM_REFERENCE: # vmJson = str(VM_REFERENCE[vm]).replace("\'", "\"") # #VM_REFERENCE[vm]['vmName'] # #vmDetails = json.JSONDecoder.decode(VM_REFERENCE[vm]) # print(vmJson) # vmDetails = json.loads(vmJson) # type(vmDetails) # print(f'\nTEST {vmDetails["vmName"]}') # #vmDetails.get(). # print(f'\nTEST {VM_REFERENCE["dc"]["nicPrivateIPAddress"]}') # pips_info = [] # pips_info.append("item1") # pips_info.append("item2") # pips_info.append("item3") # print(f'\nTEST {pips_info[0]}') # # replace_in_dict(example_dict, variables) # ADMIN_PASSWORD = os.environ['ADMINPASSWORD'] # SERVICE_ACCOUNT_PASSWORD = ADMIN_PASSWORD # PASSWORDS["ADMIN_PASSWORD"] = ADMIN_PASSWORD # PASSWORDS["SERVICE_ACCOUNT_PASSWORD"] = SERVICE_ACCOUNT_PASSWORD # #result = replace_in_dict(example_dict, PASSWORDS) # result = replace_in_dict(VM_DSC_REFERENCE, PASSWORDS) # print(f'\nresult after: {result}') print(f'\nStart test') print_time_pool = ThreadPool(1) print_time_pool.apply_async(print_time) time.sleep(2) print_time_pool.terminate()
def dowload_all(by_page=False): # 下载全站标签对应的图片 items = range(1, 145) if by_page else get_all_tags() retry = 0 while True: pool = ThreadPool(4) try: pool.map(download_by_page if by_page else download_by_tag, items) pool.close() pool.join() print('all images are downloaded completely.') break except KeyboardInterrupt, e: print('download terminated by user, quit now.', e) pool.terminate() pool.join() break except Exception, e: pool.terminate() pool.join() retry += 1 traceback.print_exc() try: print('download error: {0}, {1} retry in {2}s'.format( e, retry, retry * 20 % 120)) except Exception: pass time.sleep(retry * 20 % 120)
def put_from_manifest( s3_bucket, s3_connection_host, s3_ssenc, s3_base_path, aws_access_key_id, aws_secret_access_key, manifest, bufsize, reduced_redundancy, rate_limit, concurrency=None, incremental_backups=False): """ Uploads files listed in a manifest to amazon S3 to support larger than 5GB files multipart upload is used (chunks of 60MB) files are uploaded compressed with lzop, the .lzo suffix is appended """ exit_code = 0 bucket = get_bucket( s3_bucket, aws_access_key_id, aws_secret_access_key, s3_connection_host) manifest_fp = open(manifest, 'r') buffer_size = int(bufsize * MBFACTOR) files = manifest_fp.read().splitlines() pool = Pool(concurrency) for f in pool.imap(upload_file, ((bucket, f, destination_path(s3_base_path, f), s3_ssenc, buffer_size, reduced_redundancy, rate_limit) for f in files if f)): if f is None: # Upload failed. exit_code = 1 elif incremental_backups: # Delete files that were successfully uploaded. os.remove(f) pool.terminate() exit(exit_code)
def put_from_manifest( s3_bucket, s3_connection_host, s3_ssenc, s3_base_path, aws_access_key_id, aws_secret_access_key, manifest, bufsize, concurrency=None, incremental_backups=False, ): """ Uploads files listed in a manifest to amazon S3 to support larger than 5GB files multipart upload is used (chunks of 60MB) files are uploaded compressed with lzop, the .lzo suffix is appended """ bucket = get_bucket(s3_bucket, aws_access_key_id, aws_secret_access_key, s3_connection_host) manifest_fp = open(manifest, "r") buffer_size = int(bufsize * MBFACTOR) files = manifest_fp.read().splitlines() pool = Pool(concurrency) for _ in pool.imap( upload_file, ((bucket, f, destination_path(s3_base_path, f), s3_ssenc, buffer_size) for f in files) ): pass pool.terminate() if incremental_backups: for f in files: os.remove(f)
def main1(): pp = sys.argv[1] log = get_logger(level=logging.DEBUG) pool = ThreadPool(10) q = queue.Queue() i = 0 total = 0 try: t = time.time() print(time.localtime(t)) for root, dirs, files in os.walk(pp, True): for f in files: fpath = pjoin(root, f) # q.put(pool.apply_async(work, (fpath, t, log))) pool.apply_async(work, (fpath, t, log)) total += 1 log.info("total=%d, q=%d, t=%d", total, q.qsize(), time.time() - t) pool.close() pool.join() log.info("finish, total=%d, q=%d, t=%d", total, q.qsize(), time.time() - t) print(time.localtime()) except Exception as e: log.exception(e) pool.terminate() raise finally: pool.join()
def put_from_manifest(s3_bucket, s3_connection_host, s3_ssenc, s3_base_path, aws_access_key_id, aws_secret_access_key, manifest, bufsize, concurrency=None, incremental_backups=False): """ Uploads files listed in a manifest to amazon S3 to support larger than 5GB files multipart upload is used (chunks of 60MB) files are uploaded compressed with lzop, the .lzo suffix is appended """ bucket = get_bucket(s3_bucket, aws_access_key_id, aws_secret_access_key, s3_connection_host) manifest_fp = open(manifest, 'r') buffer_size = int(bufsize * MBFACTOR) files = manifest_fp.read().splitlines() pool = Pool(concurrency) for _ in pool.imap( upload_file, ((bucket, f, destination_path(s3_base_path, f), s3_ssenc, buffer_size) for f in files)): pass pool.terminate() if incremental_backups: for f in files: os.remove(f)
def abortable_worker(func, *args, **kwargs): timeout = kwargs.get("timeout", None) instance_id = kwargs.get("instance_id", None) instance_name = kwargs.get("instance_name", None) main_result = kwargs.get("result", None) timeout_results = [] timeout_results.append(main_result) p = ThreadPool(1) timeout_result = initResult() res = p.apply_async(func, args=args) try: out = res.get(timeout) #wait for function to complete return out except mp.TimeoutError as e: print "Timeout" timeout_result["id"] = instance_id timeout_result["name"] = instance_name timeout_result["action"] = "GET_STATUS" timeout_result["status"] = "Timeout checking status after X secs" timeout_results.append(timeout_result) #process_results(timeout_results) print "terminating due to timeout" print timeout_results p.terminate() #print "Printing timeout result ********************" #print timeout_results return timeout_results
def process(self, urls): result = {} urls = list(set(urls)) # get crawled pages for page in self.storage.find({'_id': {'$in': urls}}): if page.get('crawled_date'): self.logger.debug('Page was crawled: ' + page['_id']) result[page['_id']] = page self.logger.info("Num of crawled urls: %s" % len(result)) # filter crawled page urls = [u for u in urls if u not in result] self.logger.info("Remain haven't crawled urls: %s" % len(urls)) if not urls: self.logger.info('All urls has been crawled') return result if len(urls) > 2: # use multi thread to crawl pages pool = Pool(cpu_count() * 2) self.logger.debug('Have to crawl these urls: %s' % urls) pool_results = pool.map(self._crawl_page, urls) # get results for r in pool_results: result.update(r) pool.close() pool.terminate() else: for url in urls: result.update(self._crawl_page(url)) return result
def bfs(start, depth, keyword): start = removeQuery(start) foundUrls.append(removeScheme(start)) startNode = newNode(start) nodeList.append(startNode) queue = [startNode] keywordFound = False p = ThreadPool(10) #if we have reached depth then we don't need to search any more links #check all of the nodes at this depth before moving deeper while depth > 0 and queue and not keywordFound: results = p.map(scrape, queue) queue = [] for children in list(results): for childNode in children: tempUrl = childNode['url'] if removeScheme(tempUrl) in foundUrls: children.remove(childNode) continue if keyword and (keyword in tempUrl): keywordFound = True childNode['hasKeyword'] = True break foundUrls.append(removeScheme(tempUrl)) queue.extend(children) if (keywordFound): break depth -= 1 print('***************** depth levels remaining = ', depth) p.terminate() p.join() return
def abortable_func(func, *args, **kwargs): """ The abortable_func is the wrapper function, which wraps around function type "func", call it in a background thread (multiprocessing.dummy.Thread), and terminates it after "timeout" seconds. This function is inspired by http://stackoverflow.com/questions/29494001/how-can-i-abort-a-task-in-a-multiprocessing-pool-after-a-timeout but is an improvement over the original solution, since the original solution is only applicable to a function that takes positional arguments. Parameters of the function: func - the function that will be called and terminated if not return with "timeout" seconds *args - positional arguments of "func" **kwargs - named arguments of "func" + "timeout" value """ #- Get "timeout" value and create a ThreadPool (multiprocessing.dummy.Pool) # with only 1 worker. #- Use functools.partial (https://docs.python.org/3/library/functools.html) # to fit all the arguments of the func into the interface of # Pool.apply_async function timeout = kwargs.pop('timeout', None); p = ThreadPool(1); partial_func = partial(func,**kwargs); res = p.apply_async(partial_func,args); #- Terminate the thread if it does not return after "timeout" seconds # otherwise return the returned value of func try: out = res.get(timeout); return out except TimeoutError: p.terminate() return "{}:Timeout exceeded. Process terminated.\r\n".format(args[0]);
def main(): n = 1000000 m = 1 m2 = 10000 m3 = 100 create_db() pool = Pool(processes=5) start = time.time() fill(n) fill_time = time.time() - start print('{} inserts in {}s'.format(n,fill_time)) start = time.time() results = [] for _ in range(m): results.append(pool.apply_async(read, ())) # results.append(pool.apply_async(read_dataset, ())) for i in range(m2): results.append(pool.apply_async(read_one, ())) if i%m3 == 0: results.append(pool.apply_async(fill, (1,))) for r in results: r.get(timeout=1000000) read_time = time.time() - start pool.terminate() print('{}.{} reads in {}s'.format(m,m2,read_time))
def ComputeTimelineBasedMetrics(results): """Compute TBMv2 metrics on all story runs in parallel.""" assert not results.current_story_run, 'Cannot compute metrics while running.' def _GetCpuCount(): try: return multiprocessing.cpu_count() except NotImplementedError: # Some platforms can raise a NotImplementedError from cpu_count() logging.warn('cpu_count() not implemented.') return 8 available_runs = list(run for run in results.IterRunsWithTraces() if run.tbm_metrics) if not available_runs: return # Note that this is speculatively halved as an attempt to fix # crbug.com/953365. threads_count = min(_GetCpuCount() / 2 or 1, len(available_runs)) pool = ThreadPool(threads_count) metrics_runner = lambda run: _ComputeMetricsInPool(run, results.label, results.upload_bucket) try: for result in pool.imap_unordered(metrics_runner, available_runs): results.AddMetricPageResults(result) finally: pool.terminate() pool.join()
def main(): n = 1000000 m = 1 m2 = 10000 m3 = 100 create_db() pool = Pool(processes=5) start = time.time() fill(n) fill_time = time.time() - start print('{} inserts in {}s'.format(n, fill_time)) start = time.time() results = [] for _ in range(m): results.append(pool.apply_async(read, ())) # results.append(pool.apply_async(read_dataset, ())) for i in range(m2): results.append(pool.apply_async(read_one, ())) if i % m3 == 0: results.append(pool.apply_async(fill, (1, ))) for r in results: r.get(timeout=1000000) read_time = time.time() - start pool.terminate() print('{}.{} reads in {}s'.format(m, m2, read_time))
def embed_message(embed_fn, path, payload, output_dir, embed_fn_saving=False): path = utils.absolute_path(path) if not os.path.exists(output_dir): os.makedirs(output_dir) output_dir = utils.absolute_path(output_dir) # Read filenames files = [] if os.path.isdir(path): for dirpath, _, filenames in os.walk(path): for f in filenames: path = os.path.abspath(os.path.join(dirpath, f)) if not utils.is_valid_image(path): print("Warning, please provide a valid image: ", f) else: files.append(path) else: files = [path] # remove fileas already generated in a previous execution filtered_files = [] for f in files: basename = os.path.basename(f) dst_path = os.path.join(output_dir, basename) if os.path.exists(dst_path): print("Warning! file already exists, ignored:", dst_path) continue filtered_files.append(f) files = filtered_files del filtered_files def embed(path): basename = os.path.basename(path) dst_path = os.path.join(output_dir, basename) if embed_fn_saving: embed_fn(path, payload, dst_path) else: X = embed_fn(path, payload) try: scipy.misc.toimage(X, cmin=0, cmax=255).save(dst_path) except Exception as e: print(str(e)) # Process thread pool in batches batch = 1000 for i in range(0, len(files), batch): files_batch = files[i:i + batch] n_core = cpu_count() print("Using", n_core, "threads") pool = ThreadPool(n_core) results = pool.map(embed, files_batch) pool.close() pool.terminate() pool.join() """
def parallel_apply_generator( func, iterable, workers, max_queue_size, dummy=False, random_seeds=True ): """多进程或多线程地将func应用到iterable的每个元素中。 注意这个apply是异步且无序的,也就是说依次输入a,b,c,但是 输出可能是func(c), func(a), func(b)。结果将作为一个 generator返回,其中每个item是输入的序号以及该输入对应的 处理结果。 参数: dummy: False是多进程/线性,True则是多线程/线性; random_seeds: 每个进程的随机种子。 """ if dummy: from multiprocessing.dummy import Pool, Queue else: from multiprocessing import Pool, Queue in_queue, out_queue, seed_queue = Queue(max_queue_size), Queue(), Queue() if random_seeds is True: random_seeds = [None] * workers elif random_seeds is None or random_seeds is False: random_seeds = [] for seed in random_seeds: seed_queue.put(seed) def worker_step(in_queue, out_queue): """单步函数包装成循环执行 """ if not seed_queue.empty(): np.random.seed(seed_queue.get()) while True: i, d = in_queue.get() r = func(d) out_queue.put((i, r)) # 启动多进程/线程 pool = Pool(workers, worker_step, (in_queue, out_queue)) # 存入数据,取出结果 in_count, out_count = 0, 0 for i, d in enumerate(iterable): in_count += 1 while True: try: in_queue.put((i, d), block=False) break except six.moves.queue.Full: while out_queue.qsize() > max_queue_size: yield out_queue.get() out_count += 1 if out_queue.qsize() > 0: yield out_queue.get() out_count += 1 while out_count != in_count: yield out_queue.get() out_count += 1 pool.terminate()
def parallel_apply(func, iterable, workers, max_queue_size, callback=None, dummy=False): """多进程或多线程地将func应用到iterable的每个元素中。 注意这个apply是异步且无序的,也就是说依次输入a,b,c,但是 输出可能是func(c), func(a), func(b)。 参数: dummy: False是多进程/线性,True则是多线程/线性; callback: 处理单个输出的回调函数; """ if dummy: from multiprocessing.dummy import Pool, Queue else: from multiprocessing import Pool, Queue in_queue, out_queue = Queue(max_queue_size), Queue() # 启动多进程/线程 pool = Pool(workers, worker_step, (func, in_queue, out_queue)) if callback is None: results = [] # 后处理函数 def process_out_queue(): out_count = 0 for _ in range(out_queue.qsize()): d = out_queue.get() out_count += 1 if callback is None: results.append(d) else: callback(d) return out_count # 存入数据,取出结果 in_count, out_count = 0, 0 for d in iterable: in_count += 1 while True: try: in_queue.put(d, block=False) break except six.moves.queue.Full: out_count += process_out_queue() if in_count % max_queue_size == 0: out_count += process_out_queue() while out_count != in_count: out_count += process_out_queue() pool.terminate() if callback is None: return results
class MultiprocessingPool(): """ Wrapper for multiprocessing Pool map """ def __init__(self, func, arg_array, workers): """ Args: func - method to run arg_array - list of input for function workers - integer number of threads for multiprocessing """ if workers < 1: raise ValueError( "MultiprocessingPool - Workers must be >= 1: Got value '{0}'". format(workers)) if not isinstance(arg_array, list): raise ValueError( "MultiprocessingPool - Expected list for arg_array, got '{0}'". format(type(arg_array))) self.__pool = ThreadPool(workers) self.__func = func self.__arg_array_iter = iter(arg_array) def run(self): """Start the multiprocessing pool map Returns: returns list of return values of function """ try: res = self.__pool.map(self.__run_func, self.__arg_array_iter) self.__pool.close() self.__pool.join() return res except KeyboardInterrupt: log.debug("Keyboard interrupt while mapping pool.") self.__pool.terminate() except Exception as e: log.error("Unhandled Exception in pool: {0} - {1}".format( str(e), [traceback.format_exc()])) self.__pool.terminate() def __run_func(self, arg): """Wrapper for worker functions Args: arg - argument to pass into function Returns: return value of function """ try: return self.__func(arg) except KeyboardInterrupt: raise RuntimeError("Keyboard Interrupt") except Exception as e: log.error("Unhandled Exception in worker: {0} - {1}".format( str(e), [traceback.format_exc()])) return
def timeout_wrapper(func, *args, **kwargs): timeout = kwargs.get("timeout", None) p = ThreadPool(1) res = p.apply_async(func, args=args) try: out = res.get(timeout) # Wait timeout seconds for func to complete. except multiprocessing.TimeoutError: p.terminate() else: return out
def abortable_worker(in_args): tp = ThreadPool(1) timeout, tup = in_args res = tp.apply_async(func=spinSingleInstance, args=(tup, )) try: out = res.get(timeout) # Wait timeout seconds for func to complete. return out except Exception as e: tp.terminate() raise
def abortable_worker(func, *args, **kwargs): timeout = kwargs.get('timeout', None) p = ThreadPool(1) res = p.apply_async(func, args=args) try: out = res.get(timeout) # Wait timeout seconds for func to complete. return out except multiprocessing.TimeoutError: p.terminate() raise
def __call__(self, *args, **kwargs): from multiprocessing.dummy import Pool as ThreadPool p = ThreadPool(1) res = p.apply_async(self.func, args=args, kwds=kwargs) try: out = res.get(self.timeout) # Wait timeout seconds for func to complete. return out except multiprocessing.TimeoutError: p.terminate() raise multiprocessing.TimeoutError(str(self.timeout))
class SThreadPool(metaclass=metas.Singleton): def __init__(self): self.__thread_pool__ = Pool(SConfig().WORKERS) def add(self, task, args): self.__thread_pool__.apply_async(task, args) def terminate(self): self.__thread_pool__.terminate() self.__thread_pool__.join()
def _multipart_upload(buf, name, file_size, client, **kwargs): # scale the part size based on file size part_size = max(int(math.sqrt(MIN_PART_SIZE) * math.sqrt(file_size)), MIN_PART_SIZE) num_parts = int(math.ceil((file_size) / float(part_size))) log.debug( 'Uploading file with %s bytes using %s file parts with a part ' 'size of %s bytes', file_size, num_parts, part_size) file_response = client.files.post_multipart(name=name, num_parts=num_parts, **kwargs) # Platform will give us a URL for each file part urls = file_response.upload_urls if num_parts != len(urls): raise ValueError( f"There are {num_parts} file parts but only {len(urls)} urls") # upload function wrapped with a retry decorator @retry(RETRY_EXCEPTIONS) def _upload_part_base(item, file_path, part_size, file_size): part_num, part_url = item[0], item[1] offset = part_size * part_num num_bytes = min(part_size, file_size - offset) log.debug('Uploading file part %s', part_num) with open(file_path, 'rb') as fin: fin.seek(offset) partial_buf = BufferedPartialReader(fin, num_bytes) part_response = requests.put(part_url, data=partial_buf) if not part_response.ok: msg = _get_aws_error_message(part_response) raise HTTPError(msg, response=part_response) log.debug('Completed upload of file part %s', part_num) # upload each part try: pool = Pool(MAX_THREADS) _upload_part = partial(_upload_part_base, file_path=buf.name, part_size=part_size, file_size=file_size) pool.map(_upload_part, enumerate(urls)) # complete the multipart upload; an abort will be triggered # if any part except the last failed to upload at least 5MB finally: pool.terminate() client.files.post_multipart_complete(file_response.id) log.debug('Uploaded File %d', file_response.id) return file_response.id
def main(self, cnt): self.get_proxies() for i in range(1, cnt + 1): cars_links = self.get_listing( 'https://www.otomoto.pl/osobowe/uzywane/', i) pool = ThreadPool(10) pool.map(self.parse_car, cars_links) pool.terminate() pool.close() pool.join() print('DONE!')
def run(self): in_queue, out_queue = Queue(), Queue() for i in self.a: in_queue.put(i) def f(in_queue, out_queue): while not in_queue.empty(): time.sleep(1) out_queue.put(in_queue.get()+1) pool = Pool(4, f, (in_queue, out_queue)) self.b = [] while len(self.b) < len(self.a): if not out_queue.empty(): self.b.append(out_queue.get()) pool.terminate()
def core(): logging.basicConfig(level=logging.WARNING) # check() prepareprocess() pool=ThreadPool(20) time1 = datetime.now() timeend= time1+timedelta(minutes=2) while True: time2 = datetime.now() if(time2>timeend): break for i in classtorush: pool.apply_async(rush,args=(i,)) time.sleep(0.25) time.sleep(10) pool.terminate() pool.join()
def download_pages(items): retry = 0 while retry < 10: pool = ThreadPool(4) try: pool.map(download_page, items) pool.close() pool.join() break except KeyboardInterrupt: print('download terminated by user, quit execution.') pool.terminate() break except Exception, e: pool.terminate() retry += 1 print('download error occurred: {0}, {1} retry in {2}s'.format( e, retry, retry * 10)) time.sleep(retry * 10)
def process_parsing(url, output_stream): # url='http://www.yell.ru/spb/top/restorany/' time_start = time.time() url_prefix = 'http://www.yell.ru' r = requests.get(url) metro_urls = get_metro_stations(r.text) logging.info("metroes: %d" % len(metro_urls)) # collect company urls for parsing pool = ThreadPool(NUMBER_PROCESSES) res_queue = ProcessQueue() results = pool.map(collect_company_urls, [(url_prefix + u, res_queue) for u in metro_urls]) pool.close() pool.join() pool.terminate() # reduce urls count = 0 reduced_url_set = set() while not res_queue.empty(): count = count + 1 url = res_queue.get_nowait() # FOR TEST ONLY - REMOVE THIS #if count < 10: reduced_url_set.add(url) logging.info("%d" % count) logging.info("%d" % len(reduced_url_set)) # start company parsing pool ---- logging.info('start!!!!!!') pool = ThreadPool(NUMBER_PROCESSES) for field in FIELDS: output_stream.write(field + ';') output_stream.write('\n') results = pool.map(parse_company_worker, [(url_prefix + u, output_stream) for u in reduced_url_set]) pool.close() pool.join() pool.terminate() logging.info('done!!!!!') logging.info('finished in %s seconds' % (time.time() - time_start))
def execute_nodes(self): num_threads = self.config.threads target_name = self.config.target_name text = "Concurrency: {} threads (target='{}')" concurrency_line = text.format(num_threads, target_name) dbt.ui.printer.print_timestamped_line(concurrency_line) dbt.ui.printer.print_timestamped_line("") pool = ThreadPool(num_threads) try: self.run_queue(pool) except KeyboardInterrupt: pool.close() pool.terminate() adapter = get_adapter(self.config) if not adapter.is_cancelable(): msg = ("The {} adapter does not support query " "cancellation. Some queries may still be " "running!".format(adapter.type())) yellow = dbt.ui.printer.COLOR_FG_YELLOW dbt.ui.printer.print_timestamped_line(msg, yellow) raise for conn_name in adapter.cancel_open_connections(): dbt.ui.printer.print_cancel_line(conn_name) pool.join() dbt.ui.printer.print_run_end_messages(self.node_results, early_exit=True) raise pool.close() pool.join() return self.node_results
class ParallelRunner(SkeletonRunner): """ This class is used to evaluate a skeleton through parallel primitives. This runner allows to execute skeleton evaluation exploiting the parallel architecture, using all available processing elements. The parallelism is exploited evaulating the skeleton with different input elements in parallel and then joining all results. """ def __init__(self): self.pool = Pool(processes=36) def __del__(self): self.pool.close() self.pool.terminate() def run(self, skeleton, values, *params): results = self.pool.map(eval_parallel(skeleton, params[1], params[0]), values) self.pool.close() return results
def _fetch_photos_multi(self): rows = self.db.get_photo_status() if not rows: print('{0}的相册里没有照片'.format(self.target_id)) return photos = [] for row in rows: photos.append(json.loads(row['data'])) count = len(photos) print("正在下载第{0}-{1}张照片 ...".format( self.photo_total, self.photo_total+count)) pool = ThreadPool(8) try: pool.map(self._download_photo, photos) pool.close() pool.join() self.photo_total += count except KeyboardInterrupt: pool.terminate()
def fetch_or_load_urls(fileanme): if os.path.exists(jsonfile): return json.load(open(jsonfile, 'r')) pool = ThreadPool(8) try: pool.map(findurls, range(1, 51)) pool.close() pool.join() except KeyboardInterrupt: print('terminated by user.') pool.terminate() print(len(urls)) items = [url_to_item(url) for url in sorted(urls, cmp=url_cmp)] # json.dump(items, open('urls.json', 'w'),indent=2) # 输出\uxxxx json.dump(items, codecs.open(fileanme, 'w', 'utf8'), # 输出中文文字 ensure_ascii=False, indent=2) return items
def start(self): while self.retry > 0: pool = ThreadPool(self.pool_size) try: pool.map(self.func, self.args) pool.close() pool.join() print('task execution completely.') break except KeyboardInterrupt, e: print('task terminated by user.', e) pool.terminate() pool.join() break except Exception, e: pool.terminate() pool.join() self.retry -= 1 traceback.print_exc() print('task error: {0}, {1} retry in {2}s'.format( e, sys.maxint - self.retry, 60)) time.sleep(self.sleep)
def get_all_urls_from_cat_multithread(category_param, nb_threads=concurrent_limit): """ Given a category URL param, returns the list of urls for all names in it. """ # to get by letter, add parameter: &lettre=^[aA] logger.info('Extracting URLs for category parameter: %s', category_param) letters = string.lowercase pool = ThreadPool(nb_threads) try: # Lambda function for multiprocessing get_url_func = lambda x: get_letter_urls(x, category_param) results = pool.map(get_url_func, letters) pool.close() pool.join() # Results is a list of lists, make only one list urls = [url for l in results for url in l] return urls except KeyboardInterrupt: pool.terminate()
class SimpleDesktopsDownloader: def __init__( self, output="simpledesktops", max_threads=5, force=False, tree=False, logger=None, ): if not os.path.exists(output): os.mkdir(output) self.output = output self.pool = Pool(max_threads) self.force = force self.tree = tree self.session = requests.Session() self.skip_count = 0 self.update_count = 0 self.logger = logger or logging.getLogger() def join(self, path): return os.path.join(self.output, path) def download_job(self, img): # .295x184_q100.png img_url = img["src"][:-17] self.logger.info("start job: %s", img_url) # /uploads/desktops/ path = urlparse(img_url).path[18:] if self.tree: dir = path.rsplit("/", 1)[0] os.makedirs(dir, exist_ok=True) else: path = path.replace("/", "-") path = self.join(path) if not self.force and os.path.exists(path): self.skip_count += 1 self.logger.info("%s already exists! skip downloading ...", path) return with click.open_file(path, "wb") as fp: r = self.session.get(img_url) if not r.ok: self.logger.error("something wrong! [%d]%s", r.status_code, img_url) self.pool.terminate() return fp.write(r.content) self.update_count += 1 self.logger.info("%s successfully downloaded.", path) def iter_download_job(self): url = "http://simpledesktops.com/browse/" page = 1 while True: response = self.session.get(url + str(page)) if response.status_code == 404: break page += 1 bs = BeautifulSoup(response.text, "html.parser") for img in bs.select(".desktops .edge .desktop img"): yield img def download(self): self.logger.info("dispatching download jobs ...") self.pool.map(self.download_job, self.iter_download_job()) self.logger.info( "all task done, %d updated, %d skipped, enjoy!", self.update_count, self.skip_count, )
def mergeFilesByRegion(filesByRegion, grid, outputDir): # Merge a set of files by region into the specified dir # Key is up/down/nominal etc N = 0 filesToWrite = {} for r in filesByRegion: for key in filesByRegion[r]: if filesByRegion[r][key] == []: if key == "Nominal": print ("WARNING: no input files for region {0} key {1}".format(r, key)) continue filePrefix = "%s_%s" % (r, grid) filename = os.path.join(outputDir, "%s.root" % (filePrefix)) if os.path.exists(filename): print ("Output file {0} exists - skipping".format(os.path.basename(filename))) continue filesToWrite[filename] = {"region": r, "files": filesByRegion[r][key]} N += 1 # Got anything? if filesToWrite == {}: return # build the pool arguments args = [] for filename in filesToWrite: N -= 1 args.append((filename, filesToWrite[filename]["files"], False, filesToWrite[filename]["region"], N)) pool = ThreadPool(8, init_worker) try: # results = pool.map(mergeFiles, args) results = pool.imap_unordered(mergeFiles, args) pool.close() pool.join() except KeyboardInterrupt: print "Caught KeyboardInterrupt, terminating workers" pool.terminate() pool.join() return # Below is to be removed legacy code relying on hadd for r in filesByRegion: for key in filesByRegion[r]: if filesByRegion[r][key] == []: continue N -= 1 # Merge the files in chunks of 50, and then merge these chunks # The whole idea behind this exercise is to avoid exceeding the maximum length of # of a command allowed in bash. filePrefix = "%s_%s" % (r, grid) filename = os.path.join(outputDir, "%s.root" % (filePrefix)) if os.path.exists(filename): print ("Output file {0} exists - skipping".format(os.path.basename(filename))) continue mergeFiles(filename, filesByRegion[r][key]) # fileMerger = ROOT.TFileMerger() # fileMerger.OutputFile(filename) # for f in filesByRegion[r][key]: # fileMerger.AddFile(f) # fileMerger.Merge() # i=1 # print("Attempting to make file {0}".format(filename)) # for subset in chunks(filesByRegion[r][key], 50): # print("Merging subset {0:d}...".format(i)) # filename = os.path.join(outputDir, "%s_%03d.root" % (filePrefix, i) ) # outputFiles.append(filename) # # if len(subset) == 1: # shutil.copy(subset[0], filename) # else: # cmd = "hadd -f %s %s" % (filename, " ".join(subset)) # subprocess.call(cmd, shell=True) # # i+=1 # print("Merging all subsets") # filename = os.path.join(outputDir, "%s.root" % (filePrefix) ) # if len(outputFiles) == 1: # # only 1 file, so just rename it # os.rename(outputFiles[0], filename) # else: # cmd = "hadd -f %s %s" % (filename, " ".join(outputFiles)) # subprocess.call(cmd, shell=True) # print("Done merging subsets; removing temporary files") # for f in outputFiles: # if not os.path.exists(f): continue # os.remove(f) print ("=> Created file for {0}; {1} files remaining".format(r, N))
class TextService(QtCore.QObject): """ A TextService which handles all text processing including the fetching of images and voice """ change_img = QtCore.Signal() def __init__(self, text, window, lang_en, def_counter): """ :param text: Complete tale/story :param window: Story_UI window """ QtCore.QObject.__init__(self) self.word_list = re.split('\s', text) self.window = window self.sentence_list = regex.split("(?V1)(?<=\.|:|;|-|,|\!)", text) self.sentence_list = self.join_short_sentences() self.keyword_list = [] self.timing_list = [] self.pool = ThreadPool(4) self.keyword_list = self.pool.map(derive_keyword, self.sentence_list) self.pool.close() self.pool.join() self.audio_service = AudioService(window) self.audio_thread = threading.Thread(target=self.audio_service.prepare_voice, args=(self.sentence_list, def_counter)) self.audio_thread.setDaemon(True) self.audio_thread.start() self.image_thread = threading.Thread(target=image_from_keyword_list, args=(self.keyword_list, window, lang_en)) self.image_thread.setDaemon(True) self.image_thread.start() # subtitle_thread = threading.Thread(target=window.set_subtitles, args=()) # subtitle_thread.setDaemon(True) # subtitle_thread.start() def start_story(self, wait_seconds=5): """ Starts the story telling but waits a few seconds (to preload some data) """ self.audio_service.set_clip_callback(self.window.switch_to_next_image) sleep(wait_seconds) self.audio_service.start_audio() def get_sentence_list(self): return self.sentence_list def pause_play(self): """ Pauses the audio or ends the Pause """ self.audio_service.pause_play() def stop_play(self): """ Stops the Story. Used for restart. """ self.pool.terminate() self.audio_service.stop_play() def join_short_sentences(self): result_list = [] for sentence in self.sentence_list: if len(sentence.split()) > 4: result_list.append(sentence) else: try: result_list[-1] = result_list[-1] + sentence except IndexError: result_list.append(sentence) return result_list
from multiprocessing import Pool, Process import time def test((a, b)): print 'test %s %s' % (a, b) p = ThreadPool() p.map_async(time.sleep, [3,2,1]).get(120) return a ** 2, b ** 2 if __name__ == '__main__': # p = Pool() # import pdb;pdb.set_trace() try: p = ThreadPool() # results = p.map(test, [(1,2), (3,4)]) results = p.map_async(test, [(1, 2), (3, 4), (4, 4), (5, 4), (6, 4), (7, 4)]).get(120) # results = p.map_async(test, [1, 3], [2, 4]).get() p.close() p.join() print results except KeyboardInterrupt: print 'Catch KeyboardInterrupt in main' p.terminate() # p.join() # for item in results: # print item.get() # results = tp.map(test, [(1,2), (3,4)])
if __name__ == '__main__': switch, user, command, debug, parameters, logger , key_based = get_argument() debug_logging(debug) if parameters != None: parameters=parameters_validate(parameters, len(switch)) command = command_to_send(command,parameters,len(switch)) logging.debug("User: %s\nNetwork devices: %s\nCommands: %s\nDebug: %s\nLogging: %s\n"%(user,switch,command,debug,logger)) if key_based: passwd='' else: passwd = getpass.getpass('Please provide password for user %s:'%user) data_holder= itertools.izip(switch,command,itertools.repeat(user),itertools.repeat(passwd),itertools.repeat(logger)) pool = ThreadPool() try: pool.map(connection_star_thread,data_holder) pool.close() pool.join() gracefully_exit() except KeyboardInterrupt: logging.error("Caught keyboard interrupt. Killing all threads.") pool.terminate() pool.join() gracefully_exit()
def _multipart_upload(syn, filename, contentType, get_chunk_function, md5, fileSize, partSize=None, **kwargs): """ Multipart Upload. :param syn: a Synapse object :param filename: a string containing the base filename :param contentType: contentType_ :param get_chunk_function: a function that takes a part number and size and returns the bytes of that chunk of the file :param md5: the part's MD5 as hex. :param fileSize: total number of bytes :param partSize: number of bytes per part. Minimum 5MB. :return: a MultipartUploadStatus_ object Keyword arguments are passed down to :py:func:`_start_multipart_upload`. .. MultipartUploadStatus: http://rest.synapse.org/org/sagebionetworks/repo/model/file/MultipartUploadStatus.html .. contentType: https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.17 """ partSize = calculate_part_size(fileSize, partSize, MIN_PART_SIZE, MAX_NUMBER_OF_PARTS) status = _start_multipart_upload(syn, filename, md5, fileSize, partSize, contentType, **kwargs) ## only force restart once kwargs['forceRestart'] = False completedParts = count_completed_parts(status.partsState) progress=True retries=0 mp = Pool(8) try: while retries<MAX_RETRIES: ## keep track of the number of bytes uploaded so far completed = Value('d', min(completedParts * partSize, fileSize)) printTransferProgress(completed.value, fileSize, prefix='Uploading', postfix=filename) chunk_upload = lambda part: _upload_chunk(part, completed=completed, status=status, syn=syn, filename=filename, get_chunk_function=get_chunk_function, fileSize=fileSize, partSize=partSize, t0=time.time()) url_generator = _get_presigned_urls(syn, status.uploadId, find_parts_to_upload(status.partsState)) mp.map(chunk_upload, url_generator) #Check if there are still parts status = _start_multipart_upload(syn, filename, md5, fileSize, partSize, contentType, **kwargs) oldCompletedParts, completedParts = completedParts, count_completed_parts(status.partsState) progress = (completedParts>oldCompletedParts) retries = retries+1 if not progress else retries ## Are we done, yet? if completed.value >= fileSize: try: status = _complete_multipart_upload(syn, status.uploadId) if status.state == "COMPLETED": break except Exception as ex1: sys.stderr.write(str(ex1)+"\n") finally: mp.terminate() if status["state"] != "COMPLETED": raise SynapseError("Upoad {id} did not complete. Try again.".format(id=status["uploadId"])) return status