def scrap_all_comments(base_url, urls, max_workers=256): urls_to_do = [url for (_, url, is_visited) in urls if not is_visited] urls_to_do_iterator = iter(urls_to_do) pbar = tqdm(initial=len(urls) - len(urls_to_do), total=len(urls)) with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {} for url in itertools.islice(urls_to_do_iterator, max_workers): futures_executor = executor.submit(scrap_comments, url=url) futures.update({futures_executor: url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: pbar.update(1) url = futures[future] futures.pop(future) try: comments = future.result() except Exception as exc: tqdm.write(f"{url} generated an exception: {exc}") else: with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, url, True, ) if comments: db.insert_all_rating(base_url, comments) for url in itertools.islice(urls_to_do_iterator, len(done)): futures_executor = executor.submit(scrap_comments, url=url) futures.update({futures_executor: url}) pbar.close()
def get_all_comments(restaurants_url, pages_tracker={}, max_workers=64): restaurants_url_to_do_iterator = iter(restaurants_url) pages_comments = [] pbar = tqdm(total=len(restaurants_url)) with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {} for restaurant_url, page_number in itertools.islice( restaurants_url_to_do_iterator, max_workers ): futures_executor = executor.submit( get_page_comments, restaurant_url, page_number ) futures.update({futures_executor: restaurant_url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: pbar.update(1) restaurant_url = futures[future] futures.pop(future) try: comments = future.result() except Exception as exc: tqdm.write(f"{restaurant_url} generated an exception: {exc}") else: if pages_tracker: pages_tracker[restaurant_url][1] += 1 if ( pages_tracker[restaurant_url][1] >= pages_tracker[restaurant_url][0] ): with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, restaurant_url, True, ) pages_comments.append(comments) with DimnaDatabase(db_path, logger) as db: for comment, rating in comments["comments"]: db.insert_rating( base_url, comment.replace("\x00", ""), rating ) for restaurant_url, page_number in itertools.islice( restaurants_url_to_do_iterator, len(done) ): futures_executor = executor.submit( get_page_comments, restaurant_url, page_number ) futures.update({futures_executor: restaurant_url}) pbar.close() return pages_comments
def callback(f): try: dataHash[data] = f.result() futures.pop(data) except: # noqa sys.stderr.write("Invoke error for {} for {}\n".format( f.exception(), data))
def find_all_doctors_url(base_url, cities_url, max_workers=128): cities_url_iterator = iter(cities_url) pbar = tqdm(total=len(cities_url)) with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers) as executor: futures = {} for city_url in itertools.islice(cities_url_iterator, max_workers): futures_executor = executor.submit(find_doctors_url, base_url=base_url, city_url=city_url) futures.update({futures_executor: city_url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED) for future in done: pbar.update(1) city_url = futures[future] futures.pop(future) try: doctors_url = future.result() except Exception as exc: tqdm.write(f"{city_url} generated an exception: {exc}") else: with DimnaDatabase(db_path, logger) as db: db.insert_all_pages_url(base_url, doctors_url) for city_url in itertools.islice(cities_url_iterator, len(done)): futures_executor = executor.submit(find_doctors_url, base_url=base_url, city_url=city_url) futures.update({futures_executor: city_url}) pbar.close()
def test_plain_future(): executor = concurrent.futures.ThreadPoolExecutor(max_workers=1) futures = FutureCollection() future = executor.submit(fib, 33) futures.add('fibonacci', future) assert futures.done('fibonacci') is False assert futures._state('fibonacci') is not None assert future in futures futures.pop('fibonacci') assert future not in futures
def process_urls_parallel(analysis_urls, script_file, container_timeout, max_containers): futures = {} processed_url_ids = [] urls = analysis_urls.copy() with concurrent.futures.ThreadPoolExecutor( max_workers=max_containers) as executor: while len(urls) > 0: ## Submit jobs to container ## for i in range(min(len(urls), max_containers)): id = urls.keys()[0] url = urls.pop(id) futures[executor.submit(initiate_container, url, str(id), script_file, 0, container_timeout)] = str(id) res_futures = concurrent.futures.wait( futures, timeout=container_timeout, return_when=concurrent.futures.ALL_COMPLETED) for future in res_futures[0]: id = futures.pop(future) res = -1 try: res = future.result(timeout=container_timeout) except Exception as exc: print(get_time() + 'Container_' + str(id) + ': Exception ') print(exc) res = export_log(id) if res > 0: print(get_time() + 'Container_' + str(id) + ': URL Visited successfully!!') api_requests.update_url_api(id, 'is_visited', 'true') api_requests.update_url_api(id, 'visit_status', '1') processed_url_ids.append(id) elif res == -99: print(get_time() + 'Container_' + str(id) + ': Chromium Crashed!!') api_requests.update_url_api(id, 'visit_status', '3') else: print(get_time() + 'Container_' + str(id) + ': URL Visit failed!!') api_requests.update_url_api(id, 'visit_status', '2') for future in res_futures[1]: id = futures.pop(future) print(get_time() + 'Container_' + str(id) + ': Timeout occured!!') stop_container(id) export_log(id) api_requests.update_url_api(id, 'is_visited', 'false') return processed_url_ids
def simpleParallelZstdReading(filename): parallelization = os.cpu_count() with concurrent.futures.ThreadPoolExecutor(parallelization) as pool: futures = [] with indexed_zstd.IndexedZstdFile(filename) as file: offsets = np.array(list(file.block_offsets().values())) sizes = offsets[1:] - offsets[:-1] t0 = time.time() for offset, size in zip(offsets[:-1], sizes): futures.append(pool.submit(readBlock, filename, offset, size)) while len(futures) >= parallelization: futures.pop(0).result() t1 = time.time() print(f"Reading in parallel with a thread pool took {t1-t0:.3f}s")
def find_all_comments_pages(pages_url, max_workers=128): book_url_to_do = [ book_url for (_, book_url, is_visited) in pages_url if not is_visited ] book_url_to_do_iterator = iter(book_url_to_do) pbar = tqdm(initial=len(pages_url) - len(book_url_to_do), total=len(pages_url)) comments_url = list() with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: futures = {} for book_url in itertools.islice(book_url_to_do_iterator, max_workers): book_id, book_name = book_url.split("/")[-2:] first_comment_url = f"{comments_base_url}/{book_id}/{book_name}.json" futures_executor = executor.submit( find_number_of_comments, comment_url=first_comment_url ) futures.update({futures_executor: book_url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: pbar.update(1) book_url = futures[future] futures.pop(future) book_id, book_name = book_url.split("/")[-2:] try: num_pages = future.result() except Exception as exc: tqdm.write(f"{book_url} generated an exception: {exc}") else: if num_pages: for page in range(1, num_pages + 1): comment_url = f"{comments_base_url}/{book_id}/{book_name}.json?p={page}" comments_url.append([book_url, comment_url]) else: with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, book_url, True, ) for book_url in itertools.islice(book_url_to_do_iterator, len(done)): book_id, book_name = book_url.split("/")[-2:] first_comment_url = f"{comments_base_url}/{book_id}/{book_name}.json" futures_executor = executor.submit( find_number_of_comments, comment_url=first_comment_url ) futures.update({futures_executor: book_url}) pbar.close() return comments_url
def dask_executor(items, function, accumulator, **kwargs): """Execute using dask futures Parameters ---------- items : list List of input arguments function : callable A function to be called on each input, which returns an accumulator instance accumulator : AccumulatorABC An accumulator to collect the output of the function client : distributed.client.Client A dask distributed client instance treereduction : int, optional Tree reduction factor for output accumulators (default: 20) status : bool, optional If true (default), enable progress bar compression : int, optional Compress accumulator outputs in flight with LZ4, at level specified (default 1) Set to ``None`` for no compression. priority : int, optional Task priority, default 0 heavy_input : serializable, optional Any value placed here will be broadcast to workers and joined to input items in a tuple (item, heavy_input) that is passed to function. function_name : str, optional Name of the function being passed """ if len(items) == 0: return accumulator client = kwargs.pop('client') ntree = kwargs.pop('treereduction', 20) status = kwargs.pop('status', True) clevel = kwargs.pop('compression', 1) priority = kwargs.pop('priority', 0) heavy_input = kwargs.pop('heavy_input', None) function_name = kwargs.pop('function_name', None) reducer = _reduce() if clevel is not None: function = _compression_wrapper(clevel, function, name=function_name) reducer = _compression_wrapper(clevel, reducer) if heavy_input is not None: heavy_token = client.scatter(heavy_input, broadcast=True, hash=False) items = list(zip(items, repeat(heavy_token))) futures = client.map(function, items, priority=priority) while len(futures) > 1: futures = client.map( reducer, [futures[i:i + ntree] for i in range(0, len(futures), ntree)], priority=priority, ) if status: from dask.distributed import progress # FIXME: fancy widget doesn't appear, have to live with boring pbar progress(futures, multi=True, notebook=False) accumulator += _maybe_decompress(futures.pop().result()) return accumulator
def process_futures(futures, encoding): index = 0 while index < len(futures): future = futures[index] if not future.done(): index = index + 1 continue futures.pop(index) exception = future.exception() if exception: LOGGER.error(str(exception)) continue parsed_line = future.result() if not parsed_line: continue try: if isinstance(parsed_line, unicode): parsed_line = parsed_line.encode(encoding) sys.stdout.write(parsed_line) except Exception as ex: LOGGER.error(str(ex))
def scrap_all_comments(comments_url, max_workers=128): comments_url_iterator = iter(comments_url) pbar = tqdm(total=len(comments_url)) with concurrent.futures.ThreadPoolExecutor(max_workers=128) as executor: futures = {} for book_url, comment_url in itertools.islice( comments_url_iterator, max_workers ): futures_executor = executor.submit(scrap_comments, comment_url=comment_url) futures.update({futures_executor: book_url}) while futures: done, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in done: pbar.update(1) book_url = futures[future] futures.pop(future) try: comments = future.result() except Exception as exc: tqdm.write(f"{book_url} generated an exception: {exc}") else: with DimnaDatabase(db_path, logger) as db: db.update_page_visit_status( base_url, book_url, True, ) db.insert_all_rating(base_url, comments) for book_url, comment_url in itertools.islice( comments_url_iterator, len(done) ): futures_executor = executor.submit( scrap_comments, comment_url=comment_url ) futures.update({futures_executor: book_url}) pbar.close()
def _map_gen(self, futures: List[ConcurrentFuture], end_time=None) -> Generator: """The generator that ``map`` return when ``awaitable`` is False.""" try: while futures: future = futures.pop(0) if end_time is not None: yield from future.result(end_time - time.time()) else: yield from future.result() # Finally clause for generator exit and timeout. finally: future.cancel() # This future may have been done. for future in futures: future.cancel()
def process_urls_parallel(analysis_urls, script_file, container_timeout, max_containers): futures = {} processed_url_ids = set() urls = analysis_urls.copy() with concurrent.futures.ProcessPoolExecutor( max_workers=max_containers) as executor: while len(urls) > 0: ## Submit jobs to container ## for i in range(min(len(urls), max_containers)): id = urls.keys()[0] itm = urls.pop(id) url = itm['url'] visit_count = itm['count'] if i != 0 and i % 5 == 0: time.sleep(200) if visit_count == 0: ## initiates docker container for the first time futures[executor.submit(initiate_container, url, str(id), script_file, visit_count, container_timeout)] = (str(id), visit_count) else: ## Resumes docker container and waits for notifications futures[executor.submit(resume_container, url, str(id), script_file, visit_count, container_timeout)] = (str(id), visit_count) try: ## Keep docker container active for specific duration and stop the containe and export data for future in concurrent.futures.as_completed( futures, timeout=container_timeout): id, v_count = futures.pop(future) try: logging.info(get_time() + 'Container_' + str(id) + ': Completed successfully!!') except concurrent.futures.TimeoutError as ex: logging.info(get_time() + 'Container_' + str(id) + ': Timeout occured!!') except Exception as exc: logging.info(get_time() + 'Container_' + str(id) + ': Exception ') logging.info(exc) res = export_container(id, v_count) stop_container(id) if res: processed_url_ids.add(id) except Exception as e: ## Stop the containers that didn't complete before timeout and export data for future in futures.keys(): id, v_count = futures.pop(future) try: logging.info(get_time() + 'Container_' + str(id) + ': Timeout Occured!!') except concurrent.futures.TimeoutError as ex: logging.info(get_time() + 'Container_' + str(id) + ': Timeout occured!!') except Exception as exc: logging.info(get_time() + 'Container_' + str(id) + ': Exception ') logging.info(exc) res = export_container(id, v_count) stop_container(id) if res: processed_url_ids.add(id) return processed_url_ids
def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: nonlocal reretrieve num_cached = 0 num_pushes = len(pushes) for _ in tqdm(range(num_pushes)): push = pushes.pop(0) cached = futures.pop(0).result() semaphore.release() # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. if cached: value, mozci_version = cached # Regenerate results which were generated with an older version of mozci. if reretrieve > 0 and mozci_version != MOZCI_VERSION: cached = None reretrieve -= 1 # Regenerate results which don't contain the fix revision. elif len(value) != 5: cached = None if cached: num_cached += 1 value, mozci_version = cached assert len(value) == 5 yield value else: logger.info( f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( tuple(push.revs), push.backedoutby or push.bustage_fixed_by, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) mozci.config.cache.put( key, (value, MOZCI_VERSION), mozci.config["cache"]["retention"], ) assert len(value) == 5 yield value except mozci.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info( f"{num_cached} pushes were already cached out of {num_pushes}")
def callback(f): try: print(f.result()) futures.pop(data) except: print("Please handle {} for {}.".format(f.exception(), data))
video = av.open(sys.argv[1]) stream = next(s for s in video.streams if s.type == b'video') frame_count =0 with ThreadPoolExecutor(8) as executor: futures = [] for packet in video.demux(stream): for frame in packet.decode(): frame_count += 1 # reformat is not very thread happy new_frame = frame.reformat(1920, 1080, 'rgb48le') futures.append(executor.submit(convert, new_frame, frame_count, 'dpx')) #convert(new_frame, frame_count) while len(futures) > 8 * 4: f = futures.pop(0) f.result() if frame_count > 100: break for f in concurrent.futures.as_completed(futures): r = f.result() print "completed in %i secs" % (time.time() - start)
def generate( futures: List[concurrent.futures.Future], ) -> Generator[PushResult, None, None]: num_cached = 0 num_pushes = len(pushes) # Regenerating a large amount of data when we update the mozci regression detection # algorithm is currently pretty slow, so we only regenerate a subset of pushes whenever we # run. to_regenerate = int(os.environ.get("OLD_RESULTS_TO_REGENERATE", 0)) for _ in tqdm(range(num_pushes)): push = pushes.pop(0) cached = futures.pop(0).result() semaphore.release() if cached and to_regenerate > 0: value, mozci_version = cached # Regenerate results which were generated when we were not cleaning # up WPT groups. if granularity == "group" and any( runnable.startswith("/") for runnable in value[1] ): cached = None to_regenerate -= 1 # Regenerate results which were generated when we didn't get a correct # configuration for test-verify tasks. elif granularity == "config_group" and any( "test-verify" in runnable[0] for runnable in value[1] ): cached = None to_regenerate -= 1 # Regenerate results which were generated with an older version of mozci. elif mozci_version != MOZCI_VERSION: cached = None to_regenerate -= 1 if cached: num_cached += 1 value, mozci_version = cached yield value else: logger.info(f"Analyzing {push.rev} at the {granularity} level...") key = cache_key(push) try: if granularity == "label": runnables = push.task_labels elif granularity == "group": runnables = push.group_summaries.keys() elif granularity == "config_group": runnables = push.config_group_summaries.keys() value = ( push.revs, tuple(runnables), tuple(push.get_possible_regressions(granularity)), tuple(push.get_likely_regressions(granularity)), ) adr.config.cache.put( key, (value, MOZCI_VERSION), adr.config["cache"]["retention"], ) yield value except adr.errors.MissingDataError: logger.warning( f"Tasks for push {push.rev} can't be found on ActiveData" ) except Exception: traceback.print_exc() logger.info(f"{num_cached} pushes were already cached out of {num_pushes}")
async def execute(self): if not self._bulk_mode: raise Exception("No Bulk request started") if len(self._bulk_data) == 0: return [] data = self._bulk_data[:] self._bulk_data = [] futures = self._bulk_futures.copy() self._bulk_futures = {} req_start = time.time() for fn in self.middleware.before_request: res = fn(data) if asyncio.iscoroutine(res): res = await res if res is not None: data = res retries = 0 while True: try: resp = await self._httpclient.fetch( self._url, method="POST", body=data, request_timeout=60.0 # higher request timeout than other operations ) except concurrent.futures.CancelledError: raise except Exception as e: if self.should_retry and isinstance(e, HTTPError) and (e.status == 599 or e.status == 502): # always retry after 599 pass elif not self.should_retry or time.time() - req_start >= self._request_timeout: # give up after the request timeout raise if retries == 0: logger = self.log.exception else: logger = self.log.error logger("Error in JsonRPCClient.execute: retry {}".format(retries)) retries += 1 await asyncio.sleep(random.random()) continue break rvals = await resp.json() for fn in self.middleware.after_request: res = fn(data, rvals) if asyncio.iscoroutine(res): res = await res if res is not None: rvals = res results = [] for rval in rvals: if 'id' not in rval: continue future, result_processor = futures.pop(rval['id'], (None, None)) if future is None: self.log.warning("Got unexpected id in jsonrpc bulk response") continue if "error" in rval: future.set_exception(JsonRPCError(rval['id'], rval['error']['code'], rval['error']['message'], rval['error']['data'] if 'data' in rval['error'] else None)) result = None else: if result_processor: result = result_processor(rval['result']) else: result = rval['result'] future.set_result(result) results.append(result) if len(futures): self.log.warning("Found some unprocessed requests in bulk jsonrpc request") for future, result_processor in futures: future.set_exception(Exception("Unexpectedly missing result")) return results
def summary_tables(name, tables_only=False): """ Creates the summary tables in a schema. :param str name: the schema's name :param boolean tables_only: whether to create SQL tables instead of SQL views """ logger = logging.getLogger('ocdskingfisher.summarize.summary-tables') start = time() files = { directory: sql_files(directory, tables_only=tables_only) for directory in ('initial', 'middle', 'final') } graph = dependency_graph(files['middle']) def run(directory): """ Runs the files in a directory in sequence. :param str directory: a sub-directory containing SQL files """ for identifier, content in files[directory].items(): _run_file(name, identifier, content) def submit(identifier): """ If a file's dependencies are met, removes it from the dependency graph and submits it. :param str identifier: the identifier of a SQL file """ if not graph[identifier]: graph.pop(identifier) futures[executor.submit(_run_file, name, identifier, files['middle'][identifier])] = identifier # The initial files are fast, and don't need multiprocessing. run('initial') futures = {} with concurrent.futures.ProcessPoolExecutor() as executor: # Submit files whose dependencies are met. for identifier in list(graph): submit(identifier) # The for-loop terminates after its given futures, so it needs to start again with new futures. while futures: for future in concurrent.futures.as_completed(futures): future.result() done = futures.pop(future) # Update dependencies, and submit files whose dependencies are met. for identifier in list(graph): graph[identifier].discard(done) submit(identifier) # The final files are fast, and can also deadlock. run('final') logger.info('Total time: %ss', time() - start)
mongo_download = MongoDownload(mongo_obj) tournaments_scraped = mongo_download.getTournamentsScraped() filter_tournaments = tournament_df[~tournament_df[['Name', 'Year']].apply(tuple, 1).isin(tournaments_scraped)] tournaments = filter_tournaments.apply(lambda row: TournamentRun(row[0], row[1], mongo_obj, main_logger), axis=1).tolist() iter_tournaments = iter(tournaments) with concurrent.futures.ThreadPoolExecutor(max_workers=max_drivers) as executor: # Only schedule max_drivers amount of futures to start futures = { executor.submit(tournament.runTournament, None, True): tournament for tournament in itertools.islice(iter_tournaments, max_drivers) } while futures: # Wait for the next future to complete. finished, _ = concurrent.futures.wait( futures, return_when=concurrent.futures.FIRST_COMPLETED ) for future in finished: # get the completed tournament completed_tournament = futures.pop(future) main_logger.info('{}'.format(future.result())) for tournament in itertools.islice(iter_tournaments, len(finished)): future = executor.submit(tournament.runTournament, None, True) futures[future] = tournament failed_scrape_df = pd.DataFrame(columns=['Name', 'Year'], data=tournaments[0].failed_scrape_list) failed_scrape_df.to_csv('tournaments/FailedTournamentList.csv', index=False, header=True)