def _push(self, data): """Override Consecution's push such that we can push in parallel""" if self._logging == "output": self._write_log(data) executor_kwargs = self.context.get("executor_kwargs", None) or {} with self.executor_class(**executor_kwargs) as executor: futures = [] do_split = self.context.get("split", False) info( "%s: split=%s, %d downstream nodes" % (self.__class__.__name__, do_split, len( self._downstream_nodes)), label="push", ) if do_split: # Split the data among the downstream nodes splits = divide_data(data, len(self._downstream_nodes)) for i, split in enumerate(splits): node = self._downstream_nodes[i] futures.append(executor.submit(node._process, split)) else: # Pass complete data to each downstream node for downstream in self._downstream_nodes: futures.append(executor.submit(downstream._process, data)) # Wait for results for future in self.__class__.as_completed_func(futures): future.result()
def consume(self, data=None, cleanup=None, split_count=None, synchronous=False, timeout=None, **node_contexts): """Setup node contexts and consume data with the pipeline Parameters ---------- data : iterable, optional Iterable of data to consume cleanup : dict, optional A mapping of arg names to clean up functions to be run after data processing is complete. split_count : int, optional How many slices to split the data into for parallel processing. Default is to inspect the celery app and set split_count = worker count. synchronous : bool, optional If False, return AsyncResults. If True, wait for tasks to complete and return their results, if any. timeout : int or float, optional If waiting for results, pass this as timeout to AsyncResult.get(). **node_contexts Keyword arguments that are node_name->param_dict """ if not split_count: dbg("determining split count from app celery worker count") app_stats = self.consume_task.app.control.inspect().stats() split_count = len(app_stats.keys()) split_count = split_count_helper(data, split_count) if data is None: splits = [None for s in range(split_count)] else: splits = divide_data(data, split_count) dbg("%s: data len: %s, splits: %d" % (self.__class__.__name__, size(data, "n/a"), split_count)) async_results = [] for split in splits: async_results.append( self.consume_task.delay(self.pipeline, split, cleanup=cleanup, **node_contexts)) if synchronous: results = [] for async_result in async_results: try: results.append(async_result.get(timeout=timeout)) finally: async_result.forget() return results return async_results
def consume(self, data=None, cleanup=None, split_count=None, synchronous=False, timeout=None, **node_contexts): """Setup node contexts and consume data with the pipeline Parameters ---------- data : iterable, optional Iterable of data to consume cleanup : dict, optional A mapping of arg names to clean up functions to be run after data processing is complete. split_count : int, optional How many slices to split the data into for parallel processing. Default is to use executor._max_workers. synchronous : bool, optional If False, return Futures. If True, wait for futures to complete and return their results, if any. timeout : int or float, optional Raises a concurrent.futures.TimeoutError if __next__() is called and the result isn’t available after timeout seconds from the original call to as_completed(). Ignored if synchronous=False. **node_contexts Keyword arguments that are node_name->param_dict """ with self.get_executor() as executor: worker_count = self.get_worker_count(executor) split_count = split_count_helper(data, split_count or worker_count) if data is None: splits = [None for s in range(split_count)] else: splits = divide_data(data, split_count) futures = [] info("%s: data len: %s, splits: %d, workers: %d" % ( self.__class__.__name__, size(data, "n/a"), worker_count, split_count, )) for split in splits: futures.append( executor.submit(consume, self.pipeline, split, cleanup=cleanup, **node_contexts)) if synchronous: return self.get_results(futures, timeout=timeout) return futures
def consume(self, data=None, cleanup=None, split_count=None, synchronous=False, timeout=None, **node_contexts): """Setup node contexts and consume data with the pipeline Parameters ---------- data : iterable, optional Iterable of data to consume cleanup : dict, optional A mapping of arg names to clean up functions to be run after data processing is complete. split_count : int, optional How many slices to split the data into for parallel processing. Default is the number of workers in the provided queue. synchronous : bool, optional If False, return Jobs. If True, wait for jobs to complete and return their results, if any. timeout : int or float, optional If waiting for results, raise an exception if polling for all results takes longer than timeout seconds. **node_contexts Keyword arguments that are node_name->param_dict """ if not split_count: dbg("determining split count from rq worker count") workers = Worker.all(queue=self.queue) split_count = len(workers) split_count = split_count_helper(data, split_count) if data is None: splits = [None for s in range(split_count)] else: splits = divide_data(data, split_count) dbg("%s: data len: %s, splits: %d" % (self.__class__.__name__, size(data, "n/a"), split_count)) async_results = [] for split in splits: async_results.append( self.queue.enqueue( rq_consume, args=(self.pipeline, split), kwargs=dict(cleanup=cleanup, **node_contexts), )) if synchronous: return get_async_results(async_results, timeout=timeout) return async_results
def run(self, data, func, split_count=None, timeout=None, push_type=PushTypes.Async): """Use a asyncio to apply func to data Parameters ---------- data An iterable to process func : callable A async callable that will be passed data to operate on using asyncio. split_count : int, optional How many slices to split the data into for concurrent processing. Default is to set split_count = len(data). timeout : int or float, optional Time to wait for jobs to complete before raising an error. Ignored unless using a push_type that waits for results. push_type : str, optional If "async", push the Futures immediately. If "input", push the input data immediately after task submission. If "result", collect the result synchronously and push it. """ split_count = split_count or len(data) splits = divide_data(data, split_count) info("%s: data len: %s, splits: %s" % (self.__class__.__name__, size(data, "n/a"), split_count)) loop, close = get_or_create_event_loop() try: futures = [loop.create_task(func(split)) for split in splits] if push_type == PushTypes.Async: for future in futures: self.push(future) elif push_type == PushTypes.Input: self.push(data) elif push_type == PushTypes.Result: self.push(self.get_results(futures, timeout=timeout)) else: raise AssertionError("Invalid push_type: %s" % push_type) finally: if close and push_type == PushTypes.Result: # We can only be sure its safe to close the event loop if it # was created and all processing took place in here. loop.close()
def get_splits(self, data, split_count): """Split the data into split_count slices""" return divide_data(data, split_count)
def run(self, data, func, executor=None, executor_kwargs=None, split_count=None, timeout=None, push_type=PushTypes.Async, **kwargs): """Use a parallel executor to apply func to data Parameters ---------- data An iterable to process func : callable A callable that will be passed data to operate on in parallel executor : Executor, optional If passed use this executor instead of creating one. executor_kwargs : dict, optional Keyword arguments to pass when initalizing an executor. split_count : int, optional How many slices to split the data into for parallel processing. Default is to set split_count = number of workers timeout : int or float, optional Time to wait for jobs to complete before raising an error. Ignored unless using a push_type that waits for results. push_type : str, optional If "async", push the Futures immediately. If "input", push the input data immediately after task submission. If "result", collect the result synchronously and push it. **kwargs Keyword arguments passed to the executor when submitting work """ self.check_data(data) shutdown = True if executor: shutdown = False else: executor_kwargs = executor_kwargs or {} executor = self.get_executor(**executor_kwargs) try: worker_count = self.get_worker_count(executor) split_count = split_count or worker_count splits = divide_data(data, split_count) info("%s: data len: %s, splits: %s, workers: %d" % ( self.__class__.__name__, size(data, "n/a"), split_count, worker_count, )) futures = self.submit(executor, func, splits, **kwargs) if push_type == PushTypes.Async: for future in futures: self.push(future) elif push_type == PushTypes.Input: self.push(data) elif push_type == PushTypes.Result: self.push(self.get_results(futures, timeout=timeout)) else: raise AssertionError("Invalid push_type: %s" % push_type) finally: if shutdown: self.shutdown_executor(executor)