Ejemplo n.º 1
0
    def _push(self, data):
        """Override Consecution's push such that we can push in parallel"""
        if self._logging == "output":
            self._write_log(data)

        executor_kwargs = self.context.get("executor_kwargs", None) or {}
        with self.executor_class(**executor_kwargs) as executor:
            futures = []

            do_split = self.context.get("split", False)
            info(
                "%s: split=%s, %d downstream nodes" %
                (self.__class__.__name__, do_split, len(
                    self._downstream_nodes)),
                label="push",
            )

            if do_split:
                # Split the data among the downstream nodes
                splits = divide_data(data, len(self._downstream_nodes))
                for i, split in enumerate(splits):
                    node = self._downstream_nodes[i]
                    futures.append(executor.submit(node._process, split))
            else:
                # Pass complete data to each downstream node
                for downstream in self._downstream_nodes:
                    futures.append(executor.submit(downstream._process, data))

            # Wait for results
            for future in self.__class__.as_completed_func(futures):
                future.result()
Ejemplo n.º 2
0
    def consume(self,
                data=None,
                cleanup=None,
                split_count=None,
                synchronous=False,
                timeout=None,
                **node_contexts):
        """Setup node contexts and consume data with the pipeline

        Parameters
        ----------
        data : iterable, optional
            Iterable of data to consume
        cleanup : dict, optional
            A mapping of arg names to clean up functions to be run after
            data processing is complete.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is to use executor._max_workers.
        synchronous : bool, optional
            If False, return Futures. If True, wait for futures to complete and
            return their results, if any.
        timeout : int or float, optional
            Raises a concurrent.futures.TimeoutError if __next__() is called
            and the result isn’t available after timeout seconds from the
            original call to as_completed(). Ignored if synchronous=False.
        **node_contexts
            Keyword arguments that are node_name->param_dict

        """
        with self.get_executor() as executor:
            worker_count = self.get_worker_count(executor)
            split_count = split_count_helper(data, split_count or worker_count)
            if data is None:
                splits = [None for s in range(split_count)]
            else:
                splits = divide_data(data, split_count)
            futures = []

            info("%s: data len: %s, splits: %d, workers: %d" % (
                self.__class__.__name__,
                size(data, "n/a"),
                worker_count,
                split_count,
            ))

            for split in splits:
                futures.append(
                    executor.submit(consume,
                                    self.pipeline,
                                    split,
                                    cleanup=cleanup,
                                    **node_contexts))

            if synchronous:
                return self.get_results(futures, timeout=timeout)

            return futures
Ejemplo n.º 3
0
    def run(self,
            data,
            func,
            split_count=None,
            timeout=None,
            push_type=PushTypes.Async):
        """Use a asyncio to apply func to data

        Parameters
        ----------
        data
            An iterable to process
        func : callable
            A async callable that will be passed data to operate on using asyncio.
        split_count : int, optional
            How many slices to split the data into for concurrent processing. Default
            is to set split_count = len(data).
        timeout : int or float, optional
            Time to wait for jobs to complete before raising an error. Ignored
            unless using a push_type that waits for results.
        push_type : str, optional
            If "async", push the Futures immediately.
            If "input", push the input data immediately after task submission.
            If "result", collect the result synchronously and push it.

        """
        split_count = split_count or len(data)
        splits = divide_data(data, split_count)
        info("%s: data len: %s, splits: %s" %
             (self.__class__.__name__, size(data, "n/a"), split_count))

        loop, close = get_or_create_event_loop()

        try:
            futures = [loop.create_task(func(split)) for split in splits]

            if push_type == PushTypes.Async:
                for future in futures:
                    self.push(future)
            elif push_type == PushTypes.Input:
                self.push(data)
            elif push_type == PushTypes.Result:
                self.push(self.get_results(futures, timeout=timeout))
            else:
                raise AssertionError("Invalid push_type: %s" % push_type)
        finally:
            if close and push_type == PushTypes.Result:
                # We can only be sure its safe to close the event loop if it
                # was created and all processing took place in here.
                loop.close()
Ejemplo n.º 4
0
    def run(self,
            data,
            func,
            executor=None,
            executor_kwargs=None,
            split_count=None,
            timeout=None,
            push_type=PushTypes.Async,
            **kwargs):
        """Use a parallel executor to apply func to data

        Parameters
        ----------
        data
            An iterable to process
        func : callable
            A callable that will be passed data to operate on in parallel
        executor : Executor, optional
            If passed use this executor instead of creating one.
        executor_kwargs : dict, optional
            Keyword arguments to pass when initalizing an executor.
        split_count : int, optional
            How many slices to split the data into for parallel processing. Default
            is to set split_count = number of workers
        timeout : int or float, optional
            Time to wait for jobs to complete before raising an error. Ignored
            unless using a push_type that waits for results.
        push_type : str, optional
            If "async", push the Futures immediately.
            If "input", push the input data immediately after task submission.
            If "result", collect the result synchronously and push it.
        **kwargs
            Keyword arguments passed to the executor when submitting work

        """
        self.check_data(data)

        shutdown = True
        if executor:
            shutdown = False
        else:
            executor_kwargs = executor_kwargs or {}
            executor = self.get_executor(**executor_kwargs)

        try:
            worker_count = self.get_worker_count(executor)
            split_count = split_count or worker_count
            splits = divide_data(data, split_count)
            info("%s: data len: %s, splits: %s, workers: %d" % (
                self.__class__.__name__,
                size(data, "n/a"),
                split_count,
                worker_count,
            ))
            futures = self.submit(executor, func, splits, **kwargs)

            if push_type == PushTypes.Async:
                for future in futures:
                    self.push(future)
            elif push_type == PushTypes.Input:
                self.push(data)
            elif push_type == PushTypes.Result:
                self.push(self.get_results(futures, timeout=timeout))
            else:
                raise AssertionError("Invalid push_type: %s" % push_type)

        finally:
            if shutdown:
                self.shutdown_executor(executor)
Ejemplo n.º 5
0
    def _get_script_args(self):
        """Generate all tlbx Args for this Glider"""
        node_lookup = self.glider.get_node_lookup()
        script_args = OrderedDict()  # Map of arg names to Args
        arg_dests = {}  # Map of arg dests back to names
        node_arg_names = defaultdict(set)

        requires_data = not isinstance(self.glider.top_node, NoInputNode)
        if requires_data and not self.blacklisted("", SCRIPT_DATA_ARG):
            script_args[SCRIPT_DATA_ARG] = Arg(SCRIPT_DATA_ARG, nargs="+")

        def add_script_arg(node, arg_name, **kwargs):
            script_arg = self._get_script_arg(node, arg_name, **kwargs)
            if not script_arg:
                return

            script_args[script_arg.name] = script_arg
            arg_dests[script_arg.dest] = script_arg.name
            node_arg_names[arg_name].add(script_arg.name)

        for node in node_lookup.values():
            node_help = {}
            if FunctionDoc:
                try:
                    # Only works if run() has docs in numpydoc format
                    docs = FunctionDoc(node.run)
                    node_help = {
                        v.name: "\n".join(v.desc)
                        for v in docs["Parameters"]
                    }
                except Exception as e:
                    info("failed to parse node '%s' run() docs: %s" %
                         (node.name, str(e)))

            for arg_name, _ in node.run_args.items():
                add_script_arg(
                    node,
                    arg_name,
                    required=True,
                    arg_help=node_help.get(arg_name, None),
                )

            for kwarg_name, kwarg_default in node.run_kwargs.items():
                add_script_arg(
                    node,
                    kwarg_name,
                    required=False,
                    default=kwarg_default,
                    arg_help=node_help.get(kwarg_name, None),
                )

        def assert_arg_present(custom_arg, arg_name):
            raiseifnot(
                arg_name in script_args,
                ("Custom arg %s with dest=%s maps to node arg=%s "
                 "which is not in the script arg list. Check for "
                 "conflicting args that cover the same node arg." %
                 (custom_arg.name, custom_arg.dest, arg_name)),
            )

        for custom_arg in self.custom_args:
            raiseif(
                self.blacklisted("", custom_arg.name),
                "Blacklisted arg '%s' passed as a custom arg" %
                custom_arg.name,
            )

            if custom_arg.dest in node_arg_names:
                # Find and delete all node-based args this will cover
                for arg_name in node_arg_names[custom_arg.dest]:
                    assert_arg_present(custom_arg, arg_name)
                    del script_args[arg_name]

            if custom_arg.dest in arg_dests:
                # Remove the original arg that this custom arg will satisfy
                arg_name = arg_dests[custom_arg.dest]
                assert_arg_present(custom_arg, arg_name)
                del script_args[arg_name]

            script_args[custom_arg.name] = custom_arg
            arg_dests[custom_arg.dest] = custom_arg.name

        return script_args.values()