Exemple #1
0
    def execute_graph(self) -> None:
        """
        Executes an RDataFrame computation graph on a distributed backend.

        The needed ingredients are:

        - A collection of logical ranges in which the dataset is split. Each
          range is going to be assigned to a distributed task.
        - A representation of the computation graph that the task needs to
          execute.
        - A way to generate an RDataFrame instance starting from the logical
          range of the task.
        - Optionally, some setup code to be run at the beginning of each task.

        These are used as inputs to a generic mapper function. Results from the
        various mappers are then reduced and the final results are retrieved in
        the local session. These are properly handled to perform extra checks,
        depending on the data source. Finally, the local user-facing nodes are
        filled with the values that were computed distributedly so that they
        can be accessed in the application like with local RDataFrame.
        """
        # Check if the workflow must be generated in optimized mode
        optimized = ROOT.RDF.Experimental.Distributed.optimized

        # Updates the number of partitions for this dataframe if the user did
        # not specify one initially. This is done each time the computations are
        # triggered, in case the user changed the resource configuration
        # between runs (e.g. changing the number of available cores).
        self.npartitions = self.backend.optimize_npartitions()

        if optimized:
            computation_graph_callable = partial(
                ComputationGraphGenerator.run_with_cppworkflow,
                self._generate_graph_dict())
        else:
            computation_graph_callable = partial(
                ComputationGraphGenerator.trigger_computation_graph,
                self._generate_graph_dict())

        mapper = partial(distrdf_mapper,
                         build_rdf_from_range=self._generate_rdf_creator(),
                         computation_graph_callable=computation_graph_callable,
                         initialization_fn=self.backend.initialization,
                         optimized=optimized)

        # Execute graph distributedly and return the aggregated results from all
        # tasks
        returned_values = self.backend.ProcessAndMerge(self._build_ranges(),
                                                       mapper, distrdf_reducer)
        # Perform any extra checks that may be needed according to the
        # type of the head node
        final_values = self._handle_returned_values(returned_values)
        # List of action nodes in the same order as values
        local_nodes = self._get_action_nodes()
        # Set the value of every action node
        for node, value in zip(local_nodes, final_values):
            Utils.set_value_on_node(value, node, self.backend)
Exemple #2
0
    def execute(self, generator: "ComputationGraphGenerator"):
        """
        Executes an RDataFrame computation graph on a distributed backend.

        Args:
            generator (ComputationGraphGenerator): A factory object for a
                computation graph. Its ``get_callable`` method will return a
                function responsible for creating the computation graph of a
                given RDataFrame object and a range of entries. The range is
                needed for the `Snapshot` operation.
        """
        # Check if the workflow must be generated in optimized mode
        optimized = ROOT.RDF.Experimental.Distributed.optimized

        if optimized:
            computation_graph_callable = generator.get_callable_optimized()
        else:
            computation_graph_callable = generator.get_callable()

        # Avoid having references to the instance inside the mapper
        initialization_fn = self.initialization

        # Build the ranges for the current dataset
        headnode = generator.headnode

        ranges = headnode.build_ranges()
        build_rdf_from_range = headnode.generate_rdf_creator()

        mapper = partial(distrdf_mapper,
                         build_rdf_from_range=build_rdf_from_range,
                         computation_graph_callable=computation_graph_callable,
                         initialization_fn=initialization_fn,
                         optimized=optimized)

        # Values produced after Map-Reduce
        returned_values = self.ProcessAndMerge(ranges, mapper, distrdf_reducer)

        # Extract actual results of the RDataFrame operations requested
        actual_values = handle_returned_values(headnode, returned_values)
        # List of action nodes in the same order as values
        nodes = headnode.get_action_nodes()

        # Set the value of every action node
        for node, value in zip(nodes, actual_values):
            Utils.set_value_on_node(value, node, self)
Exemple #3
0
    def execute(self, generator):
        """
        Executes an RDataFrame computation graph on a distributed backend.

        Args:
            generator (ComputationGraphGenerator): A factory object for a
                computation graph. Its ``get_callable`` method will return a
                function responsible for creating the computation graph of a
                given RDataFrame object and a range of entries. The range is
                needed for the `Snapshot` operation.
        """
        # Check if the workflow must be generated in optimized mode
        optimized = ROOT.RDF.Experimental.Distributed.optimized

        if optimized:
            computation_graph_callable = generator.get_callable_optimized()
        else:
            computation_graph_callable = generator.get_callable()

        # Avoid having references to the instance inside the mapper
        initialization = self.initialization

        # Build the ranges for the current dataset
        headnode = generator.headnode
        ranges = headnode.build_ranges()
        build_rdf_from_range = headnode.generate_rdf_creator()

        def mapper(current_range):
            """
            Triggers the event-loop and executes all
            nodes in the computational graph using the
            callable.

            Args:
                current_range (Range): A Range named tuple, representing the
                    range of entries to be processed, their input files and
                    information about friend trees.

            Returns:
                list: This respresents the list of (mergeable)values of all
                action nodes in the computational graph.
            """
            # Disable graphics functionality in ROOT. It is not needed inside a
            # distributed task
            ROOT.gROOT.SetBatch(True)
            # Enable thread safety for the whole mapper function. We need to do
            # this since two tasks could be invoking the C++ interpreter
            # simultaneously, given that this function will release the GIL
            # before calling into C++ to run the event loop. Dask multi-threaded
            # or even multi-process workers could trigger such a scenario.
            ROOT.EnableThreadSafety()

            # We have to decide whether to do this in Dist or in subclasses
            # Utils.declare_headers(worker_includes)  # Declare headers if any
            # Run initialization method to prepare the worker runtime
            # environment
            initialization()

            # Build an RDataFrame instance for the current mapper task, based
            # on the type of the head node.
            rdf = build_rdf_from_range(current_range)

            if optimized:
                # Create the RDF computation graph and execute it on this ranged
                # dataset. The results of the actions of the graph and their types
                # are returned
                results, res_types = computation_graph_callable(
                    rdf, current_range.id)

                # Get RResultPtrs out of the type-erased RResultHandles by
                # instantiating with the type of the value
                mergeables = [
                    ROOT.ROOT.Detail.RDF.GetMergeableValue(
                        res.GetResultPtr[res_type]()) if isinstance(
                            res, ROOT.RDF.RResultHandle) else res
                    for res, res_type in zip(results, res_types)
                ]
            else:
                # Output of the callable
                resultptr_list = computation_graph_callable(
                    rdf, current_range.id)

                mergeables = [
                    Utils.get_mergeablevalue(resultptr)
                    for resultptr in resultptr_list
                ]

            return mergeables

        def reducer(mergeables_out, mergeables_in):
            """
            Merges two given lists of values that were
            returned by the mapper function for two different
            ranges.

            Args:
                mergeables_out (list): A list of computed (mergeable)values for
                    a given entry range in a dataset. The elements of this list
                    will be updated with the information contained in the
                    elements of the other argument list.

                mergeables_in (list): A list of computed (mergeable)values for
                    a given entry range in a dataset.

            Returns:
                list: The list of updated (mergeable)values.
            """

            for mergeable_out, mergeable_in in zip(mergeables_out,
                                                   mergeables_in):
                Utils.merge_values(mergeable_out, mergeable_in)

            return mergeables_out

        # Values produced after Map-Reduce
        values = self.ProcessAndMerge(ranges, mapper, reducer)
        # List of action nodes in the same order as values
        nodes = generator.get_action_nodes()

        # Set the value of every action node
        for node, value in zip(nodes, values):
            Utils.set_value_on_node(value, node, self)