Exemple #1
0
    def distribute_headers(self, headers_paths):
        """
        Includes the C++ headers to be declared before execution.

        Args:
            headers_paths (str, iter): A string or an iterable (such as a
                list, set...) containing the paths to all necessary C++ headers
                as strings. This function accepts both paths to the headers
                themselves and paths to directories containing the headers.
        """
        headers_to_distribute = set()

        if isinstance(headers_paths, str):
            headers_to_distribute.update(
                Utils.get_paths_set_from_string(headers_paths))
        else:
            for path_string in headers_paths:
                headers_to_distribute.update(
                    Utils.get_paths_set_from_string(path_string))

        # Distribute header files to the workers
        self.distribute_unique_paths(headers_to_distribute)

        # Declare headers locally
        Utils.declare_headers(headers_to_distribute)

        # Finally, add everything to the includes set
        self.headers.update(headers_to_distribute)
Exemple #2
0
    def distribute_shared_libraries(self, shared_libraries_paths):
        """
        Includes the C++ shared libraries to be declared before execution. If
        any pcm file is present in the same folder as the shared libraries, the
        function will try to retrieve them and distribute them.

        Args:
            shared_libraries_paths (str, iter): A string or an iterable (such as
                a list, set...) containing the paths to all necessary C++ shared
                libraries as strings. This function accepts both paths to the
                libraries themselves and paths to directories containing the
                libraries.
        """
        libraries_to_distribute = set()
        pcm_to_distribute = set()

        if isinstance(shared_libraries_paths, str):
            pcm_to_distribute, libraries_to_distribute = (
                Utils.check_pcm_in_library_path(shared_libraries_paths))
        else:
            for path_string in shared_libraries_paths:
                pcm, libraries = Utils.check_pcm_in_library_path(path_string)
                libraries_to_distribute.update(libraries)
                pcm_to_distribute.update(pcm)

        # Distribute shared libraries and pcm files to the workers
        self.distribute_unique_paths(libraries_to_distribute)
        self.distribute_unique_paths(pcm_to_distribute)

        # Include shared libraries locally
        Utils.declare_shared_libraries(libraries_to_distribute)

        # Finally, add everything to the includes set
        self.shared_libraries.update(libraries_to_distribute)
Exemple #3
0
        def spark_mapper(current_range):
            """
            Gets the paths to the file(s) in the current executor, then
            declares the headers found.

            Args:
                current_range (tuple): A pair that contains the starting and
                    ending values of the current range.

            Returns:
                function: The map function to be executed on each executor,
                complete with all headers needed for the analysis.
            """
            # Get and declare headers on each worker
            headers_on_executor = [
                pyspark.SparkFiles.get(ntpath.basename(filepath))
                for filepath in headers
            ]
            Utils.declare_headers(headers_on_executor)

            # Get and declare shared libraries on each worker
            shared_libs_on_ex = [
                pyspark.SparkFiles.get(ntpath.basename(filepath))
                for filepath in shared_libraries
            ]
            Utils.declare_shared_libraries(shared_libs_on_ex)

            return mapper(current_range)
Exemple #4
0
        def dask_mapper(current_range):
            """
            Gets the paths to the file(s) in the current executor, then
            declares the headers found.

            Args:
                current_range (tuple): The current range of the dataset being
                    processed on the executor.

            Returns:
                function: The map function to be executed on each executor,
                complete with all headers needed for the analysis.
            """
            # Retrieve the current worker local directory
            localdir = get_worker().local_directory

            # Get and declare headers on each worker
            headers_on_executor = [
                os.path.join(localdir, os.path.basename(filepath))
                for filepath in headers
            ]
            Utils.declare_headers(headers_on_executor)

            # Get and declare shared libraries on each worker
            shared_libs_on_ex = [
                os.path.join(localdir, os.path.basename(filepath))
                for filepath in shared_libraries
            ]
            Utils.declare_shared_libraries(shared_libs_on_ex)

            return mapper(current_range)
Exemple #5
0
 def test_header_declaration_on_current_session(self):
     """Header has to be declared on the current session"""
     # Before the header declaration the function f is not present on the
     # ROOT interpreter
     with self.assertRaises(AttributeError):
         self.assertRaises(ROOT.b(1))
     Utils.declare_headers(["test_headers/header4.hxx"])
     self.assertEqual(ROOT.b(1), True)
Exemple #6
0
    def test_multiple_headers_declare(self):
        """'declare_headers' with multiple headers to be included."""
        Utils.declare_headers(
            ["test_headers/header2.hxx", "test_headers/header3.hxx"])

        self.assertEqual(ROOT.a(1), True)
        self.assertEqual(ROOT.f1(2), 2)
        self.assertEqual(ROOT.f2("myString"), "myString")
Exemple #7
0
    def execute_graph(self) -> None:
        """
        Executes an RDataFrame computation graph on a distributed backend.

        The needed ingredients are:

        - A collection of logical ranges in which the dataset is split. Each
          range is going to be assigned to a distributed task.
        - A representation of the computation graph that the task needs to
          execute.
        - A way to generate an RDataFrame instance starting from the logical
          range of the task.
        - Optionally, some setup code to be run at the beginning of each task.

        These are used as inputs to a generic mapper function. Results from the
        various mappers are then reduced and the final results are retrieved in
        the local session. These are properly handled to perform extra checks,
        depending on the data source. Finally, the local user-facing nodes are
        filled with the values that were computed distributedly so that they
        can be accessed in the application like with local RDataFrame.
        """
        # Check if the workflow must be generated in optimized mode
        optimized = ROOT.RDF.Experimental.Distributed.optimized

        # Updates the number of partitions for this dataframe if the user did
        # not specify one initially. This is done each time the computations are
        # triggered, in case the user changed the resource configuration
        # between runs (e.g. changing the number of available cores).
        self.npartitions = self.backend.optimize_npartitions()

        if optimized:
            computation_graph_callable = partial(
                ComputationGraphGenerator.run_with_cppworkflow,
                self._generate_graph_dict())
        else:
            computation_graph_callable = partial(
                ComputationGraphGenerator.trigger_computation_graph,
                self._generate_graph_dict())

        mapper = partial(distrdf_mapper,
                         build_rdf_from_range=self._generate_rdf_creator(),
                         computation_graph_callable=computation_graph_callable,
                         initialization_fn=self.backend.initialization,
                         optimized=optimized)

        # Execute graph distributedly and return the aggregated results from all
        # tasks
        returned_values = self.backend.ProcessAndMerge(self._build_ranges(),
                                                       mapper, distrdf_reducer)
        # Perform any extra checks that may be needed according to the
        # type of the head node
        final_values = self._handle_returned_values(returned_values)
        # List of action nodes in the same order as values
        local_nodes = self._get_action_nodes()
        # Set the value of every action node
        for node, value in zip(local_nodes, final_values):
            Utils.set_value_on_node(value, node, self.backend)
Exemple #8
0
def get_mergeable_values(starting_node: ROOT.RDF.RNode, range_id: int,
                         computation_graph_callable: Callable[[ROOT.RDF.RNode, int], List], optimized: bool) -> List:
    """
    Triggers the computation graph and returns a list of mergeable values.
    """
    if optimized:
        # Create the RDF computation graph and execute it on this ranged
        # dataset. The results of the actions of the graph and their types
        # are returned
        results, res_types = computation_graph_callable(starting_node, range_id)

        # Get RResultPtrs out of the type-erased RResultHandles by
        # instantiating with the type of the value
        mergeables = [
            ROOT.ROOT.Detail.RDF.GetMergeableValue(res.GetResultPtr[res_type]())
            if isinstance(res, ROOT.RDF.RResultHandle)
            else res
            for res, res_type in zip(results, res_types)
        ]
    else:
        # Output of the callable
        resultptr_list = computation_graph_callable(starting_node, range_id)

        mergeables = [Utils.get_mergeablevalue(resultptr) for resultptr in resultptr_list]

    return mergeables
Exemple #9
0
    def execute(self, generator: "ComputationGraphGenerator"):
        """
        Executes an RDataFrame computation graph on a distributed backend.

        Args:
            generator (ComputationGraphGenerator): A factory object for a
                computation graph. Its ``get_callable`` method will return a
                function responsible for creating the computation graph of a
                given RDataFrame object and a range of entries. The range is
                needed for the `Snapshot` operation.
        """
        # Check if the workflow must be generated in optimized mode
        optimized = ROOT.RDF.Experimental.Distributed.optimized

        if optimized:
            computation_graph_callable = generator.get_callable_optimized()
        else:
            computation_graph_callable = generator.get_callable()

        # Avoid having references to the instance inside the mapper
        initialization_fn = self.initialization

        # Build the ranges for the current dataset
        headnode = generator.headnode

        ranges = headnode.build_ranges()
        build_rdf_from_range = headnode.generate_rdf_creator()

        mapper = partial(distrdf_mapper,
                         build_rdf_from_range=build_rdf_from_range,
                         computation_graph_callable=computation_graph_callable,
                         initialization_fn=initialization_fn,
                         optimized=optimized)

        # Values produced after Map-Reduce
        returned_values = self.ProcessAndMerge(ranges, mapper, distrdf_reducer)

        # Extract actual results of the RDataFrame operations requested
        actual_values = handle_returned_values(headnode, returned_values)
        # List of action nodes in the same order as values
        nodes = headnode.get_action_nodes()

        # Set the value of every action node
        for node, value in zip(nodes, actual_values):
            Utils.set_value_on_node(value, node, self)
Exemple #10
0
    def distribute_files(self, files_paths):
        """
        Sends to the workers the generic files needed by the user.

        Args:
            files_paths (str, iter): Paths to the files to be sent to the
                distributed workers.
        """
        files_to_distribute = set()

        if isinstance(files_paths, str):
            files_to_distribute.update(
                Utils.get_paths_set_from_string(files_paths))
        else:
            for path_string in files_paths:
                files_to_distribute.update(
                    Utils.get_paths_set_from_string(path_string))

        self.distribute_unique_paths(files_to_distribute)
Exemple #11
0
def merge_values(mergeables_out: Iterable, mergeables_in: Iterable) -> Iterable:
    """
    Merge values of second argument into values of first argument and return
    first argument.
    """
    if mergeables_out is not None and mergeables_in is not None:

        for mergeable_out, mergeable_in in zip(mergeables_out, mergeables_in):
            Utils.merge_values(mergeable_out, mergeable_in)

    elif mergeables_out is None and mergeables_in is not None:
        mergeables_out = mergeables_in

    # This should treat the 4 possible cases:
    # 1. both arguments are non-empty: first if statement
    # 2. First argument is None and second is not empty: elif statement
    # 3. First argument is not empty and second is None: return first
    #    list, no need to do anything
    # 4. Both arguments are None: return first, it's None anyway.
    return mergeables_out
Exemple #12
0
        def reducer(mergeables_out, mergeables_in):
            """
            Merges two given lists of values that were
            returned by the mapper function for two different
            ranges.

            Args:
                mergeables_out (list): A list of computed (mergeable)values for
                    a given entry range in a dataset. The elements of this list
                    will be updated with the information contained in the
                    elements of the other argument list.

                mergeables_in (list): A list of computed (mergeable)values for
                    a given entry range in a dataset.

            Returns:
                list: The list of updated (mergeable)values.
            """

            for mergeable_out, mergeable_in in zip(mergeables_out,
                                                   mergeables_in):
                Utils.merge_values(mergeable_out, mergeable_in)

            return mergeables_out
Exemple #13
0
        def mapper(current_range):
            """
            Triggers the event-loop and executes all
            nodes in the computational graph using the
            callable.

            Args:
                current_range (Range): A Range named tuple, representing the
                    range of entries to be processed, their input files and
                    information about friend trees.

            Returns:
                list: This respresents the list of (mergeable)values of all
                action nodes in the computational graph.
            """
            # Disable graphics functionality in ROOT. It is not needed inside a
            # distributed task
            ROOT.gROOT.SetBatch(True)
            # Enable thread safety for the whole mapper function. We need to do
            # this since two tasks could be invoking the C++ interpreter
            # simultaneously, given that this function will release the GIL
            # before calling into C++ to run the event loop. Dask multi-threaded
            # or even multi-process workers could trigger such a scenario.
            ROOT.EnableThreadSafety()

            # We have to decide whether to do this in Dist or in subclasses
            # Utils.declare_headers(worker_includes)  # Declare headers if any
            # Run initialization method to prepare the worker runtime
            # environment
            initialization()

            # Build an RDataFrame instance for the current mapper task, based
            # on the type of the head node.
            rdf = build_rdf_from_range(current_range)

            if optimized:
                # Create the RDF computation graph and execute it on this ranged
                # dataset. The results of the actions of the graph and their types
                # are returned
                results, res_types = computation_graph_callable(
                    rdf, current_range.id)

                # Get RResultPtrs out of the type-erased RResultHandles by
                # instantiating with the type of the value
                mergeables = [
                    ROOT.ROOT.Detail.RDF.GetMergeableValue(
                        res.GetResultPtr[res_type]()) if isinstance(
                            res, ROOT.RDF.RResultHandle) else res
                    for res, res_type in zip(results, res_types)
                ]
            else:
                # Output of the callable
                resultptr_list = computation_graph_callable(
                    rdf, current_range.id)

                mergeables = [
                    Utils.get_mergeablevalue(resultptr)
                    for resultptr in resultptr_list
                ]

            return mergeables
Exemple #14
0
    def test_single_header_declare(self):
        """'declare_headers' with a single header to be included."""
        Utils.declare_headers(["test_headers/header1.hxx"])

        self.assertEqual(ROOT.f(1), True)
Exemple #15
0
    def execute(self, generator):
        """
        Executes an RDataFrame computation graph on a distributed backend.

        Args:
            generator (ComputationGraphGenerator): A factory object for a
                computation graph. Its ``get_callable`` method will return a
                function responsible for creating the computation graph of a
                given RDataFrame object and a range of entries. The range is
                needed for the `Snapshot` operation.
        """
        # Check if the workflow must be generated in optimized mode
        optimized = ROOT.RDF.Experimental.Distributed.optimized

        if optimized:
            computation_graph_callable = generator.get_callable_optimized()
        else:
            computation_graph_callable = generator.get_callable()

        # Avoid having references to the instance inside the mapper
        initialization = self.initialization

        # Build the ranges for the current dataset
        headnode = generator.headnode
        ranges = headnode.build_ranges()
        build_rdf_from_range = headnode.generate_rdf_creator()

        def mapper(current_range):
            """
            Triggers the event-loop and executes all
            nodes in the computational graph using the
            callable.

            Args:
                current_range (Range): A Range named tuple, representing the
                    range of entries to be processed, their input files and
                    information about friend trees.

            Returns:
                list: This respresents the list of (mergeable)values of all
                action nodes in the computational graph.
            """
            # Disable graphics functionality in ROOT. It is not needed inside a
            # distributed task
            ROOT.gROOT.SetBatch(True)
            # Enable thread safety for the whole mapper function. We need to do
            # this since two tasks could be invoking the C++ interpreter
            # simultaneously, given that this function will release the GIL
            # before calling into C++ to run the event loop. Dask multi-threaded
            # or even multi-process workers could trigger such a scenario.
            ROOT.EnableThreadSafety()

            # We have to decide whether to do this in Dist or in subclasses
            # Utils.declare_headers(worker_includes)  # Declare headers if any
            # Run initialization method to prepare the worker runtime
            # environment
            initialization()

            # Build an RDataFrame instance for the current mapper task, based
            # on the type of the head node.
            rdf = build_rdf_from_range(current_range)

            if optimized:
                # Create the RDF computation graph and execute it on this ranged
                # dataset. The results of the actions of the graph and their types
                # are returned
                results, res_types = computation_graph_callable(
                    rdf, current_range.id)

                # Get RResultPtrs out of the type-erased RResultHandles by
                # instantiating with the type of the value
                mergeables = [
                    ROOT.ROOT.Detail.RDF.GetMergeableValue(
                        res.GetResultPtr[res_type]()) if isinstance(
                            res, ROOT.RDF.RResultHandle) else res
                    for res, res_type in zip(results, res_types)
                ]
            else:
                # Output of the callable
                resultptr_list = computation_graph_callable(
                    rdf, current_range.id)

                mergeables = [
                    Utils.get_mergeablevalue(resultptr)
                    for resultptr in resultptr_list
                ]

            return mergeables

        def reducer(mergeables_out, mergeables_in):
            """
            Merges two given lists of values that were
            returned by the mapper function for two different
            ranges.

            Args:
                mergeables_out (list): A list of computed (mergeable)values for
                    a given entry range in a dataset. The elements of this list
                    will be updated with the information contained in the
                    elements of the other argument list.

                mergeables_in (list): A list of computed (mergeable)values for
                    a given entry range in a dataset.

            Returns:
                list: The list of updated (mergeable)values.
            """

            for mergeable_out, mergeable_in in zip(mergeables_out,
                                                   mergeables_in):
                Utils.merge_values(mergeable_out, mergeable_in)

            return mergeables_out

        # Values produced after Map-Reduce
        values = self.ProcessAndMerge(ranges, mapper, reducer)
        # List of action nodes in the same order as values
        nodes = generator.get_action_nodes()

        # Set the value of every action node
        for node, value in zip(nodes, values):
            Utils.set_value_on_node(value, node, self)