def distribute_headers(self, headers_paths): """ Includes the C++ headers to be declared before execution. Args: headers_paths (str, iter): A string or an iterable (such as a list, set...) containing the paths to all necessary C++ headers as strings. This function accepts both paths to the headers themselves and paths to directories containing the headers. """ headers_to_distribute = set() if isinstance(headers_paths, str): headers_to_distribute.update( Utils.get_paths_set_from_string(headers_paths)) else: for path_string in headers_paths: headers_to_distribute.update( Utils.get_paths_set_from_string(path_string)) # Distribute header files to the workers self.distribute_unique_paths(headers_to_distribute) # Declare headers locally Utils.declare_headers(headers_to_distribute) # Finally, add everything to the includes set self.headers.update(headers_to_distribute)
def distribute_shared_libraries(self, shared_libraries_paths): """ Includes the C++ shared libraries to be declared before execution. If any pcm file is present in the same folder as the shared libraries, the function will try to retrieve them and distribute them. Args: shared_libraries_paths (str, iter): A string or an iterable (such as a list, set...) containing the paths to all necessary C++ shared libraries as strings. This function accepts both paths to the libraries themselves and paths to directories containing the libraries. """ libraries_to_distribute = set() pcm_to_distribute = set() if isinstance(shared_libraries_paths, str): pcm_to_distribute, libraries_to_distribute = ( Utils.check_pcm_in_library_path(shared_libraries_paths)) else: for path_string in shared_libraries_paths: pcm, libraries = Utils.check_pcm_in_library_path(path_string) libraries_to_distribute.update(libraries) pcm_to_distribute.update(pcm) # Distribute shared libraries and pcm files to the workers self.distribute_unique_paths(libraries_to_distribute) self.distribute_unique_paths(pcm_to_distribute) # Include shared libraries locally Utils.declare_shared_libraries(libraries_to_distribute) # Finally, add everything to the includes set self.shared_libraries.update(libraries_to_distribute)
def spark_mapper(current_range): """ Gets the paths to the file(s) in the current executor, then declares the headers found. Args: current_range (tuple): A pair that contains the starting and ending values of the current range. Returns: function: The map function to be executed on each executor, complete with all headers needed for the analysis. """ # Get and declare headers on each worker headers_on_executor = [ pyspark.SparkFiles.get(ntpath.basename(filepath)) for filepath in headers ] Utils.declare_headers(headers_on_executor) # Get and declare shared libraries on each worker shared_libs_on_ex = [ pyspark.SparkFiles.get(ntpath.basename(filepath)) for filepath in shared_libraries ] Utils.declare_shared_libraries(shared_libs_on_ex) return mapper(current_range)
def dask_mapper(current_range): """ Gets the paths to the file(s) in the current executor, then declares the headers found. Args: current_range (tuple): The current range of the dataset being processed on the executor. Returns: function: The map function to be executed on each executor, complete with all headers needed for the analysis. """ # Retrieve the current worker local directory localdir = get_worker().local_directory # Get and declare headers on each worker headers_on_executor = [ os.path.join(localdir, os.path.basename(filepath)) for filepath in headers ] Utils.declare_headers(headers_on_executor) # Get and declare shared libraries on each worker shared_libs_on_ex = [ os.path.join(localdir, os.path.basename(filepath)) for filepath in shared_libraries ] Utils.declare_shared_libraries(shared_libs_on_ex) return mapper(current_range)
def test_header_declaration_on_current_session(self): """Header has to be declared on the current session""" # Before the header declaration the function f is not present on the # ROOT interpreter with self.assertRaises(AttributeError): self.assertRaises(ROOT.b(1)) Utils.declare_headers(["test_headers/header4.hxx"]) self.assertEqual(ROOT.b(1), True)
def test_multiple_headers_declare(self): """'declare_headers' with multiple headers to be included.""" Utils.declare_headers( ["test_headers/header2.hxx", "test_headers/header3.hxx"]) self.assertEqual(ROOT.a(1), True) self.assertEqual(ROOT.f1(2), 2) self.assertEqual(ROOT.f2("myString"), "myString")
def execute_graph(self) -> None: """ Executes an RDataFrame computation graph on a distributed backend. The needed ingredients are: - A collection of logical ranges in which the dataset is split. Each range is going to be assigned to a distributed task. - A representation of the computation graph that the task needs to execute. - A way to generate an RDataFrame instance starting from the logical range of the task. - Optionally, some setup code to be run at the beginning of each task. These are used as inputs to a generic mapper function. Results from the various mappers are then reduced and the final results are retrieved in the local session. These are properly handled to perform extra checks, depending on the data source. Finally, the local user-facing nodes are filled with the values that were computed distributedly so that they can be accessed in the application like with local RDataFrame. """ # Check if the workflow must be generated in optimized mode optimized = ROOT.RDF.Experimental.Distributed.optimized # Updates the number of partitions for this dataframe if the user did # not specify one initially. This is done each time the computations are # triggered, in case the user changed the resource configuration # between runs (e.g. changing the number of available cores). self.npartitions = self.backend.optimize_npartitions() if optimized: computation_graph_callable = partial( ComputationGraphGenerator.run_with_cppworkflow, self._generate_graph_dict()) else: computation_graph_callable = partial( ComputationGraphGenerator.trigger_computation_graph, self._generate_graph_dict()) mapper = partial(distrdf_mapper, build_rdf_from_range=self._generate_rdf_creator(), computation_graph_callable=computation_graph_callable, initialization_fn=self.backend.initialization, optimized=optimized) # Execute graph distributedly and return the aggregated results from all # tasks returned_values = self.backend.ProcessAndMerge(self._build_ranges(), mapper, distrdf_reducer) # Perform any extra checks that may be needed according to the # type of the head node final_values = self._handle_returned_values(returned_values) # List of action nodes in the same order as values local_nodes = self._get_action_nodes() # Set the value of every action node for node, value in zip(local_nodes, final_values): Utils.set_value_on_node(value, node, self.backend)
def get_mergeable_values(starting_node: ROOT.RDF.RNode, range_id: int, computation_graph_callable: Callable[[ROOT.RDF.RNode, int], List], optimized: bool) -> List: """ Triggers the computation graph and returns a list of mergeable values. """ if optimized: # Create the RDF computation graph and execute it on this ranged # dataset. The results of the actions of the graph and their types # are returned results, res_types = computation_graph_callable(starting_node, range_id) # Get RResultPtrs out of the type-erased RResultHandles by # instantiating with the type of the value mergeables = [ ROOT.ROOT.Detail.RDF.GetMergeableValue(res.GetResultPtr[res_type]()) if isinstance(res, ROOT.RDF.RResultHandle) else res for res, res_type in zip(results, res_types) ] else: # Output of the callable resultptr_list = computation_graph_callable(starting_node, range_id) mergeables = [Utils.get_mergeablevalue(resultptr) for resultptr in resultptr_list] return mergeables
def execute(self, generator: "ComputationGraphGenerator"): """ Executes an RDataFrame computation graph on a distributed backend. Args: generator (ComputationGraphGenerator): A factory object for a computation graph. Its ``get_callable`` method will return a function responsible for creating the computation graph of a given RDataFrame object and a range of entries. The range is needed for the `Snapshot` operation. """ # Check if the workflow must be generated in optimized mode optimized = ROOT.RDF.Experimental.Distributed.optimized if optimized: computation_graph_callable = generator.get_callable_optimized() else: computation_graph_callable = generator.get_callable() # Avoid having references to the instance inside the mapper initialization_fn = self.initialization # Build the ranges for the current dataset headnode = generator.headnode ranges = headnode.build_ranges() build_rdf_from_range = headnode.generate_rdf_creator() mapper = partial(distrdf_mapper, build_rdf_from_range=build_rdf_from_range, computation_graph_callable=computation_graph_callable, initialization_fn=initialization_fn, optimized=optimized) # Values produced after Map-Reduce returned_values = self.ProcessAndMerge(ranges, mapper, distrdf_reducer) # Extract actual results of the RDataFrame operations requested actual_values = handle_returned_values(headnode, returned_values) # List of action nodes in the same order as values nodes = headnode.get_action_nodes() # Set the value of every action node for node, value in zip(nodes, actual_values): Utils.set_value_on_node(value, node, self)
def distribute_files(self, files_paths): """ Sends to the workers the generic files needed by the user. Args: files_paths (str, iter): Paths to the files to be sent to the distributed workers. """ files_to_distribute = set() if isinstance(files_paths, str): files_to_distribute.update( Utils.get_paths_set_from_string(files_paths)) else: for path_string in files_paths: files_to_distribute.update( Utils.get_paths_set_from_string(path_string)) self.distribute_unique_paths(files_to_distribute)
def merge_values(mergeables_out: Iterable, mergeables_in: Iterable) -> Iterable: """ Merge values of second argument into values of first argument and return first argument. """ if mergeables_out is not None and mergeables_in is not None: for mergeable_out, mergeable_in in zip(mergeables_out, mergeables_in): Utils.merge_values(mergeable_out, mergeable_in) elif mergeables_out is None and mergeables_in is not None: mergeables_out = mergeables_in # This should treat the 4 possible cases: # 1. both arguments are non-empty: first if statement # 2. First argument is None and second is not empty: elif statement # 3. First argument is not empty and second is None: return first # list, no need to do anything # 4. Both arguments are None: return first, it's None anyway. return mergeables_out
def reducer(mergeables_out, mergeables_in): """ Merges two given lists of values that were returned by the mapper function for two different ranges. Args: mergeables_out (list): A list of computed (mergeable)values for a given entry range in a dataset. The elements of this list will be updated with the information contained in the elements of the other argument list. mergeables_in (list): A list of computed (mergeable)values for a given entry range in a dataset. Returns: list: The list of updated (mergeable)values. """ for mergeable_out, mergeable_in in zip(mergeables_out, mergeables_in): Utils.merge_values(mergeable_out, mergeable_in) return mergeables_out
def mapper(current_range): """ Triggers the event-loop and executes all nodes in the computational graph using the callable. Args: current_range (Range): A Range named tuple, representing the range of entries to be processed, their input files and information about friend trees. Returns: list: This respresents the list of (mergeable)values of all action nodes in the computational graph. """ # Disable graphics functionality in ROOT. It is not needed inside a # distributed task ROOT.gROOT.SetBatch(True) # Enable thread safety for the whole mapper function. We need to do # this since two tasks could be invoking the C++ interpreter # simultaneously, given that this function will release the GIL # before calling into C++ to run the event loop. Dask multi-threaded # or even multi-process workers could trigger such a scenario. ROOT.EnableThreadSafety() # We have to decide whether to do this in Dist or in subclasses # Utils.declare_headers(worker_includes) # Declare headers if any # Run initialization method to prepare the worker runtime # environment initialization() # Build an RDataFrame instance for the current mapper task, based # on the type of the head node. rdf = build_rdf_from_range(current_range) if optimized: # Create the RDF computation graph and execute it on this ranged # dataset. The results of the actions of the graph and their types # are returned results, res_types = computation_graph_callable( rdf, current_range.id) # Get RResultPtrs out of the type-erased RResultHandles by # instantiating with the type of the value mergeables = [ ROOT.ROOT.Detail.RDF.GetMergeableValue( res.GetResultPtr[res_type]()) if isinstance( res, ROOT.RDF.RResultHandle) else res for res, res_type in zip(results, res_types) ] else: # Output of the callable resultptr_list = computation_graph_callable( rdf, current_range.id) mergeables = [ Utils.get_mergeablevalue(resultptr) for resultptr in resultptr_list ] return mergeables
def test_single_header_declare(self): """'declare_headers' with a single header to be included.""" Utils.declare_headers(["test_headers/header1.hxx"]) self.assertEqual(ROOT.f(1), True)
def execute(self, generator): """ Executes an RDataFrame computation graph on a distributed backend. Args: generator (ComputationGraphGenerator): A factory object for a computation graph. Its ``get_callable`` method will return a function responsible for creating the computation graph of a given RDataFrame object and a range of entries. The range is needed for the `Snapshot` operation. """ # Check if the workflow must be generated in optimized mode optimized = ROOT.RDF.Experimental.Distributed.optimized if optimized: computation_graph_callable = generator.get_callable_optimized() else: computation_graph_callable = generator.get_callable() # Avoid having references to the instance inside the mapper initialization = self.initialization # Build the ranges for the current dataset headnode = generator.headnode ranges = headnode.build_ranges() build_rdf_from_range = headnode.generate_rdf_creator() def mapper(current_range): """ Triggers the event-loop and executes all nodes in the computational graph using the callable. Args: current_range (Range): A Range named tuple, representing the range of entries to be processed, their input files and information about friend trees. Returns: list: This respresents the list of (mergeable)values of all action nodes in the computational graph. """ # Disable graphics functionality in ROOT. It is not needed inside a # distributed task ROOT.gROOT.SetBatch(True) # Enable thread safety for the whole mapper function. We need to do # this since two tasks could be invoking the C++ interpreter # simultaneously, given that this function will release the GIL # before calling into C++ to run the event loop. Dask multi-threaded # or even multi-process workers could trigger such a scenario. ROOT.EnableThreadSafety() # We have to decide whether to do this in Dist or in subclasses # Utils.declare_headers(worker_includes) # Declare headers if any # Run initialization method to prepare the worker runtime # environment initialization() # Build an RDataFrame instance for the current mapper task, based # on the type of the head node. rdf = build_rdf_from_range(current_range) if optimized: # Create the RDF computation graph and execute it on this ranged # dataset. The results of the actions of the graph and their types # are returned results, res_types = computation_graph_callable( rdf, current_range.id) # Get RResultPtrs out of the type-erased RResultHandles by # instantiating with the type of the value mergeables = [ ROOT.ROOT.Detail.RDF.GetMergeableValue( res.GetResultPtr[res_type]()) if isinstance( res, ROOT.RDF.RResultHandle) else res for res, res_type in zip(results, res_types) ] else: # Output of the callable resultptr_list = computation_graph_callable( rdf, current_range.id) mergeables = [ Utils.get_mergeablevalue(resultptr) for resultptr in resultptr_list ] return mergeables def reducer(mergeables_out, mergeables_in): """ Merges two given lists of values that were returned by the mapper function for two different ranges. Args: mergeables_out (list): A list of computed (mergeable)values for a given entry range in a dataset. The elements of this list will be updated with the information contained in the elements of the other argument list. mergeables_in (list): A list of computed (mergeable)values for a given entry range in a dataset. Returns: list: The list of updated (mergeable)values. """ for mergeable_out, mergeable_in in zip(mergeables_out, mergeables_in): Utils.merge_values(mergeable_out, mergeable_in) return mergeables_out # Values produced after Map-Reduce values = self.ProcessAndMerge(ranges, mapper, reducer) # List of action nodes in the same order as values nodes = generator.get_action_nodes() # Set the value of every action node for node, value in zip(nodes, values): Utils.set_value_on_node(value, node, self)