def execute_graph(self) -> None: """ Executes an RDataFrame computation graph on a distributed backend. The needed ingredients are: - A collection of logical ranges in which the dataset is split. Each range is going to be assigned to a distributed task. - A representation of the computation graph that the task needs to execute. - A way to generate an RDataFrame instance starting from the logical range of the task. - Optionally, some setup code to be run at the beginning of each task. These are used as inputs to a generic mapper function. Results from the various mappers are then reduced and the final results are retrieved in the local session. These are properly handled to perform extra checks, depending on the data source. Finally, the local user-facing nodes are filled with the values that were computed distributedly so that they can be accessed in the application like with local RDataFrame. """ # Check if the workflow must be generated in optimized mode optimized = ROOT.RDF.Experimental.Distributed.optimized # Updates the number of partitions for this dataframe if the user did # not specify one initially. This is done each time the computations are # triggered, in case the user changed the resource configuration # between runs (e.g. changing the number of available cores). self.npartitions = self.backend.optimize_npartitions() if optimized: computation_graph_callable = partial( ComputationGraphGenerator.run_with_cppworkflow, self._generate_graph_dict()) else: computation_graph_callable = partial( ComputationGraphGenerator.trigger_computation_graph, self._generate_graph_dict()) mapper = partial(distrdf_mapper, build_rdf_from_range=self._generate_rdf_creator(), computation_graph_callable=computation_graph_callable, initialization_fn=self.backend.initialization, optimized=optimized) # Execute graph distributedly and return the aggregated results from all # tasks returned_values = self.backend.ProcessAndMerge(self._build_ranges(), mapper, distrdf_reducer) # Perform any extra checks that may be needed according to the # type of the head node final_values = self._handle_returned_values(returned_values) # List of action nodes in the same order as values local_nodes = self._get_action_nodes() # Set the value of every action node for node, value in zip(local_nodes, final_values): Utils.set_value_on_node(value, node, self.backend)
def execute(self, generator: "ComputationGraphGenerator"): """ Executes an RDataFrame computation graph on a distributed backend. Args: generator (ComputationGraphGenerator): A factory object for a computation graph. Its ``get_callable`` method will return a function responsible for creating the computation graph of a given RDataFrame object and a range of entries. The range is needed for the `Snapshot` operation. """ # Check if the workflow must be generated in optimized mode optimized = ROOT.RDF.Experimental.Distributed.optimized if optimized: computation_graph_callable = generator.get_callable_optimized() else: computation_graph_callable = generator.get_callable() # Avoid having references to the instance inside the mapper initialization_fn = self.initialization # Build the ranges for the current dataset headnode = generator.headnode ranges = headnode.build_ranges() build_rdf_from_range = headnode.generate_rdf_creator() mapper = partial(distrdf_mapper, build_rdf_from_range=build_rdf_from_range, computation_graph_callable=computation_graph_callable, initialization_fn=initialization_fn, optimized=optimized) # Values produced after Map-Reduce returned_values = self.ProcessAndMerge(ranges, mapper, distrdf_reducer) # Extract actual results of the RDataFrame operations requested actual_values = handle_returned_values(headnode, returned_values) # List of action nodes in the same order as values nodes = headnode.get_action_nodes() # Set the value of every action node for node, value in zip(nodes, actual_values): Utils.set_value_on_node(value, node, self)
def execute(self, generator): """ Executes an RDataFrame computation graph on a distributed backend. Args: generator (ComputationGraphGenerator): A factory object for a computation graph. Its ``get_callable`` method will return a function responsible for creating the computation graph of a given RDataFrame object and a range of entries. The range is needed for the `Snapshot` operation. """ # Check if the workflow must be generated in optimized mode optimized = ROOT.RDF.Experimental.Distributed.optimized if optimized: computation_graph_callable = generator.get_callable_optimized() else: computation_graph_callable = generator.get_callable() # Avoid having references to the instance inside the mapper initialization = self.initialization # Build the ranges for the current dataset headnode = generator.headnode ranges = headnode.build_ranges() build_rdf_from_range = headnode.generate_rdf_creator() def mapper(current_range): """ Triggers the event-loop and executes all nodes in the computational graph using the callable. Args: current_range (Range): A Range named tuple, representing the range of entries to be processed, their input files and information about friend trees. Returns: list: This respresents the list of (mergeable)values of all action nodes in the computational graph. """ # Disable graphics functionality in ROOT. It is not needed inside a # distributed task ROOT.gROOT.SetBatch(True) # Enable thread safety for the whole mapper function. We need to do # this since two tasks could be invoking the C++ interpreter # simultaneously, given that this function will release the GIL # before calling into C++ to run the event loop. Dask multi-threaded # or even multi-process workers could trigger such a scenario. ROOT.EnableThreadSafety() # We have to decide whether to do this in Dist or in subclasses # Utils.declare_headers(worker_includes) # Declare headers if any # Run initialization method to prepare the worker runtime # environment initialization() # Build an RDataFrame instance for the current mapper task, based # on the type of the head node. rdf = build_rdf_from_range(current_range) if optimized: # Create the RDF computation graph and execute it on this ranged # dataset. The results of the actions of the graph and their types # are returned results, res_types = computation_graph_callable( rdf, current_range.id) # Get RResultPtrs out of the type-erased RResultHandles by # instantiating with the type of the value mergeables = [ ROOT.ROOT.Detail.RDF.GetMergeableValue( res.GetResultPtr[res_type]()) if isinstance( res, ROOT.RDF.RResultHandle) else res for res, res_type in zip(results, res_types) ] else: # Output of the callable resultptr_list = computation_graph_callable( rdf, current_range.id) mergeables = [ Utils.get_mergeablevalue(resultptr) for resultptr in resultptr_list ] return mergeables def reducer(mergeables_out, mergeables_in): """ Merges two given lists of values that were returned by the mapper function for two different ranges. Args: mergeables_out (list): A list of computed (mergeable)values for a given entry range in a dataset. The elements of this list will be updated with the information contained in the elements of the other argument list. mergeables_in (list): A list of computed (mergeable)values for a given entry range in a dataset. Returns: list: The list of updated (mergeable)values. """ for mergeable_out, mergeable_in in zip(mergeables_out, mergeables_in): Utils.merge_values(mergeable_out, mergeable_in) return mergeables_out # Values produced after Map-Reduce values = self.ProcessAndMerge(ranges, mapper, reducer) # List of action nodes in the same order as values nodes = generator.get_action_nodes() # Set the value of every action node for node, value in zip(nodes, values): Utils.set_value_on_node(value, node, self)