def test_mapper_from_graph(self): """A simple test case to check the working of mapper.""" # A mock RDF object t = ComputationGraphGeneratorTest.Temp() # Head node hn = create_dummy_headnode(1) hn.backend = ComputationGraphGeneratorTest.TestBackend() node = Proxy.TransformationProxy(hn) # Set of operations to build the graph n1 = node.Define() n2 = node.Filter().Filter() n4 = n2.Count() n5 = n1.Count() n6 = node.Filter() # noqa: avoid PEP8 F841 # Generate and execute the mapper generator = ComputationGraphGenerator.ComputationGraphGenerator( node.proxied_node) mapper_func = generator.generate_computation_graph triggerables = mapper_func(t, 0) nodes = generator.get_action_nodes() reqd_order = [1, 3, 2, 2, 3, 2] self.assertEqual(t.ord_list, reqd_order) self.assertListEqual(nodes, [n5.proxied_node, n4.proxied_node]) self.assertListEqual(triggerables, [t, t])
def execute_graph(self) -> None: """ Executes an RDataFrame computation graph on a distributed backend. The needed ingredients are: - A collection of logical ranges in which the dataset is split. Each range is going to be assigned to a distributed task. - A representation of the computation graph that the task needs to execute. - A way to generate an RDataFrame instance starting from the logical range of the task. - Optionally, some setup code to be run at the beginning of each task. These are used as inputs to a generic mapper function. Results from the various mappers are then reduced and the final results are retrieved in the local session. These are properly handled to perform extra checks, depending on the data source. Finally, the local user-facing nodes are filled with the values that were computed distributedly so that they can be accessed in the application like with local RDataFrame. """ # Check if the workflow must be generated in optimized mode optimized = ROOT.RDF.Experimental.Distributed.optimized if optimized: computation_graph_callable = ComputationGraphGenerator.get_callable_optimized() else: computation_graph_callable = partial( ComputationGraphGenerator.trigger_computation_graph, self._generate_graph_dict()) mapper = partial(distrdf_mapper, build_rdf_from_range=self._generate_rdf_creator(), computation_graph_callable=computation_graph_callable, initialization_fn=self.backend.initialization, optimized=optimized) # Execute graph distributedly and return the aggregated results from all # tasks returned_values = self.backend.ProcessAndMerge(self._build_ranges(), mapper, distrdf_reducer) # Perform any extra checks that may be needed according to the # type of the head node final_values = self._handle_returned_values(returned_values) # List of action nodes in the same order as values local_nodes = self._get_action_nodes() # Set the value of every action node for node, value in zip(local_nodes, final_values): Utils.set_value_on_node(value, node, self.backend)
def test_mapper_with_pruning(self): """ A test case to check that the mapper works even in the case of pruning. """ # A mock RDF object t = ComputationGraphGeneratorTest.Temp() # Head node hn = create_dummy_headnode(1) hn.backend = ComputationGraphGeneratorTest.TestBackend() node = Proxy.TransformationProxy(hn) # Set of operations to build the graph n1 = node.Define() n2 = node.Filter().Filter() n4 = n2.Count() n5 = n1.Count() n6 = node.Filter() # noqa: avoid PEP8 F841 # Reason for pruning (change of reference) n5 = n1.Filter() # noqa: avoid PEP8 F841 # Generate and execute the mapper generator = ComputationGraphGenerator.ComputationGraphGenerator( node.proxied_node) # Prune first generator.headnode.graph_prune() mapper_func = generator.generate_computation_graph triggerables = mapper_func(t, 0) nodes = generator.get_action_nodes() reqd_order = [1, 2, 2, 2, 3, 2] self.assertEqual(t.ord_list, reqd_order) self.assertListEqual(nodes, [n4.proxied_node]) self.assertListEqual(triggerables, [t])