def make_dataframe(self, *args, **kwargs): """ Creates an instance of distributed RDataFrame that can send computations to a Dask cluster. """ # Set the number of partitions for this dataframe, one of the following: # 1. User-supplied `npartitions` optional argument npartitions = kwargs.pop("npartitions", None) headnode = HeadNode.get_headnode(self, npartitions, *args) return DataFrame.RDataFrame(headnode)
def make_dataframe(self, *args, **kwargs): """ Creates an instance of distributed RDataFrame that can send computations to a Dask cluster. """ # Set the number of partitions for this dataframe, one of the following: # 1. User-supplied `npartitions` optional argument # 2. An educated guess according to the backend, using the backend's # `optimize_npartitions` function # 3. Set `npartitions` to 2 npartitions = kwargs.pop("npartitions", self.optimize_npartitions()) headnode = HeadNode.get_headnode(self, npartitions, *args) return DataFrame.RDataFrame(headnode)
def test_count_result_invariance(self): """ Tests that counting the entries in the dataset does not depend on the number of partitions. This could have happened if we used TEntryList to restrict processing on a certain range of entries of the TChain in a distributed task, but the changes in https://github.com/root-project/root/commit/77bd5aa82e9544811e0d5fce197ab87c739c2e23 were not implemented yet. """ treename = "entries" filenames = ["1cluster_20entries.root"] * 5 for npartitions in range(1, 6): headnode = HeadNode.get_headnode(npartitions, treename, filenames) backend = DistRDataFrameInvariants.TestBackend() rdf = DataFrame.RDataFrame(headnode, backend) self.assertEqual(rdf.Count().GetValue(), 100)
def make_dataframe(self, *args, **kwargs): """Creates an instance of SparkDataFrame""" headnode = Node.HeadNode(*args) return DataFrame.RDataFrame(headnode, self, **kwargs)