def test_range_operation_single_thread(self): """ Check that 'Range' operation works in single-threaded mode and raises an Exception in multi-threaded mode. """ backend = Spark() with self.assertRaises(Exception): backend.check_supported("Range")
def use(backend_name, conf={}): """ Allows the user to choose the execution backend. Args: backend_name (str): This is the name of the chosen backend. conf (str, optional): This should be a dictionary with necessary configuration parameters. Its default value is an empty dictionary {}. """ future_backends = ["dask"] global current_backend if backend_name in future_backends: msg = "This backend environment will be considered in the future !" raise NotImplementedError(msg) elif backend_name == "local": current_backend = Local(conf) elif backend_name == "spark": from PyRDF.backend.Spark import Spark current_backend = Spark(conf) elif backend_name == "AWS": from PyRDF.backend.AWS import AWS current_backend = AWS(conf) else: msg = "Incorrect backend environment \"{}\"".format(backend_name) raise Exception(msg)
def test_initialization_method(self): """ Check initialization method in Spark backend. Define a method in the ROOT interpreter called getValue which returns the value defined by the user on the python side. """ def init(value): import ROOT cpp_code = '''int userValue = %s ;''' % value ROOT.gInterpreter.ProcessLine(cpp_code) PyRDF.initialize(init, 123) PyRDF.current_backend = Spark() # Spark backend has a limited list of supported methods, so we use # Histo1D which is a supported action. # The code below creates an RDataFrame instance with one single entry # and defines a column 'u' whose value is taken from the variable # 'userValue'. # This variable is only declared inside the ROOT interpreter, however # the value of the variable is passed by the user from the python side. # If the init function defined by the user is properly propagated to the # Spark backend, each workers will run the init function as a first step # and hence the variable 'userValue' will be defined at runtime. # As a result the define operation should read the variable 'userValue' # and assign it to the entries of the column 'u' (only one entry). # Finally, Histo1D returns a histogram filled with one value. The mean # of this single value has to be the value itself, independently of # the number of spawned workers. df = PyRDF.RDataFrame(1).Define("u", "userValue").Histo1D("u") h = df.GetValue() self.assertEqual(h.GetMean(), 123)
def test_npartitions_default(self): """ Check that the default number of partitions is correctly set when no input value is given in the config dictionary. """ backend = Spark() self.assertEqual(backend.npartitions, Spark.MIN_NPARTITIONS)
def test_npartitions_with_num_executors(self): """ Check that the number of partitions is correctly set to number of executors when no input value is given in the config dictionary. """ backend = Spark({'spark.executor.instances': 10}) self.assertEqual(backend.npartitions, 10)
def test_set_npartitions_explicit(self): """ Check that the number of partitions is correctly set for a given input value in the config dictionary. """ backend = Spark({"npartitions": 5}) self.assertEqual(backend.npartitions, 5)
def test_unsupported_operations(self): """Check that unsupported operations raise an Exception.""" backend = Spark() with self.assertRaises(Exception): backend.check_supported("Take") with self.assertRaises(Exception): backend.check_supported("Foreach") with self.assertRaises(Exception): backend.check_supported("Range")
def test_set_spark_context_default(self): """ Check that if the config dictionary is empty, a `SparkContext` object is still created with default options for the current system. """ backend = Spark() self.assertDictEqual(backend.config, {}) self.assertIsInstance(backend.sparkContext, SparkContext)
def test_npartitions_with_already_existing_spark_context(self): """ Check that the number of partitions is correctly set when a Spark Context already exists. """ from pyspark import SparkConf sparkConf = SparkConf().set('spark.executor.instances', 15) SparkContext(conf=sparkConf) backend = Spark() self.assertEqual(backend.npartitions, 15)
def test_set_spark_context_with_conf(self): """ Check that a `SparkContext` object is correctly created for a given `SparkConf` object in the config dictionary. """ backend = Spark({'spark.app.name': 'my-pyspark-app1'}) self.assertIsInstance(backend.sparkContext, SparkContext) appname = backend.sparkContext.getConf().get('spark.app.name') self.assertEqual(appname, 'my-pyspark-app1')
def test_none(self): """Check that incorrect operations raise an Exception.""" backend = Spark() with self.assertRaises(Exception): backend.check_supported("random")
def test_transformation(self): """Check that transformation nodes are classified accurately.""" backend = Spark() backend.check_supported("Define")
def test_action(self): """Check that action nodes are classified accurately.""" backend = Spark() backend.check_supported("Histo1D")