def test_pyspark_runner(self, spark_context): sc = spark_context.return_value.__enter__.return_value def mock_spark_submit(task): from luigi.contrib.pyspark_runner import PySparkRunner PySparkRunner(*task.app_command()[1:]).run() # Check py-package exists self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0])) # Check that main module containing the task exists. run_path = os.path.dirname(task.app_command()[1]) self.assertTrue( os.path.exists( os.path.join(run_path, os.path.basename(__file__)))) # Check that the python path contains the run_path self.assertTrue(run_path in sys.path) # Check if find_class finds the class for the correct module name. with open(task.app_command()[1], 'rb') as fp: self.assertTrue( pickle.Unpickler(fp).find_class('spark_test', 'TestPySparkTask')) with patch.object(SparkSubmitTask, 'run', mock_spark_submit): job = TestPySparkTask() with temporary_unloaded_module(b'') as task_module: with_config({'spark': {'py-packages': task_module}})(job.run)() sc.textFile.assert_called_with('input') sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
def test_unimported_job_type(self): MODULE_CONTENTS = b''' import luigi class UnimportedTask(luigi.Task): def complete(self): return False ''' class NotImportedTask(luigi.Task): task_family = 'UnimportedTask' task_module = None task = NotImportedTask() # verify that it can't run the task without the module info necessary to import it self.w.add(task) self.assertFalse(self.assistant.run()) self.assertEqual(list(self.sch.task_list('FAILED', '').keys()), [task.task_id]) # check that it can import with the right module with temporary_unloaded_module(MODULE_CONTENTS) as task.task_module: self.w.add(task) self.assertTrue(self.assistant.run()) self.assertEqual(list(self.sch.task_list('DONE', '').keys()), [task.task_id])
def test_dynamic_loading(self): with temporary_unloaded_module(CONTENTS) as temp_module_name: luigi.interface.run([ '--module', temp_module_name, 'FooTask', '--x', '123', '--local-scheduler', '--no-lock' ]) self.assertEqual(luigi._testing_glob_var, 123)
def test_pyspark_runner(self, spark_context): sc = spark_context.return_value.__enter__.return_value def mock_spark_submit(task): from luigi.contrib.pyspark_runner import PySparkRunner PySparkRunner(*task.app_command()[1:]).run() # Check py-package exists self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0])) with patch.object(SparkSubmitTask, 'run', mock_spark_submit): job = TestPySparkTask() with temporary_unloaded_module(b'') as task_module: with_config({'spark': {'py-packages': task_module}})(job.run)() sc.textFile.assert_called_with('input') sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
def _test_context_manager(self, force_multiprocessing): CONTEXT_MANAGER_MODULE = b''' class MyContextManager(object): def __init__(self, task_process): self.task = task_process.task def __enter__(self): assert not self.task.run_event.is_set(), "the task should not have run yet" self.task.enter_event.set() return self def __exit__(self, exc_type=None, exc_value=None, traceback=None): assert self.task.run_event.is_set(), "the task should have run" self.task.exit_event.set() ''' class DummyEventRecordingTask(luigi.Task): def __init__(self, *args, **kwargs): self.enter_event = multiprocessing.Event() self.exit_event = multiprocessing.Event() self.run_event = multiprocessing.Event() super(DummyEventRecordingTask, self).__init__(*args, **kwargs) def run(self): assert self.enter_event.is_set( ), "the context manager should have been entered" assert not self.exit_event.is_set( ), "the context manager should not have been exited yet" assert not self.run_event.is_set( ), "the task should not have run yet" self.run_event.set() def complete(self): return self.run_event.is_set() with temporary_unloaded_module(CONTEXT_MANAGER_MODULE) as module_name: t = DummyEventRecordingTask() w = Worker(task_process_context=module_name + '.MyContextManager', force_multiprocessing=force_multiprocessing) w.add(t) self.assertTrue(w.run()) self.assertTrue(t.complete()) self.assertTrue(t.enter_event.is_set()) self.assertTrue(t.exit_event.is_set())
def test_pyspark_session_runner_use_spark_session_true_spark1(self): pyspark = MagicMock() pyspark.__version__ = '1.6.3' pyspark_sql = MagicMock() with patch.dict(sys.modules, { 'pyspark': pyspark, 'pyspark.sql': pyspark_sql }): def mock_spark_submit(task): from luigi.contrib.pyspark_runner import PySparkSessionRunner self.assertRaises( RuntimeError, PySparkSessionRunner(*task.app_command()[1:]).run) with patch.object(SparkSubmitTask, 'run', mock_spark_submit): job = TestPySparkSessionTask() with temporary_unloaded_module(b'') as task_module: with_config({'spark': { 'py-packages': task_module }})(job.run)()
def test_pyspark_session_runner_use_spark_session_true(self): pyspark = MagicMock() pyspark.__version__ = '2.1.0' pyspark_sql = MagicMock() with patch.dict(sys.modules, { 'pyspark': pyspark, 'pyspark.sql': pyspark_sql }): spark = pyspark_sql.SparkSession.builder.config.return_value.enableHiveSupport.return_value.getOrCreate.return_value sc = spark.sparkContext def mock_spark_submit(task): from luigi.contrib.pyspark_runner import PySparkSessionRunner PySparkSessionRunner(*task.app_command()[1:]).run() # Check py-package exists self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0])) # Check that main module containing the task exists. run_path = os.path.dirname(task.app_command()[1]) self.assertTrue( os.path.exists( os.path.join(run_path, os.path.basename(__file__)))) # Check that the python path contains the run_path self.assertTrue(run_path in sys.path) # Check if find_class finds the class for the correct module name. with open(task.app_command()[1], 'rb') as fp: self.assertTrue( pickle.Unpickler(fp).find_class( 'spark_test', 'TestPySparkSessionTask')) with patch.object(SparkSubmitTask, 'run', mock_spark_submit): job = TestPySparkSessionTask() with temporary_unloaded_module(b'') as task_module: with_config({'spark': { 'py-packages': task_module }})(job.run)() spark.sql.assert_called_with('input') spark.sql.return_value.write.saveAsTable.assert_called_with( 'output') spark.stop.assert_called_once_with()
def _test_context_manager(self, force_multiprocessing): CONTEXT_MANAGER_MODULE = b''' class MyContextManager(object): def __init__(self, task_process): self.task = task_process.task def __enter__(self): assert not self.task.run_event.is_set(), "the task should not have run yet" self.task.enter_event.set() return self def __exit__(self, exc_type=None, exc_value=None, traceback=None): assert self.task.run_event.is_set(), "the task should have run" self.task.exit_event.set() ''' class DummyEventRecordingTask(luigi.Task): def __init__(self, *args, **kwargs): self.enter_event = multiprocessing.Event() self.exit_event = multiprocessing.Event() self.run_event = multiprocessing.Event() super(DummyEventRecordingTask, self).__init__(*args, **kwargs) def run(self): assert self.enter_event.is_set(), "the context manager should have been entered" assert not self.exit_event.is_set(), "the context manager should not have been exited yet" assert not self.run_event.is_set(), "the task should not have run yet" self.run_event.set() def complete(self): return self.run_event.is_set() with temporary_unloaded_module(CONTEXT_MANAGER_MODULE) as module_name: t = DummyEventRecordingTask() w = Worker(task_process_context=module_name + '.MyContextManager', force_multiprocessing=force_multiprocessing) w.add(t) self.assertTrue(w.run()) self.assertTrue(t.complete()) self.assertTrue(t.enter_event.is_set()) self.assertTrue(t.exit_event.is_set())
def test_pyspark_runner(self, spark_context): sc = spark_context.return_value.__enter__.return_value def mock_spark_submit(task): from luigi.contrib.pyspark_runner import PySparkRunner PySparkRunner(*task.app_command()[1:]).run() # Check py-package exists self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0])) # Check that main module containing the task exists. run_path = os.path.dirname(task.app_command()[1]) self.assertTrue(os.path.exists(os.path.join(run_path, os.path.basename(__file__)))) # Check that the python path contains the run_path self.assertTrue(run_path in sys.path) # Check if find_class finds the class for the correct module name. with open(task.app_command()[1], 'rb') as fp: self.assertTrue(pickle.Unpickler(fp).find_class('spark_test', 'TestPySparkTask')) with patch.object(SparkSubmitTask, 'run', mock_spark_submit): job = TestPySparkTask() with temporary_unloaded_module(b'') as task_module: with_config({'spark': {'py-packages': task_module}})(job.run)() sc.textFile.assert_called_with('input') sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
def test_dynamic_loading(self): with temporary_unloaded_module(CONTENTS) as temp_module_name: luigi.interface.run(['--module', temp_module_name, 'FooTask', '--x', '123', '--local-scheduler', '--no-lock']) self.assertEqual(luigi._testing_glob_var, 123)