Beispiel #1
0
    def test_pyspark_runner(self, spark_context):
        sc = spark_context.return_value.__enter__.return_value

        def mock_spark_submit(task):
            from luigi.contrib.pyspark_runner import PySparkRunner
            PySparkRunner(*task.app_command()[1:]).run()
            # Check py-package exists
            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))
            # Check that main module containing the task exists.
            run_path = os.path.dirname(task.app_command()[1])
            self.assertTrue(
                os.path.exists(
                    os.path.join(run_path, os.path.basename(__file__))))
            # Check that the python path contains the run_path
            self.assertTrue(run_path in sys.path)
            # Check if find_class finds the class for the correct module name.
            with open(task.app_command()[1], 'rb') as fp:
                self.assertTrue(
                    pickle.Unpickler(fp).find_class('spark_test',
                                                    'TestPySparkTask'))

        with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
            job = TestPySparkTask()
            with temporary_unloaded_module(b'') as task_module:
                with_config({'spark': {'py-packages': task_module}})(job.run)()

        sc.textFile.assert_called_with('input')
        sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
Beispiel #2
0
    def test_unimported_job_type(self):
        MODULE_CONTENTS = b'''
import luigi


class UnimportedTask(luigi.Task):
    def complete(self):
        return False
'''

        class NotImportedTask(luigi.Task):
            task_family = 'UnimportedTask'
            task_module = None

        task = NotImportedTask()

        # verify that it can't run the task without the module info necessary to import it
        self.w.add(task)
        self.assertFalse(self.assistant.run())
        self.assertEqual(list(self.sch.task_list('FAILED', '').keys()), [task.task_id])

        # check that it can import with the right module
        with temporary_unloaded_module(MODULE_CONTENTS) as task.task_module:
            self.w.add(task)
            self.assertTrue(self.assistant.run())
            self.assertEqual(list(self.sch.task_list('DONE', '').keys()), [task.task_id])
 def test_dynamic_loading(self):
     with temporary_unloaded_module(CONTENTS) as temp_module_name:
         luigi.interface.run([
             '--module', temp_module_name, 'FooTask', '--x', '123',
             '--local-scheduler', '--no-lock'
         ])
         self.assertEqual(luigi._testing_glob_var, 123)
Beispiel #4
0
    def test_unimported_job_type(self):
        MODULE_CONTENTS = b'''
import luigi


class UnimportedTask(luigi.Task):
    def complete(self):
        return False
'''

        class NotImportedTask(luigi.Task):
            task_family = 'UnimportedTask'
            task_module = None

        task = NotImportedTask()

        # verify that it can't run the task without the module info necessary to import it
        self.w.add(task)
        self.assertFalse(self.assistant.run())
        self.assertEqual(list(self.sch.task_list('FAILED', '').keys()),
                         [task.task_id])

        # check that it can import with the right module
        with temporary_unloaded_module(MODULE_CONTENTS) as task.task_module:
            self.w.add(task)
            self.assertTrue(self.assistant.run())
            self.assertEqual(list(self.sch.task_list('DONE', '').keys()),
                             [task.task_id])
Beispiel #5
0
    def test_pyspark_runner(self, spark_context):
        sc = spark_context.return_value.__enter__.return_value

        def mock_spark_submit(task):
            from luigi.contrib.pyspark_runner import PySparkRunner
            PySparkRunner(*task.app_command()[1:]).run()
            # Check py-package exists
            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))

        with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
            job = TestPySparkTask()
            with temporary_unloaded_module(b'') as task_module:
                with_config({'spark': {'py-packages': task_module}})(job.run)()

        sc.textFile.assert_called_with('input')
        sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
Beispiel #6
0
    def test_pyspark_runner(self, spark_context):
        sc = spark_context.return_value.__enter__.return_value

        def mock_spark_submit(task):
            from luigi.contrib.pyspark_runner import PySparkRunner
            PySparkRunner(*task.app_command()[1:]).run()
            # Check py-package exists
            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))

        with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
            job = TestPySparkTask()
            with temporary_unloaded_module(b'') as task_module:
                with_config({'spark': {'py-packages': task_module}})(job.run)()

        sc.textFile.assert_called_with('input')
        sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
Beispiel #7
0
    def _test_context_manager(self, force_multiprocessing):
        CONTEXT_MANAGER_MODULE = b'''
class MyContextManager(object):
    def __init__(self, task_process):
        self.task = task_process.task
    def __enter__(self):
        assert not self.task.run_event.is_set(), "the task should not have run yet"
        self.task.enter_event.set()
        return self
    def __exit__(self, exc_type=None, exc_value=None, traceback=None):
        assert self.task.run_event.is_set(), "the task should have run"
        self.task.exit_event.set()
'''

        class DummyEventRecordingTask(luigi.Task):
            def __init__(self, *args, **kwargs):
                self.enter_event = multiprocessing.Event()
                self.exit_event = multiprocessing.Event()
                self.run_event = multiprocessing.Event()
                super(DummyEventRecordingTask, self).__init__(*args, **kwargs)

            def run(self):
                assert self.enter_event.is_set(
                ), "the context manager should have been entered"
                assert not self.exit_event.is_set(
                ), "the context manager should not have been exited yet"
                assert not self.run_event.is_set(
                ), "the task should not have run yet"
                self.run_event.set()

            def complete(self):
                return self.run_event.is_set()

        with temporary_unloaded_module(CONTEXT_MANAGER_MODULE) as module_name:
            t = DummyEventRecordingTask()
            w = Worker(task_process_context=module_name + '.MyContextManager',
                       force_multiprocessing=force_multiprocessing)
            w.add(t)
            self.assertTrue(w.run())
            self.assertTrue(t.complete())
            self.assertTrue(t.enter_event.is_set())
            self.assertTrue(t.exit_event.is_set())
Beispiel #8
0
    def test_pyspark_session_runner_use_spark_session_true_spark1(self):
        pyspark = MagicMock()
        pyspark.__version__ = '1.6.3'
        pyspark_sql = MagicMock()
        with patch.dict(sys.modules, {
                'pyspark': pyspark,
                'pyspark.sql': pyspark_sql
        }):

            def mock_spark_submit(task):
                from luigi.contrib.pyspark_runner import PySparkSessionRunner
                self.assertRaises(
                    RuntimeError,
                    PySparkSessionRunner(*task.app_command()[1:]).run)

            with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
                job = TestPySparkSessionTask()
                with temporary_unloaded_module(b'') as task_module:
                    with_config({'spark': {
                        'py-packages': task_module
                    }})(job.run)()
Beispiel #9
0
    def test_pyspark_session_runner_use_spark_session_true(self):
        pyspark = MagicMock()
        pyspark.__version__ = '2.1.0'
        pyspark_sql = MagicMock()
        with patch.dict(sys.modules, {
                'pyspark': pyspark,
                'pyspark.sql': pyspark_sql
        }):
            spark = pyspark_sql.SparkSession.builder.config.return_value.enableHiveSupport.return_value.getOrCreate.return_value
            sc = spark.sparkContext

            def mock_spark_submit(task):
                from luigi.contrib.pyspark_runner import PySparkSessionRunner
                PySparkSessionRunner(*task.app_command()[1:]).run()
                # Check py-package exists
                self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))
                # Check that main module containing the task exists.
                run_path = os.path.dirname(task.app_command()[1])
                self.assertTrue(
                    os.path.exists(
                        os.path.join(run_path, os.path.basename(__file__))))
                # Check that the python path contains the run_path
                self.assertTrue(run_path in sys.path)
                # Check if find_class finds the class for the correct module name.
                with open(task.app_command()[1], 'rb') as fp:
                    self.assertTrue(
                        pickle.Unpickler(fp).find_class(
                            'spark_test', 'TestPySparkSessionTask'))

            with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
                job = TestPySparkSessionTask()
                with temporary_unloaded_module(b'') as task_module:
                    with_config({'spark': {
                        'py-packages': task_module
                    }})(job.run)()

            spark.sql.assert_called_with('input')
            spark.sql.return_value.write.saveAsTable.assert_called_with(
                'output')
            spark.stop.assert_called_once_with()
Beispiel #10
0
    def _test_context_manager(self, force_multiprocessing):
        CONTEXT_MANAGER_MODULE = b'''
class MyContextManager(object):
    def __init__(self, task_process):
        self.task = task_process.task
    def __enter__(self):
        assert not self.task.run_event.is_set(), "the task should not have run yet"
        self.task.enter_event.set()
        return self
    def __exit__(self, exc_type=None, exc_value=None, traceback=None):
        assert self.task.run_event.is_set(), "the task should have run"
        self.task.exit_event.set()
'''

        class DummyEventRecordingTask(luigi.Task):
            def __init__(self, *args, **kwargs):
                self.enter_event = multiprocessing.Event()
                self.exit_event = multiprocessing.Event()
                self.run_event = multiprocessing.Event()
                super(DummyEventRecordingTask, self).__init__(*args, **kwargs)

            def run(self):
                assert self.enter_event.is_set(), "the context manager should have been entered"
                assert not self.exit_event.is_set(), "the context manager should not have been exited yet"
                assert not self.run_event.is_set(), "the task should not have run yet"
                self.run_event.set()

            def complete(self):
                return self.run_event.is_set()

        with temporary_unloaded_module(CONTEXT_MANAGER_MODULE) as module_name:
            t = DummyEventRecordingTask()
            w = Worker(task_process_context=module_name + '.MyContextManager',
                       force_multiprocessing=force_multiprocessing)
            w.add(t)
            self.assertTrue(w.run())
            self.assertTrue(t.complete())
            self.assertTrue(t.enter_event.is_set())
            self.assertTrue(t.exit_event.is_set())
Beispiel #11
0
    def test_pyspark_runner(self, spark_context):
        sc = spark_context.return_value.__enter__.return_value

        def mock_spark_submit(task):
            from luigi.contrib.pyspark_runner import PySparkRunner
            PySparkRunner(*task.app_command()[1:]).run()
            # Check py-package exists
            self.assertTrue(os.path.exists(sc.addPyFile.call_args[0][0]))
            # Check that main module containing the task exists.
            run_path = os.path.dirname(task.app_command()[1])
            self.assertTrue(os.path.exists(os.path.join(run_path, os.path.basename(__file__))))
            # Check that the python path contains the run_path
            self.assertTrue(run_path in sys.path)
            # Check if find_class finds the class for the correct module name.
            with open(task.app_command()[1], 'rb') as fp:
                self.assertTrue(pickle.Unpickler(fp).find_class('spark_test', 'TestPySparkTask'))

        with patch.object(SparkSubmitTask, 'run', mock_spark_submit):
            job = TestPySparkTask()
            with temporary_unloaded_module(b'') as task_module:
                with_config({'spark': {'py-packages': task_module}})(job.run)()

        sc.textFile.assert_called_with('input')
        sc.textFile.return_value.saveAsTextFile.assert_called_with('output')
Beispiel #12
0
 def test_dynamic_loading(self):
     with temporary_unloaded_module(CONTENTS) as temp_module_name:
         luigi.interface.run(['--module', temp_module_name, 'FooTask', '--x', '123', '--local-scheduler', '--no-lock'])
         self.assertEqual(luigi._testing_glob_var, 123)