Ejemplo n.º 1
0
 def test_luigi_sanity_complex_foo(self, complex_foo):
     with dbnd_config({CoreConfig.databand_url: "http://localhost:8080"}):
         with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
             try:
                 shutil.rmtree("/tmp/bar")
             except FileNotFoundError:
                 pass
             result = dbnd_luigi_build(tasks=[complex_foo])
     assert result.status == LuigiStatusCode.SUCCESS
Ejemplo n.º 2
0
    def test_build_fat_wheel(self):
        with dbnd_config(
            {
                "bdist_zip": {
                    "package_dir": test_scenario_path("dbnd-test-package"),
                    "requirements_file": test_scenario_path(
                        "dbnd-test-package/requirements.txt"
                    ),
                }
            }
        ):
            bdist_file = build_fat_wheel()
            assert os.path.exists(bdist_file)

            temp_zip = zipfile.ZipFile(file=bdist_file, mode="r")
            all_files = temp_zip.NameToInfo.keys()

            assert "six.py" in all_files
            assert "dbnd_test_package/my_lib.py" in all_files
            assert "dbnd_test_package-0.1.dist-info/METADATA" in all_files
            assert "luigi/task.py" in all_files

            # check cache
            new_bdist_file = build_fat_wheel()
            assert bdist_file == new_bdist_file
Ejemplo n.º 3
0
 def test__by_day_simple_local(self):
     with dbnd_config({
             ProductionIdsAndData.task_env: "local",
             FetchIds.task_enabled_in_prod: True,
             FetchData.task_enabled_in_prod: True,
     }):
         assert_run_task(ByDayExamplePipeline(period="2d"))
    def test_submit_driver_req(self, mock_client):
        with dbnd_config(K8S_CONFIG):
            dbnd_run_cmd(
                [
                    "dbnd_sanity_check",
                    "--env=gcp_k8s",
                    "--set-config",
                    "kubernetes.container_tag=tag",
                ]
            )
        calls = mock_client().create_namespaced_pod.call_args_list
        assert len(calls) == 1

        call = calls[0].kwargs

        # 1) test - default labels
        req_labels = call["body"]["metadata"]["labels"]
        assert is_sub_dict(
            req_labels,
            {
                "dbnd_task_family": "d.d.d.docker-run-task",
                "dbnd_task_name": "dbnd-driver-run",
                "dbnd_task_af_id": "dbnd-driver-run",
                "dbnd": "dbnd-system-task-run",
            },
        )
        # 2) test -  running the driver with the global resources
        assert call["body"]["spec"]["containers"][0]["resources"] == {
            "limits": {"test_limits": 1},
            "requests": {"memory": "1536Mi", "cpu": "1"},
        }
Ejemplo n.º 5
0
 def test_multiple_input_tracking(self, task_c):
     with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
         dbnd_task = wrap_luigi_task(task_c)
         assert dbnd_task
         # Output1 and 2 are actually inputs from TaskB, just badly named
         assert dbnd_task.output1
         assert dbnd_task.output10
         assert dbnd_task.output2
         assert dbnd_task.output20
         output1 = [
             x for x in dbnd_task.task_meta.task_params if x.name == "output1"
         ][0]
         output10 = [
             x for x in dbnd_task.task_meta.task_params if x.name == "output10"
         ][0]
         output2 = [
             x for x in dbnd_task.task_meta.task_params if x.name == "output2"
         ][0]
         output20 = [
             x for x in dbnd_task.task_meta.task_params if x.name == "output20"
         ][0]
         assert output1.parameter.kind == _ParameterKind.task_input
         assert output10.parameter.kind == _ParameterKind.task_input
         assert output2.parameter.kind == _ParameterKind.task_input
         assert output20.parameter.kind == _ParameterKind.task_input
Ejemplo n.º 6
0
    def test_spark_conf_merge_not_overlap(self, _, __, spark_submit_hook):
        with dbnd_config({
                SparkConfig.disable_sync: True,
                SparkConfig.disable_tracking_api: True,
                SparkConfig.conf: {
                    CONFIG_1: "config_layer",
                    CONFIG_2: "config_layer"
                },
        }):
            TaskA(text=__file__).dbnd_run()

            spark_submit_hook.assert_called_with(conf={
                CONFIG_1: "TaskA",
                CONFIG_2: "config_layer"
            },
                                                 **self.spark_hook_params())

            TaskB(text=__file__).dbnd_run()
            spark_submit_hook.assert_called_with(conf={
                CONFIG_1: "config_layer",
                CONFIG_2: "TaskB"
            },
                                                 **self.spark_hook_params())

            TaskC(text=__file__).dbnd_run()
            spark_submit_hook.assert_called_with(conf={
                CONFIG_1: "TaskC",
                CONFIG_2: "config_layer"
            },
                                                 **self.spark_hook_params())
    def test_pod_building(self):
        with dbnd_config(K8S_CONFIG):

            run = pod_builder.dbnd_run(config_name="gcp_k8s_engine")
            pod = run.run_executor.result.load("result")

        assert is_sub_dict(
            pod.metadata.labels,
            {
                "dbnd_task_family": "t.k.t.pod-builder",
                "dbnd_task_name": "pod-builder",
                "dbnd_task_af_id": "pod-builder",
            },
        )
        container = pod.spec.containers[0]
        assert container.resources.limits == {"test_limits": 1}
        assert container.resources.requests == {"memory": "1536Mi", "cpu": "1"}
        raw_env = {env_var.name: env_var.value for env_var in container.env}
        assert is_sub_dict(
            raw_env,
            {
                "DBND__POD_NAMESPACE": "test_namespace",
                "DBND__ENV_IMAGE": "gcr.io/dbnd-dev-260010/databand:dummy_tag",
                "DBND__GCP_K8S_ENGINE__IN_CLUSTER": "True",
                "AIRFLOW__KUBERNETES__IN_CLUSTER": "True",
            },
        )

        assert pod.metadata.namespace == "test_namespace"
    def test_req_building(self):
        with dbnd_config(K8S_CONFIG):
            run = request_builder.dbnd_run(config_name="gcp_k8s_engine")
            req = run.run_executor.result.load("result")
        assert is_sub_dict(
            req["metadata"]["labels"],
            {
                "dbnd_task_family": "t.k.t.request-builder",
                "dbnd_task_name": "request-builder",
                "dbnd_task_af_id": "request-builder",
            },
        )

        container_spec = req["spec"]["containers"][0]
        container_spec_env = {
            v["name"]: v["value"] for v in container_spec["env"] if "value" in v
        }
        assert is_sub_dict(
            container_spec_env,
            {
                "DBND__POD_NAMESPACE": "test_namespace",
                "DBND__ENV_IMAGE": "gcr.io/dbnd-dev-260010/databand:dummy_tag",
                "DBND__GCP_K8S_ENGINE__IN_CLUSTER": "True",
                "AIRFLOW__KUBERNETES__IN_CLUSTER": "True",
                "AIRFLOW__KUBERNETES__DAGS_IN_IMAGE": "True",
            },
        )

        assert container_spec["resources"] == {
            "requests": {"memory": "1536Mi", "cpu": "1"},
            "limits": {"test_limits": 1},
        }
        assert container_spec["image"] == "gcr.io/dbnd-dev-260010/databand:dummy_tag"
Ejemplo n.º 9
0
 def test_spark_conf_merge(self, _, __, spark_submit_hook, task, expected):
     with dbnd_config({
             SparkConfig.disable_sync: True,
             SparkConfig.disable_tracking_api: True,
             SparkConfig.conf: {
                 CONFIG_1: "config_layer",
                 CONFIG_2: "config_layer"
             },
     }):
         task(text=__file__).dbnd_run()
         spark_submit_hook.assert_called_once_with(
             conf=expected,
             application_args=mock.ANY,
             conn_id=mock.ANY,
             driver_class_path=mock.ANY,
             driver_memory=mock.ANY,
             env_vars=mock.ANY,
             exclude_packages=mock.ANY,
             executor_cores=mock.ANY,
             executor_memory=mock.ANY,
             files=mock.ANY,
             jars=mock.ANY,
             java_class=mock.ANY,
             keytab=mock.ANY,
             name=mock.ANY,
             num_executors=mock.ANY,
             packages=mock.ANY,
             principal=mock.ANY,
             py_files=mock.ANY,
             repositories=mock.ANY,
             total_executor_cores=mock.ANY,
             verbose=mock.ANY,
         )
Ejemplo n.º 10
0
 def test_serialization_runtime(self):
     # Serialization in runtime is achieved by running in parallel
     with dbnd_config({
             RunConfig.parallel: True,
             RunConfig.enable_concurent_sqlite: True
     }):
         tf_pipeline.dbnd_run()
Ejemplo n.º 11
0
    def test_scope_and_defined_for_child_config(self):
        @task
        def simple_task(tdata, tstr):
            pass

        class TPipeline(PipelineTask):
            tdata = data(scope=ParameterScope.children)
            tstr = parameter(scope=ParameterScope.children)[str]

            some_a = output

            def band(self):
                self.some_a = simple_task()

        expected_task_target_date = date(2020, 1, 1)
        expected_tstr = "teststr"
        expected_tdata = __file__

        # tstr is "scoped" from Tpipeline, however, that's default
        # so the value should come from config
        with dbnd_config(config_values={simple_task.task.tstr: expected_tstr}):
            t_pipeline = TPipeline(
                tdata=expected_tdata,
                tstr="tpipeline_scope_value",
                task_target_date=expected_task_target_date,
            )
        t_task = t_pipeline.some_a.task

        assert t_task.task_target_date == expected_task_target_date
        assert str(t_task.tdata) == expected_tdata
        assert t_task.tstr == expected_tstr
    def test_build_task_with_task_band_through_config(self, task_band_file):
        run = First(input_1=3).dbnd_run()
        assert run.run_executor.result.load("result") == 3

        with dbnd_config({"First": {"task_band": task_band_file.strpath}}):
            run = First(input_1=3).dbnd_run()
        # accessing the result and check that the used value is the one from the task_band
        assert run.run_executor.result.load("result") == 1
Ejemplo n.º 13
0
 def test_prod_immutable_output_example(self):
     with dbnd_config({
             FetchIds.task_enabled_in_prod: True,
             FetchData.task_enabled_in_prod: True
     }):
         task = ProductionIdsAndData(
             task_env=get_databand_context().env.clone(production=True))
         assert_run_task(task)
Ejemplo n.º 14
0
 def test_wine_quality_deco_simple_all(self):
     with dbnd_config(
         {"local_prod": {"_from": "local", "env_label": "prod", "production": True}}
     ):
         task = wine_quality.predict_wine_quality.t(
             alpha=0.5, override={wine_quality.fetch_data.t.task_env: "local_prod"}
         )
         assert_run_task(task)
Ejemplo n.º 15
0
 def test_luigi_sanity_foo(self, simple_foo):
     try:
         shutil.rmtree("/tmp/bar")
     except FileNotFoundError:
         pass
     with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
         result = dbnd_luigi_build(tasks=[simple_foo])
     assert result.status == LuigiStatusCode.SUCCESS
Ejemplo n.º 16
0
 def test_luigi_build_exception(self, top10_artists_run_error):
     with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
         with mock.patch("dbnd_luigi.luigi_tracking.handler") as handler:
             result = dbnd_luigi_build(tasks=[top10_artists_run_error])
             assert handler.on_failure.call_count == 1
             assert handler.on_success.call_count == 3
             assert handler.on_dependency_discovered.call_count == 3
             assert handler.on_run_start.call_count == 4
             assert result.status == LuigiStatusCode.FAILED
Ejemplo n.º 17
0
 def test_luigi_wrapper_task_run_fail(self, wrapper_task_run_fail):
     delete_task_output(wrapper_task_run_fail)
     with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
         with mock.patch("dbnd_luigi.luigi_tracking.handler") as handler:
             result = dbnd_luigi_build(tasks=[wrapper_task_run_fail])
             assert handler.on_failure.call_count == 1
             assert handler.on_success.call_count == 1
             assert handler.on_dependency_discovered.call_count == 1
             assert handler.on_run_start.call_count == 2
             assert result.status == LuigiStatusCode.FAILED
Ejemplo n.º 18
0
 def test_luigi_sanity_input_target_tracking(self, top10_artists):
     with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
         dbnd_task = wrap_luigi_task(top10_artists)
         assert dbnd_task
         dbnd_input_target = [
             x for x in dbnd_task.task_meta.task_params if "artist_streams" in x.name
         ][0].value
         assert dbnd_input_target
         luigi_target = top10_artists.input()
         assert luigi_target
         assert luigi_target.path in dbnd_input_target.path
Ejemplo n.º 19
0
    def test_spark_inline(self):
        from dbnd_test_scenarios.spark.spark_tasks_inline import word_count_inline

        # Solve "tests" module conflict on pickle loading after spark-submit
        parent_directory = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        with dbnd_config(
            {SparkConfig.env_vars: {
                "PYTHONPATH": parent_directory
            }}):
            assert_run_task(word_count_inline.t(text=__file__))
Ejemplo n.º 20
0
 def test_spark_conf_merge(self, _, __, spark_submit_hook, task, expected):
     with dbnd_config({
             SparkConfig.disable_sync: True,
             SparkConfig.disable_tracking_api: True,
             SparkConfig.conf: {
                 CONFIG_1: "config_layer",
                 CONFIG_2: "config_layer"
             },
     }):
         task(text=__file__).dbnd_run()
         spark_submit_hook.assert_called_once_with(
             conf=expected, **self.spark_hook_params())
Ejemplo n.º 21
0
 def test_custom_partition_from_config(self):
     with dbnd_config(
             config_values=
         {
             "task": {
                 "task_output_path_format":
                 "{root}/{env_label}/{task_family}{task_class_version}_custom/"
                 "{output_name}{output_ext}/date={task_target_date}"
             }
         }):
         task = TTask()
         assert_run_task(task)
         assert "TTask_custom/t_output.csv/" in str(task.t_output)
Ejemplo n.º 22
0
    def band(self):
        # This is a way to override every output of underneath tasks with custom output location (see "_custom")
        with dbnd.dbnd_config(
            config_values={
                "task": {
                    "task_output_path_format": "{root}/{env_label}/{task_family}{task_class_version}_custom/"
                    "{output_name}{output_ext}/date={task_target_date}"
                }
            }
        ):

            partitioned_data = PartitionedDataTask().partitioned_data
            self.concat = PartitionedDataReader(
                partitioned_data=partitioned_data
            ).concat
Ejemplo n.º 23
0
    def test_spark_complete(self, monkeypatch):
        from dbnd_test_scenarios.spark.spark_tasks_inline import word_count_inline

        # Solve "tests" module conflict on pickle loading after spark-submit
        parent_directory = os.path.dirname(
            os.path.dirname(os.path.realpath(__file__)))
        with dbnd_config(
            {SparkConfig.env_vars: {
                "PYTHONPATH": parent_directory
            }}):
            t = word_count_inline.t(text=__file__)
            dir_target = mock.MagicMock(DirTarget)
            dir_target.exists = mock.Mock(return_value=False)
            monkeypatch.setattr(t, "_get_dir_outputs", lambda: [dir_target])
            assert t._complete() is False
Ejemplo n.º 24
0
 def test_multiple_output_tracking(self, task_b):
     with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
         dbnd_task = wrap_luigi_task(task_b)
         assert dbnd_task
         assert len(dbnd_task.task_outputs) == 3
         assert dbnd_task.output1
         assert dbnd_task.output2
         output1 = [
             x for x in dbnd_task.task_meta.task_params if x.name == "output1"
         ][0]
         output2 = [
             x for x in dbnd_task.task_meta.task_params if x.name == "output2"
         ][0]
         assert output1.parameter.kind == _ParameterKind.task_output
         assert output2.parameter.kind == _ParameterKind.task_output
Ejemplo n.º 25
0
 def test_luigi_sanity_output_target_tracking(self, top10_artists):
     with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
         dbnd_task = wrap_luigi_task(top10_artists)
         assert dbnd_task
         assert dbnd_task.task_outputs
         # 'result' is our added output target
         assert len(dbnd_task.task_outputs) == 2
         dbnd_output = [
             v for k, v in dbnd_task.task_outputs.items() if k != "task_band"
         ][0]
         assert dbnd_output
         luigi_output = top10_artists.output()
         assert luigi_output
         # Assert we preserve filename and directory tree format
         assert luigi_output.path in dbnd_output.path
    def test_custom_yaml(self):
        with dbnd_config(
            {
                "kubernetes": {
                    "pod_yaml": relative_path(__file__, "custom_pod.yaml"),
                    "container_tag": "dummy_tag",
                    "namespace": "test_namespace",
                }
            }
        ):
            run = request_builder.dbnd_run(config_name="gcp_k8s_engine")
            req = run.run_executor.result.load("result")

        spec = req["spec"]

        assert spec["dnsPolicy"] == "ClusterFirstWithHostNet"
Ejemplo n.º 27
0
 def test_luigi_run_exception(self):
     sys.argv = [
         "luigi",
         "Top10ArtistsRunException",
         "--Top10ArtistsRunException-date-interval",
         "2020-05-02",
         "--local-scheduler",
         "--module",
         str("tests.luigi_examples.top_artists"),
     ]
     with dbnd_config({CoreConfig.tracker: ["file", "console"]}):
         with mock.patch("dbnd_luigi.luigi_tracking.handler") as handler:
             result = dbnd_luigi_run()
             assert handler.on_failure.call_count == 1
             assert handler.on_success.call_count == 2
             assert handler.on_dependency_discovered.call_count == 2
             assert handler.on_run_start.call_count == 3
             assert result.status == LuigiStatusCode.FAILED
Ejemplo n.º 28
0
 def test_auto_load(self):
     with dbnd_config({
             "autotestconfig": {
                 "param_datetime": "2018-01-01",
                 "param_int": "42"
             },
             "core":
         {
             "user_configs":
             "autotestconfig",
             "user_init":
             "test_dbnd.orchestration.config.autoloaded_config.user_code_load_config",
         },
             "databand": {
                 "module":
                 "test_dbnd.orchestration.config.autoloaded_config"
             },
     }):
         dbnd_run_cmd("task_auto_config")
Ejemplo n.º 29
0
    def test_decorated_report_params(
        self,
        log_value_preview,
        expected_input_param,
        expected_inputs_args,
        expected_inputs_kwargs,
        expected_result_value_preview,
        mock_channel_tracker,
    ):
        @task()
        def my_task(a, *args, **kwargs):
            return 6

        # executing the task
        with dbnd_config(config_values={
                "tracking": {
                    "log_value_preview": log_value_preview
                }
        }):
            my_task("a", 1, 2, 3, 4, 5, 5, b=20, others=123)

        # get the parameters reported to the tracker
        # we want to compare that for each parameter value we have a definition
        # otherwise the webserver wouldn't have all the needed information
        param_definitions, run_time_params = get_reported_params(
            mock_channel_tracker, "my_task")
        assert set(param_definitions) == set(run_time_params)

        # we want to be sure that the right parameter values where reported
        assert run_time_params["args"].value == expected_inputs_args
        assert run_time_params["kwargs"].value == expected_inputs_kwargs
        assert run_time_params["a"].value == expected_input_param

        # we want to check that we report the result target correctly
        result_target_info = get_task_target_result(mock_channel_tracker,
                                                    "my_task")
        if log_value_preview:
            expected_target_path = result_target_info.target_path
        else:
            expected_target_path = "***"

        assert run_time_params[RESULT_PARAM].value == expected_target_path
        assert result_target_info.value_preview == expected_result_value_preview
Ejemplo n.º 30
0
    def test_build_separate_wheels(self):
        with dbnd_config(
            {
                "bdist_zip": {
                    "package_dir": test_scenario_path("dbnd-test-package"),
                    "requirements_file": test_scenario_path(
                        "dbnd-test-package/requirements.txt"
                    ),
                }
            }
        ):
            zip_files = build_wheel_zips()
            assert len(zip_files) == 2

            package_zip = (
                zip_files[0]
                if zip_files[1].endswith("third-party-deps.zip")
                else zip_files[1]
            )
            third_patry_zip = (
                zip_files[1] if zip_files[0] == package_zip else zip_files[0]
            )

            assert os.path.exists(package_zip)
            assert os.path.exists(third_patry_zip)

            package_zip_file = zipfile.ZipFile(file=package_zip, mode="r")
            all_package_files = package_zip_file.NameToInfo.keys()

            assert "dbnd_test_package/my_lib.py" in all_package_files
            assert "dbnd_test_package-0.1.dist-info/METADATA" in all_package_files

            third_patry_zip_file = zipfile.ZipFile(file=third_patry_zip, mode="r")
            all_third_patry_files = third_patry_zip_file.NameToInfo.keys()

            assert "six.py" in all_third_patry_files
            assert "luigi/task.py" in all_third_patry_files

            # check cache
            new_zip_files = build_wheel_zips()
            assert zip_files[0] == new_zip_files[0]
            assert zip_files[1] == new_zip_files[1]