def test_luigi_sanity_complex_foo(self, complex_foo): with dbnd_config({CoreConfig.databand_url: "http://localhost:8080"}): with dbnd_config({CoreConfig.tracker: ["file", "console"]}): try: shutil.rmtree("/tmp/bar") except FileNotFoundError: pass result = dbnd_luigi_build(tasks=[complex_foo]) assert result.status == LuigiStatusCode.SUCCESS
def test_build_fat_wheel(self): with dbnd_config( { "bdist_zip": { "package_dir": test_scenario_path("dbnd-test-package"), "requirements_file": test_scenario_path( "dbnd-test-package/requirements.txt" ), } } ): bdist_file = build_fat_wheel() assert os.path.exists(bdist_file) temp_zip = zipfile.ZipFile(file=bdist_file, mode="r") all_files = temp_zip.NameToInfo.keys() assert "six.py" in all_files assert "dbnd_test_package/my_lib.py" in all_files assert "dbnd_test_package-0.1.dist-info/METADATA" in all_files assert "luigi/task.py" in all_files # check cache new_bdist_file = build_fat_wheel() assert bdist_file == new_bdist_file
def test__by_day_simple_local(self): with dbnd_config({ ProductionIdsAndData.task_env: "local", FetchIds.task_enabled_in_prod: True, FetchData.task_enabled_in_prod: True, }): assert_run_task(ByDayExamplePipeline(period="2d"))
def test_submit_driver_req(self, mock_client): with dbnd_config(K8S_CONFIG): dbnd_run_cmd( [ "dbnd_sanity_check", "--env=gcp_k8s", "--set-config", "kubernetes.container_tag=tag", ] ) calls = mock_client().create_namespaced_pod.call_args_list assert len(calls) == 1 call = calls[0].kwargs # 1) test - default labels req_labels = call["body"]["metadata"]["labels"] assert is_sub_dict( req_labels, { "dbnd_task_family": "d.d.d.docker-run-task", "dbnd_task_name": "dbnd-driver-run", "dbnd_task_af_id": "dbnd-driver-run", "dbnd": "dbnd-system-task-run", }, ) # 2) test - running the driver with the global resources assert call["body"]["spec"]["containers"][0]["resources"] == { "limits": {"test_limits": 1}, "requests": {"memory": "1536Mi", "cpu": "1"}, }
def test_multiple_input_tracking(self, task_c): with dbnd_config({CoreConfig.tracker: ["file", "console"]}): dbnd_task = wrap_luigi_task(task_c) assert dbnd_task # Output1 and 2 are actually inputs from TaskB, just badly named assert dbnd_task.output1 assert dbnd_task.output10 assert dbnd_task.output2 assert dbnd_task.output20 output1 = [ x for x in dbnd_task.task_meta.task_params if x.name == "output1" ][0] output10 = [ x for x in dbnd_task.task_meta.task_params if x.name == "output10" ][0] output2 = [ x for x in dbnd_task.task_meta.task_params if x.name == "output2" ][0] output20 = [ x for x in dbnd_task.task_meta.task_params if x.name == "output20" ][0] assert output1.parameter.kind == _ParameterKind.task_input assert output10.parameter.kind == _ParameterKind.task_input assert output2.parameter.kind == _ParameterKind.task_input assert output20.parameter.kind == _ParameterKind.task_input
def test_spark_conf_merge_not_overlap(self, _, __, spark_submit_hook): with dbnd_config({ SparkConfig.disable_sync: True, SparkConfig.disable_tracking_api: True, SparkConfig.conf: { CONFIG_1: "config_layer", CONFIG_2: "config_layer" }, }): TaskA(text=__file__).dbnd_run() spark_submit_hook.assert_called_with(conf={ CONFIG_1: "TaskA", CONFIG_2: "config_layer" }, **self.spark_hook_params()) TaskB(text=__file__).dbnd_run() spark_submit_hook.assert_called_with(conf={ CONFIG_1: "config_layer", CONFIG_2: "TaskB" }, **self.spark_hook_params()) TaskC(text=__file__).dbnd_run() spark_submit_hook.assert_called_with(conf={ CONFIG_1: "TaskC", CONFIG_2: "config_layer" }, **self.spark_hook_params())
def test_pod_building(self): with dbnd_config(K8S_CONFIG): run = pod_builder.dbnd_run(config_name="gcp_k8s_engine") pod = run.run_executor.result.load("result") assert is_sub_dict( pod.metadata.labels, { "dbnd_task_family": "t.k.t.pod-builder", "dbnd_task_name": "pod-builder", "dbnd_task_af_id": "pod-builder", }, ) container = pod.spec.containers[0] assert container.resources.limits == {"test_limits": 1} assert container.resources.requests == {"memory": "1536Mi", "cpu": "1"} raw_env = {env_var.name: env_var.value for env_var in container.env} assert is_sub_dict( raw_env, { "DBND__POD_NAMESPACE": "test_namespace", "DBND__ENV_IMAGE": "gcr.io/dbnd-dev-260010/databand:dummy_tag", "DBND__GCP_K8S_ENGINE__IN_CLUSTER": "True", "AIRFLOW__KUBERNETES__IN_CLUSTER": "True", }, ) assert pod.metadata.namespace == "test_namespace"
def test_req_building(self): with dbnd_config(K8S_CONFIG): run = request_builder.dbnd_run(config_name="gcp_k8s_engine") req = run.run_executor.result.load("result") assert is_sub_dict( req["metadata"]["labels"], { "dbnd_task_family": "t.k.t.request-builder", "dbnd_task_name": "request-builder", "dbnd_task_af_id": "request-builder", }, ) container_spec = req["spec"]["containers"][0] container_spec_env = { v["name"]: v["value"] for v in container_spec["env"] if "value" in v } assert is_sub_dict( container_spec_env, { "DBND__POD_NAMESPACE": "test_namespace", "DBND__ENV_IMAGE": "gcr.io/dbnd-dev-260010/databand:dummy_tag", "DBND__GCP_K8S_ENGINE__IN_CLUSTER": "True", "AIRFLOW__KUBERNETES__IN_CLUSTER": "True", "AIRFLOW__KUBERNETES__DAGS_IN_IMAGE": "True", }, ) assert container_spec["resources"] == { "requests": {"memory": "1536Mi", "cpu": "1"}, "limits": {"test_limits": 1}, } assert container_spec["image"] == "gcr.io/dbnd-dev-260010/databand:dummy_tag"
def test_spark_conf_merge(self, _, __, spark_submit_hook, task, expected): with dbnd_config({ SparkConfig.disable_sync: True, SparkConfig.disable_tracking_api: True, SparkConfig.conf: { CONFIG_1: "config_layer", CONFIG_2: "config_layer" }, }): task(text=__file__).dbnd_run() spark_submit_hook.assert_called_once_with( conf=expected, application_args=mock.ANY, conn_id=mock.ANY, driver_class_path=mock.ANY, driver_memory=mock.ANY, env_vars=mock.ANY, exclude_packages=mock.ANY, executor_cores=mock.ANY, executor_memory=mock.ANY, files=mock.ANY, jars=mock.ANY, java_class=mock.ANY, keytab=mock.ANY, name=mock.ANY, num_executors=mock.ANY, packages=mock.ANY, principal=mock.ANY, py_files=mock.ANY, repositories=mock.ANY, total_executor_cores=mock.ANY, verbose=mock.ANY, )
def test_serialization_runtime(self): # Serialization in runtime is achieved by running in parallel with dbnd_config({ RunConfig.parallel: True, RunConfig.enable_concurent_sqlite: True }): tf_pipeline.dbnd_run()
def test_scope_and_defined_for_child_config(self): @task def simple_task(tdata, tstr): pass class TPipeline(PipelineTask): tdata = data(scope=ParameterScope.children) tstr = parameter(scope=ParameterScope.children)[str] some_a = output def band(self): self.some_a = simple_task() expected_task_target_date = date(2020, 1, 1) expected_tstr = "teststr" expected_tdata = __file__ # tstr is "scoped" from Tpipeline, however, that's default # so the value should come from config with dbnd_config(config_values={simple_task.task.tstr: expected_tstr}): t_pipeline = TPipeline( tdata=expected_tdata, tstr="tpipeline_scope_value", task_target_date=expected_task_target_date, ) t_task = t_pipeline.some_a.task assert t_task.task_target_date == expected_task_target_date assert str(t_task.tdata) == expected_tdata assert t_task.tstr == expected_tstr
def test_build_task_with_task_band_through_config(self, task_band_file): run = First(input_1=3).dbnd_run() assert run.run_executor.result.load("result") == 3 with dbnd_config({"First": {"task_band": task_band_file.strpath}}): run = First(input_1=3).dbnd_run() # accessing the result and check that the used value is the one from the task_band assert run.run_executor.result.load("result") == 1
def test_prod_immutable_output_example(self): with dbnd_config({ FetchIds.task_enabled_in_prod: True, FetchData.task_enabled_in_prod: True }): task = ProductionIdsAndData( task_env=get_databand_context().env.clone(production=True)) assert_run_task(task)
def test_wine_quality_deco_simple_all(self): with dbnd_config( {"local_prod": {"_from": "local", "env_label": "prod", "production": True}} ): task = wine_quality.predict_wine_quality.t( alpha=0.5, override={wine_quality.fetch_data.t.task_env: "local_prod"} ) assert_run_task(task)
def test_luigi_sanity_foo(self, simple_foo): try: shutil.rmtree("/tmp/bar") except FileNotFoundError: pass with dbnd_config({CoreConfig.tracker: ["file", "console"]}): result = dbnd_luigi_build(tasks=[simple_foo]) assert result.status == LuigiStatusCode.SUCCESS
def test_luigi_build_exception(self, top10_artists_run_error): with dbnd_config({CoreConfig.tracker: ["file", "console"]}): with mock.patch("dbnd_luigi.luigi_tracking.handler") as handler: result = dbnd_luigi_build(tasks=[top10_artists_run_error]) assert handler.on_failure.call_count == 1 assert handler.on_success.call_count == 3 assert handler.on_dependency_discovered.call_count == 3 assert handler.on_run_start.call_count == 4 assert result.status == LuigiStatusCode.FAILED
def test_luigi_wrapper_task_run_fail(self, wrapper_task_run_fail): delete_task_output(wrapper_task_run_fail) with dbnd_config({CoreConfig.tracker: ["file", "console"]}): with mock.patch("dbnd_luigi.luigi_tracking.handler") as handler: result = dbnd_luigi_build(tasks=[wrapper_task_run_fail]) assert handler.on_failure.call_count == 1 assert handler.on_success.call_count == 1 assert handler.on_dependency_discovered.call_count == 1 assert handler.on_run_start.call_count == 2 assert result.status == LuigiStatusCode.FAILED
def test_luigi_sanity_input_target_tracking(self, top10_artists): with dbnd_config({CoreConfig.tracker: ["file", "console"]}): dbnd_task = wrap_luigi_task(top10_artists) assert dbnd_task dbnd_input_target = [ x for x in dbnd_task.task_meta.task_params if "artist_streams" in x.name ][0].value assert dbnd_input_target luigi_target = top10_artists.input() assert luigi_target assert luigi_target.path in dbnd_input_target.path
def test_spark_inline(self): from dbnd_test_scenarios.spark.spark_tasks_inline import word_count_inline # Solve "tests" module conflict on pickle loading after spark-submit parent_directory = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) with dbnd_config( {SparkConfig.env_vars: { "PYTHONPATH": parent_directory }}): assert_run_task(word_count_inline.t(text=__file__))
def test_spark_conf_merge(self, _, __, spark_submit_hook, task, expected): with dbnd_config({ SparkConfig.disable_sync: True, SparkConfig.disable_tracking_api: True, SparkConfig.conf: { CONFIG_1: "config_layer", CONFIG_2: "config_layer" }, }): task(text=__file__).dbnd_run() spark_submit_hook.assert_called_once_with( conf=expected, **self.spark_hook_params())
def test_custom_partition_from_config(self): with dbnd_config( config_values= { "task": { "task_output_path_format": "{root}/{env_label}/{task_family}{task_class_version}_custom/" "{output_name}{output_ext}/date={task_target_date}" } }): task = TTask() assert_run_task(task) assert "TTask_custom/t_output.csv/" in str(task.t_output)
def band(self): # This is a way to override every output of underneath tasks with custom output location (see "_custom") with dbnd.dbnd_config( config_values={ "task": { "task_output_path_format": "{root}/{env_label}/{task_family}{task_class_version}_custom/" "{output_name}{output_ext}/date={task_target_date}" } } ): partitioned_data = PartitionedDataTask().partitioned_data self.concat = PartitionedDataReader( partitioned_data=partitioned_data ).concat
def test_spark_complete(self, monkeypatch): from dbnd_test_scenarios.spark.spark_tasks_inline import word_count_inline # Solve "tests" module conflict on pickle loading after spark-submit parent_directory = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) with dbnd_config( {SparkConfig.env_vars: { "PYTHONPATH": parent_directory }}): t = word_count_inline.t(text=__file__) dir_target = mock.MagicMock(DirTarget) dir_target.exists = mock.Mock(return_value=False) monkeypatch.setattr(t, "_get_dir_outputs", lambda: [dir_target]) assert t._complete() is False
def test_multiple_output_tracking(self, task_b): with dbnd_config({CoreConfig.tracker: ["file", "console"]}): dbnd_task = wrap_luigi_task(task_b) assert dbnd_task assert len(dbnd_task.task_outputs) == 3 assert dbnd_task.output1 assert dbnd_task.output2 output1 = [ x for x in dbnd_task.task_meta.task_params if x.name == "output1" ][0] output2 = [ x for x in dbnd_task.task_meta.task_params if x.name == "output2" ][0] assert output1.parameter.kind == _ParameterKind.task_output assert output2.parameter.kind == _ParameterKind.task_output
def test_luigi_sanity_output_target_tracking(self, top10_artists): with dbnd_config({CoreConfig.tracker: ["file", "console"]}): dbnd_task = wrap_luigi_task(top10_artists) assert dbnd_task assert dbnd_task.task_outputs # 'result' is our added output target assert len(dbnd_task.task_outputs) == 2 dbnd_output = [ v for k, v in dbnd_task.task_outputs.items() if k != "task_band" ][0] assert dbnd_output luigi_output = top10_artists.output() assert luigi_output # Assert we preserve filename and directory tree format assert luigi_output.path in dbnd_output.path
def test_custom_yaml(self): with dbnd_config( { "kubernetes": { "pod_yaml": relative_path(__file__, "custom_pod.yaml"), "container_tag": "dummy_tag", "namespace": "test_namespace", } } ): run = request_builder.dbnd_run(config_name="gcp_k8s_engine") req = run.run_executor.result.load("result") spec = req["spec"] assert spec["dnsPolicy"] == "ClusterFirstWithHostNet"
def test_luigi_run_exception(self): sys.argv = [ "luigi", "Top10ArtistsRunException", "--Top10ArtistsRunException-date-interval", "2020-05-02", "--local-scheduler", "--module", str("tests.luigi_examples.top_artists"), ] with dbnd_config({CoreConfig.tracker: ["file", "console"]}): with mock.patch("dbnd_luigi.luigi_tracking.handler") as handler: result = dbnd_luigi_run() assert handler.on_failure.call_count == 1 assert handler.on_success.call_count == 2 assert handler.on_dependency_discovered.call_count == 2 assert handler.on_run_start.call_count == 3 assert result.status == LuigiStatusCode.FAILED
def test_auto_load(self): with dbnd_config({ "autotestconfig": { "param_datetime": "2018-01-01", "param_int": "42" }, "core": { "user_configs": "autotestconfig", "user_init": "test_dbnd.orchestration.config.autoloaded_config.user_code_load_config", }, "databand": { "module": "test_dbnd.orchestration.config.autoloaded_config" }, }): dbnd_run_cmd("task_auto_config")
def test_decorated_report_params( self, log_value_preview, expected_input_param, expected_inputs_args, expected_inputs_kwargs, expected_result_value_preview, mock_channel_tracker, ): @task() def my_task(a, *args, **kwargs): return 6 # executing the task with dbnd_config(config_values={ "tracking": { "log_value_preview": log_value_preview } }): my_task("a", 1, 2, 3, 4, 5, 5, b=20, others=123) # get the parameters reported to the tracker # we want to compare that for each parameter value we have a definition # otherwise the webserver wouldn't have all the needed information param_definitions, run_time_params = get_reported_params( mock_channel_tracker, "my_task") assert set(param_definitions) == set(run_time_params) # we want to be sure that the right parameter values where reported assert run_time_params["args"].value == expected_inputs_args assert run_time_params["kwargs"].value == expected_inputs_kwargs assert run_time_params["a"].value == expected_input_param # we want to check that we report the result target correctly result_target_info = get_task_target_result(mock_channel_tracker, "my_task") if log_value_preview: expected_target_path = result_target_info.target_path else: expected_target_path = "***" assert run_time_params[RESULT_PARAM].value == expected_target_path assert result_target_info.value_preview == expected_result_value_preview
def test_build_separate_wheels(self): with dbnd_config( { "bdist_zip": { "package_dir": test_scenario_path("dbnd-test-package"), "requirements_file": test_scenario_path( "dbnd-test-package/requirements.txt" ), } } ): zip_files = build_wheel_zips() assert len(zip_files) == 2 package_zip = ( zip_files[0] if zip_files[1].endswith("third-party-deps.zip") else zip_files[1] ) third_patry_zip = ( zip_files[1] if zip_files[0] == package_zip else zip_files[0] ) assert os.path.exists(package_zip) assert os.path.exists(third_patry_zip) package_zip_file = zipfile.ZipFile(file=package_zip, mode="r") all_package_files = package_zip_file.NameToInfo.keys() assert "dbnd_test_package/my_lib.py" in all_package_files assert "dbnd_test_package-0.1.dist-info/METADATA" in all_package_files third_patry_zip_file = zipfile.ZipFile(file=third_patry_zip, mode="r") all_third_patry_files = third_patry_zip_file.NameToInfo.keys() assert "six.py" in all_third_patry_files assert "luigi/task.py" in all_third_patry_files # check cache new_zip_files = build_wheel_zips() assert zip_files[0] == new_zip_files[0] assert zip_files[1] == new_zip_files[1]