def test_memoization_on_celery_k8s( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url): ephemeral_prefix = str(uuid.uuid4()) run_config = deep_merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml") ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_config = deep_merge_dicts( run_config, { "resources": { "io_manager": { "config": { "s3_prefix": ephemeral_prefix } } } }, ) try: run_ids = [] for _ in range(2): run_id = launch_run_over_graphql( dagit_url, run_config=run_config, pipeline_name="memoization_pipeline", mode="celery", ) result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format( result) run_ids.append(run_id) unmemoized_run_id = run_ids[0] step_events = _get_step_events( dagster_instance.all_logs(unmemoized_run_id)) assert len(step_events) == 4 memoized_run_id = run_ids[1] step_events = _get_step_events( dagster_instance.all_logs(memoized_run_id)) assert len(step_events) == 0 finally: cleanup_memoized_results(define_memoization_pipeline(), "celery", dagster_instance, run_config)
def daily_weather_ingest_schedule(date): unix_seconds_since_epoch = int( (date - datetime(year=1970, month=1, day=1)).total_seconds()) return deep_merge_dicts( { "solids": { "weather_etl": { "solids": { "download_weather_report_from_weather_api": { "inputs": { "epoch_date": { "value": unix_seconds_since_epoch } } }, }, } } }, { "resources": weather_etl_environment["resources"], "solids": { "weather_etl": weather_etl_environment["solids"]["weather_etl"] }, }, )
def monthly_trip_ingest_schedule(date): return deep_merge_dicts( { "solids": { "trip_etl": { "solids": { "download_baybike_zipfile_from_url": { "inputs": { "file_name": { "value": "{}-fordgobike-tripdata.csv.zip".format( date.date().strftime("%Y%m")) } } } } } } }, { "resources": trip_etl_environment["resources"], "solids": { "trip_etl": trip_etl_environment["solids"]["trip_etl"] }, }, )
def update_job(self, job_id: int, **kwargs) -> Dict[str, Any]: """ Updates specific properties of a dbt job. Documentation on the full set of potential parameters can be found here: https://docs.getdbt.com/dbt-cloud/api-v2#operation/updateJobById Args: job_id (int): The ID of the relevant dbt Cloud job. You can find this value by going to the details page of your job in the dbt Cloud UI. It will be the final number in the url, e.g.: ``https://cloud.getdbt.com/#/accounts/{account_id}/projects/{project_id}/jobs/{job_id}/`` kwargs: Passed in as the properties to be changed. Returns: Dict[str, Any]: Parsed json data from the response to this request Examples: .. code-block:: python # disable schedule for job with id=12345 my_dbt_cloud_resource.update_job(12345, triggers={"schedule": False}) """ # API requires you to supply a bunch of values, so we can just use the current state # as the defaults job_data = self.get_job(job_id) return self.make_request("POST", f"{self._account_id}/jobs/{job_id}/", data=deep_merge_dicts(job_data, kwargs))
def daily_weather_ingest_schedule(date): unix_seconds_since_epoch = int( (date - datetime(year=1970, month=1, day=1)).total_seconds()) return deep_merge_dicts( { 'solids': { 'weather_etl': { 'solids': { 'download_weather_report_from_weather_api': { 'inputs': { 'epoch_date': { 'value': unix_seconds_since_epoch } } }, }, } } }, { 'resources': weather_etl_environment['resources'], 'solids': { 'weather_etl': weather_etl_environment['solids']['weather_etl'] }, }, )
def test_pyspark_databricks(mock_wait, mock_get_step_events, mock_put_file, mock_submit_run): result = execute_pipeline( pipeline=reconstructable(define_do_nothing_pipe), mode='prod_s3', environment_dict={ 'resources': { 'pyspark_step_launcher': { 'config': deep_merge_dicts( BASE_DATABRICKS_PYSPARK_STEP_LAUNCHER_CONFIG, { 'databricks_host': '', 'databricks_token': '' }, ), }, }, }, ) assert result.success assert mock_wait.call_count == 1 assert mock_get_step_events.call_count == 1 # TODO: uncomment this with correct value when uploaded packages are more stable assert mock_put_file.call_count == 3 assert mock_submit_run.call_count == 1
def make_environment_dict(scratch_dir, mode): if mode in ['external', 'request_retry']: step_launcher_resource_keys = [ 'first_step_launcher', 'second_step_launcher' ] else: step_launcher_resource_keys = ['second_step_launcher'] return deep_merge_dicts( ENVIRONMENT_DICT_BASE, { 'resources': { step_launcher_resource_key: { 'config': { 'scratch_dir': scratch_dir } } for step_launcher_resource_key in step_launcher_resource_keys }, 'storage': { 'filesystem': { 'config': { 'base_dir': scratch_dir } } }, }, )
def test_pyspark_databricks(mock_wait, mock_get_step_events, mock_put_file, mock_submit_run): result = execute_pipeline( pipeline=reconstructable(define_do_nothing_pipe), mode="prod_s3", run_config={ "resources": { "pyspark_step_launcher": { "config": deep_merge_dicts( BASE_DATABRICKS_PYSPARK_STEP_LAUNCHER_CONFIG, { "databricks_host": "", "databricks_token": "" }, ), }, }, }, ) assert result.success assert mock_wait.call_count == 1 assert mock_get_step_events.call_count == 1 # TODO: uncomment this with correct value when uploaded packages are more stable assert mock_put_file.call_count == 3 assert mock_submit_run.call_count == 1
def make_run_config(scratch_dir, mode): if mode in ['external', 'request_retry']: step_launcher_resource_keys = [ 'first_step_launcher', 'second_step_launcher' ] else: step_launcher_resource_keys = ['second_step_launcher'] return deep_merge_dicts( RUN_CONFIG_BASE, { 'resources': { step_launcher_resource_key: { 'config': { 'scratch_dir': scratch_dir } } for step_launcher_resource_key in step_launcher_resource_keys }, 'storage': { 'filesystem': { 'config': { 'base_dir': scratch_dir } } }, }, )
def monthly_trip_ingest_schedule(date): return deep_merge_dicts( { 'solids': { 'trip_etl': { 'solids': { 'download_baybike_zipfile_from_url': { 'inputs': { 'file_name': { 'value': '{}-fordgobike-tripdata.csv.zip'.format( date.date().strftime('%Y%m')) } } } } } } }, { 'resources': trip_etl_environment['resources'], 'solids': { 'trip_etl': trip_etl_environment['solids']['trip_etl'] }, }, )
def sample_runs_details(include_related=None, **kwargs): runs = [sample_run_details(include_related, **kwargs) for i in range(100)] if include_related and "environment" in include_related: for run in runs: run["environment"] = { "dbt_project_subdirectory": None, "project_id": 50000, "id": 47000, "account_id": SAMPLE_ACCOUNT_ID, "connection_id": 56000, "repository_id": 58000, "credentials_id": 52000, "created_by_id": None, "name": "dbt-environment", "use_custom_branch": False, "custom_branch": None, "dbt_version": "0.21.0", "supports_docs": False, "state": 10, } run = deep_merge_dicts(run, kwargs) return { "status": { "code": 200, "is_success": True, "user_message": "Success!", "developer_message": "", }, "data": runs, }
def test_realistic(): from_dict = { 'context': { 'unittest': { 'resources': { 'db_resource': {'config': {'user': '******', 'password': '******'}} } } } } onto_dict = {'context': {'unittest': {'resources': {'another': {'config': 'not_sensitive'}}}}} result_dict = { 'context': { 'unittest': { 'resources': { 'db_resource': {'config': {'user': '******', 'password': '******'}}, 'another': {'config': 'not_sensitive'}, } } } } assert deep_merge_dicts(onto_dict, from_dict) == result_dict
def test_realistic(): from_dict = { "context": { "unittest": { "resources": { "db_resource": {"config": {"user": "******", "password": "******"}} } } } } onto_dict = {"context": {"unittest": {"resources": {"another": {"config": "not_sensitive"}}}}} result_dict = { "context": { "unittest": { "resources": { "db_resource": {"config": {"user": "******", "password": "******"}}, "another": {"config": "not_sensitive"}, } } } } assert deep_merge_dicts(onto_dict, from_dict) == result_dict
def make_run_config(scratch_dir, mode): if mode in ["external", "request_retry"]: step_launcher_resource_keys = [ "first_step_launcher", "second_step_launcher" ] else: step_launcher_resource_keys = ["second_step_launcher"] return deep_merge_dicts( RUN_CONFIG_BASE, { "resources": merge_dicts( {"io_manager": { "config": { "base_dir": scratch_dir } }}, { step_launcher_resource_key: { "config": { "scratch_dir": scratch_dir } } for step_launcher_resource_key in step_launcher_resource_keys }, ), }, )
def test_pyspark_databricks(mock_wait, mock_get_step_events, mock_put_file, mock_submit_run): mock_get_step_events.return_value = execute_pipeline( pipeline=reconstructable(define_do_nothing_pipe), mode="local").events_by_step_key["do_nothing_solid"] result = execute_pipeline( pipeline=reconstructable(define_do_nothing_pipe), mode="prod_s3", run_config={ "resources": { "pyspark_step_launcher": { "config": deep_merge_dicts( BASE_DATABRICKS_PYSPARK_STEP_LAUNCHER_CONFIG, { "databricks_host": "", "databricks_token": "" }, ), }, }, }, ) assert result.success assert mock_wait.call_count == 1 assert mock_get_step_events.call_count == 1 assert mock_put_file.call_count == 4 assert mock_submit_run.call_count == 1
def test_nested_merge(): from_dict = {"key": {"nested_one": 1}} onto_dict = {"key": {"nested_two": 2}} assert deep_merge_dicts(onto_dict, from_dict) == { "key": { "nested_one": 1, "nested_two": 2 } }
def run_pipeline(self, pipeline: PipelineDefinition, config_name: str, extra_config: dict[str, Any] = {}, pipeline_mode='test') -> PipelineExecutionResult: config_dict = load_yaml_from_globs(config_path(config_name)) config_dict = deep_merge_dicts(config_dict, extra_config) return execute_pipeline(pipeline, run_config=config_dict, mode=pipeline_mode)
def test_pyspark_databricks(mock_get_run_state, mock_get_step_events, mock_put_file, mock_read_file, mock_submit_run): mock_submit_run.return_value = 12345 mock_read_file.return_value = "somefilecontents".encode() running_state = DatabricksRunState(DatabricksRunLifeCycleState.Running, None, "") final_state = DatabricksRunState(DatabricksRunLifeCycleState.Terminated, DatabricksRunResultState.Success, "") mock_get_run_state.side_effect = [running_state] * 5 + [final_state] with instance_for_test() as instance: execute_pipeline(pipeline=reconstructable(define_do_nothing_pipe), mode="local", instance=instance) mock_get_step_events.return_value = [ record.event_log_entry for record in instance.get_event_records() if record.event_log_entry.step_key == "do_nothing_solid" ] config = BASE_DATABRICKS_PYSPARK_STEP_LAUNCHER_CONFIG.copy() config.pop("local_pipeline_package_path") result = execute_pipeline( pipeline=reconstructable(define_do_nothing_pipe), mode="test", run_config={ "resources": { "pyspark_step_launcher": { "config": deep_merge_dicts( config, { "databricks_host": "", "databricks_token": "", "poll_interval_sec": 0.1, "local_dagster_job_package_path": os.path.abspath(os.path.dirname(__file__)), }, ), }, }, }, ) assert result.success assert mock_get_run_state.call_count == 6 assert mock_get_step_events.call_count == 6 assert mock_put_file.call_count == 4 assert mock_read_file.call_count == 2 assert mock_submit_run.call_count == 1
def with_additional_config(self, run_config): """Return a new PresetDefinition with additional config merged in to the existing config.""" check.opt_nullable_dict_param(run_config, "run_config") if run_config is None: return self else: return PresetDefinition( name=self.name, solid_selection=self.solid_selection, mode=self.mode, tags=self.tags, run_config=deep_merge_dicts(self.run_config, run_config), )
def test_pyspark_emr(mock_is_emr_step_complete, mock_read_events, mock_s3_bucket): mock_read_events.return_value = execute_pipeline( reconstructable(define_do_nothing_pipe), mode="local").events_by_step_key["do_nothing_solid"] run_job_flow_args = dict( Instances={ "InstanceCount": 1, "KeepJobFlowAliveWhenNoSteps": True, "MasterInstanceType": "c3.medium", "Placement": { "AvailabilityZone": "us-west-1a" }, "SlaveInstanceType": "c3.xlarge", }, JobFlowRole="EMR_EC2_DefaultRole", LogUri="s3://{bucket}/log".format(bucket=mock_s3_bucket.name), Name="cluster", ServiceRole="EMR_DefaultRole", VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region="us-west-1") context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args) result = execute_pipeline( pipeline=reconstructable(define_do_nothing_pipe), mode="prod", run_config={ "resources": { "pyspark_step_launcher": { "config": deep_merge_dicts( BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG, { "cluster_id": cluster_id, "staging_bucket": mock_s3_bucket.name }, ), } }, }, ) assert result.success assert mock_is_emr_step_complete.called
def run_config_storage_field_backcompat(run_config): """This method will be removed after "storage" is removed in run config. For backwards compatibility, we treat "storage" as as alias of "intermediate_storage", i.e. run config that has been passed in through the "storage" entry will be used to define intermediate storage. When "storage" and "intermediate_storage" are both specified, intermediate storage config will override storage config. Tracking https://github.com/dagster-io/dagster/issues/3280 """ intermediate_storage_dict = {} if run_config.get("storage"): intermediate_storage_dict = { "intermediate_storage": run_config.get("intermediate_storage") or run_config.get("storage") } return deep_merge_dicts(run_config, intermediate_storage_dict)
def test_pyspark_emr(mock_wait, mock_get_step_events): run_job_flow_args = dict( Instances={ 'InstanceCount': 1, 'KeepJobFlowAliveWhenNoSteps': True, 'MasterInstanceType': 'c3.medium', 'Placement': { 'AvailabilityZone': 'us-west-1a' }, 'SlaveInstanceType': 'c3.xlarge', }, JobFlowRole='EMR_EC2_DefaultRole', LogUri='s3://mybucket/log', Name='cluster', ServiceRole='EMR_DefaultRole', VisibleToAllUsers=True, ) # Doing cluster setup outside of a solid here, because run_job_flow is not yet plumbed through # to the pyspark EMR resource. job_runner = EmrJobRunner(region='us-west-1') context = create_test_pipeline_execution_context() cluster_id = job_runner.run_job_flow(context.log, run_job_flow_args) pipeline_def = ExecutionTargetHandle.for_pipeline_fn( define_do_nothing_pipe).build_pipeline_definition() result = execute_pipeline( pipeline=pipeline_def, mode='prod', environment_dict={ 'resources': { 'pyspark_step_launcher': { 'config': deep_merge_dicts(BASE_EMR_PYSPARK_STEP_LAUNCHER_CONFIG, {'cluster_id': cluster_id}), } }, }, ) assert result.success assert mock_wait.called_once assert mock_get_step_events.called_once
def test_volume_mounts(dagster_docker_image, dagster_instance, helm_namespace, dagit_url): run_config = deep_merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml") ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql( dagit_url, run_config=run_config, pipeline_name="volume_mount_pipeline", mode="celery", ) result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
def run_config_storage_field_backcompat(run_config): """This method will be removed after "storage" is removed in run config. For backwards compatibility, we treat "storage" as as alias of "intermediate_storage", i.e. run config that has been passed in through the "storage" entry will be used to define intermediate storage. When "storage" and "intermediate_storage" are both specified, intermediate storage config will override storage config. Tracking https://github.com/dagster-io/dagster/issues/3280 """ intermediate_storage_dict = {} if run_config.get("storage"): warnings.warn(( 'the "storage" entry in the run config is deprecated and will removed in the ' 'dagster 0.11.0 release. Please use "intermediate_storage" instead and update ' "the corresponding `system_storage_defs` argument in `ModeDefinition` to " "`intermediate_storage_defs`.")) intermediate_storage_dict = { "intermediate_storage": run_config.get("intermediate_storage") or run_config.get("storage") } return deep_merge_dicts(run_config, intermediate_storage_dict)
def get_sample_connector_response(**kwargs): return deep_merge_dicts( { "code": "Success", "data": { "id": DEFAULT_CONNECTOR_ID, "group_id": "some_group", "service": "some_service", "service_version": 1, "schema": "some_service.some_name", "connected_by": "some_user", "created_at": "2021-01-01T00:00:00.0Z", "succeeded_at": "2021-01-01T01:00:00.0Z", "failed_at": None, "paused": False, "pause_after_trial": False, "sync_frequency": 360, "schedule_type": "auto", "status": { "setup_state": "connected", "sync_state": "scheduled", "update_state": "on_schedule", "is_historical_sync": False, "tasks": [], "warnings": [], }, "config": { "auth_type": "OAuth", }, "source_sync_details": { "last_synced": "2021-10-27T16:58:40.035Z" }, }, }, kwargs, )
def test_simple_merge(): assert deep_merge_dicts({}, {}) == {} assert deep_merge_dicts({1: 2}, {}) == {1: 2} assert deep_merge_dicts({}, {1: 2}) == {1: 2}
def test_smash(): from_dict = {"value": "smasher"} onto_dict = {"value": "got_smashed"} assert deep_merge_dicts(onto_dict, from_dict)["value"] == "smasher"
def test_nested_merge(): from_dict = {'key': {'nested_one': 1}} onto_dict = {'key': {'nested_two': 2}} assert deep_merge_dicts(onto_dict, from_dict) == {'key': {'nested_one': 1, 'nested_two': 2}}
def sample_run_details(include_related=None, **kwargs): base_data = { "id": SAMPLE_RUN_ID, "trigger_id": 33000000, "account_id": SAMPLE_ACCOUNT_ID, "environment_id": 47000, "project_id": 50000, "job_definition_id": SAMPLE_JOB_ID, "status": 10, "dbt_version": "0.21.0", "git_branch": "master", "git_sha": "a32e8239326887421f1314ee0890e5174c8f644a", "status_message": None, "owner_thread_id": None, "executed_by_thread_id": "dbt-run-3000000-6v5pm", "deferring_run_id": None, "artifacts_saved": True, "artifact_s3_path": "prod/runs/3000000/artifacts/target", "has_docs_generated": False, "has_sources_generated": False, "notifications_sent": True, "blocked_by": [], "scribe_enabled": True, "created_at": "2021-11-01 22:47:48.501943+00:00", "updated_at": "2021-11-01 22:48:44.860334+00:00", "dequeued_at": "2021-11-01 22:48:22.880352+00:00", "started_at": "2021-11-01 22:48:28.595439+00:00", "finished_at": "2021-11-01 22:48:44.684263+00:00", "last_checked_at": None, "last_heartbeat_at": None, "should_start_at": "2021-11-01 22:47:48.501943+00:00", "trigger": None, "job": None, "environment": None, "run_steps": [], "status_humanized": "Success", "in_progress": False, "is_complete": True, "is_success": True, "is_error": False, "is_cancelled": False, "href": "https://cloud.getdbt.com/#/accounts/30000/projects/50000/runs/3000000/", "duration": "00:00:56", "queued_duration": "00:00:40", "run_duration": "00:00:16", "duration_humanized": "56 seconds", "queued_duration_humanized": "40 seconds", "run_duration_humanized": "16 seconds", "created_at_humanized": "26 minutes, 12 seconds ago", "finished_at_humanized": "25 minutes, 16 seconds ago", "job_id": SAMPLE_JOB_ID, } if include_related: if "trigger" in include_related: base_data["trigger"] = { "id": 33149624, "cause": "Triggered via Dagster", "job_definition_id": SAMPLE_JOB_ID, "git_branch": None, "git_sha": None, "github_pull_request_id": None, "gitlab_merge_request_id": None, "schema_override": None, "dbt_version_override": None, "threads_override": None, "target_name_override": None, "generate_docs_override": None, "timeout_seconds_override": None, "steps_override": None, "created_at": "2021-11-01 22:47:48.494450+00:00", "cause_humanized": "Triggered via Dagster", "job": None, } if "job" in include_related: base_data["job"] = { "execution": {"timeout_seconds": 0}, "generate_docs": False, "run_generate_sources": False, "id": SAMPLE_JOB_ID, "account_id": SAMPLE_ACCOUNT_ID, "project_id": 50000, "environment_id": 47071, "name": "MyCoolJob", "dbt_version": None, "created_at": "2021-10-29T21:35:33.278228Z", "updated_at": "2021-11-01T23:03:20.887248Z", "execute_steps": ["dbt run"], "state": 1, "deferring_job_definition_id": None, "lifecycle_webhooks": False, "lifecycle_webhooks_url": None, "triggers": { "github_webhook": False, "git_provider_webhook": False, "custom_branch_only": False, "schedule": False, }, "settings": {"threads": 4, "target_name": "default"}, } return { "status": { "code": 200, "is_success": True, "user_message": "Success!", "developer_message": "", }, "data": deep_merge_dicts(base_data, kwargs), }
def test_memoization_k8s_executor( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagster_docker_image, dagit_url_for_k8s_run_launcher, ): ephemeral_path = str(uuid.uuid4()) run_config = deep_merge_dicts( load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env_s3.yaml")), { "execution": { "k8s": { "config": { "job_namespace": helm_namespace_for_k8s_run_launcher, "job_image": dagster_docker_image, "image_pull_policy": image_pull_policy(), } } }, }, ) run_config = deep_merge_dicts( run_config, { "resources": { "io_manager": { "config": { "s3_prefix": ephemeral_path } } } }, ) # wrap in try-catch to ensure that memoized results are always cleaned from s3 bucket try: pipeline_name = "memoization_pipeline" run_ids = [] for _ in range(2): run_id = launch_run_over_graphql( dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name, mode="k8s", ) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher, ) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format( result) run_ids.append(run_id) # We expect that first run should have to run the step, since it has not yet been # memoized. unmemoized_run_id = run_ids[0] events = dagster_instance_for_k8s_run_launcher.all_logs( unmemoized_run_id) assert len(_get_step_execution_events(events)) == 1 # We expect that second run should not have to run the step, since it has been memoized. memoized_run_id = run_ids[1] events = dagster_instance_for_k8s_run_launcher.all_logs( memoized_run_id) assert len(_get_step_execution_events(events)) == 0 finally: cleanup_memoized_results(define_memoization_pipeline(), "k8s", dagster_instance_for_k8s_run_launcher, run_config)