def config_type(cls): '''Include all arguments required for DagsterK8sJobConfig along with additional arguments needed for the RunLauncher itself. ''' job_cfg = DagsterK8sJobConfig.config_type() run_launcher_extra_cfg = { 'job_namespace': str, 'load_incluster_config': Field(bool, is_required=False, default_value=True), 'kubeconfig_file': Field(Noneable(str), is_required=False, default_value=None), } return merge_dicts(job_cfg, run_launcher_extra_cfg)
def test_backfill_partition_range(backfill_args_context): with backfill_args_context as (cli_args, instance): args = merge_dicts(cli_args, { "pipeline": "baz", "partition_set": "baz_partitions", "from": "7" }) run_test_backfill(args, instance, expected_count=3) args = merge_dicts(cli_args, { "pipeline": "baz", "partition_set": "baz_partitions", "to": "2" }) run_test_backfill(args, instance, expected_count=6) # 3 more runs args = merge_dicts( cli_args, { "pipeline": "baz", "partition_set": "baz_partitions", "from": "2", "to": "5" }) run_test_backfill(args, instance, expected_count=10) # 4 more runs
def _create_sensor_run(instance, repo_location, external_sensor, external_pipeline, run_request): external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_request.run_config, external_sensor.mode, step_keys_to_execute=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts( merge_dicts(pipeline_tags, run_request.tags), PipelineRun.tags_for_sensor(external_sensor), ) if run_request.run_key: tags[RUN_KEY_TAG] = run_request.run_key return instance.create_run( pipeline_name=external_sensor.pipeline_name, run_id=None, run_config=run_request.run_config, mode=external_sensor.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, status=PipelineRunStatus.NOT_STARTED, solid_selection=external_sensor.solid_selection, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, external_pipeline_origin=external_pipeline.get_external_origin(), )
def config_type(cls): """Include all arguments required for DagsterK8sJobConfig along with additional arguments needed for the RunLauncher itself. """ job_cfg = DagsterK8sJobConfig.config_type() run_launcher_extra_cfg = { "job_namespace": Field(StringSource, is_required=False, default_value="default"), "load_incluster_config": Field(bool, is_required=False, default_value=True), "kubeconfig_file": Field(Noneable(str), is_required=False, default_value=None), } return merge_dicts(job_cfg, run_launcher_extra_cfg)
def merge(self, other: "DockerContainerContext"): # Combines config set at a higher level with overrides/additions that are set at a lower # level. For example, a certain set of config set in the `DockerRunLauncher`` can be # combined with config set at the step level in the `docker_executor`. # Lists of env vars and secrets are appended, the registry is replaced, and the # `container_kwargs` field does a shallow merge so that different kwargs can be combined # or replaced without replacing the full set of arguments. return DockerContainerContext( registry=other.registry if other.registry != None else self.registry, env_vars=self.env_vars + other.env_vars, networks=self.networks + other.networks, container_kwargs=merge_dicts(other.container_kwargs, self.container_kwargs), )
def test_execute_on_celery_k8s_with_termination( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace ): run_config = merge_dicts( merge_yamls( [ os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ] ), get_celery_engine_config( dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace ), ) _test_termination(dagster_instance, run_config)
def __new__( cls, pipeline_name, key_suffix, step_inputs, step_outputs, compute_fn, kind, solid_handle, solid, logging_tags=None, ): check.inst_param(solid, "solid", Solid) return super(ExecutionStep, cls).__new__( cls, pipeline_name=check.str_param(pipeline_name, "pipeline_name"), key_suffix=check.str_param(key_suffix, "key_suffix"), step_inputs=check.list_param(step_inputs, "step_inputs", of_type=StepInput), step_input_dict={si.name: si for si in step_inputs}, step_outputs=check.list_param(step_outputs, "step_outputs", of_type=StepOutput), step_output_dict={so.name: so for so in step_outputs}, compute_fn=check.callable_param( compute_fn, "compute_fn" ), # Compute_fn is the compute function for the step. # Not to be confused with the compute_fn of the passed in solid. kind=check.inst_param(kind, "kind", StepKind), solid_handle=check.inst_param(solid_handle, "solid_handle", SolidHandle), solid_version=solid.definition.version, logging_tags=merge_dicts( { "step_key": str(solid_handle) + "." + key_suffix, "pipeline": pipeline_name, "solid": solid_handle.name, "solid_definition": solid.definition.name, }, check.opt_dict_param(logging_tags, "logging_tags"), ), tags=solid.tags, hook_defs=solid.hook_defs, )
def test_execute_on_celery( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace ): environment_dict = merge_dicts( merge_yamls( [ os.path.join(test_project_environments_path(), 'env.yaml'), os.path.join(test_project_environments_path(), 'env_s3.yaml'), ] ), { 'execution': { 'celery-k8s': { 'config': { 'broker': {'env': 'DAGSTER_K8S_CELERY_BROKER'}, 'backend': {'env': 'DAGSTER_K8S_CELERY_BACKEND'}, 'job_image': dagster_docker_image, 'job_namespace': helm_namespace, 'instance_config_map': 'dagster-instance', 'postgres_password_secret': 'dagster-postgresql-secret', 'image_pull_policy': 'Always', 'env_config_maps': ['dagster-pipeline-env'], } } }, }, ) pipeline_name = 'demo_pipeline_celery' run = create_run_for_test( dagster_instance, pipeline_name=pipeline_name, environment_dict=environment_dict, mode='default', ) dagster_instance.launch_run(run.run_id) result = wait_for_job_and_get_logs( job_name='dagster-run-%s' % run.run_id, namespace=helm_namespace ) assert not result.get('errors') assert result['data'] assert ( result['data']['startPipelineExecutionForCreatedRun']['__typename'] == 'StartPipelineRunSuccess' )
def test_k8s_run_launcher_image_from_origin( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagit_url_for_k8s_run_launcher, ): # Like the previous test, but the executor doesn't supply an image - it's pulled # from the origin on the run instead pods = DagsterKubernetesClient.production_client( ).core_api.list_namespaced_pod( namespace=helm_namespace_for_k8s_run_launcher) celery_pod_names = [ p.metadata.name for p in pods.items if "celery-workers" in p.metadata.name ] check.invariant(not celery_pod_names) run_config = merge_dicts( load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env.yaml")), load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env_s3.yaml")), { "execution": { "k8s": { "config": { "job_namespace": helm_namespace_for_k8s_run_launcher, "image_pull_policy": image_pull_policy(), } } }, }, ) pipeline_name = "demo_k8s_executor_pipeline" run_id = launch_run_over_graphql(dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance_for_k8s_run_launcher.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == get_test_project_docker_image( )
def launch_run(self, instance, run, external_pipeline): check.inst_param(run, "run", PipelineRun) check.inst_param(external_pipeline, "external_pipeline", ExternalPipeline) repository_location_handle = external_pipeline.repository_handle.repository_location_handle check.inst( repository_location_handle, GRPC_REPOSITORY_LOCATION_HANDLE_TYPES, "DefaultRunLauncher: Can't launch runs for pipeline not loaded from a GRPC server", ) self._instance.add_run_tags( run.run_id, { GRPC_INFO_TAG: seven.json.dumps( merge_dicts( {"host": repository_location_handle.host}, ({ "port": repository_location_handle.port } if repository_location_handle.port else { "socket": repository_location_handle.socket }), ({ "use_ssl": True } if repository_location_handle.use_ssl else {}), )) }, ) res = repository_location_handle.client.start_run( ExecuteExternalPipelineArgs( pipeline_origin=external_pipeline.get_external_origin(), pipeline_run_id=run.run_id, instance_ref=self._instance.get_ref(), )) if not res.success: raise (DagsterLaunchFailedError( res.message, serializable_error_info=res.serializable_error_info)) self._run_id_to_repository_location_handle_cache[ run.run_id] = repository_location_handle return run
def get_failing_celery_job_engine_config(dagster_docker_image, job_namespace): return { "execution": { "config": merge_dicts( ({ "job_image": dagster_docker_image, } if dagster_docker_image else {}), { "job_namespace": job_namespace, "image_pull_policy": image_pull_policy(), "env_config_maps": ["non-existent-config-map"], }, ) }, }
def k8s_mode_defs(resources=None, name="default"): from dagster_k8s.executor import k8s_job_executor resources = resources if resources else {"s3": s3_resource} resources = merge_dicts(resources, {"io_manager": s3_pickle_io_manager}) return [ ModeDefinition( name=name, resource_defs=resources if resources else { "s3": s3_resource, "io_manager": s3_pickle_io_manager }, executor_defs=default_executors + [k8s_job_executor], ) ]
def test_map_fail(run_config): with instance_for_test() as instance: result = execute_pipeline( reconstructable(dynamic_pipeline), instance=instance, run_config=merge_dicts( {"solids": { "emit": { "config": { "fail": True } } }}, run_config), raise_on_error=False, ) assert not result.success
def _default_cli_test_instance_tempdir(temp_dir, overrides=None): default_overrides = { "run_launcher": { "module": "dagster.core.test_utils", "class": "MockedRunLauncher", } } with instance_for_test( temp_dir=temp_dir, overrides=merge_dicts(default_overrides, (overrides if overrides else {})), ) as instance: with mock.patch( "dagster.core.instance.DagsterInstance.get") as _instance: _instance.return_value = instance yield instance
def scheduler_instance(overrides=None): with tempfile.TemporaryDirectory() as temp_dir: with _default_cli_test_instance_tempdir( temp_dir, overrides=merge_dicts( { "scheduler": { "module": "dagster.utils.test", "class": "FilesystemTestScheduler", "config": {"base_dir": temp_dir}, } }, overrides if overrides else {}, ), ) as instance: yield instance
def test_map_empty(run_config): with instance_for_test() as instance: result = execute_pipeline( reconstructable(dynamic_pipeline), instance=instance, run_config=merge_dicts( {"solids": { "num_range": { "config": { "range": 0 } } }}, run_config), ) assert result.success assert result.result_for_solid("double_total").output_value() == 0
def test_execute_on_celery_k8s_with_env_var_and_termination( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, set_dagster_k8s_pipeline_run_namespace_env ): run_config = merge_dicts( merge_yamls( [ os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ] ), get_celery_engine_config( dagster_docker_image=dagster_docker_image, job_namespace={"env": "DAGSTER_K8S_PIPELINE_RUN_NAMESPACE"}, ), ) _test_termination(dagster_instance, run_config)
def _get_existing_run_for_request(instance, external_schedule, schedule_time, run_request): tags = merge_dicts( PipelineRun.tags_for_schedule(external_schedule), { SCHEDULED_EXECUTION_TIME_TAG: schedule_time.in_tz("UTC").isoformat(), }, ) if run_request.run_key: tags[RUN_KEY_TAG] = run_request.run_key runs_filter = PipelineRunsFilter(tags=tags) existing_runs = instance.get_runs(runs_filter) if not len(existing_runs): return None return existing_runs[0]
def celery_docker_config(): additional_config = { 'docker': Field( { 'image': Field( StringSource, is_required=True, description= 'The docker image to be used for step execution.', ), 'registry': Field( { 'url': Field(StringSource), 'username': Field(StringSource), 'password': Field(StringSource), }, is_required=False, description= 'Information for using a non local/public docker registry', ), 'env_vars': Field( [str], is_required=False, description= 'The list of environment variables names to forward from the celery worker in to the docker container', ), }, is_required=True, description= 'The configuration for interacting with docker in the celery worker.', ), 'repo_location_name': Field( StringSource, is_required=False, default_value=IN_PROCESS_NAME, description= '[temporary workaround] The repository location name to use for execution.', ), } cfg = merge_dicts(CELERY_CONFIG, additional_config) return cfg
def _create_scheduler_run( instance, schedule_time, repo_location, external_schedule, external_pipeline, run_request, ): run_config = run_request.run_config schedule_tags = run_request.tags external_execution_plan = repo_location.get_external_execution_plan( external_pipeline, run_config, external_schedule.mode, step_keys_to_execute=None, known_state=None, ) execution_plan_snapshot = external_execution_plan.execution_plan_snapshot pipeline_tags = external_pipeline.tags or {} check_tags(pipeline_tags, "pipeline_tags") tags = merge_dicts(pipeline_tags, schedule_tags) tags[SCHEDULED_EXECUTION_TIME_TAG] = to_timezone(schedule_time, "UTC").isoformat() if run_request.run_key: tags[RUN_KEY_TAG] = run_request.run_key return instance.create_run( pipeline_name=external_schedule.pipeline_name, run_id=None, run_config=run_config, mode=external_schedule.mode, solids_to_execute=external_pipeline.solids_to_execute, step_keys_to_execute=None, solid_selection=external_pipeline.solid_selection, status=PipelineRunStatus.NOT_STARTED, root_run_id=None, parent_run_id=None, tags=tags, pipeline_snapshot=external_pipeline.pipeline_snapshot, execution_plan_snapshot=execution_plan_snapshot, parent_pipeline_snapshot=external_pipeline.parent_pipeline_snapshot, external_pipeline_origin=external_pipeline.get_external_origin(), pipeline_code_origin=external_pipeline.get_python_origin(), )
def start_scheduled_execution(graphene_info, schedule_name): from dagster_graphql.schema.roots import create_execution_metadata check.inst_param(graphene_info, 'graphene_info', ResolveInfo) check.str_param(schedule_name, 'schedule_name') schedule = get_dagster_schedule(graphene_info, schedule_name) schedule_def = get_dagster_schedule_def(graphene_info, schedule_name) schedule_context = ScheduleExecutionContext(graphene_info.context.instance) # Run should_execute and halt if it returns False if not schedule_def.should_execute(schedule_context): return graphene_info.schema.type_named('ScheduledExecutionBlocked')( message='Schedule {schedule_name} did not run because the should_execute did not return' ' True'.format(schedule_name=schedule_name) ) # Get environment_dict environment_dict = schedule_def.get_environment_dict(schedule_context) tags = schedule_def.get_tags(schedule_context) check.invariant('dagster/schedule_id' not in tags) tags['dagster/schedule_id'] = schedule.schedule_id check.invariant('dagster/schedule_name' not in tags) tags['dagster/schedule_name'] = schedule_def.name execution_metadata_tags = [{'key': key, 'value': value} for key, value in tags.items()] execution_params = merge_dicts( schedule_def.execution_params, {'executionMetadata': {'tags': execution_metadata_tags}} ) selector = ExecutionSelector( execution_params['selector']['name'], execution_params['selector'].get('solidSubset') ) execution_params = ExecutionParams( selector=selector, environment_dict=environment_dict, mode=execution_params.get('mode'), execution_metadata=create_execution_metadata(execution_params.get('executionMetadata')), step_keys=execution_params.get('stepKeys'), previous_run_id=None, ) return start_pipeline_execution(graphene_info, execution_params)
def test_success_whole_execution_plan_with_in_memory_config( graphql_context, snapshot): instance = graphql_context.instance environment_dict = merge_dicts(csv_hello_world_solids_config(), {'storage': { 'in_memory': {} }}) pipeline_run = instance.create_run_for_pipeline( pipeline_def=csv_hello_world, environment_dict=environment_dict) result = execute_dagster_graphql( graphql_context, EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': { 'name': 'csv_hello_world' }, 'environmentConfigData': environment_dict, 'stepKeys': None, 'executionMetadata': { 'runId': pipeline_run.run_id }, 'mode': 'default', } }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = { step_event['step']['key']: step_event for step_event in query_result['stepEvents'] if step_event['step'] } assert 'sum_solid.compute' in step_events assert 'sum_sq_solid.compute' in step_events snapshot.assert_match(clean_log_messages(result.data)) store = build_fs_intermediate_store(instance.intermediates_directory, pipeline_run.run_id) intermediates_manager = IntermediateStoreIntermediatesManager(store) assert not intermediates_manager.has_intermediate( None, StepOutputHandle('sum_solid.compute')) assert not intermediates_manager.has_intermediate( None, StepOutputHandle('sum_sq_solid.compute'))
def frequent_celery(): from dagster_celery_k8s.config import get_celery_engine_config additional_env_config_maps = ["test-aws-env-configmap"] if not IS_BUILDKITE else [] return merge_dicts( merge_yamls( [ file_relative_path(__file__, os.path.join("..", "environments", "env.yaml")), file_relative_path(__file__, os.path.join("..", "environments", "env_s3.yaml")), ] ), get_celery_engine_config( image_pull_policy=image_pull_policy(), additional_env_config_maps=additional_env_config_maps, ), )
def test_map_selection(run_config): with instance_for_test() as instance: result = execute_pipeline( reconstructable(dynamic_pipeline), instance=instance, run_config=merge_dicts( {"solids": { "emit": { "inputs": { "num": 2 } } }}, run_config), solid_selection=["emit*", "emit_ten"], ) assert result.success assert result.result_for_solid("double_total").output_value() == 40
def __new__( cls, pipeline_name, key_suffix, step_inputs, step_outputs, compute_fn, kind, solid_handle, solid, logging_tags=None, tags=None, hook_defs=None, ): return super(ExecutionStep, cls).__new__( cls, pipeline_name=check.str_param(pipeline_name, 'pipeline_name'), key_suffix=check.str_param(key_suffix, 'key_suffix'), step_inputs=check.list_param(step_inputs, 'step_inputs', of_type=StepInput), step_input_dict={si.name: si for si in step_inputs}, step_outputs=check.list_param(step_outputs, 'step_outputs', of_type=StepOutput), step_output_dict={so.name: so for so in step_outputs}, compute_fn=check.callable_param(compute_fn, 'compute_fn'), kind=check.inst_param(kind, 'kind', StepKind), solid_handle=check.inst_param(solid_handle, 'solid_handle', SolidHandle), logging_tags=merge_dicts( { 'step_key': str(solid_handle) + '.' + key_suffix, 'pipeline': pipeline_name, 'solid': solid_handle.name, 'solid_definition': solid.definition.name, }, check.opt_dict_param(logging_tags, 'logging_tags'), ), tags=check.opt_inst_param(tags, 'tags', frozentags), hook_defs=check.opt_set_param(hook_defs, 'hook_defs', of_type=HookDefinition), )
def dagster_instance_config(base_dir, config_filename=DAGSTER_CONFIG_YAML_FILENAME, overrides=None): overrides = check.opt_dict_param(overrides, 'overrides') dagster_config_dict = merge_dicts( load_yaml_from_globs(os.path.join(base_dir, config_filename)), overrides) dagster_config_type = resolve_to_config_type(define_dagster_config_cls()) dagster_config = validate_config(dagster_config_type, dagster_config_dict) if not dagster_config.success: raise DagsterInvalidConfigError( 'Errors whilst loading dagster instance config at {}.'.format( config_filename), dagster_config.errors, dagster_config_dict, ) return dagster_config.value
def celery_docker_config(): additional_config = { "docker": Field( { "image": Field( StringSource, is_required=False, description= "The docker image to be used for step execution.", ), "registry": Field( { "url": Field(StringSource), "username": Field(StringSource), "password": Field(StringSource), }, is_required=False, description= "Information for using a non local/public docker registry", ), "env_vars": Field( [str], is_required=False, description= "The list of environment variables names to forward from the celery worker in to the docker container", ), "network": Field( str, is_required=False, description= "Name of the network this container will be connected to at creation time", ), }, is_required=True, description= "The configuration for interacting with docker in the celery worker.", ), } cfg = merge_dicts(CELERY_CONFIG, additional_config) return cfg
def default_instance_tempdir(temp_dir, overrides=None): default_overrides = { "run_launcher": { "module": "dagster_tests.cli_tests.command_tests.test_cli_commands", "class": "InMemoryRunLauncher", } } with instance_for_test_tempdir( temp_dir, overrides=merge_dicts( default_overrides, (overrides if overrides else {}))) as instance: with mock.patch( "dagster.core.instance.DagsterInstance.get") as _instance: _instance.return_value = instance yield instance
def test_success_whole_execution_plan_with_in_memory_config(snapshot): run_id = make_new_run_id() instance = DagsterInstance.ephemeral() instance.create_empty_run(run_id, 'csv_hello_world') result = execute_dagster_graphql( define_test_context(instance=instance), EXECUTE_PLAN_QUERY, variables={ 'executionParams': { 'selector': { 'name': 'csv_hello_world' }, 'environmentConfigData': merge_dicts(csv_hello_world_solids_config(), {'storage': { 'in_memory': {} }}), 'stepKeys': None, 'executionMetadata': { 'runId': run_id }, 'mode': 'default', } }, ) query_result = result.data['executePlan'] assert query_result['__typename'] == 'ExecutePlanSuccess' assert query_result['pipeline']['name'] == 'csv_hello_world' assert query_result['hasFailures'] is False step_events = { step_event['step']['key']: step_event for step_event in query_result['stepEvents'] if step_event['step'] } assert 'sum_solid.compute' in step_events assert 'sum_sq_solid.compute' in step_events snapshot.assert_match(clean_log_messages(result.data)) store = build_fs_intermediate_store(instance.intermediates_directory, run_id) assert not store.has_intermediate(None, 'sum_solid.compute') assert not store.has_intermediate(None, 'sum_sq_solid.compute')
def test_k8s_run_launcher_with_celery_executor_fails( dagster_docker_image, dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagit_url_for_k8s_run_launcher, ): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config( dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace_for_k8s_run_launcher, ), ) pipeline_name = "demo_pipeline_celery" run_id = launch_run_over_graphql(dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name) timeout = datetime.timedelta(0, 120) start_time = datetime.datetime.now() while True: assert (datetime.datetime.now() < start_time + timeout), "Timed out waiting for pipeline failure" event_records = dagster_instance_for_k8s_run_launcher.all_logs(run_id) found_pipeline_failure = False for event_record in event_records: if event_record.dagster_event: if event_record.dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE: found_pipeline_failure = True if found_pipeline_failure: break time.sleep(5) assert (dagster_instance_for_k8s_run_launcher.get_run_by_id(run_id).status == PipelineRunStatus.FAILURE)