def submit_run(self, pipeline_run, external_pipeline): check.inst_param(pipeline_run, "run", PipelineRun) check.opt_inst_param(external_pipeline, "external_pipeline", ExternalPipeline) check.inst(pipeline_run.external_pipeline_origin, ExternalPipelineOrigin) self._queue.append(pipeline_run) return pipeline_run
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set( step_keys_to_execute) yield DagsterEvent.engine_event( pipeline_context, 'Executing steps in process (pid: {pid})'.format(pid=os.getpid()), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), ) with time_execution_scope() as timer_result: check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config), ) failed_or_skipped_steps = set() step_levels = execution_plan.topological_step_levels() # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) failed_inputs = [] for step_input in step.step_inputs: failed_inputs.extend( failed_or_skipped_steps.intersection( step_input.dependency_keys)) if failed_inputs: step_context.log.info(( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.' ).format(step=step.key, failed_inputs=failed_inputs)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional( uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue for step_event in check.generator( dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event yield DagsterEvent.engine_event( pipeline_context, 'Finished steps in process (pid: {pid}) in {duration_ms}'.format( pid=os.getpid(), duration_ms=format_duration(timer_result.millis)), event_specific_data=EngineEventData.in_process( os.getpid(), step_key_set), )
def event_generator( self, execution_plan, run_config, pipeline_run, instance, scoped_resources_builder_cm, intermediate_storage=None, raise_on_error=False, resource_instances_to_override=None, output_capture=None, ): execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan) pipeline_def = execution_plan.pipeline.get_definition() run_config = check.dict_param(run_config, "run_config", key_type=str) pipeline_run = check.inst_param(pipeline_run, "pipeline_run", PipelineRun) instance = check.inst_param(instance, "instance", DagsterInstance) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, "scoped_resources_builder_cm") intermediate_storage = check.opt_inst_param( intermediate_storage, "intermediate_storage_data", IntermediateStorage) raise_on_error = check.bool_param(raise_on_error, "raise_on_error") resource_instances_to_override = check.opt_dict_param( resource_instances_to_override, "resource_instances_to_override") execution_context = None resources_manager = None try: context_creation_data = create_context_creation_data( execution_plan, run_config, pipeline_run, instance, ) log_manager = create_log_manager(context_creation_data) resource_defs = execution_plan.pipeline_def.get_mode_definition( context_creation_data.environment_config.mode).resource_defs resources_manager = scoped_resources_builder_cm( resource_defs=resource_defs, resource_configs=context_creation_data.environment_config. resources, log_manager=log_manager, execution_plan=execution_plan, pipeline_run=context_creation_data.pipeline_run, resource_keys_to_init=context_creation_data. resource_keys_to_init, instance=instance, resource_instances_to_override=resource_instances_to_override, emit_persistent_events=True, ) yield from resources_manager.generate_setup_events() scoped_resources_builder = check.inst( resources_manager.get_object(), ScopedResourcesBuilder) intermediate_storage = create_intermediate_storage( context_creation_data, intermediate_storage, scoped_resources_builder, ) execution_context = self.construct_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, log_manager=log_manager, intermediate_storage=intermediate_storage, raise_on_error=raise_on_error, output_capture=output_capture, ) _validate_plan_with_context(execution_context, execution_plan) yield execution_context yield from resources_manager.generate_teardown_events() except DagsterError as dagster_error: if execution_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager( instance, pipeline_run, pipeline_def), ) if resources_manager: yield from resources_manager.generate_teardown_events() else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def _execute_step_k8s_job( self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod. """ check.dict_param(instance_ref_dict, "instance_ref_dict") check.list_param(step_keys, "step_keys", of_type=str) check.invariant( len(step_keys) == 1, "Celery K8s task executor can only execute 1 step at a time") check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.str_param(repo_name, "repo_name") check.str_param(repo_location_name, "repo_location_name") check.str_param(run_id, "run_id") # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") check.dict_param(retries_dict, "retries_dict") pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, "pipeline_origin_packed") # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_key = step_keys[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format( step_key=step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ]), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, )) command = ["dagster"] args = ["api", "execute_step_with_structured_logs", input_json] job = construct_dagster_k8s_job(job_config, command, args, job_name, user_defined_k8s_config, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text(str(job_config.image_pull_secrets), "Image pull secrets"), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so do not procede. instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(str(e), "Error"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return [] except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error while waiting on Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(str(e), "Error"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(str(e), "Error"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData( [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: try: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error while fetching pod logs for Kubernetes job {}, " "Pod name {} for step {}. Will attempt to continue with other pods." .format(job_name, pod_name, step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(str(e), "Error"), ]), CeleryK8sJobExecutor, step_key=step_key, ) events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def execution_context_event_generator( pipeline, execution_plan, run_config, pipeline_run, instance, retry_mode, scoped_resources_builder_cm=None, intermediate_storage=None, raise_on_error=False, output_capture=None, ): scoped_resources_builder_cm = check.opt_callable_param( scoped_resources_builder_cm, "scoped_resources_builder_cm", default=resource_initialization_manager, ) execution_plan = check.inst_param(execution_plan, "execution_plan", ExecutionPlan) pipeline_def = pipeline.get_definition() run_config = check.dict_param(run_config, "run_config", key_type=str) pipeline_run = check.inst_param(pipeline_run, "pipeline_run", PipelineRun) instance = check.inst_param(instance, "instance", DagsterInstance) intermediate_storage = check.opt_inst_param(intermediate_storage, "intermediate_storage_data", IntermediateStorage) raise_on_error = check.bool_param(raise_on_error, "raise_on_error") context_creation_data = create_context_creation_data( pipeline, execution_plan, run_config, pipeline_run, instance, ) log_manager = create_log_manager(context_creation_data) resource_defs = pipeline_def.get_mode_definition( context_creation_data.environment_config.mode).resource_defs resources_manager = scoped_resources_builder_cm( resource_defs=resource_defs, resource_configs=context_creation_data.environment_config.resources, log_manager=log_manager, execution_plan=execution_plan, pipeline_run=context_creation_data.pipeline_run, resource_keys_to_init=context_creation_data.resource_keys_to_init, instance=instance, emit_persistent_events=True, pipeline_def_for_backwards_compat=pipeline_def, ) yield from resources_manager.generate_setup_events() scoped_resources_builder = check.inst(resources_manager.get_object(), ScopedResourcesBuilder) intermediate_storage = create_intermediate_storage( context_creation_data, intermediate_storage, scoped_resources_builder, ) execution_context = PlanExecutionContext( plan_data=create_plan_data(context_creation_data, raise_on_error, retry_mode), execution_data=create_execution_data(context_creation_data, scoped_resources_builder, intermediate_storage), log_manager=log_manager, output_capture=output_capture, ) _validate_plan_with_context(execution_context, execution_plan) yield execution_context yield from resources_manager.generate_teardown_events()
def command(input_file, output_file): args = check.inst(read_unary_input(input_file), input_cls) output = check.inst(fn(args), output_cls) ipc_write_unary_response(output_file, output)
def from_dagster_event_record(graphene_info, event_record, dauphin_pipeline, execution_plan): # Lots of event types. Pylint thinks there are too many branches # pylint: disable=too-many-branches check.inst_param(event_record, 'event_record', EventRecord) check.param_invariant(event_record.is_dagster_event, 'event_record') check.opt_inst_param(dauphin_pipeline, 'dauphin_pipeline', graphene_info.schema.type_named('Pipeline')) check.opt_inst_param(execution_plan, 'execution_plan', ExecutionPlan) dagster_event = event_record.dagster_event basic_params = construct_basic_params(graphene_info, event_record, execution_plan) if dagster_event.event_type == DagsterEventType.STEP_START: return graphene_info.schema.type_named('ExecutionStepStartEvent')( **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SKIPPED: return graphene_info.schema.type_named('ExecutionStepSkippedEvent')( **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SUCCESS: return graphene_info.schema.type_named('ExecutionStepSuccessEvent')( **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_INPUT: input_data = dagster_event.event_specific_data return graphene_info.schema.type_named('ExecutionStepInputEvent')( input_name=input_data.input_name, type_check=input_data.type_check_data, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_OUTPUT: output_data = dagster_event.step_output_data return graphene_info.schema.type_named('ExecutionStepOutputEvent')( output_name=output_data.output_name, type_check=output_data.type_check_data, # parens make black not put trailing commas, which in turn break py27 # fmt: off **(basic_params) # fmt: on ) elif dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION: materialization = dagster_event.step_materialization_data.materialization return graphene_info.schema.type_named('StepMaterializationEvent')( materialization=materialization, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT: expectation_result = dagster_event.event_specific_data.expectation_result return graphene_info.schema.type_named('StepExpectationResultEvent')( expectation_result=expectation_result, **(basic_params)) elif dagster_event.event_type == DagsterEventType.STEP_FAILURE: check.inst(dagster_event.step_failure_data, StepFailureData) return graphene_info.schema.type_named('ExecutionStepFailureEvent')( error=graphene_info.schema.type_named('PythonError')( dagster_event.step_failure_data.error), # parens make black not put trailing commas, which in turn break py27 # fmt: off **(basic_params) # fmt: on ) elif dagster_event.event_type == DagsterEventType.PIPELINE_START: return graphene_info.schema.type_named('PipelineStartEvent')( pipeline=dauphin_pipeline, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_SUCCESS: return graphene_info.schema.type_named('PipelineSuccessEvent')( pipeline=dauphin_pipeline, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE: return graphene_info.schema.type_named('PipelineFailureEvent')( pipeline=dauphin_pipeline, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_PROCESS_START: process_data = dagster_event.pipeline_process_start_data return graphene_info.schema.type_named('PipelineProcessStartEvent')( pipeline=dauphin_pipeline, pipeline_name=process_data.pipeline_name, run_id=process_data.run_id, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_PROCESS_STARTED: process_data = dagster_event.pipeline_process_started_data return graphene_info.schema.type_named('PipelineProcessStartedEvent')( pipeline=dauphin_pipeline, process_id=process_data.process_id, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_PROCESS_EXITED: process_data = dagster_event.pipeline_process_exited_data return graphene_info.schema.type_named('PipelineProcessExitedEvent')( pipeline=dauphin_pipeline, process_id=process_data.process_id, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_INIT_FAILURE: return graphene_info.schema.type_named('PipelineInitFailureEvent')( pipeline=dauphin_pipeline, error=graphene_info.schema.type_named('PythonError')( dagster_event.pipeline_init_failure_data.error), # parens make black not put trailing commas, which in turn break py27 # fmt: off **(basic_params) # fmt: on ) elif dagster_event.event_type == DagsterEventType.OBJECT_STORE_OPERATION: operation_result = dagster_event.event_specific_data return graphene_info.schema.type_named('ObjectStoreOperationEvent')( operation_result=operation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.ENGINE_EVENT: return graphene_info.schema.type_named('EngineEvent')( metadataEntries=_to_dauphin_metadata_entries( dagster_event.event_specific_data.metadata_entries), **basic_params) else: raise Exception( 'Unknown DAGSTER_EVENT type {inner_type} found in logs'.format( inner_type=dagster_event.event_type))
def from_dagster_event_record(event_record, pipeline_name): # Lots of event types. Pylint thinks there are too many branches # pylint: disable=too-many-branches check.inst_param(event_record, "event_record", EventRecord) check.param_invariant(event_record.is_dagster_event, "event_record") check.str_param(pipeline_name, "pipeline_name") # circular ref at module scope from .errors import DauphinPythonError dagster_event = event_record.dagster_event basic_params = construct_basic_params(event_record) if dagster_event.event_type == DagsterEventType.STEP_START: return DauphinExecutionStepStartEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SKIPPED: return DauphinExecutionStepSkippedEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_UP_FOR_RETRY: return DauphinExecutionStepUpForRetryEvent( error=dagster_event.step_retry_data.error, secondsToWait=dagster_event.step_retry_data.seconds_to_wait, **basic_params, ) elif dagster_event.event_type == DagsterEventType.STEP_RESTARTED: return DauphinExecutionStepRestartEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SUCCESS: return DauphinExecutionStepSuccessEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_INPUT: input_data = dagster_event.event_specific_data return DauphinExecutionStepInputEvent( input_name=input_data.input_name, type_check=input_data.type_check_data, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_OUTPUT: output_data = dagster_event.step_output_data return DauphinExecutionStepOutputEvent( output_name=output_data.output_name, type_check=output_data.type_check_data, **basic_params, ) elif dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION: materialization = dagster_event.step_materialization_data.materialization return DauphinStepMaterializationEvent(materialization=materialization, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT: expectation_result = dagster_event.event_specific_data.expectation_result return DauphinStepExpectationResultEvent( expectation_result=expectation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_FAILURE: check.inst(dagster_event.step_failure_data, StepFailureData) return DauphinExecutionStepFailureEvent( error=DauphinPythonError(dagster_event.step_failure_data.error), failureMetadata=dagster_event.step_failure_data.user_failure_data, **basic_params, ) elif dagster_event.event_type == DagsterEventType.PIPELINE_ENQUEUED: return DauphinPipelineEnqueuedEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_DEQUEUED: return DauphinPipelineDequeuedEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_STARTING: return DauphinPipelineStartingEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_CANCELING: return DauphinPipelineCancelingEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_CANCELED: return DauphinPipelineCanceledEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_START: return DauphinPipelineStartEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_SUCCESS: return DauphinPipelineSuccessEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE: return DauphinPipelineFailureEvent( pipelineName=pipeline_name, error=DauphinPythonError(dagster_event.pipeline_failure_data.error) if (dagster_event.pipeline_failure_data and dagster_event.pipeline_failure_data.error) else None, **basic_params, ) elif dagster_event.event_type == DagsterEventType.PIPELINE_INIT_FAILURE: return DauphinPipelineInitFailureEvent( pipelineName=pipeline_name, error=DauphinPythonError( dagster_event.pipeline_init_failure_data.error), **basic_params, ) elif dagster_event.event_type == DagsterEventType.HANDLED_OUTPUT: return DauphinHandledOutputEvent( output_name=dagster_event.event_specific_data.output_name, manager_key=dagster_event.event_specific_data.manager_key, **basic_params, ) elif dagster_event.event_type == DagsterEventType.LOADED_INPUT: return DauphinLoadedInputEvent( input_name=dagster_event.event_specific_data.input_name, manager_key=dagster_event.event_specific_data.manager_key, upstream_output_name=dagster_event.event_specific_data. upstream_output_name, upstream_step_key=dagster_event.event_specific_data. upstream_step_key, **basic_params, ) elif dagster_event.event_type == DagsterEventType.OBJECT_STORE_OPERATION: operation_result = dagster_event.event_specific_data return DauphinObjectStoreOperationEvent( operation_result=operation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.ENGINE_EVENT: return DauphinEngineEvent( metadataEntries=_to_dauphin_metadata_entries( dagster_event.engine_event_data.metadata_entries), error=DauphinPythonError(dagster_event.engine_event_data.error) if dagster_event.engine_event_data.error else None, marker_start=dagster_event.engine_event_data.marker_start, marker_end=dagster_event.engine_event_data.marker_end, **basic_params, ) elif dagster_event.event_type == DagsterEventType.HOOK_COMPLETED: return DauphinHookCompletedEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.HOOK_SKIPPED: return DauphinHookSkippedEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.HOOK_ERRORED: return DauphinHookErroredEvent(error=DauphinPythonError( dagster_event.hook_errored_data.error), **basic_params) else: raise Exception( "Unknown DAGSTER_EVENT type {inner_type} found in logs".format( inner_type=dagster_event.event_type))
def __init__( self, config, default_value=FIELD_NO_DEFAULT_PROVIDED, is_required=None, description=None, ): from .validate import validate_config from .post_process import resolve_defaults self.config_type = check.inst(self._resolve_config_arg(config), ConfigType) self.description = check.opt_str_param(description, "description") check.opt_bool_param(is_required, "is_required") if default_value != FIELD_NO_DEFAULT_PROVIDED: check.param_invariant(not (callable(default_value)), "default_value", "default_value cannot be a callable") if is_required is True: check.param_invariant( default_value == FIELD_NO_DEFAULT_PROVIDED, "default_value", "required arguments should not specify default values", ) self._default_value = default_value # check explicit default value if self.default_provided: if self.config_type.kind == ConfigTypeKind.ENUM and is_enum_value( default_value): raise DagsterInvalidDefinitionError(( "You have passed into a python enum value as the default value " "into of a config enum type {name}. You must pass in the underlying " "string represention as the default value. One of {value_set}." ).format( value_set=[ ev.config_value for ev in self.config_type.enum_values ], name=self.config_type.given_name, )) evr = validate_config(self.config_type, default_value) if not evr.success: raise DagsterInvalidConfigError( "Invalid default_value for Field.", evr.errors, default_value, ) if is_required is None: is_optional = all_optional_type( self.config_type) or self.default_provided is_required = not is_optional # on implicitly optional - set the default value # by resolving the defaults of the type if not is_required and not self.default_provided: evr = resolve_defaults(self.config_type, None) if not evr.success: raise DagsterInvalidConfigError( "Unable to resolve implicit default_value for Field.", evr.errors, None, ) self._default_value = evr.value self._is_required = is_required
def load_repository_from_target_info(info): return check.inst( load_repository_object_from_target_info(info).fn(), RepositoryDefinition)
def inner_plan_execution_iterator(pipeline_context, execution_plan): check.inst_param(pipeline_context, 'pipeline_context', SystemExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) retries = pipeline_context.retries for event in copy_required_intermediates_for_execution( pipeline_context, execution_plan): yield event # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 active_execution = execution_plan.start(retries=retries) while not active_execution.is_complete: step = active_execution.get_next_step() step_context = pipeline_context.for_step(step) missing_resources = [ resource_key for resource_key in step_context.required_resource_keys if not hasattr(step_context.resources, resource_key) ] check.invariant( len(missing_resources) == 0, ('Expected step context for solid {solid_name} to have all required resources, but ' 'missing {missing_resources}.').format( solid_name=step_context.solid.name, missing_resources=missing_resources), ) with pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key): # capture all of the logs for this step uncovered_inputs = pipeline_context.intermediates_manager.uncovered_inputs( step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional(uncovered_inputs, execution_plan, step.key) step_context.log.info(( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}').format( uncovered_inputs=uncovered_inputs, step=step.key)) yield DagsterEvent.step_skipped_event(step_context) active_execution.mark_skipped(step.key) else: for step_event in check.generator( _dagster_event_sequence_for_step( step_context, retries)): check.inst(step_event, DagsterEvent) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) # process skips from failures or uncovered inputs for event in active_execution.skipped_step_events_iterator( pipeline_context): yield event
def StartRun(self, request, _context): if self._shutdown_once_executions_finish_event.is_set(): return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message= "Tried to start a run on a server after telling it to shut down", serializable_error_info=None, ))) try: execute_run_args = check.inst( deserialize_json_to_dagster_namedtuple( request.serialized_execute_run_args), ExecuteExternalPipelineArgs, ) run_id = execute_run_args.pipeline_run_id recon_pipeline = self._recon_pipeline_from_origin( execute_run_args.pipeline_origin) except: # pylint: disable=bare-except return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=False, message=None, serializable_error_info= serializable_error_info_from_exc_info(sys.exc_info()), ))) event_queue = multiprocessing.Queue() termination_event = multiprocessing.Event() execution_process = multiprocessing.Process( target=start_run_in_subprocess, args=[ request.serialized_execute_run_args, recon_pipeline, event_queue, termination_event, ], ) with self._execution_lock: execution_process.start() self._executions[run_id] = ( execution_process, execute_run_args.instance_ref, ) self._termination_events[run_id] = termination_event success = None message = None serializable_error_info = None while success is None: time.sleep(EVENT_QUEUE_POLL_INTERVAL) # We use `get_nowait()` instead of `get()` so that we can handle the case where the # execution process has died unexpectedly -- `get()` would hang forever in that case try: dagster_event_or_ipc_error_message_or_done = event_queue.get_nowait( ) except queue.Empty: if not execution_process.is_alive(): # subprocess died unexpectedly success = False message = ( "GRPC server: Subprocess for {run_id} terminated unexpectedly with " "exit code {exit_code}".format( run_id=run_id, exit_code=execution_process.exitcode, )) serializable_error_info = serializable_error_info_from_exc_info( sys.exc_info()) else: if isinstance(dagster_event_or_ipc_error_message_or_done, StartRunInSubprocessSuccessful): success = True elif isinstance(dagster_event_or_ipc_error_message_or_done, RunInSubprocessComplete): continue if isinstance(dagster_event_or_ipc_error_message_or_done, IPCErrorMessage): success = False message = dagster_event_or_ipc_error_message_or_done.message serializable_error_info = ( dagster_event_or_ipc_error_message_or_done. serializable_error_info) # Ensure that if the run failed, we remove it from the executions map before # returning so that CanCancel will never return True if not success: with self._execution_lock: self._clear_run(run_id) return api_pb2.StartRunReply( serialized_start_run_result=serialize_dagster_namedtuple( StartRunResult( success=success, message=message, serializable_error_info=serializable_error_info, )))
def pipeline_initialization_event_generator( execution_plan, run_config, pipeline_run, instance, scoped_resources_builder_cm, system_storage_data=None, intermediate_storage=None, raise_on_error=False, ): execution_plan = check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) pipeline_def = execution_plan.pipeline.get_definition() run_config = check.dict_param(run_config, 'run_config', key_type=str) pipeline_run = check.inst_param(pipeline_run, 'pipeline_run', PipelineRun) instance = check.inst_param(instance, 'instance', DagsterInstance) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, 'scoped_resources_builder_cm') system_storage_data = check.opt_inst_param(system_storage_data, 'system_storage_data', SystemStorageData) intermediate_storage = check.opt_inst_param(intermediate_storage, 'intermediate_storage_data', IntermediateStorage) raise_on_error = check.bool_param(raise_on_error, 'raise_on_error') pipeline_context = None resources_manager = None try: context_creation_data = create_context_creation_data( execution_plan, run_config, pipeline_run, instance, ) executor = check.inst(create_executor(context_creation_data), Executor, 'Must return an Executor') log_manager = create_log_manager(context_creation_data) resources_manager = scoped_resources_builder_cm( execution_plan, context_creation_data.environment_config, context_creation_data.pipeline_run, log_manager, context_creation_data.resource_keys_to_init, ) for event in resources_manager.generate_setup_events(): yield event scoped_resources_builder = check.inst(resources_manager.get_object(), ScopedResourcesBuilder) system_storage_data = create_system_storage_data( context_creation_data, system_storage_data, scoped_resources_builder) if intermediate_storage or context_creation_data.intermediate_storage_def: intermediate_storage = create_intermediate_storage( context_creation_data, intermediate_storage, scoped_resources_builder, ) else: # remove this as part of https://github.com/dagster-io/dagster/issues/2705 intermediate_storage = system_storage_data.intermediates_manager pipeline_context = construct_pipeline_execution_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=system_storage_data, intermediate_storage=intermediate_storage, log_manager=log_manager, executor=executor, raise_on_error=raise_on_error, ) _validate_plan_with_context(pipeline_context, execution_plan) yield pipeline_context for event in resources_manager.generate_teardown_events(): yield event except DagsterError as dagster_error: if pipeline_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info()) error_info = serializable_error_info_from_exc_info( user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager( instance, pipeline_run, pipeline_def), ) if resources_manager: for event in resources_manager.generate_teardown_events(): yield event else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def _core_resource_initialization_event_generator( resource_defs: Dict[str, ResourceDefinition], resource_configs: Dict[str, ResourceConfig], resource_log_manager: DagsterLogManager, resource_managers: Deque[EventGenerationManager], execution_plan: Optional[ExecutionPlan], pipeline_run: Optional[PipelineRun], resource_keys_to_init: Optional[Set[str]], instance: Optional[DagsterInstance], resource_instances_to_override: Optional[Dict[str, "InitializedResource"]], emit_persistent_events: Optional[bool], pipeline_def_for_backwards_compat: Optional[PipelineDefinition], ): pipeline_name = None if emit_persistent_events: check.invariant( pipeline_run and execution_plan, "If emit_persistent_events is enabled, then pipeline_run and execution_plan must be provided", ) pipeline_name = cast(PipelineRun, pipeline_run).pipeline_name resource_instances_to_override = check.opt_dict_param( resource_instances_to_override, "resource_instances_to_override" ) resource_keys_to_init = check.opt_set_param(resource_keys_to_init, "resource_keys_to_init") resource_instances: Dict[str, "InitializedResource"] = {} resource_init_times = {} try: if emit_persistent_events and resource_keys_to_init: yield DagsterEvent.resource_init_start( pipeline_name, execution_plan, resource_log_manager, resource_keys_to_init, ) resource_dependencies = _resolve_resource_dependencies(resource_defs) for level in toposort(resource_dependencies): for resource_name in level: if resource_name in resource_instances_to_override: # use the given resource instances instead of re-initiating it from resource def resource_def = ResourceDefinition.hardcoded_resource( resource_instances_to_override[resource_name] ) else: resource_def = resource_defs[resource_name] if not resource_name in resource_keys_to_init: continue resource_context = InitResourceContext( resource_def=resource_def, resource_config=resource_configs[resource_name].config, pipeline_run=pipeline_run, # Add tags with information about the resource log_manager=resource_log_manager.with_tags( resource_name=resource_name, resource_fn_name=str(resource_def.resource_fn.__name__), ), resource_instance_dict=resource_instances, required_resource_keys=resource_def.required_resource_keys, instance=instance, pipeline_def_for_backwards_compat=pipeline_def_for_backwards_compat, ) manager = single_resource_generation_manager( resource_context, resource_name, resource_def ) for event in manager.generate_setup_events(): if event: yield event initialized_resource = check.inst(manager.get_object(), InitializedResource) resource_instances[resource_name] = initialized_resource.resource resource_init_times[resource_name] = initialized_resource.duration resource_managers.append(manager) if emit_persistent_events and resource_keys_to_init: yield DagsterEvent.resource_init_success( pipeline_name, execution_plan, resource_log_manager, resource_instances, resource_init_times, ) yield ScopedResourcesBuilder(resource_instances) except DagsterUserCodeExecutionError as dagster_user_error: # Can only end up in this state if we attempt to initialize a resource, so # resource_keys_to_init cannot be empty if emit_persistent_events: yield DagsterEvent.resource_init_failure( pipeline_name, execution_plan, resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info(dagster_user_error.original_exc_info), ) raise dagster_user_error
def _execute_step_k8s_job( self, execute_step_args_packed, job_config_dict, job_namespace, load_incluster_config, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.invariant( len(execute_step_args.step_keys_to_execute) == 1, "Celery K8s task executor can only execute 1 step at a time", ) # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_key = execute_step_args.step_keys_to_execute[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format( step_key=step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ]), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), ]), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(execute_step_args.pipeline_run_id, step_key) retry_state = execute_step_args.known_state.get_retry_state() if retry_state.get_attempt_count(step_key): attempt_number = retry_state.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple(execute_step_args) args = ["dagster", "api", "execute_step", input_json] job = construct_dagster_k8s_job(job_config, args, job_name, user_defined_k8s_config, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text(str(job_config.image_pull_secrets), "Image pull secrets"), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so proceed and see if the existing job succeeded instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, proceeding with existing job.".format( job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=execute_step_args.pipeline_run_id, ) except (DagsterK8sError, DagsterK8sTimeoutError) as err: step_failure_event = construct_step_failure_event_and_handle( pipeline_run, step_key, err, instance=instance) events.append(step_failure_event) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, "Step key"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return [] except ( DagsterK8sUnrecoverableAPIError, DagsterK8sAPIRetryLimitExceeded, # We shouldn't see unwrapped APIExceptions anymore, as they should all be wrapped in # a retry boundary. We still catch it here just in case we missed one so that we can # report it to the event log kubernetes.client.rest.ApiException, ) as err: instance.report_engine_event( "Encountered unexpected error while waiting on Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] try: pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error retreiving Pods for Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) return [] # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData( [EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: try: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") except kubernetes.client.rest.ApiException as e: instance.report_engine_event( "Encountered unexpected error while fetching pod logs for Kubernetes job {}, " "Pod name {} for step {}. Will attempt to continue with other pods." .format(job_name, pod_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step key"), ], error=serializable_error_info_from_exc_info( sys.exc_info()), ), CeleryK8sJobExecutor, step_key=step_key, ) events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _core_resource_initialization_event_generator( execution_plan, environment_config, pipeline_run, resource_keys_to_init, resource_log_manager, resource_managers, instance, resource_instances_to_override, ): pipeline_def = execution_plan.pipeline_def resource_instances = {} mode_definition = pipeline_def.get_mode_definition(pipeline_run.mode) resource_init_times = {} try: if resource_keys_to_init: yield DagsterEvent.resource_init_start( execution_plan, resource_log_manager, resource_keys_to_init, ) resource_dependencies = _resolve_resource_dependencies( mode_definition.resource_defs) for level in toposort(resource_dependencies): for resource_name in level: if resource_name in resource_instances_to_override: # use the given resource instances instead of re-initiating it from resource def resource_def = ResourceDefinition.hardcoded_resource( resource_instances_to_override[resource_name]) else: resource_def = mode_definition.resource_defs[resource_name] if not resource_name in resource_keys_to_init: continue resource_context = InitResourceContext( resource_def=resource_def, resource_config=environment_config.resources.get( resource_name, {}).get("config"), pipeline_run=pipeline_run, # Add tags with information about the resource log_manager=resource_log_manager.with_tags( resource_name=resource_name, resource_fn_name=str( resource_def.resource_fn.__name__), ), resource_instance_dict=resource_instances, required_resource_keys=resource_def.required_resource_keys, instance_for_backwards_compat=instance, pipeline_def_for_backwards_compat=pipeline_def, ) manager = single_resource_generation_manager( resource_context, resource_name, resource_def) for event in manager.generate_setup_events(): if event: yield event initialized_resource = check.inst(manager.get_object(), InitializedResource) resource_instances[ resource_name] = initialized_resource.resource resource_init_times[ resource_name] = initialized_resource.duration resource_managers.append(manager) if resource_keys_to_init: yield DagsterEvent.resource_init_success(execution_plan, resource_log_manager, resource_instances, resource_init_times) yield ScopedResourcesBuilder(resource_instances) except DagsterUserCodeExecutionError as dagster_user_error: yield DagsterEvent.resource_init_failure( execution_plan, resource_log_manager, resource_keys_to_init, serializable_error_info_from_exc_info( dagster_user_error.original_exc_info), ) raise dagster_user_error
def pipeline_initialization_event_generator( pipeline_def, environment_dict, pipeline_run, instance, execution_plan, scoped_resources_builder_cm, system_storage_data=None, raise_on_error=False, ): pipeline_def = check.inst_param(pipeline_def, 'pipeline_def', PipelineDefinition) environment_dict = check.dict_param(environment_dict, 'environment_dict', key_type=str) pipeline_run = check.inst_param(pipeline_run, 'pipeline_run', PipelineRun) instance = check.inst_param(instance, 'instance', DagsterInstance) execution_plan = check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) scoped_resources_builder_cm = check.callable_param( scoped_resources_builder_cm, 'scoped_resources_builder_cm' ) system_storage_data = check.opt_inst_param( system_storage_data, 'system_storage_data', SystemStorageData ) raise_on_error = check.bool_param(raise_on_error, 'raise_on_error') pipeline_context = None resources_manager = None try: context_creation_data = create_context_creation_data( pipeline_def, environment_dict, pipeline_run, instance, execution_plan, ) executor_config = create_executor_config(context_creation_data) log_manager = create_log_manager(context_creation_data) resources_manager = scoped_resources_builder_cm( context_creation_data.pipeline_def, context_creation_data.environment_config, context_creation_data.pipeline_run, log_manager, context_creation_data.resource_keys_to_init, ) for event in resources_manager.generate_setup_events(): yield event scoped_resources_builder = check.inst( resources_manager.get_object(), ScopedResourcesBuilder ) system_storage_data = create_system_storage_data( context_creation_data, system_storage_data, scoped_resources_builder ) pipeline_context = construct_pipeline_execution_context( context_creation_data=context_creation_data, scoped_resources_builder=scoped_resources_builder, system_storage_data=system_storage_data, log_manager=log_manager, executor_config=executor_config, raise_on_error=raise_on_error, ) yield pipeline_context for event in resources_manager.generate_teardown_events(): yield event except DagsterError as dagster_error: if pipeline_context is None: user_facing_exc_info = ( # pylint does not know original_exc_info exists is is_user_code_error is true # pylint: disable=no-member dagster_error.original_exc_info if dagster_error.is_user_code_error else sys.exc_info() ) error_info = serializable_error_info_from_exc_info(user_facing_exc_info) yield DagsterEvent.pipeline_init_failure( pipeline_name=pipeline_def.name, failure_data=PipelineInitFailureData(error=error_info), log_manager=_create_context_free_log_manager(instance, pipeline_run, pipeline_def), ) try: if resources_manager: for event in resources_manager.generate_teardown_events(): yield event except DagsterError as dagster_teardown_error: # we will fail/reraise based on the original error, so for now just do a best-effort # teardown and swallow any errors pass else: # pipeline teardown failure raise dagster_error if raise_on_error: raise dagster_error
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format( pipeline_context.executor), ) check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) step_levels = execution_plan.execution_step_levels() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) variables = { "executionParams": { "selector": { "pipelineName": pipeline_name, "repositoryName": recon_repo.get_definition().name, "repositoryLocationName": "<<in_process>>", }, "runConfigData": run_config, "mode": pipeline_context.mode_def.name, "executionMetadata": { "runId": pipeline_context.pipeline_run.run_id }, "stepKeys": [step.key], } } dask_task_name = "%s.%s" % (pipeline_name, step.key) workspace = create_in_process_ephemeral_workspace( pointer=pipeline_context.pipeline. get_reconstructable_repository().pointer) future = client.submit( query_on_dask_worker, workspace, variables, dependencies, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master for future in dask.distributed.as_completed(execution_futures): for step_event in future.result(): check.inst(step_event, DagsterEvent) yield step_event
def sync_list_repositories_grpc(api_client): from dagster.grpc.client import DagsterGrpcClient from dagster.grpc.types import ListRepositoriesResponse check.inst_param(api_client, 'api_client', DagsterGrpcClient) return check.inst(api_client.list_repositories(), ListRepositoriesResponse)
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time' ) check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, 'pipeline_origin_packed' ) # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([EventMetadataEntry.text(step_key, 'Step keys'),]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, ) ) command = ['dagster'] args = ['api', 'execute_step_with_structured_logs', input_json] job = construct_dagster_k8s_job(job_config, command, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format(step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text( str(job_config.image_pull_secrets), 'Image pull secrets' ), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name' ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == 'Conflict': # There is an existing job with the same name so do not procede. instance.report_engine_event( 'Did not create Kubernetes job {} for step {} since job name already ' 'exists, exiting.'.format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( 'Encountered unexpected error while creating Kubernetes job {} for step {}, ' 'exiting.'.format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(e, 'Error'), ] ), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ] ), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') events += filter_dagster_events_from_pod_logs(logs) serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def _execute_step_docker( self, execute_step_args_packed, docker_config, ): """Run step execution in a Docker container.""" execute_step_args = unpack_value( check.dict_param( execute_step_args_packed, "execute_step_args_packed", )) check.inst_param(execute_step_args, "execute_step_args", ExecuteStepArgs) check.dict_param(docker_config, "docker_config") instance = DagsterInstance.from_ref(execute_step_args.instance_ref) pipeline_run = instance.get_run_by_id( execute_step_args.pipeline_run_id) check.inst( pipeline_run, PipelineRun, "Could not load run {}".format(execute_step_args.pipeline_run_id), ) step_keys_str = ", ".join(execute_step_args.step_keys_to_execute) input_json = serialize_dagster_namedtuple(execute_step_args) command = "dagster api execute_step {}".format(json.dumps(input_json)) docker_image = (docker_config["image"] if docker_config.get("image") else execute_step_args. pipeline_origin.repository_origin.container_image) if not docker_image: raise Exception( "No docker image specified by either the pipeline or the repository" ) client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "Step keys"), EventMetadataEntry.text(docker_image, "Image"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=execute_step_args.step_keys_to_execute[0], ) raise else: if res is None: raise Exception( "No response from execute_step in CeleryDockerExecutor") serialized_events += [event for event in res.split("\n") if event] return serialized_events
def execute(pipeline_context, execution_plan, step_keys_to_execute=None): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set( step_keys_to_execute) dask_config = pipeline_context.run_config.executor_config check.param_invariant( isinstance(pipeline_context.executor_config, DaskConfig), 'pipeline_context', 'Expected executor_config to be DaskConfig got {}'.format( pipeline_context.executor_config), ) # Checks to ensure storage is compatible with Dask configuration storage = pipeline_context.environment_dict.get('storage') check.invariant(storage.keys(), 'Must specify storage to use Dask execution') if dask_config.is_remote_execution: check.invariant( storage.get('s3'), 'Must use S3 storage with non-local Dask address {dask_address}' .format(dask_address=dask_config.address), ) else: check.invariant( not storage.get('in_memory'), 'Cannot use in-memory storage with Dask, use filesystem or S3', ) step_levels = execution_plan.topological_step_levels() pipeline_name = pipeline_context.pipeline_def.name with dask.distributed.Client( **dask_config.build_dict(pipeline_name)) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) check.invariant( not step_context.run_config.loggers, 'Cannot inject loggers via RunConfig with the Dask executor', ) check.invariant( not step_context.event_callback, 'Cannot use event_callback with Dask executor', ) # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [ execution_futures_dict[ step_input.prev_output_handle.step_key] for step_input in step.step_inputs if step_input.prev_output_handle is not None ] variables = { 'executionParams': { 'selector': { 'name': pipeline_name }, 'environmentConfigData': pipeline_context.environment_dict, 'mode': pipeline_context.mode_def.name, 'executionMetadata': { 'runId': pipeline_context.run_config.run_id }, 'stepKeys': [step.key], } } dask_task_name = '%s.%s' % (pipeline_name, step.key) future = client.submit( query_on_dask_worker, pipeline_context.execution_target_handle, variables, dependencies, key=dask_task_name, ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the master execution_step_events = client.gather(execution_futures) # execution_step_events is now a list of lists, the inner lists contain the dagster # events emitted by each step execution for step_event in itertools.chain.from_iterable( execution_step_events): check.inst(step_event, DagsterEvent) yield step_event
def celery_k8s_job_executor(init_context): """Celery-based executor which launches tasks as Kubernetes Jobs. The Celery executor exposes config settings for the underlying Celery app under the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced in Celery version 4.0 and the object constructed from config will be passed to the :py:class:`celery.Celery` constructor as its ``config_source`` argument. (See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.) The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the :py:class:`celery.Celery` constructor. In the most common case, you may want to modify the ``broker`` and ``backend`` (e.g., to use Redis instead of RabbitMQ). We expect that ``config_source`` will be less frequently modified, but that when solid executions are especially fast or slow, or when there are different requirements around idempotence or retry, it may make sense to execute pipelines with variations on these settings. If you'd like to configure a Celery Kubernetes Job executor in addition to the :py:class:`~dagster.default_executors`, you should add it to the ``executor_defs`` defined on a :py:class:`~dagster.ModeDefinition` as follows: .. literalinclude:: ../dagster_celery_k8s_tests/example_celery_mode_def.py :language: python Then you can configure the executor as follows: .. code-block:: YAML execution: celery-k8s: config: job_image: 'my_repo.com/image_name:latest' job_namespace: 'some-namespace' broker: 'pyamqp://guest@localhost//' # Optional[str]: The URL of the Celery broker backend: 'rpc://' # Optional[str]: The URL of the Celery results backend include: ['my_module'] # Optional[List[str]]: Modules every worker should import config_source: # Dict[str, Any]: Any additional parameters to pass to the #... # Celery workers. This dict will be passed as the `config_source` #... # argument of celery.Celery(). Note that the YAML you provide here must align with the configuration with which the Celery workers on which you hope to run were started. If, for example, you point the executor at a different broker than the one your workers are listening to, the workers will never be able to pick up tasks for execution. In deployments where the celery_k8s_job_executor is used all appropriate celery and dagster_celery commands must be invoked with the `-A dagster_celery_k8s.app` argument. """ check_cross_process_constraints(init_context) run_launcher = init_context.instance.run_launcher exc_cfg = init_context.executor_config check.inst( run_launcher, CeleryK8sRunLauncher, "This engine is only compatible with a CeleryK8sRunLauncher; configure the " "CeleryK8sRunLauncher on your instance to use it.", ) job_config = DagsterK8sJobConfig( dagster_home=run_launcher.dagster_home, instance_config_map=run_launcher.instance_config_map, postgres_password_secret=run_launcher.postgres_password_secret, job_image=exc_cfg.get("job_image") or os.getenv("DAGSTER_CURRENT_IMAGE"), image_pull_policy=exc_cfg.get("image_pull_policy"), image_pull_secrets=exc_cfg.get("image_pull_secrets"), service_account_name=exc_cfg.get("service_account_name"), env_config_maps=exc_cfg.get("env_config_maps"), env_secrets=exc_cfg.get("env_secrets"), ) # Set on the instance but overrideable here broker = run_launcher.broker or exc_cfg.get("broker") backend = run_launcher.backend or exc_cfg.get("backend") config_source = run_launcher.config_source or exc_cfg.get("config_source") include = run_launcher.include or exc_cfg.get("include") retries = run_launcher.retries or Retries.from_config( exc_cfg.get("retries")) return CeleryK8sJobExecutor( broker=broker, backend=backend, config_source=config_source, include=include, retries=retries, job_config=job_config, job_namespace=exc_cfg.get("job_namespace"), load_incluster_config=exc_cfg.get("load_incluster_config"), kubeconfig_file=exc_cfg.get("kubeconfig_file"), repo_location_name=exc_cfg.get("repo_location_name"), )
def execute(self, pipeline_context, execution_plan): check.inst_param(pipeline_context, "pipeline_context", SystemPipelineExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) check.param_invariant( isinstance(pipeline_context.executor, DaskExecutor), "pipeline_context", "Expected executor to be DaskExecutor got {}".format( pipeline_context.executor), ) check.invariant( pipeline_context.instance.is_persistent, "Dask execution requires a persistent DagsterInstance", ) step_levels = execution_plan.get_steps_to_execute_by_level() pipeline_name = pipeline_context.pipeline_def.name instance = pipeline_context.instance cluster_type = self.cluster_type if cluster_type == "existing": # address passed directly to Client() below to connect to existing Scheduler cluster = self.cluster_configuration["address"] elif cluster_type == "local": from dask.distributed import LocalCluster cluster = LocalCluster(**self.build_dict(pipeline_name)) elif cluster_type == "yarn": from dask_yarn import YarnCluster cluster = YarnCluster(**self.build_dict(pipeline_name)) elif cluster_type == "ssh": from dask.distributed import SSHCluster cluster = SSHCluster(**self.build_dict(pipeline_name)) elif cluster_type == "pbs": from dask_jobqueue import PBSCluster cluster = PBSCluster(**self.build_dict(pipeline_name)) elif cluster_type == "moab": from dask_jobqueue import MoabCluster cluster = MoabCluster(**self.build_dict(pipeline_name)) elif cluster_type == "sge": from dask_jobqueue import SGECluster cluster = SGECluster(**self.build_dict(pipeline_name)) elif cluster_type == "lsf": from dask_jobqueue import LSFCluster cluster = LSFCluster(**self.build_dict(pipeline_name)) elif cluster_type == "slurm": from dask_jobqueue import SLURMCluster cluster = SLURMCluster(**self.build_dict(pipeline_name)) elif cluster_type == "oar": from dask_jobqueue import OARCluster cluster = OARCluster(**self.build_dict(pipeline_name)) elif cluster_type == "kube": from dask_kubernetes import KubeCluster cluster = KubeCluster(**self.build_dict(pipeline_name)) else: raise ValueError( f"Must be providing one of the following ('existing', 'local', 'yarn', 'ssh', 'pbs', 'moab', 'sge', 'lsf', 'slurm', 'oar', 'kube') not {cluster_type}" ) with dask.distributed.Client(cluster) as client: execution_futures = [] execution_futures_dict = {} for step_level in step_levels: for step in step_level: # We ensure correctness in sequencing by letting Dask schedule futures and # awaiting dependencies within each step. dependencies = [] for step_input in step.step_inputs: for key in step_input.dependency_keys: dependencies.append(execution_futures_dict[key]) run_config = dict(pipeline_context.run_config, execution={"in_process": {}}) recon_repo = pipeline_context.pipeline.get_reconstructable_repository( ) dask_task_name = "%s.%s" % (pipeline_name, step.key) recon_pipeline = recon_repo.get_reconstructable_pipeline( pipeline_name) future = client.submit( query_on_dask_worker, dependencies, recon_pipeline, pipeline_context.pipeline_run, run_config, [step.key], pipeline_context.mode_def.name, instance.get_ref(), key=dask_task_name, resources=get_dask_resource_requirements(step.tags), ) execution_futures.append(future) execution_futures_dict[step.key] = future # This tells Dask to awaits the step executions and retrieve their results to the # master futures = dask.distributed.as_completed(execution_futures, with_results=True) # Allow interrupts while waiting for the results from Dask for future, result in iterate_with_context( raise_execution_interrupts, futures): for step_event in result: check.inst(step_event, DagsterEvent) yield step_event
def start_inprocess_executor( pipeline_context, execution_plan, intermediates_manager, step_keys_to_execute=None ): check.inst_param(pipeline_context, 'pipeline_context', SystemPipelineExecutionContext) check.inst_param(execution_plan, 'execution_plan', ExecutionPlan) check.inst_param(intermediates_manager, 'intermediates_manager', IntermediatesManager) check.opt_list_param(step_keys_to_execute, 'step_keys_to_execute', of_type=str) step_key_set = None if step_keys_to_execute is None else set(step_keys_to_execute) check.param_invariant( isinstance(pipeline_context.executor_config, ExecutorConfig), 'pipeline_context', 'Expected executor_config to be ExecutorConfig got {}'.format( pipeline_context.executor_config ), ) failed_or_skipped_steps = set() step_levels = execution_plan.topological_step_levels() # It would be good to implement a reference tracking algorithm here so we could # garbage collection results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 for step_level in step_levels: for step in step_level: if step_key_set and step.key not in step_key_set: continue step_context = pipeline_context.for_step(step) failed_inputs = [ step_input.prev_output_handle.step_key for step_input in step.step_inputs if step_input.prev_output_handle.step_key in failed_or_skipped_steps ] if failed_inputs: step_context.log.info( 'Dependencies for step {step} failed: {failed_inputs}. Not executing.'.format( step=step.key, failed_inputs=failed_inputs ) ) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue uncovered_inputs = intermediates_manager.uncovered_inputs(step_context, step) if uncovered_inputs: # In partial pipeline execution, we may end up here without having validated the # missing dependent outputs were optional _assert_missing_inputs_optional(uncovered_inputs, execution_plan, step.key) step_context.log.info( ( 'Not all inputs covered for {step}. Not executing. Output missing for ' 'inputs: {uncovered_inputs}' ).format(uncovered_inputs=uncovered_inputs, step=step.key) ) failed_or_skipped_steps.add(step.key) yield DagsterEvent.step_skipped_event(step_context) continue input_values = _create_input_values(step_context, intermediates_manager) for step_event in check.generator( execute_step_in_memory(step_context, input_values, intermediates_manager) ): check.inst(step_event, DagsterEvent) if step_event.is_step_failure: failed_or_skipped_steps.add(step.key) yield step_event
def get_step_by_key(self, step_key: str) -> ExecutionStep: step = self._plan.get_step_by_key(step_key) return cast(ExecutionStep, check.inst(step, ExecutionStep))
def load_repository_from_target_info(repo_target_info): check.inst_param(repo_target_info, 'repo_target_info', RepositoryTargetInfo) return check.inst( load_repository_object_from_target_info(repo_target_info).load(), RepositoryDefinition)
def inner_plan_execution_iterator( pipeline_context: PlanExecutionContext, execution_plan: ExecutionPlan ) -> Iterator[DagsterEvent]: check.inst_param(pipeline_context, "pipeline_context", PlanExecutionContext) check.inst_param(execution_plan, "execution_plan", ExecutionPlan) with execution_plan.start(retry_mode=pipeline_context.retry_mode) as active_execution: # It would be good to implement a reference tracking algorithm here to # garbage collect results that are no longer needed by any steps # https://github.com/dagster-io/dagster/issues/811 while not active_execution.is_complete: step = active_execution.get_next_step() step_context = cast( StepExecutionContext, pipeline_context.for_step( step, active_execution.retry_state.get_attempt_count(step.key) ), ) step_event_list = [] missing_resources = [ resource_key for resource_key in step_context.required_resource_keys if not hasattr(step_context.resources, resource_key) ] check.invariant( len(missing_resources) == 0, ( "Expected step context for solid {solid_name} to have all required resources, but " "missing {missing_resources}." ).format(solid_name=step_context.solid.name, missing_resources=missing_resources), ) # capture all of the logs for this step with ExitStack() as stack: log_capture_error = None try: stack.enter_context( pipeline_context.instance.compute_log_manager.watch( step_context.pipeline_run, step_context.step.key ) ) except Exception as e: yield DagsterEvent.engine_event( pipeline_context=pipeline_context, message="Exception while setting up compute log capture", event_specific_data=EngineEventData( error=serializable_error_info_from_exc_info(sys.exc_info()) ), step_handle=step_context.step.handle, ) log_capture_error = e if not log_capture_error: yield DagsterEvent.capture_logs( step_context, log_key=step_context.step.key, steps=[step_context.step] ) for step_event in check.generator(dagster_event_sequence_for_step(step_context)): check.inst(step_event, DagsterEvent) step_event_list.append(step_event) yield step_event active_execution.handle_event(step_event) active_execution.verify_complete(pipeline_context, step.key) try: stack.close() except Exception: yield DagsterEvent.engine_event( pipeline_context=pipeline_context, message="Exception while cleaning up compute log capture", event_specific_data=EngineEventData( error=serializable_error_info_from_exc_info(sys.exc_info()) ), step_handle=step_context.step.handle, ) # process skips from failures or uncovered inputs for event in active_execution.plan_events_iterator(pipeline_context): step_event_list.append(event) yield event # pass a list of step events to hooks for hook_event in _trigger_hook(step_context, step_event_list): yield hook_event
def solid_definition(fn): return check.inst(fn(), SolidDefinition)
def from_dagster_event_record(event_record, pipeline_name, execution_plan_index): # Lots of event types. Pylint thinks there are too many branches # pylint: disable=too-many-branches check.inst_param(event_record, 'event_record', EventRecord) check.param_invariant(event_record.is_dagster_event, 'event_record') check.str_param(pipeline_name, 'pipeline_name') check.opt_inst_param(execution_plan_index, 'execution_plan_index', ExecutionPlanIndex) # circular ref at module scope from .errors import DauphinPythonError dagster_event = event_record.dagster_event basic_params = construct_basic_params(event_record, execution_plan_index) if dagster_event.event_type == DagsterEventType.STEP_START: return DauphinExecutionStepStartEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SKIPPED: return DauphinExecutionStepSkippedEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_UP_FOR_RETRY: return DauphinExecutionStepUpForRetryEvent( error=dagster_event.step_retry_data.error, secondsToWait=dagster_event.step_retry_data.seconds_to_wait, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_RESTARTED: return DauphinExecutionStepRestartEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_SUCCESS: return DauphinExecutionStepSuccessEvent(**basic_params) elif dagster_event.event_type == DagsterEventType.STEP_INPUT: input_data = dagster_event.event_specific_data return DauphinExecutionStepInputEvent( input_name=input_data.input_name, type_check=input_data.type_check_data, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_OUTPUT: output_data = dagster_event.step_output_data return DauphinExecutionStepOutputEvent( output_name=output_data.output_name, type_check=output_data.type_check_data, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_MATERIALIZATION: materialization = dagster_event.step_materialization_data.materialization return DauphinStepMaterializationEvent(materialization=materialization, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_EXPECTATION_RESULT: expectation_result = dagster_event.event_specific_data.expectation_result return DauphinStepExpectationResultEvent( expectation_result=expectation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.STEP_FAILURE: check.inst(dagster_event.step_failure_data, StepFailureData) return DauphinExecutionStepFailureEvent( error=DauphinPythonError(dagster_event.step_failure_data.error), failureMetadata=dagster_event.step_failure_data.user_failure_data, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_START: return DauphinPipelineStartEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_SUCCESS: return DauphinPipelineSuccessEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE: return DauphinPipelineFailureEvent(pipelineName=pipeline_name, **basic_params) elif dagster_event.event_type == DagsterEventType.PIPELINE_INIT_FAILURE: return DauphinPipelineInitFailureEvent( pipelineName=pipeline_name, error=DauphinPythonError( dagster_event.pipeline_init_failure_data.error), **basic_params) elif dagster_event.event_type == DagsterEventType.OBJECT_STORE_OPERATION: operation_result = dagster_event.event_specific_data return DauphinObjectStoreOperationEvent( operation_result=operation_result, **basic_params) elif dagster_event.event_type == DagsterEventType.ENGINE_EVENT: return DauphinEngineEvent( metadataEntries=_to_dauphin_metadata_entries( dagster_event.engine_event_data.metadata_entries), error=DauphinPythonError(dagster_event.engine_event_data.error) if dagster_event.engine_event_data.error else None, marker_start=dagster_event.engine_event_data.marker_start, marker_end=dagster_event.engine_event_data.marker_end, **basic_params) else: raise Exception( 'Unknown DAGSTER_EVENT type {inner_type} found in logs'.format( inner_type=dagster_event.event_type))