def _execute_plan(_self, instance_ref_dict, handle_dict, run_id, step_keys, retries_dict): check.dict_param(instance_ref_dict, 'instance_ref_dict') check.dict_param(handle_dict, 'handle_dict') check.str_param(run_id, 'run_id') check.list_param(step_keys, 'step_keys', of_type=str) check.dict_param(retries_dict, 'retries_dict') instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) handle = ExecutionTargetHandle.from_dict(handle_dict) retries = Retries.from_config(retries_dict) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) pipeline_def = handle.build_pipeline_definition().build_sub_pipeline( pipeline_run.selector.solid_subset) step_keys_str = ", ".join(step_keys) execution_plan = create_execution_plan( pipeline_def, pipeline_run.environment_dict, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ).build_subset_plan(step_keys) engine_event = instance.report_engine_event( 'Executing steps {} in celery worker'.format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, 'step_keys'), ], marker_end=DELEGATE_MARKER, ), CeleryEngine, step_key=execution_plan.step_key_for_single_step_plans(), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan, pipeline_run=pipeline_run, environment_dict=pipeline_run.environment_dict, instance=instance, retries=retries, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_plan(self, instance_ref_dict, executable_dict, run_id, step_keys, retries_dict): check.dict_param(instance_ref_dict, "instance_ref_dict") check.dict_param(executable_dict, "executable_dict") check.str_param(run_id, "run_id") check.list_param(step_keys, "step_keys", of_type=str) check.dict_param(retries_dict, "retries_dict") instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline = ReconstructablePipeline.from_dict(executable_dict) retries = Retries.from_config(retries_dict) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_keys_str = ", ".join(step_keys) execution_plan = create_execution_plan( pipeline, pipeline_run.run_config, mode=pipeline_run.mode, step_keys_to_execute=pipeline_run.step_keys_to_execute, ).build_subset_plan(step_keys) engine_event = instance.report_engine_event( "Executing steps {} in celery worker".format(step_keys_str), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "step_keys"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryExecutor, step_key=execution_plan.step_key_for_single_step_plans(), ) events = [engine_event] for step_event in execute_plan_iterator( execution_plan, pipeline_run=pipeline_run, run_config=pipeline_run.run_config, instance=instance, retries=retries, ): events.append(step_event) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_query(_self, handle_dict, variables, instance_ref_dict): instance_ref = InstanceRef.from_dict(instance_ref_dict) handle = ExecutionTargetHandle.from_dict(handle_dict) events = execute_execute_plan_mutation( handle=handle, variables=variables, instance_ref=instance_ref, ) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_k8s_job( self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, user_defined_k8s_config_dict=None, kubeconfig_file=None, ): """Run step execution in a K8s job pod. """ check.dict_param(instance_ref_dict, "instance_ref_dict") check.list_param(step_keys, "step_keys", of_type=str) check.invariant( len(step_keys) == 1, "Celery K8s task executor can only execute 1 step at a time" ) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.str_param(repo_name, "repo_name") check.str_param(repo_location_name, "repo_location_name") check.str_param(run_id, "run_id") # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, "job_config", DagsterK8sJobConfig) check.str_param(job_namespace, "job_namespace") check.bool_param(load_incluster_config, "load_incluster_config") check.dict_param(retries_dict, "retries_dict") pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, "pipeline_origin_packed" ) # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) user_defined_k8s_config = UserDefinedDagsterK8sConfig.from_dict( user_defined_k8s_config_dict ) check.opt_inst_param( user_defined_k8s_config, "user_defined_k8s_config", UserDefinedDagsterK8sConfig, ) check.opt_str_param(kubeconfig_file, "kubeconfig_file") # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_key = step_keys[0] celery_worker_name = self.request.hostname celery_pod_name = os.environ.get("HOSTNAME") instance.report_engine_event( "Task for step {step_key} picked up by Celery".format(step_key=step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(celery_worker_name, "Celery worker name"), EventMetadataEntry.text(celery_pod_name, "Celery worker Kubernetes Pod name"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( "Not scheduling step because pipeline run status is not STARTED", pipeline_run, EngineEventData([EventMetadataEntry.text(step_key, "Step keys"),]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) pod_name = "dagster-job-%s-%d" % (k8s_name_key, attempt_number) else: job_name = "dagster-job-%s" % (k8s_name_key) pod_name = "dagster-job-%s" % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, ) ) command = ["dagster"] args = ["api", "execute_step_with_structured_logs", input_json] job = construct_dagster_k8s_job( job_config, command, args, job_name, user_defined_k8s_config, pod_name ) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( "Executing step {} in Kubernetes job {}".format(step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), EventMetadataEntry.text(job_config.job_image, "Job image"), EventMetadataEntry.text(job_config.image_pull_policy, "Image pull policy"), EventMetadataEntry.text( str(job_config.image_pull_secrets), "Image pull secrets" ), EventMetadataEntry.text( str(job_config.service_account_name), "Service account name" ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == "Conflict": # There is an existing job with the same name so do not procede. instance.report_engine_event( "Did not create Kubernetes job {} for step {} since job name already " "exists, exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(pod_name, "Kubernetes Pod name"), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( "Encountered unexpected error while creating Kubernetes job {} for step {}, " "exiting.".format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(e, "Error"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( "Terminating Kubernetes Job because pipeline run status is not STARTED", pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, "Step keys"), EventMetadataEntry.text(job_name, "Kubernetes Job name"), EventMetadataEntry.text(job_namespace, "Kubernetes Job namespace"), ] ), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( "Retrieving logs from Kubernetes Job pods", pipeline_run, EngineEventData([EventMetadataEntry.text("\n".join(pod_names), "Pod names")]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split("\n") events += filter_dagster_events_from_pod_logs(logs) serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, environment_dict, mode, pipeline_name, run_id, job_config_dict, job_namespace, load_incluster_config, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' from dagster_k8s.job import DagsterK8sJobConfig, construct_dagster_graphql_k8s_job from dagster_k8s.utils import get_pod_names_in_job, retrieve_pod_logs, wait_for_job_success import kubernetes check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time' ) check.dict_param(environment_dict, 'environment_dict') check.str_param(mode, 'mode') check.str_param(pipeline_name, 'pipeline_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_keys_str = ", ".join(step_keys) # Ensure we stay below k8s name length limits k8s_name_key = _get_k8s_name_key(run_id, step_keys) job_name = 'dagster-stepjob-%s' % k8s_name_key pod_name = 'dagster-stepjob-%s' % k8s_name_key variables = construct_variables(mode, environment_dict, pipeline_name, run_id, step_keys) args = ['-p', 'executePlan', '-v', seven.json.dumps(variables)] job = construct_dagster_graphql_k8s_job(job_config, args, job_name, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution engine_event = instance.report_engine_event( 'Executing steps {} in Kubernetes job {}'.format(step_keys_str, job.metadata.name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, 'Step keys'), EventMetadataEntry.text(job.metadata.name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text( str(job_config.image_pull_secrets), 'Image pull secrets' ), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name' ), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobEngine, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_keys[0], ) events.append(engine_event) kubernetes.client.BatchV1Api().create_namespaced_job(body=job, namespace=job_namespace) wait_for_job_success(job.metadata.name, namespace=job_namespace) pod_names = get_pod_names_in_job(job.metadata.name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData([EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobEngine, step_key=step_keys[0], ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') res = parse_raw_log_lines(logs) handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [serialize_dagster_namedtuple(event) for event in events] return serialized_events
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, pipeline_origin_packed, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time') check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') pipeline_origin = unpack_value( check.dict_param( pipeline_origin_packed, 'pipeline_origin_packed') # TODO: make part of args ) check.inst(pipeline_origin, PipelineOrigin) check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = get_k8s_job_name(run_id, step_key) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=None, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, )) command = ['dagster'] args = ['api', 'execute_step_with_structured_logs', input_json] job = construct_dagster_k8s_job(job_config, command, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text(str(job_config.image_pull_secrets), 'Image pull secrets'), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) try: kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) except kubernetes.client.rest.ApiException as e: if e.reason == 'Conflict': # There is an existing job with the same name so do not procede. instance.report_engine_event( 'Did not create Kubernetes job {} for step {} since job name already ' 'exists, exiting.'.format(job_name, step_key), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, step_key=step_key, ) else: instance.report_engine_event( 'Encountered unexpected error while creating Kubernetes job {} for step {}, ' 'exiting.'.format(job_name, step_key), pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(e, 'Error'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData( [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') events += filter_dagster_events_from_pod_logs(logs) serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_docker( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, docker_config, ): '''Run step execution in a Docker container. ''' instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_keys_str = ", ".join(step_keys) variables = { 'executionParams': { 'runConfigData': run_config, 'mode': mode, 'selector': { 'repositoryLocationName': repo_location_name, 'repositoryName': repo_name, 'pipelineName': pipeline_run.pipeline_name, 'solidSelection': pipeline_run.solid_selection, }, 'executionMetadata': { 'runId': run_id }, 'stepKeys': step_keys, } } command = 'dagster-graphql -v \'{variables}\' -p executePlan'.format( variables=seven.json.dumps(variables)) docker_image = docker_config['image'] client = docker.client.from_env() if docker_config.get('registry'): client.login( registry=docker_config['registry']['url'], username=docker_config['registry']['username'], password=docker_config['registry']['password'], ) # Post event for starting execution engine_event = instance.report_engine_event( 'Executing steps {} in Docker container {}'.format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, 'Step keys'), EventMetadataEntry.text(docker_image, 'Image'), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=step_keys[0], ) events = [engine_event] docker_env = {} if docker_config.get('env_vars'): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config['env_vars'] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, ) res = seven.json.loads(docker_response) except docker.errors.ContainerError as err: instance.report_engine_event( 'Failed to run steps {} in Docker container {}'.format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, 'Job image'), EventMetadataEntry.text(err.stderr, 'Docker stderr'), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise except JSONDecodeError: instance.report_engine_event( 'Failed to parse response for steps {} from Docker container {}' .format(step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, 'Job image'), EventMetadataEntry.text(docker_response, 'Docker Response'), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise else: handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_docker( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, docker_config, ): """Run step execution in a Docker container. """ instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_keys_str = ", ".join(step_keys) variables = { "executionParams": { "runConfigData": run_config, "mode": mode, "selector": { "repositoryLocationName": repo_location_name, "repositoryName": repo_name, "pipelineName": pipeline_run.pipeline_name, "solidSelection": list(pipeline_run.solids_to_execute) if pipeline_run.solids_to_execute else None, }, "executionMetadata": { "runId": run_id }, "stepKeys": step_keys, } } command = "dagster-graphql -v '{variables}' -p executePlan".format( variables=seven.json.dumps(variables)) docker_image = docker_config["image"] client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "Step keys"), EventMetadataEntry.text(docker_image, "Image"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=step_keys[0], ) events = [engine_event] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, ) res = seven.json.loads(docker_response) except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise except JSONDecodeError: instance.report_engine_event( "Failed to parse response for steps {} from Docker container {}" .format(step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(docker_response, "Docker Response"), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise else: handle_execution_errors(res, "executePlan") step_events = handle_execute_plan_result(res) events += step_events serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_k8s_job( _self, instance_ref_dict, step_keys, run_config, mode, repo_name, repo_location_name, run_id, job_config_dict, job_namespace, load_incluster_config, retries_dict, resources=None, kubeconfig_file=None, ): '''Run step execution in a K8s job pod. ''' check.dict_param(instance_ref_dict, 'instance_ref_dict') check.list_param(step_keys, 'step_keys', of_type=str) check.invariant( len(step_keys) == 1, 'Celery K8s task executor can only execute 1 step at a time') check.dict_param(run_config, 'run_config') check.str_param(mode, 'mode') check.str_param(repo_name, 'repo_name') check.str_param(repo_location_name, 'repo_location_name') check.str_param(run_id, 'run_id') # Celery will serialize this as a list job_config = DagsterK8sJobConfig.from_dict(job_config_dict) check.inst_param(job_config, 'job_config', DagsterK8sJobConfig) check.str_param(job_namespace, 'job_namespace') check.bool_param(load_incluster_config, 'load_incluster_config') check.dict_param(retries_dict, 'retries_dict') check.opt_dict_param(resources, 'resources', key_type=str, value_type=dict) check.opt_str_param(kubeconfig_file, 'kubeconfig_file') # For when launched via DinD or running the cluster if load_incluster_config: kubernetes.config.load_incluster_config() else: kubernetes.config.load_kube_config(kubeconfig_file) instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, 'Could not load run {}'.format(run_id)) step_key = step_keys[0] if pipeline_run.status != PipelineRunStatus.STARTED: instance.report_engine_event( 'Not scheduling step because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), ]), CeleryK8sJobExecutor, step_key=step_key, ) return # Ensure we stay below k8s name length limits k8s_name_key = _get_k8s_name_key(run_id, step_keys) retries = Retries.from_config(retries_dict) if retries.get_attempt_count(step_key): attempt_number = retries.get_attempt_count(step_key) job_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) pod_name = 'dagster-job-%s-%d' % (k8s_name_key, attempt_number) else: job_name = 'dagster-job-%s' % (k8s_name_key) pod_name = 'dagster-job-%s' % (k8s_name_key) variables = { 'executionParams': { 'runConfigData': run_config, 'mode': mode, 'selector': { 'repositoryLocationName': repo_location_name, 'repositoryName': repo_name, 'pipelineName': pipeline_run.pipeline_name, 'solidSelection': list(pipeline_run.solids_to_execute) if pipeline_run.solids_to_execute else None, }, 'executionMetadata': { 'runId': run_id }, 'stepKeys': step_keys, }, 'retries': retries.to_graphql_input(), } args = [ '-p', 'executePlan', '-v', seven.json.dumps(variables), '--remap-sigterm' ] job = construct_dagster_graphql_k8s_job(job_config, args, job_name, resources, pod_name) # Running list of events generated from this task execution events = [] # Post event for starting execution job_name = job.metadata.name engine_event = instance.report_engine_event( 'Executing step {} in Kubernetes job {}'.format( step_key, job_name), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(pod_name, 'Kubernetes Pod name'), EventMetadataEntry.text(job_config.job_image, 'Job image'), EventMetadataEntry.text(job_config.image_pull_policy, 'Image pull policy'), EventMetadataEntry.text(str(job_config.image_pull_secrets), 'Image pull secrets'), EventMetadataEntry.text( str(job_config.service_account_name), 'Service account name'), ], marker_end=DELEGATE_MARKER, ), CeleryK8sJobExecutor, # validated above that step_keys is length 1, and it is not possible to use ETH or # execution plan in this function (Celery K8s workers should not access to user code) step_key=step_key, ) events.append(engine_event) kubernetes.client.BatchV1Api().create_namespaced_job( body=job, namespace=job_namespace) try: wait_for_job_success( job_name=job_name, namespace=job_namespace, instance=instance, run_id=run_id, ) except DagsterK8sPipelineStatusException: instance.report_engine_event( 'Terminating Kubernetes Job because pipeline run status is not STARTED', pipeline_run, EngineEventData([ EventMetadataEntry.text(step_key, 'Step keys'), EventMetadataEntry.text(job_name, 'Kubernetes Job name'), EventMetadataEntry.text(job_namespace, 'Kubernetes Job namespace'), ]), CeleryK8sJobExecutor, step_key=step_key, ) delete_job(job_name=job_name, namespace=job_namespace) return pod_names = get_pod_names_in_job(job_name, namespace=job_namespace) # Post engine event for log retrieval engine_event = instance.report_engine_event( 'Retrieving logs from Kubernetes Job pods', pipeline_run, EngineEventData( [EventMetadataEntry.text('\n'.join(pod_names), 'Pod names')]), CeleryK8sJobExecutor, step_key=step_key, ) events.append(engine_event) logs = [] for pod_name in pod_names: raw_logs = retrieve_pod_logs(pod_name, namespace=job_namespace) logs += raw_logs.split('\n') res = parse_raw_log_lines(logs) handle_execution_errors(res, 'executePlan') step_events = handle_execute_plan_result(res) events += step_events serialized_events = [ serialize_dagster_namedtuple(event) for event in events ] return serialized_events
def _execute_step_docker( self, instance_ref_dict, step_keys, run_config, mode, repo_name, run_id, docker_config, pipeline_origin_packed, retries_dict, ): """Run step execution in a Docker container. """ check.dict_param(instance_ref_dict, "instance_ref_dict") check.list_param(step_keys, "step_keys", of_type=str) check.dict_param(run_config, "run_config") check.str_param(mode, "mode") check.str_param(repo_name, "repo_name") check.str_param(run_id, "run_id") check.dict_param(docker_config, "docker_config") pipeline_origin = unpack_value( check.dict_param(pipeline_origin_packed, "pipeline_origin_packed")) check.dict_param(retries_dict, "retries_dict") instance_ref = InstanceRef.from_dict(instance_ref_dict) instance = DagsterInstance.from_ref(instance_ref) pipeline_run = instance.get_run_by_id(run_id) check.invariant(pipeline_run, "Could not load run {}".format(run_id)) step_keys_str = ", ".join(step_keys) input_json = serialize_dagster_namedtuple( ExecuteStepArgs( pipeline_origin=pipeline_origin, pipeline_run_id=run_id, instance_ref=instance_ref, mode=mode, step_keys_to_execute=step_keys, run_config=run_config, retries_dict=retries_dict, )) command = "dagster api execute_step_with_structured_logs {}".format( json.dumps(input_json)) docker_image = docker_config["image"] client = docker.client.from_env() if docker_config.get("registry"): client.login( registry=docker_config["registry"]["url"], username=docker_config["registry"]["username"], password=docker_config["registry"]["password"], ) # Post event for starting execution engine_event = instance.report_engine_event( "Executing steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData( [ EventMetadataEntry.text(step_keys_str, "Step keys"), EventMetadataEntry.text(docker_image, "Image"), EventMetadataEntry.text(self.request.hostname, "Celery worker"), ], marker_end=DELEGATE_MARKER, ), CeleryDockerExecutor, step_key=step_keys[0], ) serialized_events = [serialize_dagster_namedtuple(engine_event)] docker_env = {} if docker_config.get("env_vars"): docker_env = { env_name: os.getenv(env_name) for env_name in docker_config["env_vars"] } try: docker_response = client.containers.run( docker_image, command=command, detach=False, auto_remove=True, # pass through this worker's environment for things like AWS creds etc. environment=docker_env, network=docker_config.get("network", None), ) res = docker_response.decode("utf-8") except docker.errors.ContainerError as err: instance.report_engine_event( "Failed to run steps {} in Docker container {}".format( step_keys_str, docker_image), pipeline_run, EngineEventData([ EventMetadataEntry.text(docker_image, "Job image"), EventMetadataEntry.text(err.stderr, "Docker stderr"), ], ), CeleryDockerExecutor, step_key=step_keys[0], ) raise else: if res is None: raise Exception( "No response from execute_step_with_structured_logs in CeleryDockerExecutor" ) serialized_events += [event for event in res.split("\n") if event] return serialized_events