def _launch_run_and_wait_for_resume( dagit_url_for_k8s_run_launcher, run_config, instance, namespace, pipeline_name="slow_pipeline", ): try: run_id = launch_run_over_graphql( dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name, mode="k8s", ) start_time = time.time() while True: assert time.time() - start_time < 60, "Timed out waiting for run to start" run = instance.get_run_by_id(run_id) if run.status == PipelineRunStatus.STARTED: break assert run.status == PipelineRunStatus.STARTING time.sleep(1) time.sleep(5) assert delete_job(get_job_name_from_run_id(run_id), namespace) poll_for_finished_run(instance, run_id, timeout=120) assert instance.get_run_by_id(run_id).status == PipelineRunStatus.SUCCESS finally: log_run_events(instance, run_id)
def test_execute_on_celery_k8s_default( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url, ): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="demo_pipeline_celery") result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == dagster_docker_image
def test_execute_queued_run_on_celery_k8s( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance_for_daemon, helm_namespace_for_daemon, dagit_url_for_daemon, ): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config( dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace_for_daemon, ), ) run_id = launch_run_over_graphql(dagit_url_for_daemon, run_config=run_config, pipeline_name="demo_pipeline_celery") wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_daemon) logs = dagster_instance_for_daemon.all_logs(run_id) assert_events_in_order( logs, [ "PIPELINE_ENQUEUED", "PIPELINE_DEQUEUED", "PIPELINE_STARTING", "PIPELINE_SUCCESS" ], )
def _launch_executor_run( dagit_url, run_config, dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, pipeline_name="demo_k8s_executor_pipeline", num_steps=2, mode="default", ): run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name=pipeline_name, mode=mode) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance_for_k8s_run_launcher.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == get_test_project_docker_image( ) events = dagster_instance_for_k8s_run_launcher.all_logs(run_id) assert len(_get_step_execution_events(events)) == num_steps return run_id
def test_run_monitoring_fails_on_interrupt( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url ): run_config = merge_dicts( merge_yamls( [ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ] ), get_celery_job_engine_config( dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace ), ) pipeline_name = "demo_job_celery" try: run_id = launch_run_over_graphql( dagit_url, run_config=run_config, pipeline_name=pipeline_name ) start_time = time.time() while time.time() - start_time < 60: run = dagster_instance.get_run_by_id(run_id) if run.status == PipelineRunStatus.STARTED: break assert run.status == PipelineRunStatus.STARTING time.sleep(1) assert delete_job(get_job_name_from_run_id(run_id), helm_namespace) poll_for_finished_run(dagster_instance, run.run_id, timeout=120) assert dagster_instance.get_run_by_id(run_id).status == PipelineRunStatus.FAILURE finally: log_run_events(dagster_instance, run_id)
def test_k8s_run_launcher_default( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagit_url_for_k8s_run_launcher, ): pods = DagsterKubernetesClient.production_client( ).core_api.list_namespaced_pod( namespace=helm_namespace_for_k8s_run_launcher) celery_pod_names = [ p.metadata.name for p in pods.items if "celery-workers" in p.metadata.name ] check.invariant(not celery_pod_names) run_config = load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env.yaml")) pipeline_name = "demo_pipeline" run_id = launch_run_over_graphql(dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance_for_k8s_run_launcher.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == get_test_project_docker_image( )
def test_execute_on_celery_k8s_job_api_with_legacy_configmap_set( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url): # Originally, jobs needed to include "dagster-pipeline-env" to pick up needed config when # using the helm chart - it's no longer needed, but verify that nothing breaks if it's included run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_job_engine_config( dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace, include_dagster_pipeline_env=True, ), ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="demo_job_celery") result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == dagster_docker_image
def test_failing_k8s_run_launcher( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagit_url_for_k8s_run_launcher, ): run_config = load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env.yaml")) pipeline_name = "always_fail_pipeline" run_id = launch_run_over_graphql(dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) assert "PIPELINE_SUCCESS" not in result, "no match, result: {}".format( result) event_records = dagster_instance_for_k8s_run_launcher.all_logs(run_id) assert any( ["Op Exception Message" in str(event) for event in event_records])
def test_k8s_run_launcher_terminate( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagster_docker_image, dagit_url_for_k8s_run_launcher, ): pipeline_name = "slow_pipeline" run_config = merge_dicts( load_yaml_from_path(os.path.join(get_test_project_environments_path(), "env_s3.yaml")), { "execution": { "k8s": { "config": { "job_namespace": helm_namespace_for_k8s_run_launcher, "job_image": dagster_docker_image, "image_pull_policy": image_pull_policy(), } } }, }, ) run_id = launch_run_over_graphql( dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name, mode="k8s", ) wait_for_job(job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) timeout = datetime.timedelta(0, 30) start_time = datetime.datetime.now() while True: assert datetime.datetime.now() < start_time + timeout, "Timed out waiting for can_terminate" if can_terminate_run_over_graphql(dagit_url_for_k8s_run_launcher, run_id): break time.sleep(5) terminate_run_over_graphql(dagit_url_for_k8s_run_launcher, run_id=run_id) start_time = datetime.datetime.now() pipeline_run = None while True: assert datetime.datetime.now() < start_time + timeout, "Timed out waiting for termination" pipeline_run = dagster_instance_for_k8s_run_launcher.get_run_by_id(run_id) if pipeline_run.status == PipelineRunStatus.CANCELED: break time.sleep(5) # useful to have logs here, because the worker pods get deleted print(dagster_instance_for_k8s_run_launcher.all_logs(run_id)) # pylint: disable=print-call assert pipeline_run.status == PipelineRunStatus.CANCELED assert not can_terminate_run_over_graphql(dagit_url_for_k8s_run_launcher, run_id)
def test_memoization_on_celery_k8s( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url): ephemeral_prefix = str(uuid.uuid4()) run_config = deep_merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml") ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_config = deep_merge_dicts( run_config, { "resources": { "io_manager": { "config": { "s3_prefix": ephemeral_prefix } } } }, ) try: run_ids = [] for _ in range(2): run_id = launch_run_over_graphql( dagit_url, run_config=run_config, pipeline_name="memoization_pipeline", mode="celery", ) result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format( result) run_ids.append(run_id) unmemoized_run_id = run_ids[0] step_events = _get_step_events( dagster_instance.all_logs(unmemoized_run_id)) assert len(step_events) == 4 memoized_run_id = run_ids[1] step_events = _get_step_events( dagster_instance.all_logs(memoized_run_id)) assert len(step_events) == 0 finally: cleanup_memoized_results(define_memoization_pipeline(), "celery", dagster_instance, run_config)
def test_execute_on_k8s_retry_pipeline( # pylint: disable=redefined-outer-name dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagster_docker_image, dagit_url_for_k8s_run_launcher, ): run_config = merge_dicts( load_yaml_from_path(os.path.join(get_test_project_environments_path(), "env_s3.yaml")), { "execution": { "k8s": { "config": { "job_namespace": helm_namespace_for_k8s_run_launcher, "job_image": dagster_docker_image, "image_pull_policy": image_pull_policy(), } } }, }, ) pipeline_name = "retry_pipeline" run_id = launch_run_over_graphql( dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name, mode="k8s", ) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher ) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) stats = dagster_instance_for_k8s_run_launcher.get_run_stats(run_id) assert stats.steps_succeeded == 1 all_logs = dagster_instance_for_k8s_run_launcher.all_logs(run_id) assert DagsterEventType.STEP_START in [ event.dagster_event.event_type for event in all_logs if event.is_dagster_event ] assert DagsterEventType.STEP_UP_FOR_RETRY in [ event.dagster_event.event_type for event in all_logs if event.is_dagster_event ] assert DagsterEventType.STEP_RESTARTED in [ event.dagster_event.event_type for event in all_logs if event.is_dagster_event ] assert DagsterEventType.STEP_SUCCESS in [ event.dagster_event.event_type for event in all_logs if event.is_dagster_event ]
def test_execute_on_celery_k8s_with_hard_failure( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, set_dagster_k8s_pipeline_run_namespace_env, dagit_url): run_config = merge_dicts( merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config( dagster_docker_image=dagster_docker_image, job_namespace={"env": "DAGSTER_K8S_PIPELINE_RUN_NAMESPACE"}, ), ), {"solids": { "hard_fail_or_0": { "config": { "fail": True } } }}, ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="hard_failer") # Check that pipeline run is marked as failed pipeline_run_status_failure = False start_time = datetime.datetime.now() timeout = datetime.timedelta(0, 120) while datetime.datetime.now() < start_time + timeout: pipeline_run = dagster_instance.get_run_by_id(run_id) if pipeline_run.status == PipelineRunStatus.FAILURE: pipeline_run_status_failure = True break time.sleep(5) assert pipeline_run_status_failure # Check for step failure for hard_fail_or_0.compute start_time = datetime.datetime.now() step_failure_found = False while datetime.datetime.now() < start_time + timeout: event_records = dagster_instance.all_logs(run_id) for event_record in event_records: if event_record.dagster_event: if (event_record.dagster_event.event_type == DagsterEventType.STEP_FAILURE and event_record.dagster_event.step_key == "hard_fail_or_0"): step_failure_found = True break time.sleep(5) assert step_failure_found
def test_k8s_executor_resource_requirements( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagster_docker_image, dagit_url_for_k8s_run_launcher, ): # sanity check that we have a K8sRunLauncher pods = DagsterKubernetesClient.production_client( ).core_api.list_namespaced_pod( namespace=helm_namespace_for_k8s_run_launcher) celery_pod_names = [ p.metadata.name for p in pods.items if "celery-workers" in p.metadata.name ] check.invariant(not celery_pod_names) run_config = merge_dicts( load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env_s3.yaml")), { "execution": { "k8s": { "config": { "job_namespace": helm_namespace_for_k8s_run_launcher, "job_image": dagster_docker_image, "image_pull_policy": image_pull_policy(), } } }, }, ) pipeline_name = "resources_limit_pipeline" run_id = launch_run_over_graphql( dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name, mode="k8s", ) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance_for_k8s_run_launcher.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == get_test_project_docker_image( )
def test_k8s_run_launcher_image_from_origin( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagit_url_for_k8s_run_launcher, ): # Like the previous test, but the executor doesn't supply an image - it's pulled # from the origin on the run instead pods = DagsterKubernetesClient.production_client( ).core_api.list_namespaced_pod( namespace=helm_namespace_for_k8s_run_launcher) celery_pod_names = [ p.metadata.name for p in pods.items if "celery-workers" in p.metadata.name ] check.invariant(not celery_pod_names) run_config = merge_dicts( load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env.yaml")), load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env_s3.yaml")), { "execution": { "k8s": { "config": { "job_namespace": helm_namespace_for_k8s_run_launcher, "image_pull_policy": image_pull_policy(), } } }, }, ) pipeline_name = "demo_k8s_executor_pipeline" run_id = launch_run_over_graphql(dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) updated_run = dagster_instance_for_k8s_run_launcher.get_run_by_id(run_id) assert updated_run.tags[DOCKER_IMAGE_TAG] == get_test_project_docker_image( )
def test_execute_on_celery_k8s_retry_pipeline( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml") ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="retry_pipeline") result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result) stats = dagster_instance.get_run_stats(run_id) assert stats.steps_succeeded == 1 assert DagsterEventType.STEP_START in [ event.dagster_event.event_type for event in dagster_instance.all_logs(run_id) if event.is_dagster_event ] assert DagsterEventType.STEP_UP_FOR_RETRY in [ event.dagster_event.event_type for event in dagster_instance.all_logs(run_id) if event.is_dagster_event ] assert DagsterEventType.STEP_RESTARTED in [ event.dagster_event.event_type for event in dagster_instance.all_logs(run_id) if event.is_dagster_event ] assert DagsterEventType.STEP_SUCCESS in [ event.dagster_event.event_type for event in dagster_instance.all_logs(run_id) if event.is_dagster_event ]
def test_k8s_run_launcher_with_celery_executor_fails( dagster_docker_image, dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagit_url_for_k8s_run_launcher, ): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config( dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace_for_k8s_run_launcher, ), ) pipeline_name = "demo_pipeline_celery" run_id = launch_run_over_graphql(dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name) timeout = datetime.timedelta(0, 120) start_time = datetime.datetime.now() while True: assert (datetime.datetime.now() < start_time + timeout), "Timed out waiting for pipeline failure" event_records = dagster_instance_for_k8s_run_launcher.all_logs(run_id) found_pipeline_failure = False for event_record in event_records: if event_record.dagster_event: if event_record.dagster_event.event_type == DagsterEventType.PIPELINE_FAILURE: found_pipeline_failure = True if found_pipeline_failure: break time.sleep(5) assert (dagster_instance_for_k8s_run_launcher.get_run_by_id(run_id).status == PipelineRunStatus.FAILURE)
def test_execute_on_celery_k8s_with_resource_requirements( # pylint: disable=redefined-outer-name dagster_docker_image, dagster_instance, helm_namespace, dagit_url): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="resources_limit_pipeline") result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
def test_k8s_run_launcher_terminate( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagit_url_for_k8s_run_launcher, ): pipeline_name = "slow_pipeline" run_config = load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env_s3.yaml")) run_id = launch_run_over_graphql(dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name) wait_for_job(job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) timeout = datetime.timedelta(0, 30) start_time = datetime.datetime.now() while True: assert datetime.datetime.now( ) < start_time + timeout, "Timed out waiting for can_terminate" if can_terminate_run_over_graphql(dagit_url_for_k8s_run_launcher, run_id): break time.sleep(5) terminate_run_over_graphql(dagit_url_for_k8s_run_launcher, run_id=run_id) start_time = datetime.datetime.now() pipeline_run = None while True: assert datetime.datetime.now( ) < start_time + timeout, "Timed out waiting for termination" pipeline_run = dagster_instance_for_k8s_run_launcher.get_run_by_id( run_id) if pipeline_run.status == PipelineRunStatus.CANCELED: break time.sleep(5) assert pipeline_run.status == PipelineRunStatus.CANCELED assert not can_terminate_run_over_graphql(dagit_url_for_k8s_run_launcher, run_id)
def test_volume_mounts(dagster_docker_image, dagster_instance, helm_namespace, dagit_url): run_config = deep_merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_s3.yaml") ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql( dagit_url, run_config=run_config, pipeline_name="volume_mount_pipeline", mode="celery", ) result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
def test_k8s_run_launcher_secret_from_deployment( helm_namespace_for_k8s_run_launcher, dagit_url_for_k8s_run_launcher, ): # This run_config requires that WORD_FACTOR be set on both the user code deployment # and the run launcher. It will only work if secrets are propagated from the deployment # to the run launcher, since TEST_DEPLOYMENT_SECRET_NAME is only set on the user code # deployment but not on the run launcher config. run_config = load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env_config_from_secrets.yaml")) pipeline_name = "demo_pipeline" run_id = launch_run_over_graphql(dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
def test_execute_subset_on_celery_k8s( # pylint: disable=redefined-outer-name dagster_docker_image, helm_namespace, dagit_url): run_config = merge_dicts( merge_yamls([ os.path.join(get_test_project_environments_path(), "env_subset.yaml"), os.path.join(get_test_project_environments_path(), "env_s3.yaml"), ]), get_celery_engine_config(dagster_docker_image=dagster_docker_image, job_namespace=helm_namespace), ) run_id = launch_run_over_graphql( dagit_url, run_config=run_config, pipeline_name="demo_pipeline_celery", solid_selection=["count_letters"], ) result = wait_for_job_and_get_raw_logs(job_name="dagster-run-%s" % run_id, namespace=helm_namespace) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format(result)
def test_memoization_k8s_executor( dagster_instance_for_k8s_run_launcher, helm_namespace_for_k8s_run_launcher, dagster_docker_image, dagit_url_for_k8s_run_launcher, ): ephemeral_path = str(uuid.uuid4()) run_config = deep_merge_dicts( load_yaml_from_path( os.path.join(get_test_project_environments_path(), "env_s3.yaml")), { "execution": { "k8s": { "config": { "job_namespace": helm_namespace_for_k8s_run_launcher, "job_image": dagster_docker_image, "image_pull_policy": image_pull_policy(), } } }, }, ) run_config = deep_merge_dicts( run_config, { "resources": { "io_manager": { "config": { "s3_prefix": ephemeral_path } } } }, ) # wrap in try-catch to ensure that memoized results are always cleaned from s3 bucket try: pipeline_name = "memoization_pipeline" run_ids = [] for _ in range(2): run_id = launch_run_over_graphql( dagit_url_for_k8s_run_launcher, run_config=run_config, pipeline_name=pipeline_name, mode="k8s", ) result = wait_for_job_and_get_raw_logs( job_name="dagster-run-%s" % run_id, namespace=helm_namespace_for_k8s_run_launcher, ) assert "PIPELINE_SUCCESS" in result, "no match, result: {}".format( result) run_ids.append(run_id) # We expect that first run should have to run the step, since it has not yet been # memoized. unmemoized_run_id = run_ids[0] events = dagster_instance_for_k8s_run_launcher.all_logs( unmemoized_run_id) assert len(_get_step_execution_events(events)) == 1 # We expect that second run should not have to run the step, since it has been memoized. memoized_run_id = run_ids[1] events = dagster_instance_for_k8s_run_launcher.all_logs( memoized_run_id) assert len(_get_step_execution_events(events)) == 0 finally: cleanup_memoized_results(define_memoization_pipeline(), "k8s", dagster_instance_for_k8s_run_launcher, run_config)
def _test_termination(dagit_url, dagster_instance, run_config): run_id = launch_run_over_graphql(dagit_url, run_config=run_config, pipeline_name="resource_pipeline") # Wait for pipeline run to start timeout = datetime.timedelta(0, 120) start_time = datetime.datetime.now() while True: assert datetime.datetime.now( ) < start_time + timeout, "Timed out waiting for can_terminate" pipeline_run = dagster_instance.get_run_by_id(run_id) if can_terminate_run_over_graphql(dagit_url, run_id): break time.sleep(5) # Wait for step to start step_start_found = False start_time = datetime.datetime.now() while datetime.datetime.now() < start_time + timeout: event_records = dagster_instance.all_logs(run_id) for event_record in event_records: if (event_record.dagster_event and event_record.dagster_event.event_type == DagsterEventType.STEP_START): step_start_found = True break if step_start_found: break time.sleep(5) assert step_start_found # Terminate run assert can_terminate_run_over_graphql(dagit_url, run_id=run_id) terminate_run_over_graphql(dagit_url, run_id=run_id) # Check that pipeline run is marked as canceled pipeline_run_status_canceled = False start_time = datetime.datetime.now() while datetime.datetime.now() < start_time + timeout: pipeline_run = dagster_instance.get_run_by_id(run_id) if pipeline_run.status == PipelineRunStatus.CANCELED: pipeline_run_status_canceled = True break time.sleep(5) assert pipeline_run_status_canceled # Check that terminate cannot be called again assert not can_terminate_run_over_graphql(dagit_url, run_id=run_id) # Check for step failure and resource tear down expected_events_found = False start_time = datetime.datetime.now() while datetime.datetime.now() < start_time + timeout: step_failures_count = 0 resource_tear_down_count = 0 resource_init_count = 0 termination_request_count = 0 termination_success_count = 0 event_records = dagster_instance.all_logs(run_id) for event_record in event_records: if event_record.dagster_event: if event_record.dagster_event.event_type == DagsterEventType.STEP_FAILURE: step_failures_count += 1 elif event_record.dagster_event.event_type == DagsterEventType.PIPELINE_CANCELING: termination_request_count += 1 elif event_record.dagster_event.event_type == DagsterEventType.PIPELINE_CANCELED: termination_success_count += 1 elif event_record.message: if "initializing s3_resource_with_context_manager" in event_record.message: resource_init_count += 1 if "tearing down s3_resource_with_context_manager" in event_record.message: resource_tear_down_count += 1 if (step_failures_count == 1 and resource_init_count == 1 and resource_tear_down_count == 1 and termination_request_count == 1 and termination_success_count == 1): expected_events_found = True break time.sleep(5) assert expected_events_found s3 = boto3.resource("s3", region_name="us-west-1", use_ssl=True, endpoint_url=None).meta.client bucket = "dagster-scratch-80542c2" key = "resource_termination_test/{}".format(run_id) assert s3.get_object(Bucket=bucket, Key=key)