def monitor_started_run(instance: DagsterInstance, workspace, run, logger): check.invariant(run.status == PipelineRunStatus.STARTED) check_health_result = instance.run_launcher.check_run_worker_health(run) if check_health_result.status != WorkerStatus.RUNNING: num_prev_attempts = count_resume_run_attempts(instance, run) if num_prev_attempts < instance.run_monitoring_max_resume_run_attempts: msg = ( f"Detected run worker status {check_health_result}. Resuming run {run.run_id} with " "a new worker." ) logger.info(msg) instance.report_engine_event(msg, run) attempt_number = num_prev_attempts + 1 instance.resume_run( run.run_id, workspace, attempt_number, ) else: if instance.run_launcher.supports_resume_run: msg = ( f"Detected run worker status {check_health_result}. Marking run {run.run_id} as " "failed, because it has surpassed the configured maximum attempts to resume the run: {max_resume_run_attempts}." ) else: msg = ( f"Detected run worker status {check_health_result}. Marking run {run.run_id} as " "failed." ) logger.info(msg) instance.report_run_failed(run, msg)
def monitor_starting_run(instance: DagsterInstance, run, logger): check.invariant(run.status == PipelineRunStatus.STARTING) run_stats = instance.get_run_stats(run.run_id) check.invariant(run_stats.launch_time is not None, "Run in status STARTING doesn't have a launch time.") if time.time( ) - run_stats.launch_time >= instance.run_monitoring_start_timeout_seconds: msg = ( f"Run {run.run_id} has been running for {time.time() - run_stats.launch_time} seconds, " f"which is longer than the timeout of {instance.run_monitoring_start_timeout_seconds} seconds to start. " "Marking run failed") logger.info(msg) instance.report_run_failed(run, msg)