Beispiel #1
0
def monitor_started_run(instance: DagsterInstance, workspace, run, logger):
    check.invariant(run.status == PipelineRunStatus.STARTED)
    check_health_result = instance.run_launcher.check_run_worker_health(run)
    if check_health_result.status != WorkerStatus.RUNNING:
        num_prev_attempts = count_resume_run_attempts(instance, run)
        if num_prev_attempts < instance.run_monitoring_max_resume_run_attempts:
            msg = (
                f"Detected run worker status {check_health_result}. Resuming run {run.run_id} with "
                "a new worker."
            )
            logger.info(msg)
            instance.report_engine_event(msg, run)
            attempt_number = num_prev_attempts + 1
            instance.resume_run(
                run.run_id,
                workspace,
                attempt_number,
            )
        else:
            if instance.run_launcher.supports_resume_run:
                msg = (
                    f"Detected run worker status {check_health_result}. Marking run {run.run_id} as "
                    "failed, because it has surpassed the configured maximum attempts to resume the run: {max_resume_run_attempts}."
                )
            else:
                msg = (
                    f"Detected run worker status {check_health_result}. Marking run {run.run_id} as "
                    "failed."
                )
            logger.info(msg)
            instance.report_run_failed(run, msg)
Beispiel #2
0
def monitor_starting_run(instance: DagsterInstance, run, logger):
    check.invariant(run.status == PipelineRunStatus.STARTING)
    run_stats = instance.get_run_stats(run.run_id)

    check.invariant(run_stats.launch_time is not None,
                    "Run in status STARTING doesn't have a launch time.")
    if time.time(
    ) - run_stats.launch_time >= instance.run_monitoring_start_timeout_seconds:
        msg = (
            f"Run {run.run_id} has been running for {time.time() - run_stats.launch_time} seconds, "
            f"which is longer than the timeout of {instance.run_monitoring_start_timeout_seconds} seconds to start. "
            "Marking run failed")
        logger.info(msg)
        instance.report_run_failed(run, msg)