def check_and_fix_integrity(wf_ex): check_after_seconds = CONF.engine.execution_integrity_check_delay if check_after_seconds < 0: # Never check integrity if it's a negative value. return # To break cyclic dependency. from mistral.engine import task_handler running_task_execs = db_api.get_task_executions( workflow_execution_id=wf_ex.id, state=states.RUNNING ) for t_ex in running_task_execs: # The idea is that we take the latest known timestamp of the task # execution and consider it eligible for checking and fixing only # if some minimum period of time elapsed since the last update. timestamp = t_ex.updated_at or t_ex.created_at delta = timeutils.delta_seconds(timestamp, timeutils.utcnow()) if delta < check_after_seconds: continue child_executions = t_ex.executions if not child_executions: continue all_finished = all( [states.is_completed(c_ex.state) for c_ex in child_executions] ) if all_finished: # Find the timestamp of the most recently finished child. most_recent_child_timestamp = max( [c_ex.updated_at or c_ex.created_at for c_ex in child_executions] ) interval = timeutils.delta_seconds( most_recent_child_timestamp, timeutils.utcnow() ) if interval > check_after_seconds: # We found a task execution in RUNNING state for which all # child executions are finished. We need to call # "schedule_on_action_complete" on the task handler for any of # the child executions so that the task state is calculated and # updated properly. LOG.warning( "Found a task execution that is likely stuck in RUNNING" " state because all child executions are finished," " will try to recover [task_execution=%s]", t_ex.id ) task_handler.schedule_on_action_complete(child_executions[-1])
def on_action_complete(action_ex, result): task_ex = action_ex.task_execution action = _build_action(action_ex) try: action.complete(result) except exc.MistralException as e: msg = ("Failed to complete action [action=%s, task=%s]: %s\n%s" % (action_ex.name, task_ex.name, e, tb.format_exc())) LOG.error(msg) action.fail(msg) if task_ex: task_handler.force_fail_task(task_ex, msg) return if task_ex: task_handler.schedule_on_action_complete(action_ex)
def on_action_complete(action_ex, result): task_ex = action_ex.task_execution action = _build_action(action_ex) try: action.complete(result) except exc.MistralException as e: msg = ( "Failed to complete action [error=%s, action=%s, task=%s]:\n%s" % (e, action_ex.name, task_ex.name, tb.format_exc()) ) LOG.error(msg) action.fail(msg) if task_ex: task_handler.force_fail_task(task_ex, msg) return if task_ex: task_handler.schedule_on_action_complete(action_ex)
def _check_and_fix_integrity(wf_ex_id): check_after_seconds = CONF.engine.execution_integrity_check_delay if check_after_seconds < 0: # Never check integrity if it's a negative value. return # To break cyclic dependency. from mistral.engine import task_handler with db_api.transaction(): wf_ex = db_api.get_workflow_execution(wf_ex_id) if states.is_completed(wf_ex.state): return _schedule_check_and_fix_integrity(wf_ex, delay=120) running_task_execs = db_api.get_task_executions( workflow_execution_id=wf_ex.id, state=states.RUNNING, limit=CONF.engine.execution_integrity_check_batch_size ) for t_ex in running_task_execs: # The idea is that we take the latest known timestamp of the task # execution and consider it eligible for checking and fixing only # if some minimum period of time elapsed since the last update. timestamp = t_ex.updated_at or t_ex.created_at delta = timeutils.delta_seconds(timestamp, timeutils.utcnow()) if delta < check_after_seconds: continue child_executions = t_ex.executions if not child_executions: continue all_finished = all( [states.is_completed(c_ex.state) for c_ex in child_executions] ) if all_finished: # Find the timestamp of the most recently finished child. most_recent_child_timestamp = max( [c_ex.updated_at or c_ex.created_at for c_ex in child_executions] ) interval = timeutils.delta_seconds( most_recent_child_timestamp, timeutils.utcnow() ) if interval > check_after_seconds: # We found a task execution in RUNNING state for which all # child executions are finished. We need to call # "schedule_on_action_complete" on the task handler for # any of the child executions so that the task state is # calculated and updated properly. LOG.warning( "Found a task execution that is likely stuck in" " RUNNING state because all child executions are" " finished, will try to recover [task_execution=%s]", t_ex.id ) task_handler.schedule_on_action_complete( child_executions[-1] )