def _killall(self): """ Sends a stop (SIGTERM) signal to all worker processes. """ for process in self._processes.values(): logger.info("process: sending SIGTERM to pid={}".format(process.pid)) process.terminate()
def target(self): """ Supervisor's main "target", as defined in the `multiprocessing` API. It's the code that the manager will execute once started. """ # handle signals self.bind_signal_handlers() # protection against double use of ".start()" if len(self._processes) != 0: raise Exception("Child processes map is not empty, already called .start() ?") # wait for all processes to finish while True: # if terminating, join all processes and exit the loop so we finish # the supervisor process if self._terminating: for proc in self._processes.values(): logger.info("process: waiting for proces={} to finish.".format(proc)) proc.wait() break # start worker processes self._cleanup_worker_processes() self._start_worker_processes() # re-evaluate state at least every 5 seconds ; if a SIGCHLD happens during # the "time.sleep()" below, it will be interrupted, making the code above # run nearly immediately ; but if a SIGCHLD happens during the two calls # above, the "time.sleep()" here won't be stopped, so better have it # relatively short, but not too short since the above methods involve # scanning a bunch of entries in /proc so that could become slow if we do # it every 0.1s. time.sleep(5)
def run(self, token, task): ppid = os.getppid() while True: time.sleep(self._interval) if os.getppid() != ppid: os._exit(1) try: logger.info("heartbeat {} for task {}".format( time.time(), task.activity_type.name)) except Exception: # Do not crash for debug pass try: response = self.send_heartbeat(token) except swf.exceptions.DoesNotExistError: # Either the task or the workflow execution no longer exists. logger.warning( "task {} no longer exists. Stopping heartbeat".format( task.activity_type.name)) return except Exception as error: # Let's crash if it cannot notify the heartbeat failed. logger.error("cannot send heartbeat for task {}: {}".format( task.activity_type.name, error)) raise if response and response.get("cancelRequested"): return
def _killall(self): """ Sends a stop (SIGTERM) signal to all worker processes. """ for process in self._processes.values(): logger.info("process: sending SIGTERM to pid={}".format( process.pid)) process.terminate()
def terminate(self): """ Terminate all worker processes managed by this Supervisor. """ self._terminating = True logger.info( "process: will stop workers, this might take up several minutes. " "Please, be patient.") self._killall()
def process(self, poller, token, task): """ :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask """ logger.debug('ActivityWorker.process() pid={}'.format(os.getpid())) try: activity = self.dispatch(task) input = format.decode(task.input) args = input.get('args', ()) kwargs = input.get('kwargs', {}) context = sanitize_activity_context(task.context) context['domain_name'] = poller.domain.name if input.get('meta', {}).get('binaries'): download_binaries(input['meta']['binaries']) result = ActivityTask(activity, *args, context=context, **kwargs).execute() except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.exception("process error: {}".format(str(exc_value))) if isinstance(exc_value, ExecutionError) and len(exc_value.args): details = exc_value.args[0] reason = format_exc(exc_value) # FIXME json.loads and rebuild? else: tb = traceback.format_tb(exc_traceback) reason = format_exc(exc_value) details = json_dumps( { 'error': exc_type.__name__, 'message': str(exc_value), 'traceback': tb, }, default=repr ) return poller.fail_with_retry( token, task, reason=reason, details=details ) try: logger.info('completing activity') poller.complete_with_retry(token, result) except Exception as err: logger.exception("complete error") reason = 'cannot complete task {}: {} {}'.format( task.activity_id, err.__class__.__name__, err, ) poller.fail_with_retry(token, task, reason)
def _download_binary(self): logger.info( "Downloading binary: {} -> {}".format( self.remote_location, self.local_location ) ) bucket, path = self.remote_location.replace("s3://", "", 1).split("/", 1) # with FileLock(dest): pull(bucket, path, self.local_location) os.chmod(self.local_location, 0o755)
def terminate(self): """ Terminate all worker processes managed by this Supervisor. """ self._terminating = True logger.info( "process: will stop workers, this might take up several minutes. " "Please, be patient." ) self._killall()
def start(self): """ Used to start the Supervisor process once it's configured. Has to be called explicitly on a Supervisor instance so it starts (no auto-start from __init__()). """ logger.info('starting {}'.format(self._payload)) if self._background: p = multiprocessing.Process(target=self.target) p.start() else: self.target()
def start(self): """ Used to start the Supervisor process once it's configured. Has to be called explicitly on a Supervisor instance so it starts (no auto-start from __init__()). """ logger.info("starting {}".format(self._payload)) if self._background: p = multiprocessing.Process(target=self.target) p.start() else: self.target()
def process_decision(poller, decision_response): # type: (DeciderPoller, Response) -> None workflow_id = decision_response.execution.workflow_id workflow_str = "workflow {} ({})".format(workflow_id, poller.workflow_name) logger.debug("process_decision() pid={}".format(os.getpid())) logger.info("taking decision for {}".format(workflow_str)) format.JUMBO_FIELDS_MEMORY_CACHE.clear() decisions = poller.decide(decision_response) try: logger.info("completing decision for {}".format(workflow_str)) poller.complete_with_retry(decision_response.token, decisions) except Exception as err: logger.error("cannot complete decision for {}: {}".format(workflow_str, err))
def process_decision(poller, decision_response): # type: (DeciderPoller, Response) -> None workflow_id = decision_response.execution.workflow_id workflow_str = "workflow {} ({})".format(workflow_id, poller.workflow_name) logger.debug("process_decision() pid={}".format(os.getpid())) logger.info("taking decision for {}".format(workflow_str)) format.JUMBO_FIELDS_MEMORY_CACHE.clear() decisions = poller.decide(decision_response) try: logger.info("completing decision for {}".format(workflow_str)) poller.complete_with_retry(decision_response.token, decisions) except Exception as err: logger.error("cannot complete decision for {}: {}".format(workflow_str, err))
def run_once(self): """ Run the main poller process and exits after first task is processed. """ logger.info("starting %s on domain %s", self.name, self.domain.name) self.bind_signal_handlers() self.is_alive = True self.set_process_name() while self.is_alive: try: response = self.poll_with_retry() except swf.exceptions.PollTimeout: continue self.process(response) break
def run_once(self): """ Run the main poller process and exits after first task is processed. """ logger.info("starting %s on domain %s", self.name, self.domain.name) self.bind_signal_handlers() self.is_alive = True self.set_process_name() while self.is_alive: try: response = self.poll_with_retry() except swf.exceptions.PollTimeout: continue self.process(response) break
def start(self): """ Start the main poller process. There is no daemonization. The process is intended to be run inside a supervisor process. """ logger.info("starting %s on domain %s", self.name, self.domain.name) self.bind_signal_handlers() self.is_alive = True self.set_process_name() while self.is_alive: try: response = self.poll_with_retry() except swf.exceptions.PollTimeout: continue self.process(response)
def start(self): """ Start the main poller process. There is no daemonization. The process is intended to be run inside a supervisor process. """ logger.info("starting %s on domain %s", self.name, self.domain.name) self.bind_signal_handlers() self.is_alive = True self.set_process_name() while self.is_alive: try: response = self.poll_with_retry() except swf.exceptions.PollTimeout: continue self.process(response)
def activity_rerun(domain, workflow_id, run_id, input, scheduled_id, activity_id): # handle params if not activity_id and not scheduled_id: logger.error("Please supply --scheduled-id or --activity-id.") sys.exit(1) input_override = None if input: input_override = format.decode(input) # find workflow execution try: wfe = helpers.get_workflow_execution(domain, workflow_id, run_id) except (swf.exceptions.DoesNotExistError, IndexError): logger.error("Couldn't find execution, exiting.") sys.exit(1) logger.info("Found execution: workflowId={} runId={}".format( wfe.workflow_id, wfe.run_id)) # now rerun the specified activity history = History(wfe.history()) history.parse() task, args, kwargs, meta, params = helpers.find_activity( history, scheduled_id=scheduled_id, activity_id=activity_id, input=input_override, ) kwargs["context"].update({ "workflow_id": wfe.workflow_id, "run_id": wfe.run_id, }) logger.debug("Found activity. Last execution:") for line in json_dumps(params, pretty=True).split("\n"): logger.debug(line) if input_override: logger.info("NB: input will be overriden with the passed one!") logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format( task, args, kwargs, meta)) # download binaries if needed download_binaries(meta.get("binaries", {})) # execute the activity task with the correct arguments instance = ActivityTask(task, *args, **kwargs) result = instance.execute() if hasattr(instance, "post_execute"): instance.post_execute() logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
def target(self): """ Supervisor's main "target", as defined in the `multiprocessing` API. It's the code that the manager will execute once started. """ # handle signals self.bind_signal_handlers() # protection against double use of ".start()" if len(self._processes) != 0: raise Exception( "Child processes map is not empty, already called .start() ?") # wait for all processes to finish while True: # if terminating, join all processes and exit the loop so we finish # the supervisor process if self._terminating: for proc in self._processes.values(): logger.info( "process: waiting for proces={} to finish.".format( proc)) proc.wait() break # start worker processes self._cleanup_worker_processes() self._start_worker_processes() # re-evaluate state at least every 5 seconds ; if a SIGCHLD happens during # the "time.sleep()" below, it will be interrupted, making the code above # run nearly immediately ; but if a SIGCHLD happens during the two calls # above, the "time.sleep()" here won't be stopped, so better have it # relatively short, but not too short since the above methods involve # scanning a bunch of entries in /proc so that could become slow if we do # it every 0.1s. time.sleep(5)
def activity_rerun(domain, workflow_id, run_id, input, scheduled_id, activity_id): # handle params if not activity_id and not scheduled_id: logger.error("Please supply --scheduled-id or --activity-id.") sys.exit(1) input_override = None if input: input_override = format.decode(input) # find workflow execution try: wfe = helpers.get_workflow_execution(domain, workflow_id, run_id) except (swf.exceptions.DoesNotExistError, IndexError): logger.error("Couldn't find execution, exiting.") sys.exit(1) logger.info("Found execution: workflowId={} runId={}".format(wfe.workflow_id, wfe.run_id)) # now rerun the specified activity history = History(wfe.history()) history.parse() task, args, kwargs, meta, params = helpers.find_activity( history, scheduled_id=scheduled_id, activity_id=activity_id, input=input_override, ) logger.debug("Found activity. Last execution:") for line in json_dumps(params, pretty=True).split("\n"): logger.debug(line) if input_override: logger.info("NB: input will be overriden with the passed one!") logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format(task, args, kwargs, meta)) # download binaries if needed download_binaries(meta.get("binaries", {})) # execute the activity task with the correct arguments instance = ActivityTask(task, *args, **kwargs) result = instance.execute() if hasattr(instance, 'post_execute'): instance.post_execute() logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
def _handle_graceful_shutdown(signum, frame): signals_map = {2: "SIGINT", 15: "SIGTERM"} signal_name = signals_map.get(signum, signum) logger.info("process: caught signal signal={} pid={}".format( signal_name, os.getpid())) self.terminate()
def _handle_graceful_shutdown(signum, frame): signals_map = {2: "SIGINT", 15: "SIGTERM"} signal_name = signals_map.get(signum, signum) logger.info("process: caught signal signal={} pid={}".format( signal_name, os.getpid())) self.terminate()
def _handle_graceful_shutdown(signum, frame): logger.info("process: caught signal signal=SIGTERM pid={}".format( os.getpid())) self.stop_gracefully()
def _handle_graceful_shutdown(signum, frame): logger.info("process: caught signal signal=SIGTERM pid={}".format(os.getpid())) self.stop_gracefully()
def spawn_kubernetes_job(poller, swf_response): logger.info("scheduling new kubernetes job name={}".format(poller.job_name)) job = KubernetesJob(poller.job_name, poller.domain.name, swf_response) job.schedule()
def submit(self, func, *args, **kwargs): logger.info('executing task {}(args={}, kwargs={})'.format( func, args, kwargs)) future = futures.Future() context = self.get_run_context() context["activity_id"] = str(self.nb_activities) self.nb_activities += 1 # Ensure signals ordering if isinstance(func, SignalTask): self.signals_sent.add(func.name) elif isinstance(func, WaitForSignal): signal_name = func.signal_name if signal_name not in self.signals_sent: raise NotImplementedError( 'wait_signal({}) before signal was sent: unsupported by the local executor'.format(signal_name) ) elif isinstance(func, MarkerTask): self._markers.setdefault(func.name, []).append(Marker(func.name, func.details)) if isinstance(func, Submittable): task = func # *args, **kwargs already resolved. task.context = context func = getattr(task, 'activity', None) elif isinstance(func, Activity): task = ActivityTask(func, context=context, *args, **kwargs) elif issubclass(func, Workflow): task = WorkflowTask(self, func, *args, **kwargs) else: raise TypeError('invalid type {} for {}'.format( type(func), func)) if isinstance(task, WorkflowTask): self.on_new_workflow(task) try: future._result = task.execute() if hasattr(task, 'post_execute'): task.post_execute() state = 'completed' except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() future._exception = exc_value logger.exception('rescuing exception: {}'.format(exc_value)) if (isinstance(func, Activity) or issubclass_(func, Workflow)) and getattr(func, 'raises_on_failure', None): tb = traceback.format_tb(exc_traceback) message = format_exc(exc_value) details = json_dumps( { 'error': exc_type.__name__, 'message': str(exc_value), 'traceback': tb, }, default=repr ) raise exceptions.TaskFailed( func.name, message, details, ) state = 'failed' finally: if isinstance(task, WorkflowTask): self.on_completed_workflow() future._state = futures.FINISHED if func: self._history.add_activity_task( func, decision_id=None, last_state=state, activity_id=context["activity_id"], input={'args': args, 'kwargs': kwargs}, result=future.result) return future
def on_terminate(p): logger.info('process: terminated pid={} retcode={}'.format(p.pid, p.returncode))
def spawn_kubernetes_job(poller, swf_response): logger.info('scheduling new kubernetes job name={}'.format(poller.job_name)) job = KubernetesJob(poller.job_name, poller.domain.name, swf_response) job.schedule()
def stop_gracefully(self): """ Stop the actor processes and subprocesses. """ logger.info('stopping %s', self.name) self.is_alive = False # No longer take requests.
def spawn(poller, token, task, heartbeat=60): """ Spawn a process and wait for it to end, sending heartbeats to SWF. On activity timeouts and termination, we reap the worker process and its children. :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask :param heartbeat: heartbeat delay (seconds) :type heartbeat: int """ logger.info( "spawning new activity worker pid={} heartbeat={}".format( os.getpid(), heartbeat ) ) worker = multiprocessing.Process(target=process_task, args=(poller, token, task),) worker.start() def worker_alive(): return psutil.pid_exists(worker.pid) while worker_alive(): worker.join(timeout=heartbeat) if not worker_alive(): # Most certainly unneeded: we'll see if worker.exitcode is None: # race condition, try and re-join worker.join(timeout=0) if worker.exitcode is None: logger.warning( "process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format( worker.pid ) ) if worker.exitcode != 0: poller.fail_with_retry( token, task, reason="process {} died: exit code {}".format( worker.pid, worker.exitcode ), ) return try: logger.debug("heartbeating for pid={} (token={})".format(worker.pid, token)) response = poller.heartbeat(token) except swf.exceptions.DoesNotExistError as error: # Either the task or the workflow execution no longer exists, # let's kill the worker process. logger.warning("heartbeat failed: {}".format(error)) logger.warning("killing (KILL) worker with pid={}".format(worker.pid)) reap_process_tree(worker.pid) return except swf.exceptions.RateLimitExceededError as error: # ignore rate limit errors: high chances the next heartbeat will be # ok anyway, so it would be stupid to break the task for that logger.warning( 'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format( task.activity_type.name, error ) ) continue except Exception as error: # Let's crash if it cannot notify the heartbeat failed. The # subprocess will become orphan and the heartbeat timeout may # eventually trigger on Amazon SWF side. logger.error( "cannot send heartbeat for task {}: {}".format( task.activity_type.name, error ) ) raise # Task cancelled. if response and response.get("cancelRequested"): reap_process_tree(worker.pid) return
def standalone( context, workflow, domain, workflow_id, execution_timeout, tags, decision_tasks_timeout, input, input_file, nb_workers, nb_deciders, heartbeat, display_status, repair, force_activities, ): """ This command spawn a decider and an activity worker to execute a workflow with a single main process. """ disable_boto_connection_pooling() if force_activities and not repair: raise ValueError( "You should only use --force-activities with --repair.") workflow_class = get_workflow(workflow) if not workflow_id: workflow_id = workflow_class.name wf_input = {} if input or input_file: wf_input = get_or_load_input(input_file, input) if repair: repair_run_id = None if " " in repair: repair, repair_run_id = repair.split(" ", 1) # get the previous execution history, it will serve as "default history" # for activities that succeeded in the previous execution logger.info("retrieving history of previous execution: domain={} " "workflow_id={} run_id={}".format(domain, repair, repair_run_id)) workflow_execution = get_workflow_execution(domain, repair, run_id=repair_run_id) previous_history = History(workflow_execution.history()) repair_run_id = workflow_execution.run_id previous_history.parse() # get the previous execution input if none passed if not input and not input_file: wf_input = previous_history.events[0].input if not tags: tags = workflow_execution.tag_list else: previous_history = None repair_run_id = None if not tags: get_tag_list = getattr(workflow_class, "get_tag_list", None) if get_tag_list: tags = get_tag_list(workflow_class, *wf_input.get("args", ()), **wf_input.get("kwargs", {})) else: tags = getattr(workflow_class, "tag_list", None) if tags == Workflow.INHERIT_TAG_LIST: tags = None task_list = create_unique_task_list(workflow_id) logger.info("using task list {}".format(task_list)) decider_proc = multiprocessing.Process( target=decider.command.start, args=( [workflow], domain, task_list, ), kwargs={ "nb_processes": nb_deciders, "repair_with": previous_history, "force_activities": force_activities, "is_standalone": True, "repair_workflow_id": repair or None, "repair_run_id": repair_run_id, }, ) decider_proc.start() worker_proc = multiprocessing.Process( target=worker.command.start, args=( domain, task_list, ), kwargs={ "nb_processes": nb_workers, "heartbeat": heartbeat, }, ) worker_proc.start() print("starting workflow {}".format(workflow), file=sys.stderr) ex = start_workflow.callback( workflow, domain, workflow_id, task_list, execution_timeout, tags, decision_tasks_timeout, format.input(wf_input), None, local=False, ) while True: time.sleep(2) ex = helpers.get_workflow_execution( domain, ex.workflow_id, ex.run_id, ) if display_status: print("status: {}".format(ex.status), file=sys.stderr) if ex.status == ex.STATUS_CLOSED: print("execution {} finished".format(ex.workflow_id), file=sys.stderr) break os.kill(worker_proc.pid, signal.SIGTERM) worker_proc.join() os.kill(decider_proc.pid, signal.SIGTERM) decider_proc.join()
def spawn(poller, token, task, heartbeat=60): """ Spawn a process and wait for it to end, sending heartbeats to SWF. On activity timeouts and termination, we reap the worker process and its children. :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask :param heartbeat: heartbeat delay (seconds) :type heartbeat: int """ logger.info('spawning new activity worker pid={} heartbeat={}'.format(os.getpid(), heartbeat)) worker = multiprocessing.Process( target=process_task, args=(poller, token, task), ) worker.start() def worker_alive(): return psutil.pid_exists(worker.pid) while worker_alive(): worker.join(timeout=heartbeat) if not worker_alive(): # Most certainly unneeded: we'll see if worker.exitcode is None: # race condition, try and re-join worker.join(timeout=0) if worker.exitcode is None: logger.warning("process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format( worker.pid )) if worker.exitcode != 0: poller.fail_with_retry( token, task, reason='process {} died: exit code {}'.format( worker.pid, worker.exitcode) ) return try: logger.debug( 'heartbeating for pid={} (token={})'.format(worker.pid, token) ) response = poller.heartbeat(token) except swf.exceptions.DoesNotExistError as error: # Either the task or the workflow execution no longer exists, # let's kill the worker process. logger.warning('heartbeat failed: {}'.format(error)) logger.warning('killing (KILL) worker with pid={}'.format(worker.pid)) reap_process_tree(worker.pid) return except swf.exceptions.RateLimitExceededError as error: # ignore rate limit errors: high chances the next heartbeat will be # ok anyway, so it would be stupid to break the task for that logger.warning( 'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format( task.activity_type.name, error)) continue except Exception as error: # Let's crash if it cannot notify the heartbeat failed. The # subprocess will become orphan and the heartbeat timeout may # eventually trigger on Amazon SWF side. logger.error('cannot send heartbeat for task {}: {}'.format( task.activity_type.name, error)) raise # Task cancelled. if response and response.get('cancelRequested'): reap_process_tree(worker.pid) return
def submit(self, func, *args, **kwargs): logger.info('executing task {}(args={}, kwargs={})'.format( func, args, kwargs)) future = futures.Future() context = self.get_run_context() context["activity_id"] = str(self.nb_activities) self.nb_activities += 1 # Ensure signals ordering if isinstance(func, SignalTask): self.signals_sent.add(func.name) elif isinstance(func, WaitForSignal): signal_name = func.signal_name if signal_name not in self.signals_sent: raise NotImplementedError( 'wait_signal({}) before signal was sent: unsupported by the local executor' .format(signal_name)) elif isinstance(func, MarkerTask): self._markers.setdefault(func.name, []).append(Marker(func.name, func.details)) if isinstance(func, Submittable): task = func # *args, **kwargs already resolved. task.context = context func = getattr(task, 'activity', None) elif isinstance(func, Activity): task = ActivityTask(func, context=context, *args, **kwargs) elif issubclass(func, Workflow): task = WorkflowTask(self, func, *args, **kwargs) else: raise TypeError('invalid type {} for {}'.format(type(func), func)) if isinstance(task, WorkflowTask): self.on_new_workflow(task) try: future._result = task.execute() if hasattr(task, 'post_execute'): task.post_execute() state = 'completed' except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() future._exception = exc_value logger.exception('rescuing exception: {}'.format(exc_value)) if (isinstance(func, Activity) or issubclass_(func, Workflow)) and getattr( func, 'raises_on_failure', None): tb = traceback.format_tb(exc_traceback) message = format_exc(exc_value) details = json_dumps( { 'error': exc_type.__name__, 'message': str(exc_value), 'traceback': tb, }, default=repr) raise exceptions.TaskFailed( func.name, message, details, ) state = 'failed' finally: if isinstance(task, WorkflowTask): self.on_completed_workflow() future._state = futures.FINISHED if func: self._history.add_activity_task(func, decision_id=None, last_state=state, activity_id=context["activity_id"], input={ 'args': args, 'kwargs': kwargs }, result=future.result) return future
def submit(self, func, *args, **kwargs): logger.info("executing task {}(args={}, kwargs={})".format(func, args, kwargs)) future = futures.Future() context = self.get_run_context() context["activity_id"] = str(self.nb_activities) self.nb_activities += 1 # Ensure signals ordering if isinstance(func, SignalTask): self.signals_sent.add(func.name) elif isinstance(func, WaitForSignal): signal_name = func.signal_name if signal_name not in self.signals_sent: raise NotImplementedError( "wait_signal({}) before signal was sent: unsupported by the local executor".format( signal_name ) ) elif isinstance(func, MarkerTask): self._markers.setdefault(func.name, []).append( Marker(func.name, func.details) ) if isinstance(func, Submittable): task = func # *args, **kwargs already resolved. task.context = context func = getattr(task, "activity", None) elif isinstance(func, Activity): task = ActivityTask(func, context=context, *args, **kwargs) elif issubclass(func, Workflow): task = WorkflowTask(self, func, *args, **kwargs) else: raise TypeError("invalid type {} for {}".format(type(func), func)) if isinstance(task, WorkflowTask): self.on_new_workflow(task) try: future._result = task.execute() if hasattr(task, "post_execute"): task.post_execute() state = "completed" except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() tb = traceback.format_tb(exc_traceback) task_failed = exceptions.TaskFailed( name=getattr(task, "name", "unknown"), reason=format_exc(exc_value), details=json_dumps( { "error": exc_type.__name__, "error_type": format_exc_type(exc_type), "message": str(exc_value), "traceback": tb, }, default=repr, ), ) future.set_exception(task_failed) logger.exception("rescuing exception: {}".format(exc_value)) if (isinstance(func, Activity) or issubclass_(func, Workflow)) and getattr( func, "raises_on_failure", None ): raise task_failed state = "failed" finally: if isinstance(task, WorkflowTask): self.on_completed_workflow() future._state = futures.FINISHED if func: self._history.add_activity_task( func, decision_id=None, last_state=state, activity_id=context["activity_id"], input={"args": args, "kwargs": kwargs}, result=future.result, ) return future
def on_terminate(p): logger.info("process: terminated pid={} retcode={}".format(p.pid, p.returncode))
def _download_binary(self): logger.info("Downloading binary: {} -> {}".format(self.remote_location, self.local_location)) bucket, path = self.remote_location.replace("s3://", "", 1).split("/", 1) # with FileLock(dest): pull(bucket, path, self.local_location) os.chmod(self.local_location, 0o755)
def stop_gracefully(self): """ Stop the actor processes and subprocesses. """ logger.info('stopping %s', self.name) self.is_alive = False # No longer take requests.
def standalone(context, workflow, domain, workflow_id, execution_timeout, tags, decision_tasks_timeout, input, input_file, nb_workers, nb_deciders, heartbeat, display_status, repair, force_activities, ): """ This command spawn a decider and an activity worker to execute a workflow with a single main process. """ disable_boto_connection_pooling() if force_activities and not repair: raise ValueError( "You should only use --force-activities with --repair." ) workflow_class = get_workflow(workflow) if not workflow_id: workflow_id = workflow_class.name wf_input = {} if input or input_file: wf_input = get_or_load_input(input_file, input) if repair: repair_run_id = None if " " in repair: repair, repair_run_id = repair.split(" ", 1) # get the previous execution history, it will serve as "default history" # for activities that succeeded in the previous execution logger.info( 'retrieving history of previous execution: domain={} ' 'workflow_id={} run_id={}'.format(domain, repair, repair_run_id) ) workflow_execution = get_workflow_execution(domain, repair, run_id=repair_run_id) previous_history = History(workflow_execution.history()) repair_run_id = workflow_execution.run_id previous_history.parse() # get the previous execution input if none passed if not input and not input_file: wf_input = previous_history.events[0].input if not tags: tags = workflow_execution.tag_list else: previous_history = None repair_run_id = None if not tags: get_tag_list = getattr(workflow_class, 'get_tag_list', None) if get_tag_list: tags = get_tag_list(workflow_class, *wf_input.get('args', ()), **wf_input.get('kwargs', {})) else: tags = getattr(workflow_class, 'tag_list', None) if tags == Workflow.INHERIT_TAG_LIST: tags = None task_list = create_unique_task_list(workflow_id) logger.info('using task list {}'.format(task_list)) decider_proc = multiprocessing.Process( target=decider.command.start, args=( [workflow], domain, task_list, ), kwargs={ 'nb_processes': nb_deciders, 'repair_with': previous_history, 'force_activities': force_activities, 'is_standalone': True, 'repair_workflow_id': repair or None, 'repair_run_id': repair_run_id, }, ) decider_proc.start() worker_proc = multiprocessing.Process( target=worker.command.start, args=( domain, task_list, ), kwargs={ 'nb_processes': nb_workers, 'heartbeat': heartbeat, }, ) worker_proc.start() print('starting workflow {}'.format(workflow), file=sys.stderr) ex = start_workflow.callback( workflow, domain, workflow_id, task_list, execution_timeout, tags, decision_tasks_timeout, format.input(wf_input), None, local=False, ) while True: time.sleep(2) ex = helpers.get_workflow_execution( domain, ex.workflow_id, ex.run_id, ) if display_status: print('status: {}'.format(ex.status), file=sys.stderr) if ex.status == ex.STATUS_CLOSED: print('execution {} finished'.format(ex.workflow_id), file=sys.stderr) break os.kill(worker_proc.pid, signal.SIGTERM) worker_proc.join() os.kill(decider_proc.pid, signal.SIGTERM) decider_proc.join()