def _get_cached(path): # 1/ memory cache if path in JUMBO_FIELDS_MEMORY_CACHE: return JUMBO_FIELDS_MEMORY_CACHE[path] # 2/ disk cache if SIMPLEFLOW_ENABLE_DISK_CACHE: try: # NB: this cache may also be triggered on activity workers, where it's not that # useful. The performance hit should be minimal. To be improved later. # NB2: cache has to be lazily instantiated here, cache objects do not survive forks, # see DiskCache docs. cache = Cache(constants.CACHE_DIR) # generate a dedicated cache key because this cache may be shared with other # features of simpleflow at some point cache_key = "jumbo_fields/" + path.split("/")[-1] if cache_key in cache: logger.debug( "diskcache: getting key={} from cache_dir={}".format( cache_key, constants.CACHE_DIR)) return cache[cache_key] except OperationalError: logger.warning( "diskcache: got an OperationalError, skipping cache usage") # nothing to return, but better be explicit here return
def schedule(self, *args, **kwargs): input = { 'args': self.args, 'kwargs': self.kwargs, } if self.extra_input: input.update(self.extra_input) logger.debug( 'scheduling signal name={name}, workflow_id={workflow_id}, run_id={run_id}, control={control}, ' 'extra_input={extra_input}'.format( name=self.name, workflow_id=self.workflow_id, run_id=self.run_id, control=self.control, extra_input=self.extra_input, )) decision = swf.models.decision.ExternalWorkflowExecutionDecision() decision.signal( signal_name=self.name, input=input, workflow_id=self.workflow_id, run_id=self.run_id, control=self.control, ) return [decision]
def spawn(poller, decision_response): logger.debug("spawn() pid={}".format(os.getpid())) worker = multiprocessing.Process( target=process_decision, args=(poller, decision_response), ) worker.start() worker.join()
def schedule(self, *args, **kwargs): input = { 'args': self.args, 'kwargs': self.kwargs, } if self.extra_input: input.update(self.extra_input) logger.debug( 'scheduling signal name={name}, workflow_id={workflow_id}, run_id={run_id}, control={control}, ' 'extra_input={extra_input}'.format( name=self.name, workflow_id=self.workflow_id, run_id=self.run_id, control=self.control, extra_input=self.extra_input, ) ) decision = swf.models.decision.ExternalWorkflowExecutionDecision() decision.signal( signal_name=self.name, input=input, workflow_id=self.workflow_id, run_id=self.run_id, control=self.control, ) return [decision]
def process(self, poller, token, task): """ :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask """ logger.debug('ActivityWorker.process() pid={}'.format(os.getpid())) try: activity = self.dispatch(task) input = format.decode(task.input) args = input.get('args', ()) kwargs = input.get('kwargs', {}) context = sanitize_activity_context(task.context) context['domain_name'] = poller.domain.name if input.get('meta', {}).get('binaries'): download_binaries(input['meta']['binaries']) result = ActivityTask(activity, *args, context=context, **kwargs).execute() except Exception: exc_type, exc_value, exc_traceback = sys.exc_info() logger.exception("process error: {}".format(str(exc_value))) if isinstance(exc_value, ExecutionError) and len(exc_value.args): details = exc_value.args[0] reason = format_exc(exc_value) # FIXME json.loads and rebuild? else: tb = traceback.format_tb(exc_traceback) reason = format_exc(exc_value) details = json_dumps( { 'error': exc_type.__name__, 'message': str(exc_value), 'traceback': tb, }, default=repr ) return poller.fail_with_retry( token, task, reason=reason, details=details ) try: logger.info('completing activity') poller.complete_with_retry(token, result) except Exception as err: logger.exception("complete error") reason = 'cannot complete task {}: {} {}'.format( task.activity_id, err.__class__.__name__, err, ) poller.fail_with_retry(token, task, reason)
def _set_cached(path, content): # 1/ memory cache JUMBO_FIELDS_MEMORY_CACHE[path] = content # 2/ disk cache if SIMPLEFLOW_ENABLE_DISK_CACHE: try: cache = Cache(constants.CACHE_DIR) cache_key = "jumbo_fields/" + path.split("/")[-1] logger.debug("diskcache: setting key={} on cache_dir={}".format(cache_key, constants.CACHE_DIR)) cache.set(cache_key, content, expire=3 * constants.HOUR) except OperationalError: logger.warning("diskcache: got an OperationalError on write, skipping cache write")
def process_decision(poller, decision_response): # type: (DeciderPoller, Response) -> None workflow_id = decision_response.execution.workflow_id workflow_str = "workflow {} ({})".format(workflow_id, poller.workflow_name) logger.debug("process_decision() pid={}".format(os.getpid())) logger.info("taking decision for {}".format(workflow_str)) format.JUMBO_FIELDS_MEMORY_CACHE.clear() decisions = poller.decide(decision_response) try: logger.info("completing decision for {}".format(workflow_str)) poller.complete_with_retry(decision_response.token, decisions) except Exception as err: logger.error("cannot complete decision for {}: {}".format(workflow_str, err))
def process_task(poller, token, task): """ :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask """ logger.debug('process_task() pid={}'.format(os.getpid())) format.JUMBO_FIELDS_MEMORY_CACHE.clear() worker = ActivityWorker() worker.process(poller, token, task)
def process_task(poller, token, task): """ :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask """ logger.debug("process_task() pid={}".format(os.getpid())) format.JUMBO_FIELDS_MEMORY_CACHE.clear() worker = ActivityWorker() worker.process(poller, token, task)
def activity_rerun(domain, workflow_id, run_id, input, scheduled_id, activity_id): # handle params if not activity_id and not scheduled_id: logger.error("Please supply --scheduled-id or --activity-id.") sys.exit(1) input_override = None if input: input_override = format.decode(input) # find workflow execution try: wfe = helpers.get_workflow_execution(domain, workflow_id, run_id) except (swf.exceptions.DoesNotExistError, IndexError): logger.error("Couldn't find execution, exiting.") sys.exit(1) logger.info("Found execution: workflowId={} runId={}".format( wfe.workflow_id, wfe.run_id)) # now rerun the specified activity history = History(wfe.history()) history.parse() task, args, kwargs, meta, params = helpers.find_activity( history, scheduled_id=scheduled_id, activity_id=activity_id, input=input_override, ) kwargs["context"].update({ "workflow_id": wfe.workflow_id, "run_id": wfe.run_id, }) logger.debug("Found activity. Last execution:") for line in json_dumps(params, pretty=True).split("\n"): logger.debug(line) if input_override: logger.info("NB: input will be overriden with the passed one!") logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format( task, args, kwargs, meta)) # download binaries if needed download_binaries(meta.get("binaries", {})) # execute the activity task with the correct arguments instance = ActivityTask(task, *args, **kwargs) result = instance.execute() if hasattr(instance, "post_execute"): instance.post_execute() logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
def load_workflow_executor( domain, workflow_name, task_list=None, repair_with=None, force_activities=None, repair_workflow_id=None, repair_run_id=None, ): """ Load a workflow executor. :param domain: :type domain: str | swf.models.Domain :param workflow_name: :type workflow_name: str :param task_list: :type task_list: Optional[str] :param repair_with: :type repair_with: Optional[simpleflow.history.History] :param force_activities: :type force_activities: Optional[str] :param repair_workflow_id: workflow ID to repair :type repair_workflow_id: Optional[str] :param repair_run_id: run ID to repair :type repair_run_id: Optional[str] :return: Executor for this workflow :rtype: Executor """ logger.debug( 'load_workflow_executor(workflow_name="{}")'.format(workflow_name)) module_name, object_name = workflow_name.rsplit(".", 1) module = __import__(module_name, fromlist=["*"]) workflow = getattr(module, object_name) # TODO: find the cause of this differentiated behaviour if not isinstance(domain, swf.models.Domain): domain = swf.models.Domain(domain) return Executor( domain, workflow, task_list, repair_with=repair_with, force_activities=force_activities, repair_workflow_id=repair_workflow_id, repair_run_id=repair_run_id, )
def __init__(self, *args, **kwargs): self.region = (SETTINGS.get('region') or kwargs.get('region') or boto.swf.layer1.Layer1.DefaultRegionName) # Use settings-provided keys if available, otherwise pass empty # dictionary to boto SWF client, which will use its default credentials # chain provider. cred_keys = ['aws_access_key_id', 'aws_secret_access_key'] creds_ = {k: SETTINGS[k] for k in cred_keys if SETTINGS.get(k, None)} self.connection = (kwargs.pop('connection', None) or boto.swf.connect_to_region(self.region, **creds_)) if self.connection is None: raise ValueError('invalid region: {}'.format(self.region)) logger.debug("initiated connection to region={}".format(self.region))
def __init__(self, *args, **kwargs): self.region = (SETTINGS.get('region') or kwargs.get('region') or boto.swf.layer1.Layer1.DefaultRegionName) # Use settings-provided keys if available, otherwise pass empty # dictionary to boto SWF client, which will use its default credentials # chain provider. cred_keys = ['aws_access_key_id', 'aws_secret_access_key'] creds_ = {k: SETTINGS[k] for k in cred_keys if SETTINGS.get(k, None)} self.connection = (kwargs.pop('connection', None) or boto.swf.connect_to_region( self.region, **creds_)) if self.connection is None: raise ValueError('invalid region: {}'.format(self.region)) logger.debug("initiated connection to region={}".format(self.region))
def _set_cached(path, content): # 1/ memory cache JUMBO_FIELDS_MEMORY_CACHE[path] = content # 2/ disk cache if SIMPLEFLOW_ENABLE_DISK_CACHE: try: cache = Cache(constants.CACHE_DIR) cache_key = "jumbo_fields/" + path.split("/")[-1] logger.debug("diskcache: setting key={} on cache_dir={}".format( cache_key, constants.CACHE_DIR)) cache.set(cache_key, content, expire=3 * constants.HOUR) except OperationalError: logger.warning( "diskcache: got an OperationalError on write, skipping cache write" )
def __init__(self, *args, **kwargs): settings_ = { key: SETTINGS.get(key, kwargs.get(key)) for key in ('aws_access_key_id', 'aws_secret_access_key') } self.region = (SETTINGS.get('region') or kwargs.get('region') or boto.swf.layer1.Layer1.DefaultRegionName) self.connection = (kwargs.pop('connection', None) or boto.swf.connect_to_region( self.region, **settings_)) if self.connection is None: raise ValueError('invalid region: {}'.format(self.region)) logger.debug("initiated connection to region={}".format(self.region))
def activity_rerun(domain, workflow_id, run_id, input, scheduled_id, activity_id): # handle params if not activity_id and not scheduled_id: logger.error("Please supply --scheduled-id or --activity-id.") sys.exit(1) input_override = None if input: input_override = format.decode(input) # find workflow execution try: wfe = helpers.get_workflow_execution(domain, workflow_id, run_id) except (swf.exceptions.DoesNotExistError, IndexError): logger.error("Couldn't find execution, exiting.") sys.exit(1) logger.info("Found execution: workflowId={} runId={}".format(wfe.workflow_id, wfe.run_id)) # now rerun the specified activity history = History(wfe.history()) history.parse() task, args, kwargs, meta, params = helpers.find_activity( history, scheduled_id=scheduled_id, activity_id=activity_id, input=input_override, ) logger.debug("Found activity. Last execution:") for line in json_dumps(params, pretty=True).split("\n"): logger.debug(line) if input_override: logger.info("NB: input will be overriden with the passed one!") logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format(task, args, kwargs, meta)) # download binaries if needed download_binaries(meta.get("binaries", {})) # execute the activity task with the correct arguments instance = ActivityTask(task, *args, **kwargs) result = instance.execute() if hasattr(instance, 'post_execute'): instance.post_execute() logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
def _cleanup_worker_processes(self): # cleanup children to_remove = [] for pid, child in self._processes.items(): try: name, status = child.name(), child.status() except psutil.NoSuchProcess: # May be untimely deceased name, status = "unknown", "unknown" logger.debug(" child: name=%s pid=%d status=%s" % (name, child.pid, status)) if status in (psutil.STATUS_ZOMBIE, "unknown"): logger.debug(" process {} is zombie, will cleanup".format(child.pid)) # join process to clean it up child.wait() # set the process to be removed from self._processes to_remove.append(pid) # cleanup our internal state (self._processes) for pid in to_remove: del self._processes[pid]
def load_workflow_executor(domain, workflow_name, task_list=None, repair_with=None, force_activities=None, repair_workflow_id=None, repair_run_id=None, ): """ Load a workflow executor. :param domain: :type domain: str | swf.models.Domain :param workflow_name: :type workflow_name: str :param task_list: :type task_list: Optional[str] :param repair_with: :type repair_with: Optional[simpleflow.history.History] :param force_activities: :type force_activities: Optional[str] :param repair_workflow_id: workflow ID to repair :type repair_workflow_id: Optional[str] :param repair_run_id: run ID to repair :type repair_run_id: Optional[str] :return: Executor for this workflow :rtype: Executor """ logger.debug('load_workflow_executor(workflow_name="{}")'.format(workflow_name)) module_name, object_name = workflow_name.rsplit('.', 1) module = __import__(module_name, fromlist=['*']) workflow = getattr(module, object_name) # TODO: find the cause of this differentiated behaviour if not isinstance(domain, swf.models.Domain): domain = swf.models.Domain(domain) return Executor( domain, workflow, task_list, repair_with=repair_with, force_activities=force_activities, repair_workflow_id=repair_workflow_id, repair_run_id=repair_run_id, )
def _cleanup_worker_processes(self): # cleanup children to_remove = [] for pid, child in self._processes.items(): try: name, status = child.name(), child.status() except psutil.NoSuchProcess: # May be untimely deceased name, status = "unknown", "unknown" logger.debug(" child: name=%s pid=%d status=%s" % (name, child.pid, status)) if status in (psutil.STATUS_ZOMBIE, "unknown"): logger.debug(" process {} is zombie, will cleanup".format( child.pid)) # join process to clean it up child.wait() # set the process to be removed from self._processes to_remove.append(pid) # cleanup our internal state (self._processes) for pid in to_remove: del self._processes[pid]
def _get_cached(path): # 1/ memory cache if path in JUMBO_FIELDS_MEMORY_CACHE: return JUMBO_FIELDS_MEMORY_CACHE[path] # 2/ disk cache if SIMPLEFLOW_ENABLE_DISK_CACHE: try: # NB: this cache may also be triggered on activity workers, where it's not that # useful. The performance hit should be minimal. To be improved later. # NB2: cache has to be lazily instantiated here, cache objects do not survive forks, # see DiskCache docs. cache = Cache(constants.CACHE_DIR) # generate a dedicated cache key because this cache may be shared with other # features of simpleflow at some point cache_key = "jumbo_fields/" + path.split("/")[-1] if cache_key in cache: logger.debug("diskcache: getting key={} from cache_dir={}".format(cache_key, constants.CACHE_DIR)) return cache[cache_key] except OperationalError: logger.warning("diskcache: got an OperationalError, skipping cache usage") # nothing to return, but better be explicit here return
def poll_with_retry(self): """ Polls a task represented by its token and data. It uses long-polling with a timeout of one minute. See also http://docs.aws.amazon.com/amazonswf/latest/apireference/API_PollForDecisionTask.html#API_PollForDecisionTask_RequestSyntax http://docs.aws.amazon.com/amazonswf/latest/apireference/API_PollForActivityTask.html#API_PollForActivityTask_RequestSyntax :returns: :rtype: swf.responses.Response """ task_list = self.task_list identity = self.identity logger.debug("polling task on %s", task_list) poll = utils.retry.with_delay( nb_times=self.nb_retries, delay=utils.retry.exponential, log_with=logger.exception, on_exceptions=swf.exceptions.ResponseError, )(self.poll) response = poll(task_list, identity=identity) return response
def wrapped(self, *args, **kwargs): logger.debug("entering state {}: {}(args={}, kwargs={})".format( state, method.__name__, args, kwargs)) self.state = state return method(self, *args, **kwargs)
def spawn(poller, token, task, heartbeat=60): """ Spawn a process and wait for it to end, sending heartbeats to SWF. On activity timeouts and termination, we reap the worker process and its children. :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask :param heartbeat: heartbeat delay (seconds) :type heartbeat: int """ logger.info( "spawning new activity worker pid={} heartbeat={}".format( os.getpid(), heartbeat ) ) worker = multiprocessing.Process(target=process_task, args=(poller, token, task),) worker.start() def worker_alive(): return psutil.pid_exists(worker.pid) while worker_alive(): worker.join(timeout=heartbeat) if not worker_alive(): # Most certainly unneeded: we'll see if worker.exitcode is None: # race condition, try and re-join worker.join(timeout=0) if worker.exitcode is None: logger.warning( "process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format( worker.pid ) ) if worker.exitcode != 0: poller.fail_with_retry( token, task, reason="process {} died: exit code {}".format( worker.pid, worker.exitcode ), ) return try: logger.debug("heartbeating for pid={} (token={})".format(worker.pid, token)) response = poller.heartbeat(token) except swf.exceptions.DoesNotExistError as error: # Either the task or the workflow execution no longer exists, # let's kill the worker process. logger.warning("heartbeat failed: {}".format(error)) logger.warning("killing (KILL) worker with pid={}".format(worker.pid)) reap_process_tree(worker.pid) return except swf.exceptions.RateLimitExceededError as error: # ignore rate limit errors: high chances the next heartbeat will be # ok anyway, so it would be stupid to break the task for that logger.warning( 'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format( task.activity_type.name, error ) ) continue except Exception as error: # Let's crash if it cannot notify the heartbeat failed. The # subprocess will become orphan and the heartbeat timeout may # eventually trigger on Amazon SWF side. logger.error( "cannot send heartbeat for task {}: {}".format( task.activity_type.name, error ) ) raise # Task cancelled. if response and response.get("cancelRequested"): reap_process_tree(worker.pid) return
def spawn(poller, token, task, heartbeat=60): """ Spawn a process and wait for it to end, sending heartbeats to SWF. On activity timeouts and termination, we reap the worker process and its children. :param poller: :type poller: ActivityPoller :param token: :type token: str :param task: :type task: swf.models.ActivityTask :param heartbeat: heartbeat delay (seconds) :type heartbeat: int """ logger.info('spawning new activity worker pid={} heartbeat={}'.format(os.getpid(), heartbeat)) worker = multiprocessing.Process( target=process_task, args=(poller, token, task), ) worker.start() def worker_alive(): return psutil.pid_exists(worker.pid) while worker_alive(): worker.join(timeout=heartbeat) if not worker_alive(): # Most certainly unneeded: we'll see if worker.exitcode is None: # race condition, try and re-join worker.join(timeout=0) if worker.exitcode is None: logger.warning("process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format( worker.pid )) if worker.exitcode != 0: poller.fail_with_retry( token, task, reason='process {} died: exit code {}'.format( worker.pid, worker.exitcode) ) return try: logger.debug( 'heartbeating for pid={} (token={})'.format(worker.pid, token) ) response = poller.heartbeat(token) except swf.exceptions.DoesNotExistError as error: # Either the task or the workflow execution no longer exists, # let's kill the worker process. logger.warning('heartbeat failed: {}'.format(error)) logger.warning('killing (KILL) worker with pid={}'.format(worker.pid)) reap_process_tree(worker.pid) return except swf.exceptions.RateLimitExceededError as error: # ignore rate limit errors: high chances the next heartbeat will be # ok anyway, so it would be stupid to break the task for that logger.warning( 'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format( task.activity_type.name, error)) continue except Exception as error: # Let's crash if it cannot notify the heartbeat failed. The # subprocess will become orphan and the heartbeat timeout may # eventually trigger on Amazon SWF side. logger.error('cannot send heartbeat for task {}: {}'.format( task.activity_type.name, error)) raise # Task cancelled. if response and response.get('cancelRequested'): reap_process_tree(worker.pid) return