Exemple #1
0
 def _killall(self):
     """
     Sends a stop (SIGTERM) signal to all worker processes.
     """
     for process in self._processes.values():
         logger.info("process: sending SIGTERM to pid={}".format(process.pid))
         process.terminate()
Exemple #2
0
    def target(self):
        """
        Supervisor's main "target", as defined in the `multiprocessing` API. It's the
        code that the manager will execute once started.
        """
        # handle signals
        self.bind_signal_handlers()

        # protection against double use of ".start()"
        if len(self._processes) != 0:
            raise Exception("Child processes map is not empty, already called .start() ?")

        # wait for all processes to finish
        while True:
            # if terminating, join all processes and exit the loop so we finish
            # the supervisor process
            if self._terminating:
                for proc in self._processes.values():
                    logger.info("process: waiting for proces={} to finish.".format(proc))
                    proc.wait()
                break

            # start worker processes
            self._cleanup_worker_processes()
            self._start_worker_processes()

            # re-evaluate state at least every 5 seconds ; if a SIGCHLD happens during
            # the "time.sleep()" below, it will be interrupted, making the code above
            # run nearly immediately ; but if a SIGCHLD happens during the two calls
            # above, the "time.sleep()" here won't be stopped, so better have it
            # relatively short, but not too short since the above methods involve
            # scanning a bunch of entries in /proc so that could become slow if we do
            # it every 0.1s.
            time.sleep(5)
Exemple #3
0
    def run(self, token, task):
        ppid = os.getppid()

        while True:
            time.sleep(self._interval)

            if os.getppid() != ppid:
                os._exit(1)

            try:
                logger.info("heartbeat {} for task {}".format(
                    time.time(), task.activity_type.name))
            except Exception:
                # Do not crash for debug
                pass

            try:
                response = self.send_heartbeat(token)
            except swf.exceptions.DoesNotExistError:
                # Either the task or the workflow execution no longer exists.
                logger.warning(
                    "task {} no longer exists. Stopping heartbeat".format(
                        task.activity_type.name))
                return
            except Exception as error:
                # Let's crash if it cannot notify the heartbeat failed.
                logger.error("cannot send heartbeat for task {}: {}".format(
                    task.activity_type.name, error))
                raise

            if response and response.get("cancelRequested"):
                return
Exemple #4
0
 def _killall(self):
     """
     Sends a stop (SIGTERM) signal to all worker processes.
     """
     for process in self._processes.values():
         logger.info("process: sending SIGTERM to pid={}".format(
             process.pid))
         process.terminate()
Exemple #5
0
 def terminate(self):
     """
     Terminate all worker processes managed by this Supervisor.
     """
     self._terminating = True
     logger.info(
         "process: will stop workers, this might take up several minutes. "
         "Please, be patient.")
     self._killall()
Exemple #6
0
    def process(self, poller, token, task):
        """

        :param poller:
        :type poller: ActivityPoller
        :param token:
        :type token: str
        :param task:
        :type task: swf.models.ActivityTask
        """
        logger.debug('ActivityWorker.process() pid={}'.format(os.getpid()))
        try:
            activity = self.dispatch(task)
            input = format.decode(task.input)
            args = input.get('args', ())
            kwargs = input.get('kwargs', {})
            context = sanitize_activity_context(task.context)
            context['domain_name'] = poller.domain.name
            if input.get('meta', {}).get('binaries'):
                download_binaries(input['meta']['binaries'])
            result = ActivityTask(activity, *args, context=context, **kwargs).execute()
        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            logger.exception("process error: {}".format(str(exc_value)))
            if isinstance(exc_value, ExecutionError) and len(exc_value.args):
                details = exc_value.args[0]
                reason = format_exc(exc_value)  # FIXME json.loads and rebuild?
            else:
                tb = traceback.format_tb(exc_traceback)
                reason = format_exc(exc_value)
                details = json_dumps(
                    {
                        'error': exc_type.__name__,
                        'message': str(exc_value),
                        'traceback': tb,
                    },
                    default=repr
                )
            return poller.fail_with_retry(
                token,
                task,
                reason=reason,
                details=details
            )

        try:
            logger.info('completing activity')
            poller.complete_with_retry(token, result)
        except Exception as err:
            logger.exception("complete error")
            reason = 'cannot complete task {}: {} {}'.format(
                task.activity_id,
                err.__class__.__name__,
                err,
            )
            poller.fail_with_retry(token, task, reason)
Exemple #7
0
 def _download_binary(self):
     logger.info(
         "Downloading binary: {} -> {}".format(
             self.remote_location, self.local_location
         )
     )
     bucket, path = self.remote_location.replace("s3://", "", 1).split("/", 1)
     # with FileLock(dest):
     pull(bucket, path, self.local_location)
     os.chmod(self.local_location, 0o755)
Exemple #8
0
 def terminate(self):
     """
     Terminate all worker processes managed by this Supervisor.
     """
     self._terminating = True
     logger.info(
         "process: will stop workers, this might take up several minutes. "
         "Please, be patient."
     )
     self._killall()
Exemple #9
0
 def start(self):
     """
     Used to start the Supervisor process once it's configured. Has to be called
     explicitly on a Supervisor instance so it starts (no auto-start from __init__()).
     """
     logger.info('starting {}'.format(self._payload))
     if self._background:
         p = multiprocessing.Process(target=self.target)
         p.start()
     else:
         self.target()
Exemple #10
0
 def start(self):
     """
     Used to start the Supervisor process once it's configured. Has to be called
     explicitly on a Supervisor instance so it starts (no auto-start from __init__()).
     """
     logger.info("starting {}".format(self._payload))
     if self._background:
         p = multiprocessing.Process(target=self.target)
         p.start()
     else:
         self.target()
Exemple #11
0
def process_decision(poller, decision_response):
    # type: (DeciderPoller, Response) -> None
    workflow_id = decision_response.execution.workflow_id
    workflow_str = "workflow {} ({})".format(workflow_id, poller.workflow_name)
    logger.debug("process_decision() pid={}".format(os.getpid()))
    logger.info("taking decision for {}".format(workflow_str))
    format.JUMBO_FIELDS_MEMORY_CACHE.clear()
    decisions = poller.decide(decision_response)
    try:
        logger.info("completing decision for {}".format(workflow_str))
        poller.complete_with_retry(decision_response.token, decisions)
    except Exception as err:
        logger.error("cannot complete decision for {}: {}".format(workflow_str, err))
Exemple #12
0
def process_decision(poller, decision_response):
    # type: (DeciderPoller, Response) -> None
    workflow_id = decision_response.execution.workflow_id
    workflow_str = "workflow {} ({})".format(workflow_id, poller.workflow_name)
    logger.debug("process_decision() pid={}".format(os.getpid()))
    logger.info("taking decision for {}".format(workflow_str))
    format.JUMBO_FIELDS_MEMORY_CACHE.clear()
    decisions = poller.decide(decision_response)
    try:
        logger.info("completing decision for {}".format(workflow_str))
        poller.complete_with_retry(decision_response.token, decisions)
    except Exception as err:
        logger.error("cannot complete decision for {}: {}".format(workflow_str, err))
Exemple #13
0
 def run_once(self):
     """
     Run the main poller process and exits after first task is processed.
     """
     logger.info("starting %s on domain %s", self.name, self.domain.name)
     self.bind_signal_handlers()
     self.is_alive = True
     self.set_process_name()
     while self.is_alive:
         try:
             response = self.poll_with_retry()
         except swf.exceptions.PollTimeout:
             continue
         self.process(response)
         break
Exemple #14
0
 def run_once(self):
     """
     Run the main poller process and exits after first task is processed.
     """
     logger.info("starting %s on domain %s", self.name, self.domain.name)
     self.bind_signal_handlers()
     self.is_alive = True
     self.set_process_name()
     while self.is_alive:
         try:
             response = self.poll_with_retry()
         except swf.exceptions.PollTimeout:
             continue
         self.process(response)
         break
Exemple #15
0
    def start(self):
        """
        Start the main poller process. There is no daemonization. The process
        is intended to be run inside a supervisor process.

        """
        logger.info("starting %s on domain %s", self.name, self.domain.name)
        self.bind_signal_handlers()
        self.is_alive = True
        self.set_process_name()
        while self.is_alive:
            try:
                response = self.poll_with_retry()
            except swf.exceptions.PollTimeout:
                continue
            self.process(response)
Exemple #16
0
    def start(self):
        """
        Start the main poller process. There is no daemonization. The process
        is intended to be run inside a supervisor process.

        """
        logger.info("starting %s on domain %s", self.name, self.domain.name)
        self.bind_signal_handlers()
        self.is_alive = True
        self.set_process_name()
        while self.is_alive:
            try:
                response = self.poll_with_retry()
            except swf.exceptions.PollTimeout:
                continue
            self.process(response)
Exemple #17
0
def activity_rerun(domain, workflow_id, run_id, input, scheduled_id,
                   activity_id):
    # handle params
    if not activity_id and not scheduled_id:
        logger.error("Please supply --scheduled-id or --activity-id.")
        sys.exit(1)

    input_override = None
    if input:
        input_override = format.decode(input)

    # find workflow execution
    try:
        wfe = helpers.get_workflow_execution(domain, workflow_id, run_id)
    except (swf.exceptions.DoesNotExistError, IndexError):
        logger.error("Couldn't find execution, exiting.")
        sys.exit(1)
    logger.info("Found execution: workflowId={} runId={}".format(
        wfe.workflow_id, wfe.run_id))

    # now rerun the specified activity
    history = History(wfe.history())
    history.parse()
    task, args, kwargs, meta, params = helpers.find_activity(
        history,
        scheduled_id=scheduled_id,
        activity_id=activity_id,
        input=input_override,
    )
    kwargs["context"].update({
        "workflow_id": wfe.workflow_id,
        "run_id": wfe.run_id,
    })
    logger.debug("Found activity. Last execution:")
    for line in json_dumps(params, pretty=True).split("\n"):
        logger.debug(line)
    if input_override:
        logger.info("NB: input will be overriden with the passed one!")
    logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format(
        task, args, kwargs, meta))

    # download binaries if needed
    download_binaries(meta.get("binaries", {}))

    # execute the activity task with the correct arguments
    instance = ActivityTask(task, *args, **kwargs)
    result = instance.execute()
    if hasattr(instance, "post_execute"):
        instance.post_execute()
    logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
Exemple #18
0
    def target(self):
        """
        Supervisor's main "target", as defined in the `multiprocessing` API. It's the
        code that the manager will execute once started.
        """
        # handle signals
        self.bind_signal_handlers()

        # protection against double use of ".start()"
        if len(self._processes) != 0:
            raise Exception(
                "Child processes map is not empty, already called .start() ?")

        # wait for all processes to finish
        while True:
            # if terminating, join all processes and exit the loop so we finish
            # the supervisor process
            if self._terminating:
                for proc in self._processes.values():
                    logger.info(
                        "process: waiting for proces={} to finish.".format(
                            proc))
                    proc.wait()
                break

            # start worker processes
            self._cleanup_worker_processes()
            self._start_worker_processes()

            # re-evaluate state at least every 5 seconds ; if a SIGCHLD happens during
            # the "time.sleep()" below, it will be interrupted, making the code above
            # run nearly immediately ; but if a SIGCHLD happens during the two calls
            # above, the "time.sleep()" here won't be stopped, so better have it
            # relatively short, but not too short since the above methods involve
            # scanning a bunch of entries in /proc so that could become slow if we do
            # it every 0.1s.
            time.sleep(5)
Exemple #19
0
def activity_rerun(domain,
                   workflow_id,
                   run_id,
                   input,
                   scheduled_id,
                   activity_id):
    # handle params
    if not activity_id and not scheduled_id:
        logger.error("Please supply --scheduled-id or --activity-id.")
        sys.exit(1)

    input_override = None
    if input:
        input_override = format.decode(input)

    # find workflow execution
    try:
        wfe = helpers.get_workflow_execution(domain, workflow_id, run_id)
    except (swf.exceptions.DoesNotExistError, IndexError):
        logger.error("Couldn't find execution, exiting.")
        sys.exit(1)
    logger.info("Found execution: workflowId={} runId={}".format(wfe.workflow_id, wfe.run_id))

    # now rerun the specified activity
    history = History(wfe.history())
    history.parse()
    task, args, kwargs, meta, params = helpers.find_activity(
        history, scheduled_id=scheduled_id, activity_id=activity_id, input=input_override,
    )
    logger.debug("Found activity. Last execution:")
    for line in json_dumps(params, pretty=True).split("\n"):
        logger.debug(line)
    if input_override:
        logger.info("NB: input will be overriden with the passed one!")
    logger.info("Will re-run: {}(*{}, **{}) [+meta={}]".format(task, args, kwargs, meta))

    # download binaries if needed
    download_binaries(meta.get("binaries", {}))

    # execute the activity task with the correct arguments
    instance = ActivityTask(task, *args, **kwargs)
    result = instance.execute()
    if hasattr(instance, 'post_execute'):
        instance.post_execute()
    logger.info("Result (JSON): {}".format(json_dumps(result, compact=False)))
Exemple #20
0
 def _handle_graceful_shutdown(signum, frame):
     signals_map = {2: "SIGINT", 15: "SIGTERM"}
     signal_name = signals_map.get(signum, signum)
     logger.info("process: caught signal signal={} pid={}".format(
         signal_name, os.getpid()))
     self.terminate()
Exemple #21
0
 def _handle_graceful_shutdown(signum, frame):
     signals_map = {2: "SIGINT", 15: "SIGTERM"}
     signal_name = signals_map.get(signum, signum)
     logger.info("process: caught signal signal={} pid={}".format(
         signal_name, os.getpid()))
     self.terminate()
Exemple #22
0
 def _handle_graceful_shutdown(signum, frame):
     logger.info("process: caught signal signal=SIGTERM pid={}".format(
         os.getpid()))
     self.stop_gracefully()
Exemple #23
0
 def _handle_graceful_shutdown(signum, frame):
     logger.info("process: caught signal signal=SIGTERM pid={}".format(os.getpid()))
     self.stop_gracefully()
Exemple #24
0
def spawn_kubernetes_job(poller, swf_response):
    logger.info("scheduling new kubernetes job name={}".format(poller.job_name))
    job = KubernetesJob(poller.job_name, poller.domain.name, swf_response)
    job.schedule()
Exemple #25
0
    def submit(self, func, *args, **kwargs):
        logger.info('executing task {}(args={}, kwargs={})'.format(
            func, args, kwargs))

        future = futures.Future()

        context = self.get_run_context()
        context["activity_id"] = str(self.nb_activities)
        self.nb_activities += 1

        # Ensure signals ordering
        if isinstance(func, SignalTask):
            self.signals_sent.add(func.name)
        elif isinstance(func, WaitForSignal):
            signal_name = func.signal_name
            if signal_name not in self.signals_sent:
                raise NotImplementedError(
                    'wait_signal({}) before signal was sent: unsupported by the local executor'.format(signal_name)
                )
        elif isinstance(func, MarkerTask):
            self._markers.setdefault(func.name, []).append(Marker(func.name, func.details))

        if isinstance(func, Submittable):
            task = func  # *args, **kwargs already resolved.
            task.context = context
            func = getattr(task, 'activity', None)
        elif isinstance(func, Activity):
            task = ActivityTask(func, context=context, *args, **kwargs)
        elif issubclass(func, Workflow):
            task = WorkflowTask(self, func, *args, **kwargs)
        else:
            raise TypeError('invalid type {} for {}'.format(
                type(func), func))

        if isinstance(task, WorkflowTask):
            self.on_new_workflow(task)

        try:
            future._result = task.execute()
            if hasattr(task, 'post_execute'):
                task.post_execute()
            state = 'completed'
        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            future._exception = exc_value
            logger.exception('rescuing exception: {}'.format(exc_value))
            if (isinstance(func, Activity) or issubclass_(func, Workflow)) and getattr(func, 'raises_on_failure', None):
                tb = traceback.format_tb(exc_traceback)
                message = format_exc(exc_value)
                details = json_dumps(
                    {
                        'error': exc_type.__name__,
                        'message': str(exc_value),
                        'traceback': tb,
                    },
                    default=repr
                )
                raise exceptions.TaskFailed(
                    func.name,
                    message,
                    details,
                )
            state = 'failed'
        finally:
            if isinstance(task, WorkflowTask):
                self.on_completed_workflow()
            future._state = futures.FINISHED

        if func:
            self._history.add_activity_task(
                func,
                decision_id=None,
                last_state=state,
                activity_id=context["activity_id"],
                input={'args': args, 'kwargs': kwargs},
                result=future.result)
        return future
Exemple #26
0
 def on_terminate(p):
     logger.info('process: terminated pid={} retcode={}'.format(p.pid, p.returncode))
Exemple #27
0
def spawn_kubernetes_job(poller, swf_response):
    logger.info('scheduling new kubernetes job name={}'.format(poller.job_name))
    job = KubernetesJob(poller.job_name, poller.domain.name, swf_response)
    job.schedule()
Exemple #28
0
 def stop_gracefully(self):
     """
     Stop the actor processes and subprocesses.
     """
     logger.info('stopping %s', self.name)
     self.is_alive = False  # No longer take requests.
Exemple #29
0
def spawn(poller, token, task, heartbeat=60):
    """
    Spawn a process and wait for it to end, sending heartbeats to SWF.

    On activity timeouts and termination, we reap the worker process and its
    children.

    :param poller:
    :type poller: ActivityPoller
    :param token:
    :type token: str
    :param task:
    :type task: swf.models.ActivityTask
    :param heartbeat: heartbeat delay (seconds)
    :type heartbeat: int
    """
    logger.info(
        "spawning new activity worker pid={} heartbeat={}".format(
            os.getpid(), heartbeat
        )
    )
    worker = multiprocessing.Process(target=process_task, args=(poller, token, task),)
    worker.start()

    def worker_alive():
        return psutil.pid_exists(worker.pid)

    while worker_alive():
        worker.join(timeout=heartbeat)
        if not worker_alive():
            # Most certainly unneeded: we'll see
            if worker.exitcode is None:
                # race condition, try and re-join
                worker.join(timeout=0)
                if worker.exitcode is None:
                    logger.warning(
                        "process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format(
                            worker.pid
                        )
                    )
            if worker.exitcode != 0:
                poller.fail_with_retry(
                    token,
                    task,
                    reason="process {} died: exit code {}".format(
                        worker.pid, worker.exitcode
                    ),
                )
            return
        try:
            logger.debug("heartbeating for pid={} (token={})".format(worker.pid, token))
            response = poller.heartbeat(token)
        except swf.exceptions.DoesNotExistError as error:
            # Either the task or the workflow execution no longer exists,
            # let's kill the worker process.
            logger.warning("heartbeat failed: {}".format(error))
            logger.warning("killing (KILL) worker with pid={}".format(worker.pid))
            reap_process_tree(worker.pid)
            return
        except swf.exceptions.RateLimitExceededError as error:
            # ignore rate limit errors: high chances the next heartbeat will be
            # ok anyway, so it would be stupid to break the task for that
            logger.warning(
                'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format(
                    task.activity_type.name, error
                )
            )
            continue
        except Exception as error:
            # Let's crash if it cannot notify the heartbeat failed.  The
            # subprocess will become orphan and the heartbeat timeout may
            # eventually trigger on Amazon SWF side.
            logger.error(
                "cannot send heartbeat for task {}: {}".format(
                    task.activity_type.name, error
                )
            )
            raise

        # Task cancelled.
        if response and response.get("cancelRequested"):
            reap_process_tree(worker.pid)
            return
Exemple #30
0
def standalone(
    context,
    workflow,
    domain,
    workflow_id,
    execution_timeout,
    tags,
    decision_tasks_timeout,
    input,
    input_file,
    nb_workers,
    nb_deciders,
    heartbeat,
    display_status,
    repair,
    force_activities,
):
    """
    This command spawn a decider and an activity worker to execute a workflow
    with a single main process.

    """
    disable_boto_connection_pooling()

    if force_activities and not repair:
        raise ValueError(
            "You should only use --force-activities with --repair.")

    workflow_class = get_workflow(workflow)
    if not workflow_id:
        workflow_id = workflow_class.name

    wf_input = {}
    if input or input_file:
        wf_input = get_or_load_input(input_file, input)

    if repair:
        repair_run_id = None
        if " " in repair:
            repair, repair_run_id = repair.split(" ", 1)
        # get the previous execution history, it will serve as "default history"
        # for activities that succeeded in the previous execution
        logger.info("retrieving history of previous execution: domain={} "
                    "workflow_id={} run_id={}".format(domain, repair,
                                                      repair_run_id))
        workflow_execution = get_workflow_execution(domain,
                                                    repair,
                                                    run_id=repair_run_id)
        previous_history = History(workflow_execution.history())
        repair_run_id = workflow_execution.run_id
        previous_history.parse()
        # get the previous execution input if none passed
        if not input and not input_file:
            wf_input = previous_history.events[0].input
        if not tags:
            tags = workflow_execution.tag_list
    else:
        previous_history = None
        repair_run_id = None
        if not tags:
            get_tag_list = getattr(workflow_class, "get_tag_list", None)
            if get_tag_list:
                tags = get_tag_list(workflow_class, *wf_input.get("args", ()),
                                    **wf_input.get("kwargs", {}))
            else:
                tags = getattr(workflow_class, "tag_list", None)
            if tags == Workflow.INHERIT_TAG_LIST:
                tags = None

    task_list = create_unique_task_list(workflow_id)
    logger.info("using task list {}".format(task_list))
    decider_proc = multiprocessing.Process(
        target=decider.command.start,
        args=(
            [workflow],
            domain,
            task_list,
        ),
        kwargs={
            "nb_processes": nb_deciders,
            "repair_with": previous_history,
            "force_activities": force_activities,
            "is_standalone": True,
            "repair_workflow_id": repair or None,
            "repair_run_id": repair_run_id,
        },
    )
    decider_proc.start()

    worker_proc = multiprocessing.Process(
        target=worker.command.start,
        args=(
            domain,
            task_list,
        ),
        kwargs={
            "nb_processes": nb_workers,
            "heartbeat": heartbeat,
        },
    )
    worker_proc.start()

    print("starting workflow {}".format(workflow), file=sys.stderr)
    ex = start_workflow.callback(
        workflow,
        domain,
        workflow_id,
        task_list,
        execution_timeout,
        tags,
        decision_tasks_timeout,
        format.input(wf_input),
        None,
        local=False,
    )
    while True:
        time.sleep(2)
        ex = helpers.get_workflow_execution(
            domain,
            ex.workflow_id,
            ex.run_id,
        )
        if display_status:
            print("status: {}".format(ex.status), file=sys.stderr)
        if ex.status == ex.STATUS_CLOSED:
            print("execution {} finished".format(ex.workflow_id),
                  file=sys.stderr)
            break

    os.kill(worker_proc.pid, signal.SIGTERM)
    worker_proc.join()
    os.kill(decider_proc.pid, signal.SIGTERM)
    decider_proc.join()
Exemple #31
0
def spawn(poller, token, task, heartbeat=60):
    """
    Spawn a process and wait for it to end, sending heartbeats to SWF.

    On activity timeouts and termination, we reap the worker process and its
    children.

    :param poller:
    :type poller: ActivityPoller
    :param token:
    :type token: str
    :param task:
    :type task: swf.models.ActivityTask
    :param heartbeat: heartbeat delay (seconds)
    :type heartbeat: int
    """
    logger.info('spawning new activity worker pid={} heartbeat={}'.format(os.getpid(), heartbeat))
    worker = multiprocessing.Process(
        target=process_task,
        args=(poller, token, task),
    )
    worker.start()

    def worker_alive():
        return psutil.pid_exists(worker.pid)

    while worker_alive():
        worker.join(timeout=heartbeat)
        if not worker_alive():
            # Most certainly unneeded: we'll see
            if worker.exitcode is None:
                # race condition, try and re-join
                worker.join(timeout=0)
                if worker.exitcode is None:
                    logger.warning("process {} is dead but multiprocessing doesn't know it (simpleflow bug)".format(
                        worker.pid
                    ))
            if worker.exitcode != 0:
                poller.fail_with_retry(
                    token,
                    task,
                    reason='process {} died: exit code {}'.format(
                        worker.pid,
                        worker.exitcode)
                )
            return
        try:
            logger.debug(
                'heartbeating for pid={} (token={})'.format(worker.pid, token)
            )
            response = poller.heartbeat(token)
        except swf.exceptions.DoesNotExistError as error:
            # Either the task or the workflow execution no longer exists,
            # let's kill the worker process.
            logger.warning('heartbeat failed: {}'.format(error))
            logger.warning('killing (KILL) worker with pid={}'.format(worker.pid))
            reap_process_tree(worker.pid)
            return
        except swf.exceptions.RateLimitExceededError as error:
            # ignore rate limit errors: high chances the next heartbeat will be
            # ok anyway, so it would be stupid to break the task for that
            logger.warning(
                'got a "ThrottlingException / Rate exceeded" when heartbeating for task {}: {}'.format(
                    task.activity_type.name,
                    error))
            continue
        except Exception as error:
            # Let's crash if it cannot notify the heartbeat failed.  The
            # subprocess will become orphan and the heartbeat timeout may
            # eventually trigger on Amazon SWF side.
            logger.error('cannot send heartbeat for task {}: {}'.format(
                task.activity_type.name,
                error))
            raise

        # Task cancelled.
        if response and response.get('cancelRequested'):
            reap_process_tree(worker.pid)
            return
Exemple #32
0
    def submit(self, func, *args, **kwargs):
        logger.info('executing task {}(args={}, kwargs={})'.format(
            func, args, kwargs))

        future = futures.Future()

        context = self.get_run_context()
        context["activity_id"] = str(self.nb_activities)
        self.nb_activities += 1

        # Ensure signals ordering
        if isinstance(func, SignalTask):
            self.signals_sent.add(func.name)
        elif isinstance(func, WaitForSignal):
            signal_name = func.signal_name
            if signal_name not in self.signals_sent:
                raise NotImplementedError(
                    'wait_signal({}) before signal was sent: unsupported by the local executor'
                    .format(signal_name))
        elif isinstance(func, MarkerTask):
            self._markers.setdefault(func.name,
                                     []).append(Marker(func.name,
                                                       func.details))

        if isinstance(func, Submittable):
            task = func  # *args, **kwargs already resolved.
            task.context = context
            func = getattr(task, 'activity', None)
        elif isinstance(func, Activity):
            task = ActivityTask(func, context=context, *args, **kwargs)
        elif issubclass(func, Workflow):
            task = WorkflowTask(self, func, *args, **kwargs)
        else:
            raise TypeError('invalid type {} for {}'.format(type(func), func))

        if isinstance(task, WorkflowTask):
            self.on_new_workflow(task)

        try:
            future._result = task.execute()
            if hasattr(task, 'post_execute'):
                task.post_execute()
            state = 'completed'
        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            future._exception = exc_value
            logger.exception('rescuing exception: {}'.format(exc_value))
            if (isinstance(func, Activity)
                    or issubclass_(func, Workflow)) and getattr(
                        func, 'raises_on_failure', None):
                tb = traceback.format_tb(exc_traceback)
                message = format_exc(exc_value)
                details = json_dumps(
                    {
                        'error': exc_type.__name__,
                        'message': str(exc_value),
                        'traceback': tb,
                    },
                    default=repr)
                raise exceptions.TaskFailed(
                    func.name,
                    message,
                    details,
                )
            state = 'failed'
        finally:
            if isinstance(task, WorkflowTask):
                self.on_completed_workflow()
            future._state = futures.FINISHED

        if func:
            self._history.add_activity_task(func,
                                            decision_id=None,
                                            last_state=state,
                                            activity_id=context["activity_id"],
                                            input={
                                                'args': args,
                                                'kwargs': kwargs
                                            },
                                            result=future.result)
        return future
Exemple #33
0
    def submit(self, func, *args, **kwargs):
        logger.info("executing task {}(args={}, kwargs={})".format(func, args, kwargs))

        future = futures.Future()

        context = self.get_run_context()
        context["activity_id"] = str(self.nb_activities)
        self.nb_activities += 1

        # Ensure signals ordering
        if isinstance(func, SignalTask):
            self.signals_sent.add(func.name)
        elif isinstance(func, WaitForSignal):
            signal_name = func.signal_name
            if signal_name not in self.signals_sent:
                raise NotImplementedError(
                    "wait_signal({}) before signal was sent: unsupported by the local executor".format(
                        signal_name
                    )
                )
        elif isinstance(func, MarkerTask):
            self._markers.setdefault(func.name, []).append(
                Marker(func.name, func.details)
            )

        if isinstance(func, Submittable):
            task = func  # *args, **kwargs already resolved.
            task.context = context
            func = getattr(task, "activity", None)
        elif isinstance(func, Activity):
            task = ActivityTask(func, context=context, *args, **kwargs)
        elif issubclass(func, Workflow):
            task = WorkflowTask(self, func, *args, **kwargs)
        else:
            raise TypeError("invalid type {} for {}".format(type(func), func))

        if isinstance(task, WorkflowTask):
            self.on_new_workflow(task)

        try:
            future._result = task.execute()
            if hasattr(task, "post_execute"):
                task.post_execute()
            state = "completed"
        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            tb = traceback.format_tb(exc_traceback)
            task_failed = exceptions.TaskFailed(
                name=getattr(task, "name", "unknown"),
                reason=format_exc(exc_value),
                details=json_dumps(
                    {
                        "error": exc_type.__name__,
                        "error_type": format_exc_type(exc_type),
                        "message": str(exc_value),
                        "traceback": tb,
                    },
                    default=repr,
                ),
            )
            future.set_exception(task_failed)
            logger.exception("rescuing exception: {}".format(exc_value))
            if (isinstance(func, Activity) or issubclass_(func, Workflow)) and getattr(
                func, "raises_on_failure", None
            ):
                raise task_failed
            state = "failed"
        finally:
            if isinstance(task, WorkflowTask):
                self.on_completed_workflow()
            future._state = futures.FINISHED

        if func:
            self._history.add_activity_task(
                func,
                decision_id=None,
                last_state=state,
                activity_id=context["activity_id"],
                input={"args": args, "kwargs": kwargs},
                result=future.result,
            )
        return future
Exemple #34
0
 def on_terminate(p):
     logger.info("process: terminated pid={} retcode={}".format(p.pid, p.returncode))
Exemple #35
0
 def _download_binary(self):
     logger.info("Downloading binary: {} -> {}".format(self.remote_location, self.local_location))
     bucket, path = self.remote_location.replace("s3://", "", 1).split("/", 1)
     # with FileLock(dest):
     pull(bucket, path, self.local_location)
     os.chmod(self.local_location, 0o755)
Exemple #36
0
 def stop_gracefully(self):
     """
     Stop the actor processes and subprocesses.
     """
     logger.info('stopping %s', self.name)
     self.is_alive = False  # No longer take requests.
Exemple #37
0
def standalone(context,
               workflow,
               domain,
               workflow_id,
               execution_timeout,
               tags,
               decision_tasks_timeout,
               input,
               input_file,
               nb_workers,
               nb_deciders,
               heartbeat,
               display_status,
               repair,
               force_activities,
               ):
    """
    This command spawn a decider and an activity worker to execute a workflow
    with a single main process.

    """
    disable_boto_connection_pooling()

    if force_activities and not repair:
        raise ValueError(
            "You should only use --force-activities with --repair."
        )

    workflow_class = get_workflow(workflow)
    if not workflow_id:
        workflow_id = workflow_class.name

    wf_input = {}
    if input or input_file:
        wf_input = get_or_load_input(input_file, input)

    if repair:
        repair_run_id = None
        if " " in repair:
            repair, repair_run_id = repair.split(" ", 1)
        # get the previous execution history, it will serve as "default history"
        # for activities that succeeded in the previous execution
        logger.info(
            'retrieving history of previous execution: domain={} '
            'workflow_id={} run_id={}'.format(domain, repair, repair_run_id)
        )
        workflow_execution = get_workflow_execution(domain, repair, run_id=repair_run_id)
        previous_history = History(workflow_execution.history())
        repair_run_id = workflow_execution.run_id
        previous_history.parse()
        # get the previous execution input if none passed
        if not input and not input_file:
            wf_input = previous_history.events[0].input
        if not tags:
            tags = workflow_execution.tag_list
    else:
        previous_history = None
        repair_run_id = None
        if not tags:
            get_tag_list = getattr(workflow_class, 'get_tag_list', None)
            if get_tag_list:
                tags = get_tag_list(workflow_class, *wf_input.get('args', ()), **wf_input.get('kwargs', {}))
            else:
                tags = getattr(workflow_class, 'tag_list', None)
            if tags == Workflow.INHERIT_TAG_LIST:
                tags = None

    task_list = create_unique_task_list(workflow_id)
    logger.info('using task list {}'.format(task_list))
    decider_proc = multiprocessing.Process(
        target=decider.command.start,
        args=(
            [workflow],
            domain,
            task_list,
        ),
        kwargs={
            'nb_processes': nb_deciders,
            'repair_with': previous_history,
            'force_activities': force_activities,
            'is_standalone': True,
            'repair_workflow_id': repair or None,
            'repair_run_id': repair_run_id,
        },
    )
    decider_proc.start()

    worker_proc = multiprocessing.Process(
        target=worker.command.start,
        args=(
            domain,
            task_list,
        ),
        kwargs={
            'nb_processes': nb_workers,
            'heartbeat': heartbeat,
        },
    )
    worker_proc.start()

    print('starting workflow {}'.format(workflow), file=sys.stderr)
    ex = start_workflow.callback(
        workflow,
        domain,
        workflow_id,
        task_list,
        execution_timeout,
        tags,
        decision_tasks_timeout,
        format.input(wf_input),
        None,
        local=False,
    )
    while True:
        time.sleep(2)
        ex = helpers.get_workflow_execution(
            domain,
            ex.workflow_id,
            ex.run_id,
        )
        if display_status:
            print('status: {}'.format(ex.status), file=sys.stderr)
        if ex.status == ex.STATUS_CLOSED:
            print('execution {} finished'.format(ex.workflow_id), file=sys.stderr)
            break

    os.kill(worker_proc.pid, signal.SIGTERM)
    worker_proc.join()
    os.kill(decider_proc.pid, signal.SIGTERM)
    decider_proc.join()