コード例 #1
0
 def create(self, parent):
     """
     Create a new environment containing added resources.
     """
     env = Environment(parent=parent)
     for item in self._resources:
         env.add(item)
     return env
コード例 #2
0
 def __init__(self, **options):
     super(Pool, self).__init__(**options)
     self.unassigned = []  # unassigned tasks
     self.task_assign_cnt = {}  # uid: times_assigned
     self.should_reschedule = default_check_reschedule
     self._workers = Environment(parent=self)
     self._conn = self.CONN_MANAGER(self._cfg)
     self._pool_lock = threading.Lock()
     self._metadata = {}
コード例 #3
0
class Pool(Executor):
    """
    Pool task executor object that initializes workers and dispatches tasks.
    """

    CONFIG = PoolConfig
    CONN_MANAGER = ConnectionManager

    def __init__(self, **options):
        super(Pool, self).__init__(**options)
        self.unassigned = []  # unassigned tasks
        self.task_assign_cnt = {}  # uid: times_assigned
        self.should_reschedule = default_check_reschedule
        self._workers = Environment(parent=self)
        self._workers_last_result = {}
        self._conn = self.CONN_MANAGER(self._cfg)
        self._pool_lock = threading.Lock()
        self._request_handlers = {}
        self._metadata = {}

    def uid(self):
        """Pool name."""
        return self.cfg.name

    def add(self, task, uid):
        """
        Add a task for execution.

        :param task: Task to be scheduled to workers.
        :type task: :py:class:`~testplan.runners.pools.tasks.base.Task`
        :param uid: Task uid.
        :type uid: ``str``
        """
        if not isinstance(task, Task):
            raise ValueError('Task was expected, got {} instead.'.format(
                type(task)))
        super(Pool, self).add(task, uid)
        self.unassigned.append(uid)

    def set_reschedule_check(self, check_reschedule):
        """
        Sets callable with custom rules to determine if a task should be
        rescheduled. It must accept the pool object and the task result,
        and based on these it returns if the task should be rescheduled
        (i.e due to a known rare system error).

        :param check_reschedule: Custom callable for task reschedule.
        :type check_reschedule: ``callable`` that takes
          ``pool``, ``task_result`` arguments.
        :return: True if Task should be rescheduled else False.
        :rtype: ``bool``
        """
        validate_func('pool', 'task_result')(check_reschedule)
        self.should_reschedule = check_reschedule

    def _loop(self):
        worker_monitor = threading.Thread(target=self._workers_monitoring)
        worker_monitor.daemon = True
        worker_monitor.start()

        while self.active:
            if self.status.tag == self.status.STARTING:
                self.status.change(self.status.STARTED)
            elif self.status.tag == self.status.STOPPING:
                self.status.change(self.status.STOPPED)
                break
            else:
                msg = self._conn.accept()
                if msg:
                    try:
                        with self._pool_lock:
                            self.handle_request(msg)
                    except Exception as exc:
                        self.logger.error(format_trace(inspect.trace(), exc))
            time.sleep(self.cfg.active_loop_sleep)

    def handle_request(self, request):
        """
        Handles a worker request. I.e TaskPull, TaskResults, Heartbeat etc.

        :param request: Worker request.
        :type request: :py:class:`~testplan.runners.pools.communication.Message`
        """
        sender_index = request.sender_metadata['index']
        worker = self._workers[sender_index]
        if not worker.active:
            self.logger.critical(
                'Ignoring message {} - {} from inactive worker {}'.format(
                    request.cmd, request.data, worker))
            # TODO check whether should we respond.
            worker.respond(Message(**self._metadata).make(Message.Ack))
            return
        else:
            worker.last_heartbeat = time.time()

        self.logger.debug('Pool {} request received by {} - {}, {}'.format(
            self.cfg.name, worker, request.cmd, request.data))
        response = Message(**self._metadata)  # Pool metadata

        if not self.active or self.status.tag == self.STATUS.STOPPING:
            worker.respond(response.make(Message.Stop))
        elif request.cmd == Message.ConfigRequest:
            options = []
            cfg = self.cfg
            while cfg:
                try:
                    options.append(cfg.denormalize())
                except Exception as exc:
                    self.logger.error('Could not denormalize: {} - {}'.format(
                        cfg, exc))
                cfg = cfg.parent
            worker.respond(response.make(Message.ConfigSending, data=options))
        elif request.cmd == Message.TaskPullRequest:
            tasks = []
            if self.status.tag == self.status.STARTED:
                for _ in range(request.data):
                    try:
                        uid = self.unassigned.pop(0)
                    except IndexError:
                        break
                    if uid not in self.task_assign_cnt:
                        self.task_assign_cnt[uid] = 0
                    if self.task_assign_cnt[uid] >= self.cfg.task_retries_limit:
                        self._discard_task(
                            uid, '{} already reached max retries: {}'.format(
                                self._input[uid], self.cfg.task_retries_limit))
                        continue
                    else:
                        self.task_assign_cnt[uid] += 1
                        task = self._input[uid]
                        self.logger.test_info('Scheduling {} to {}'.format(
                            task, worker))
                        worker.assigned.add(uid)
                        tasks.append(task)
                if tasks:
                    worker.respond(
                        response.make(Message.TaskSending, data=tasks))
                    worker.requesting = request.data - len(tasks)
                    return
            worker.requesting = request.data
            worker.respond(response.make(Message.Ack))
        elif request.cmd == Message.TaskResults:
            for task_result in request.data:
                uid = task_result.task.uid()
                worker.assigned.remove(uid)
                if worker not in self._workers_last_result:
                    self._workers_last_result[worker] = time.time()
                self.logger.test_info('De-assign {} from {}'.format(
                    task_result.task, worker))

                if self.should_reschedule(self, task_result):
                    if self.task_assign_cnt[uid] >= self.cfg.task_retries_limit:
                        self.logger.test_info(
                            'Will not reschedule {} again as it '
                            'reached max retries'.format(
                                self._input[uid], self.cfg.task_retries_limit))
                    else:
                        self.logger.test_info(
                            'Rescheduling {} due to '
                            'should_reschedule() cfg option of {}'.format(
                                task_result.task, self))
                        self.unassigned.append(uid)
                        continue

                self._print_test_result(task_result)
                self._results[uid] = task_result
                self.ongoing.remove(uid)
            worker.respond(response.make(Message.Ack))
        elif request.cmd == Message.Heartbeat:
            worker.last_heartbeat = time.time()
            self.logger.debug(
                'Received heartbeat from {} at {} after {}s.'.format(
                    worker, request.data,
                    time.time() - request.data))
            worker.respond(
                response.make(Message.Ack, data=worker.last_heartbeat))
        elif request.cmd == Message.SetupFailed:
            self.logger.test_info('Worker {} setup failed:{}{}'.format(
                worker, os.linesep, request.data))
            worker.respond(response.make(Message.Ack))
            self._deco_worker(worker, 'Aborting {}, setup failed.')
        elif request.cmd in self._request_handlers:
            self._request_handlers[request.cmd](worker, response)
        else:
            self.logger.error('Unknown request: {} {} {} {}'.format(
                request, dir(request), request.cmd, request.data))
            worker.respond(response.make(Message.Ack))

    def _deco_worker(self, worker, message):
        self.logger.critical(message.format(worker))
        if os.path.exists(worker.outfile):
            self.logger.critical('\tlogfile: {}'.format(worker.outfile))
        while worker.assigned:
            uid = worker.assigned.pop()
            self.logger.test_info('Re-assigning {} from {} to {}.'.format(
                self._input[uid], worker, self))
            self.unassigned.append(uid)
        worker.abort()

    def _workers_handler_monitoring(self, worker, workers_last_killed={}):
        inactivity_threshold = self.cfg.worker_inactivity_threshold

        if worker not in workers_last_killed:
            workers_last_killed[worker] = time.time()

        worker_last_killed = workers_last_killed[worker]
        if not worker.assigned or\
            time.time() - worker_last_killed < inactivity_threshold:
            return

        try:
            proc = psutil.Process(worker.handler.pid)
            children = list(proc.children(recursive=True))
            worker_last_result = self._workers_last_result.get(worker, 0)
            if all(item.status() == 'zombie' for item in children) and\
                    time.time() - worker_last_result > inactivity_threshold:
                workers_last_killed[worker] = time.time()
                try:
                    while worker.assigned:
                        uid = worker.assigned.pop()
                        self.logger.test_info(
                            'Re-assigning {} from {} to {}.'.format(
                                self._input[uid], worker, self))
                        self.unassigned.append(uid)
                    self.logger.test_info(
                        'Restarting worker: {}'.format(worker))
                    worker.stop()
                    worker.start()
                except Exception as exc:
                    self.logger.critical(
                        'Worker {} failed to restart: {}'.format(worker, exc))
                    self._deco_worker(
                        worker, 'Aborting {}, due to defunct child process.')
        except psutil.NoSuchProcess:
            pass

    def _workers_monitoring(self):
        if not self.cfg.worker_heartbeat:
            # No heartbeat means no fault tolerance for worker.
            return

        monitor_started = time.time()
        loop_sleep = self.cfg.worker_heartbeat * self.cfg.heartbeats_miss_limit

        while self._loop_handler.is_alive():
            w_total = set()
            w_uninitialized = set()
            w_active = set()
            w_inactive = set()

            monitor_alive = time.time() - monitor_started
            init_window = monitor_alive <= self.cfg.heartbeat_init_window
            with self._pool_lock:
                for worker in self._workers:
                    if getattr(worker, 'handler', None):
                        self._workers_handler_monitoring(worker)
                    w_total.add(worker)
                    if not worker.active:
                        w_inactive.add(worker)
                    elif worker.last_heartbeat is None:
                        w_uninitialized.add(worker)
                        if not init_window:
                            self._deco_worker(
                                worker, 'Aborting {}, could not initialize.')
                    elif time.time() - worker.last_heartbeat > loop_sleep:
                        w_inactive.add(worker)
                        self._deco_worker(
                            worker, 'Aborting {}, failed to send heartbeats.')
                    else:
                        w_active.add(worker)

                if w_total:
                    if len(w_inactive) == len(w_total):
                        self.logger.critical(
                            'All workers of {} are inactive.'.format(self))
                        self.abort()
                        break
            try:
                # For early finish of worker monitoring thread.
                wait_until_predicate(lambda: not self._loop_handler.is_alive(),
                                     timeout=loop_sleep,
                                     interval=0.05)
            except RuntimeError:
                return

    def _discard_task(self, uid, reason):
        self.logger.critical('Discard task {} of {} - {}.'.format(
            self._input[uid], self, reason))
        self._results[uid] = TaskResult(
            task=self._input[uid],
            status=False,
            reason='Task discarded by {} - {}.'.format(self, reason))
        self.ongoing.remove(uid)

    def _discard_pending_tasks(self):
        self.logger.critical('Discard pending tasks of {}.'.format(self))
        while self.ongoing:
            uid = self.ongoing[0]
            self._results[uid] = TaskResult(
                task=self._input[uid],
                status=False,
                reason='Task discarding due to pool {} abort.'.format(self))
            self.ongoing.pop(0)

    def _print_test_result(self, task_result):
        if not isinstance(task_result.result, RunnableResult) or\
           not hasattr(task_result.result, 'report'):
            return

        # Currently prints report top level result and not details.
        name = task_result.result.report.name
        if task_result.result.report.passed is True:
            self.logger.test_info('{} -> {}'.format(name, Color.green('Pass')))
        else:
            self.logger.test_info('{} -> {}'.format(name, Color.red('Fail')))

    def _add_workers(self):
        """TODO."""
        for idx in (str(i) for i in range(self.cfg.size)):
            worker = self.cfg.worker_type(index=idx)
            self.logger.debug('Initialized %s', worker)
            worker.parent = self
            worker.cfg.parent = self.cfg
            self.logger.debug('Worker %(index)d outfile = %(outfile)s', {
                'index': idx,
                'outfile': worker.outfile
            })
            self._workers.add(worker, uid=idx)
            self._conn.register(worker)
            self.logger.debug('Added {}.'.format(worker))

    def starting(self):
        """Starting the pool and workers."""
        super(Pool, self).starting()
        self.make_runpath_dirs()
        self._metadata['runpath'] = self.runpath
        self._add_workers()
        self._workers.start()
        if self._workers.start_exceptions:
            for msg in self._workers.start_exceptions.values():
                self.logger.error(msg)
            self._workers.stop()
            raise RuntimeError(
                'All workers of {} failed to start.'.format(self))

    def workers_requests(self):
        """Count how many tasks workers are requesting."""
        return sum(worker.requesting for worker in self._workers)

    def stopping(self):
        """Stop connections and workers."""
        self._conn.close()
        self._workers.stop()
        super(Pool, self).stopping()

    def abort_dependencies(self):
        """Empty generator to override parent implementation."""
        return
        yield

    def aborting(self):
        """Aborting logic."""
        self.logger.debug('Aborting pool {}'.format(self))
        self._conn.close()
        for worker in self._workers:
            worker.abort()
        self._discard_pending_tasks()
        self.logger.debug('Aborted pool {}'.format(self))