Example #1
0
    def __init__(self, conf, beanstalkd_addr=None, logger=None):
        self.conf = conf
        self.logger = logger or get_logger(self.conf)
        self.namespace = conf['namespace']
        self.success = True

        # counters
        self.items_processed = 0
        self.total_items_processed = 0
        self.errors = 0
        self.total_errors = 0
        self.total_expected_items = None

        # report
        self.start_time = 0
        self.last_report = 0
        self.report_interval = int_value(self.conf.get('report_interval'),
                                         self.DEFAULT_REPORT_INTERVAL)

        # dispatcher
        self.dispatcher = None

        # input
        self.beanstalkd = None
        if beanstalkd_addr:
            self.beanstalkd = BeanstalkdListener(
                beanstalkd_addr,
                self.conf.get('beanstalkd_worker_tube')
                or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger)
Example #2
0
 def __init__(self, conf, logger, beanstalkd_addr, **kwargs):
     super(BlobImprover, self).__init__(conf, logger, volume=None, **kwargs)
     self.content_factory = ContentFactory(self.conf, logger=self.logger)
     beanstalkd_tube = self.conf.get('beanstalkd_tube',
                                     DEFAULT_IMPROVER_TUBE)
     self.listener = BeanstalkdListener(beanstalkd_addr, beanstalkd_tube,
                                        self.logger, **kwargs)
     self.sender = BeanstalkdSender(beanstalkd_addr, beanstalkd_tube,
                                    self.logger, **kwargs)
     self.retry_delay = int_value(self.conf.get('retry_delay'), 30)
     self.reqid_prefix = 'blob-impr-'
Example #3
0
    def listen_beanstalkd_reply_forever(self):
        """
            Process this orchestrator's job replies
        """

        self.logger.info('Connecting to the reply beanstalkd')

        while self.running:
            try:
                listener = BeanstalkdListener(
                    addr=self.beanstalkd_reply_addr,
                    tube=self.beanstalkd_reply_tube,
                    logger=self.logger)

                break
            except ConnectionError:
                self.logger.error('Failed to connect to the reply beanstalkd')

            sleep(5)

        self.logger.info('Listening to replies on %s (tube=%s)',
                         self.beanstalkd_reply_addr,
                         self.beanstalkd_reply_tube)

        # keep the job results in memory
        while self.running:
            connection_error = self.listen_loop(listener)

            # in case of a beanstalkd connection error
            # sleep to avoid spamming
            if connection_error:
                sleep(2)

        self.logger.info('Exited listening thread')
Example #4
0
    def __init__(self, conf, beanstalkd_addr=None, logger=None):
        self.conf = conf
        self.logger = logger or get_logger(self.conf)
        self.namespace = conf['namespace']
        self.success = True

        # exit gracefully
        self.running = True
        signal.signal(signal.SIGINT, self.exit_gracefully)
        signal.signal(signal.SIGTERM, self.exit_gracefully)

        # counters
        self.items_processed = 0
        self.total_items_processed = 0
        self.errors = 0
        self.total_errors = 0
        self.total_expected_items = None

        # report
        self.start_time = 0
        self.last_report = 0
        self.report_interval = int_value(self.conf.get(
            'report_interval'), self.DEFAULT_REPORT_INTERVAL)

        # dispatcher
        self.dispatcher = None

        # input
        self.beanstalkd = None
        if beanstalkd_addr:
            self.beanstalkd = BeanstalkdListener(
                beanstalkd_addr,
                self.conf.get('beanstalkd_worker_tube')
                or self.DEFAULT_BEANSTALKD_WORKER_TUBE,
                self.logger)

        # retry
        self.retryer = None
        self.retry_queue = None
        if self.beanstalkd:
            self.retryer = BeanstalkdSender(
                self.beanstalkd.addr, self.beanstalkd.tube, self.logger)
            self.retry_queue = eventlet.Queue()
        self.retry_delay = int_value(self.conf.get('retry_delay'),
                                     self.DEFAULT_RETRY_DELAY)
Example #5
0
class BlobImprover(Rebuilder):
    """
    Move chunks of objects declared as "perfectible",
    if possible to improve them (increased distance between chunks or
    better hosting service).
    """

    supported_events = (EventTypes.CONTENT_PERFECTIBLE, )

    def __init__(self, conf, logger, beanstalkd_addr, **kwargs):
        super(BlobImprover, self).__init__(conf, logger, volume=None, **kwargs)
        self.content_factory = ContentFactory(self.conf, logger=self.logger)
        beanstalkd_tube = self.conf.get('beanstalkd_tube',
                                        DEFAULT_IMPROVER_TUBE)
        self.listener = BeanstalkdListener(beanstalkd_addr, beanstalkd_tube,
                                           self.logger, **kwargs)
        self.sender = BeanstalkdSender(beanstalkd_addr, beanstalkd_tube,
                                       self.logger, **kwargs)
        self.retry_delay = int_value(self.conf.get('retry_delay'), 30)
        self.reqid_prefix = 'blob-impr-'

    def exit_gracefully(self, signum, frame):
        super(BlobImprover, self).exit_gracefully(signum, frame)
        self.listener.running = False

    def _event_from_job(self, job_id, data, **kwargs):
        """Decode a JSON string into an event dictionary."""
        # pylint: disable=no-member
        event = json.loads(data)
        type_ = event.get('event')
        # Bury events that should not be there
        if type_ not in self.__class__.supported_events:
            msg = 'Discarding event %s (type=%s)' % (event.get('job_id'),
                                                     type_)
            self.logger.info(msg)
            raise exceptions.ExplicitBury(msg)
        yield event

    def _create_worker(self, **kwargs):
        return BlobImproverWorker(self, **kwargs)

    def _fill_queue(self, queue, **kwargs):
        max_events = kwargs.get('max_events')
        sent_events = 0
        # Do not block more than 2 seconds
        events = self.listener.fetch_jobs(self._event_from_job,
                                          reserve_timeout=2,
                                          **kwargs)
        for event in events:
            queue.put(event)
            sent_events += 1
            if max_events > 0 and sent_events >= max_events:
                self.logger.info('Max events (%d) reached, exiting',
                                 max_events)
                break
            if not self.running:
                break
        events.close()

    def _read_retry_queue(self, queue, **kwargs):
        while True:
            # Reschedule jobs we were not able to handle.
            item = queue.get()
            sent = False
            while not sent:
                sent = self.sender.send_job(json.dumps(item),
                                            delay=self.retry_delay)
                if not sent:
                    sleep(1.0)
            self.sender.job_done()
            queue.task_done()

    def _item_to_string(self, item, **kwargs):
        try:
            url = item['url']
            fullpath = encode_fullpath(url['account'],
                                       url['user'], url['path'],
                                       url.get('version', 1), url['content'])
            # TODO(FVE): maybe tell some numbers about chunks
            if item.get('event') == EventTypes.CONTENT_PERFECTIBLE:
                return 'perfectible object %s' % (fullpath, )
            else:
                return 'object %s' % (fullpath, )
        except (KeyError, ValueError) as err:
            return '<unknown item> ({0})'.format(repr(err))

    def _get_report(self, status, end_time, counters, **kwargs):
        items_processed, errors, total_items_processed, total_errors = counters
        time_since_last_report = (end_time - self.last_report) or 0.00001
        total_time = (end_time - self.start_time) or 0.00001
        return ('%(status)s volume=%(volume)s '
                'last_report=%(last_report)s %(time_since_last_report).2fs '
                'chunks=%(chunks)d %(chunks_rate).2f/s '
                'errors=%(errors)d %(errors_rate).2f%% '
                'start_time=%(start_time)s %(total_time).2fs '
                'total_chunks=%(total_chunks)d '
                '%(total_chunks_rate).2f/s '
                'total_errors=%(total_errors)d %(total_errors_rate).2f%%' % {
                    'status':
                    status,
                    'volume':
                    self.volume,
                    'last_report':
                    datetime.fromtimestamp(int(self.last_report)).isoformat(),
                    'time_since_last_report':
                    time_since_last_report,
                    'chunks':
                    items_processed,
                    'chunks_rate':
                    items_processed / time_since_last_report,
                    'errors':
                    errors,
                    'errors_rate':
                    100 * errors / float(items_processed or 1),
                    'start_time':
                    datetime.fromtimestamp(int(self.start_time)).isoformat(),
                    'total_time':
                    total_time,
                    'total_chunks':
                    total_items_processed,
                    'total_chunks_rate':
                    total_items_processed / total_time,
                    'total_errors':
                    total_errors,
                    'total_errors_rate':
                    100 * total_errors / float(total_items_processed or 1)
                })
Example #6
0
    def __init__(self, conf, tool):
        super(_DistributedDispatcher, self).__init__(conf, tool)
        self.sending = None

        self.max_items_per_second = int_value(
            self.conf.get('items_per_second'),
            self.tool.DEFAULT_ITEM_PER_SECOND)

        # All available beanstalkd
        conscience_client = ConscienceClient(self.conf)
        all_beanstalkd = conscience_client.all_services('beanstalkd')
        all_available_beanstalkd = dict()
        for beanstalkd in all_beanstalkd:
            if beanstalkd['score'] <= 0:
                continue
            all_available_beanstalkd[beanstalkd['addr']] = beanstalkd
        if not all_available_beanstalkd:
            raise OioException('No beanstalkd available')

        # Beanstalkd workers
        workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \
            or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE
        self.beanstalkd_workers = dict()
        for beanstalkd in locate_tube(all_available_beanstalkd.values(),
                                      workers_tube):
            beanstalkd_worker = BeanstalkdSender(beanstalkd['addr'],
                                                 workers_tube, self.logger)
            self.beanstalkd_workers[beanstalkd['addr']] = beanstalkd_worker
            self.logger.info(
                'Beanstalkd %s using tube %s is selected as a worker',
                beanstalkd_worker.addr, beanstalkd_worker.tube)
        if not self.beanstalkd_workers:
            raise OioException('No beanstalkd worker available')
        nb_workers = len(self.beanstalkd_workers)
        if self.max_items_per_second > 0:
            # Max 2 seconds in advance
            queue_size_per_worker = self.max_items_per_second * 2 / nb_workers
        else:
            queue_size_per_worker = 64
        for _, beanstalkd_worker in self.beanstalkd_workers.items():
            beanstalkd_worker.low_limit = queue_size_per_worker / 2
            beanstalkd_worker.high_limit = queue_size_per_worker

        # Beanstalkd reply
        beanstalkd_reply = dict()
        try:
            local_services = conscience_client.local_services()
            for local_service in local_services:
                if local_service['type'] != 'beanstalkd':
                    continue
                beanstalkd = all_available_beanstalkd.get(
                    local_service['addr'])
                if beanstalkd is None:
                    continue
                if beanstalkd_reply \
                        and beanstalkd_reply['score'] >= beanstalkd['score']:
                    continue
                beanstalkd_reply = beanstalkd
        except Exception as exc:  # pylint: disable=broad-except
            self.logger.warning(
                'ERROR when searching for beanstalkd locally: %s', exc)
        if not beanstalkd_reply:
            self.logger.warn('No beanstalkd available locally')

            try:
                beanstalkd = conscience_client.next_instance('beanstalkd')
                beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']]
            except Exception as exc:  # pylint: disable=broad-except
                self.logger.warning('ERROR when searching for beanstalkd: %s',
                                    exc)
        beanstalkd_reply_addr = beanstalkd_reply['addr']

        # If the tube exists, another service must have already used this tube
        tube_reply = workers_tube + '.reply.' + str(time.time())
        tubes = Beanstalk.from_url('beanstalk://' +
                                   beanstalkd_reply_addr).tubes()
        if tube_reply in tubes:
            raise OioException('Beanstalkd %s using tube %s is already used')

        self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr,
                                                   tube_reply, self.logger)
        self.logger.info(
            'Beanstalkd %s using tube %s is selected for the replies',
            self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)
Example #7
0
class _DistributedDispatcher(_Dispatcher):
    """
    Dispatch tasks on the platform.
    """
    def __init__(self, conf, tool):
        super(_DistributedDispatcher, self).__init__(conf, tool)
        self.sending = None

        self.max_items_per_second = int_value(
            self.conf.get('items_per_second'),
            self.tool.DEFAULT_ITEM_PER_SECOND)

        # All available beanstalkd
        conscience_client = ConscienceClient(self.conf)
        all_beanstalkd = conscience_client.all_services('beanstalkd')
        all_available_beanstalkd = dict()
        for beanstalkd in all_beanstalkd:
            if beanstalkd['score'] <= 0:
                continue
            all_available_beanstalkd[beanstalkd['addr']] = beanstalkd
        if not all_available_beanstalkd:
            raise OioException('No beanstalkd available')

        # Beanstalkd workers
        workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \
            or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE
        self.beanstalkd_workers = dict()
        for beanstalkd in locate_tube(all_available_beanstalkd.values(),
                                      workers_tube):
            beanstalkd_worker = BeanstalkdSender(beanstalkd['addr'],
                                                 workers_tube, self.logger)
            self.beanstalkd_workers[beanstalkd['addr']] = beanstalkd_worker
            self.logger.info(
                'Beanstalkd %s using tube %s is selected as a worker',
                beanstalkd_worker.addr, beanstalkd_worker.tube)
        if not self.beanstalkd_workers:
            raise OioException('No beanstalkd worker available')
        nb_workers = len(self.beanstalkd_workers)
        if self.max_items_per_second > 0:
            # Max 2 seconds in advance
            queue_size_per_worker = self.max_items_per_second * 2 / nb_workers
        else:
            queue_size_per_worker = 64
        for _, beanstalkd_worker in self.beanstalkd_workers.items():
            beanstalkd_worker.low_limit = queue_size_per_worker / 2
            beanstalkd_worker.high_limit = queue_size_per_worker

        # Beanstalkd reply
        beanstalkd_reply = dict()
        try:
            local_services = conscience_client.local_services()
            for local_service in local_services:
                if local_service['type'] != 'beanstalkd':
                    continue
                beanstalkd = all_available_beanstalkd.get(
                    local_service['addr'])
                if beanstalkd is None:
                    continue
                if beanstalkd_reply \
                        and beanstalkd_reply['score'] >= beanstalkd['score']:
                    continue
                beanstalkd_reply = beanstalkd
        except Exception as exc:  # pylint: disable=broad-except
            self.logger.warning(
                'ERROR when searching for beanstalkd locally: %s', exc)
        if not beanstalkd_reply:
            self.logger.warn('No beanstalkd available locally')

            try:
                beanstalkd = conscience_client.next_instance('beanstalkd')
                beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']]
            except Exception as exc:  # pylint: disable=broad-except
                self.logger.warning('ERROR when searching for beanstalkd: %s',
                                    exc)
        beanstalkd_reply_addr = beanstalkd_reply['addr']

        # If the tube exists, another service must have already used this tube
        tube_reply = workers_tube + '.reply.' + str(time.time())
        tubes = Beanstalk.from_url('beanstalk://' +
                                   beanstalkd_reply_addr).tubes()
        if tube_reply in tubes:
            raise OioException('Beanstalkd %s using tube %s is already used')

        self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr,
                                                   tube_reply, self.logger)
        self.logger.info(
            'Beanstalkd %s using tube %s is selected for the replies',
            self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)

    def _fetch_tasks_events_to_send(self):
        items_with_beanstalkd_reply = \
            self.tool.fetch_items_with_beanstalkd_reply()
        for item, _ in items_with_beanstalkd_reply:
            yield self.tool.task_event_from_item(item)

    def _tasks_res_from_res_event(self, job_id, data, **kwargs):
        res_event = json.loads(data)
        beanstalkd_worker_addr = res_event['beanstalkd_worker']['addr']
        tasks_res = self.tool.tasks_res_from_res_event(res_event)
        self.beanstalkd_workers[beanstalkd_worker_addr].job_done()
        return tasks_res

    def _all_events_are_processed(self):
        """
        Tell if all workers have finished to process their events.
        """
        if self.sending:
            return False

        total_events = 0
        for worker in self.beanstalkd_workers.values():
            total_events += worker.nb_jobs
        return total_events <= 0

    def _send_task_event(self, task_event, reply_loc, next_worker):
        """
        Send the event through a non-full sender.
        """
        task_event['beanstalkd_reply'] = reply_loc
        workers = self.beanstalkd_workers.values()
        nb_workers = len(workers)
        while True:
            for _ in range(nb_workers):
                success = workers[next_worker].send_job(json.dumps(task_event))
                next_worker = (next_worker + 1) % nb_workers
                if success:
                    return next_worker
            self.logger.warn("All beanstalkd workers are full")
            sleep(5)

    def _distribute_events(self, reply_loc=None):
        next_worker = 0
        items_run_time = 0

        try:
            tasks_events = self._fetch_tasks_events_to_send()
            items_run_time = ratelimit(items_run_time,
                                       self.max_items_per_second)
            next_worker = self._send_task_event(next(tasks_events), reply_loc,
                                                next_worker)
            self.sending = True
            for task_event in tasks_events:
                items_run_time = ratelimit(items_run_time,
                                           self.max_items_per_second)
                next_worker = self._send_task_event(task_event, reply_loc,
                                                    next_worker)

                if not self.tool.running:
                    break
        except Exception as exc:
            if not isinstance(exc, StopIteration) and self.tool.running:
                self.logger.error("Failed to distribute events: %s", exc)
                self.tool.success = False
        finally:
            self.sending = False

    def run(self):
        self.tool.start_time = self.tool.last_report = time.time()
        self.tool.log_report('START', force=True)
        reply_loc = {
            'addr': self.beanstalkd_reply.addr,
            'tube': self.beanstalkd_reply.tube
        }
        # pylint: disable=no-member
        thread = threading.Thread(target=self._distribute_events,
                                  args=[reply_loc])
        thread.start()

        # Wait until the thread is started sending events
        while self.sending is None:
            sleep(0.1)

        # Retrieve responses until all events are processed
        try:
            while not self._all_events_are_processed():
                tasks_res = self.beanstalkd_reply.fetch_job(
                    self._tasks_res_from_res_event,
                    timeout=DISTRIBUTED_DISPATCHER_TIMEOUT)
                for task_res in tasks_res:
                    self.tool.update_counters(task_res)
                    yield task_res
                self.tool.log_report('RUN')
        except OioTimeout:
            self.logger.error('No response for %d seconds',
                              DISTRIBUTED_DISPATCHER_TIMEOUT)
            self.tool.success = False
        except Exception:  # pylint: disable=broad-except
            self.logger.exception('ERROR in distributed dispatcher')
            self.tool.success = False

        self.tool.log_report('DONE', force=True)
Example #8
0
class Tool(object):
    """
    Process all found items.

    For the task_res variable, the following format must be respected:
    (item, info, error).
    """

    DEFAULT_BEANSTALKD_WORKER_TUBE = 'oio-process'
    DEFAULT_REPORT_INTERVAL = 3600
    DEFAULT_RETRY_DELAY = 3600
    DEFAULT_ITEM_PER_SECOND = 30
    DEFAULT_CONCURRENCY = 1
    DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE = 'oio-process'

    def __init__(self, conf, beanstalkd_addr=None, logger=None):
        self.conf = conf
        self.logger = logger or get_logger(self.conf)
        self.namespace = conf['namespace']
        self.success = True

        # exit gracefully
        self.running = True
        signal.signal(signal.SIGINT, self.exit_gracefully)
        signal.signal(signal.SIGTERM, self.exit_gracefully)

        # counters
        self.items_processed = 0
        self.total_items_processed = 0
        self.errors = 0
        self.total_errors = 0
        self.total_expected_items = None

        # report
        self.start_time = 0
        self.last_report = 0
        self.report_interval = int_value(self.conf.get('report_interval'),
                                         self.DEFAULT_REPORT_INTERVAL)

        # dispatcher
        self.dispatcher = None

        # input
        self.beanstalkd = None
        if beanstalkd_addr:
            self.beanstalkd = BeanstalkdListener(
                beanstalkd_addr,
                self.conf.get('beanstalkd_worker_tube')
                or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger)

        # retry
        self.retryer = None
        self.retry_queue = None
        if self.beanstalkd:
            self.retryer = BeanstalkdSender(self.beanstalkd.addr,
                                            self.beanstalkd.tube, self.logger)
            self.retry_queue = eventlet.Queue()
        self.retry_delay = int_value(self.conf.get('retry_delay'),
                                     self.DEFAULT_RETRY_DELAY)

    @staticmethod
    def items_from_task_event(task_event):
        """
        Convert the task event into a list (generator) of items.
        """
        raise NotImplementedError()

    @staticmethod
    def task_event_from_item(item):
        """
        Convert the item into a task event.
        """
        raise NotImplementedError()

    @staticmethod
    def tasks_res_from_res_event(res_event):
        """
        Convert the result event into a list (generator) of tasks result.
        """
        raise NotImplementedError()

    @staticmethod
    def res_event_from_task_res(task_res):
        """
        Convert the task result into a result event.
        """
        raise NotImplementedError()

    @staticmethod
    def string_from_item(item):
        """
        Convert the item into a string.
        """
        raise NotImplementedError()

    def exit_gracefully(self, signum, frame):
        self.logger.info('Stop sending and wait for all results already sent')
        self.success = False
        self.running = False
        if self.beanstalkd:
            self.beanstalkd.running = False

    def _item_with_beanstalkd_reply_from_task_event(self, job_id, data):
        task_event = json.loads(data)
        beanstalkd_reply = task_event.get('beanstalkd_reply')
        items = self.items_from_task_event(task_event)
        for item in items:
            yield (item, beanstalkd_reply)

    def _fetch_items_with_beanstalkd_reply_from_beanstalkd(self):
        # Do not block more than 2 seconds
        return self.beanstalkd.fetch_jobs(
            self._item_with_beanstalkd_reply_from_task_event,
            reserve_timeout=2)

    def _fetch_items(self):
        """
        Fetch items from inputs (other than the beanstalkd).
        """
        raise NotImplementedError()

    def _fetch_items_with_beanstalkd_reply(self):
        items = self._fetch_items()
        for item in items:
            yield (item, None)

    def fetch_items_with_beanstalkd_reply(self):
        """
        Fetch items with beanstalkd reply (useful if the task is distributed).
        """
        if self.beanstalkd:
            return self._fetch_items_with_beanstalkd_reply_from_beanstalkd()
        return self._fetch_items_with_beanstalkd_reply()

    def update_counters(self, task_res):
        """
        Update all counters of the tool.
        """
        _, _, error = task_res
        self.items_processed += 1
        if error is not None:
            self.errors += 1

    def _update_total_counters(self):
        items_processed = self.items_processed
        self.items_processed = 0
        self.total_items_processed += items_processed
        errors = self.errors
        self.errors = 0
        self.total_errors += errors
        return items_processed, self.total_items_processed, \
            errors, self.total_errors

    def _get_report(self, status, end_time, counters):
        raise NotImplementedError()

    def log_report(self, status, force=False):
        """
        Log a report with a fixed interval.
        """
        end_time = time.time()
        if force or (end_time - self.last_report >= self.report_interval):
            counters = self._update_total_counters()
            self.logger.info(self._get_report(status, end_time, counters))
            self.last_report = end_time

    def create_worker(self, queue_workers, queue_reply):
        """
        Create worker to process the items.
        """
        raise NotImplementedError()

    def prepare_local_dispatcher(self):
        """
        The tool will dispatch the tasks locally.
        """
        self.dispatcher = _LocalDispatcher(self.conf, self)

    def prepare_distributed_dispatcher(self):
        """
        The tool will dispatch the tasks on the platform.
        """
        self.dispatcher = _DistributedDispatcher(self.conf, self)

    def _load_total_expected_items(self):
        raise NotImplementedError()

    def _read_retry_queue(self):
        if self.retry_queue is None:
            return
        while True:
            # Reschedule jobs we were not able to handle.
            item = self.retry_queue.get()
            if self.retryer:
                sent = False
                while not sent:
                    sent = self.retryer.send_job(json.dumps(
                        self.task_event_from_item(item)),
                                                 delay=self.retry_delay)
                    if not sent:
                        sleep(1.0)
                self.retryer.job_done()
            self.retry_queue.task_done()

    def run(self):
        """
        Start processing all found items.
        """
        if self.dispatcher is None:
            raise ValueError('No dispatcher')

        eventlet.spawn_n(self._load_total_expected_items)

        # spawn one worker for the retry queue
        eventlet.spawn_n(self._read_retry_queue)

        for task_res in self.dispatcher.run():
            yield task_res

        # block until the retry queue is empty
        if self.retry_queue:
            self.retry_queue.join()

    def is_success(self):
        """
        Check if there are any errors.
        """
        if not self.success:
            return False
        if self.total_items_processed == 0:
            self.logger.warn('No item to proccess')
        return self.total_errors == 0
Example #9
0
    def __init__(self, conf, tool):
        super(_DistributedDispatcher, self).__init__(conf, tool)
        self.sending = False

        # All available beanstalkd
        conscience_client = ConscienceClient(self.conf)
        all_beanstalkd = conscience_client.all_services('beanstalkd')
        all_available_beanstalkd = dict()
        for beanstalkd in all_beanstalkd:
            if beanstalkd['score'] <= 0:
                continue
            all_available_beanstalkd[beanstalkd['addr']] = beanstalkd
        if not all_available_beanstalkd:
            raise OioException('No beanstalkd available')

        # Beanstalkd workers
        workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \
            or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE
        self.beanstalkd_workers = dict()
        for _, beanstalkd in all_available_beanstalkd.items():
            beanstalkd_worker_addr = beanstalkd['addr']

            # If the tube exists,
            # there should be a service that listens to this tube
            tubes = Beanstalk.from_url('beanstalk://' +
                                       beanstalkd_worker_addr).tubes()
            if workers_tube not in tubes:
                continue

            beanstalkd_worker = BeanstalkdSender(beanstalkd_worker_addr,
                                                 workers_tube, self.logger)
            self.beanstalkd_workers[beanstalkd_worker_addr] = beanstalkd_worker
            self.logger.info(
                'Beanstalkd %s using tube %s is selected as a worker',
                beanstalkd_worker.addr, beanstalkd_worker.tube)
        if not self.beanstalkd_workers:
            raise OioException('No beanstalkd worker available')

        # Beanstalkd reply
        beanstalkd_reply = dict()
        try:
            local_services = conscience_client.local_services()
            for local_service in local_services:
                if local_service['type'] != 'beanstalkd':
                    continue
                beanstalkd = all_available_beanstalkd.get(
                    local_service['addr'])
                if beanstalkd is None:
                    continue
                if beanstalkd_reply \
                        and beanstalkd_reply['score'] >= beanstalkd['score']:
                    continue
                beanstalkd_reply = beanstalkd
        except Exception as exc:  # pylint: disable=broad-except
            self.logger.warning(
                'ERROR when searching for beanstalkd locally: %s', exc)
        if not beanstalkd_reply:
            self.logger.warn('No beanstalkd available locally')

            try:
                beanstalkd = conscience_client.next_instance('beanstalkd')
                beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']]
            except Exception as exc:  # pylint: disable=broad-except
                self.logger.warning('ERROR when searching for beanstalkd: %s',
                                    exc)
        beanstalkd_reply_addr = beanstalkd_reply['addr']

        # If the tube exists, another service must have already used this tube
        tube_reply = workers_tube + '.reply.' + str(time.time())
        tubes = Beanstalk.from_url('beanstalk://' +
                                   beanstalkd_reply_addr).tubes()
        if tube_reply in tubes:
            raise OioException('Beanstalkd %s using tube %s is already used')

        self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr,
                                                   tube_reply, self.logger)
        self.logger.info(
            'Beanstalkd %s using tube %s is selected for the replies',
            self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)
Example #10
0
class Tool(object):
    """
    Process all found items.

    For the task_res variable, the following format must be respected:
    (item, info, error).
    """

    DEFAULT_BEANSTALKD_WORKER_TUBE = 'oio-process'
    DEFAULT_REPORT_INTERVAL = 3600
    DEFAULT_ITEM_PER_SECOND = 30
    DEFAULT_WORKERS = 1
    DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE = 'oio-process'

    def __init__(self, conf, beanstalkd_addr=None, logger=None):
        self.conf = conf
        self.logger = logger or get_logger(self.conf)
        self.namespace = conf['namespace']
        self.success = True

        # counters
        self.items_processed = 0
        self.total_items_processed = 0
        self.errors = 0
        self.total_errors = 0
        self.total_expected_items = None

        # report
        self.start_time = 0
        self.last_report = 0
        self.report_interval = int_value(self.conf.get('report_interval'),
                                         self.DEFAULT_REPORT_INTERVAL)

        # dispatcher
        self.dispatcher = None

        # input
        self.beanstalkd = None
        if beanstalkd_addr:
            self.beanstalkd = BeanstalkdListener(
                beanstalkd_addr,
                self.conf.get('beanstalkd_worker_tube')
                or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger)

    @staticmethod
    def items_from_task_event(task_event):
        """
        Convert the task event into a list (generator) of items.
        """
        raise NotImplementedError()

    @staticmethod
    def task_event_from_item(item):
        """
        Convert the item into a task event.
        """
        raise NotImplementedError()

    @staticmethod
    def tasks_res_from_res_event(res_event):
        """
        Convert the result event into a list (generator) of tasks result.
        """
        raise NotImplementedError()

    @staticmethod
    def res_event_from_task_res(task_res):
        """
        Convert the task result into a result event.
        """
        raise NotImplementedError()

    @staticmethod
    def string_from_item(item):
        """
        Convert the item into a string.
        """
        raise NotImplementedError()

    def _item_with_beanstalkd_reply_from_task_event(self, job_id, data):
        task_event = json.loads(data)
        beanstalkd_reply = task_event.get('beanstalkd_reply')
        items = self.items_from_task_event(task_event)
        for item in items:
            yield (item, beanstalkd_reply)

    def _fetch_items_with_beanstalkd_reply_from_beanstalkd(self):
        return self.beanstalkd.fetch_jobs(
            self._item_with_beanstalkd_reply_from_task_event)

    def _fetch_items(self):
        """
        Fetch items from inputs (other than the beanstalkd).
        """
        raise NotImplementedError()

    def _fetch_items_with_beanstalkd_reply(self):
        items = self._fetch_items()
        for item in items:
            yield (item, None)

    def fetch_items_with_beanstalkd_reply(self):
        """
        Fetch items with beanstalkd reply (useful if the task is distributed).
        """
        if self.beanstalkd:
            return self._fetch_items_with_beanstalkd_reply_from_beanstalkd()
        return self._fetch_items_with_beanstalkd_reply()

    def update_counters(self, task_res):
        """
        Update all counters of the tool.
        """
        _, _, error = task_res
        self.items_processed += 1
        if error is not None:
            self.errors += 1

    def _update_total_counters(self):
        items_processed = self.items_processed
        self.items_processed = 0
        self.total_items_processed += items_processed
        errors = self.errors
        self.errors = 0
        self.total_errors += errors
        return items_processed, self.total_items_processed, \
            errors, self.total_errors

    def _get_report(self, status, end_time, counters):
        raise NotImplementedError()

    def log_report(self, status, force=False):
        """
        Log a report with a fixed interval.
        """
        end_time = time.time()
        if force or (end_time - self.last_report >= self.report_interval):
            counters = self._update_total_counters()
            self.logger.info(self._get_report(status, end_time, counters))
            self.last_report = end_time

    def create_worker(self, queue_workers, queue_reply):
        """
        Create worker to process the items.
        """
        raise NotImplementedError()

    def prepare_local_dispatcher(self):
        """
        The tool will dispatch the tasks locally.
        """
        self.dispatcher = _LocalDispatcher(self.conf, self)

    def prepare_distributed_dispatcher(self):
        """
        The tool will dispatch the tasks on the platform.
        """
        self.dispatcher = _DistributedDispatcher(self.conf, self)

    def _load_total_expected_items(self):
        raise NotImplementedError()

    def run(self):
        """
        Start processing all found items.
        """
        if self.dispatcher is None:
            raise ValueError('No dispatcher')

        self._load_total_expected_items()
        return self.dispatcher.run()

    def is_success(self):
        """
        Check if there are any errors.
        """
        if not self.success:
            return False
        if self.total_items_processed == 0:
            self.logger.warn('No item to proccess')
        return self.total_errors == 0