Exemple #1
0
 def wait(self):
     """
     Wait until all data in the queue
     has been processed by the send coroutine
     """
     while self.queue.qsize():
         sleep(0)
Exemple #2
0
 def _send(self):
     """Send coroutine loop"""
     self.conn.upload_start = None
     while True:
         # fetch input data from the queue
         data = self.queue.get()
         # use HTTP transfer encoding chunked
         # to write data to RAWX
         if not self.failed:
             try:
                 with ChunkWriteTimeout(self.write_timeout):
                     if self.perfdata is not None \
                             and self.conn.upload_start is None:
                         self.conn.upload_start = monotonic_time()
                     self.conn.send("%x\r\n" % len(data))
                     self.conn.send(data)
                     self.conn.send("\r\n")
                     self.bytes_transferred += len(data)
                 sleep(0)
             except (Exception, ChunkWriteTimeout) as exc:
                 self.failed = True
                 msg = str(exc)
                 logger.warn("Failed to write to %s (%s, reqid=%s)",
                             self.chunk, msg, self.reqid)
                 self.chunk['error'] = 'write: %s' % msg
Exemple #3
0
    def run(self, *args, **kwargs):
        try:
            self.logger.info('conscience agent: starting')

            pool = GreenPool(len(self.watchers))
            for watcher in self.watchers:
                pool.spawn(watcher.start)

            self.running = True
            while self.running:
                sleep(1)
                for w in self.watchers:
                    if w.failed:
                        self.watchers.remove(w)
                        self.logger.warn('restart watcher "%s"', w.name)
                        new_w = ServiceWatcher(self.conf, w.service)
                        self.watchers.append(new_w)
                        pool.spawn(new_w.start)

        except Exception as e:
            self.logger.error('ERROR in main loop %s', e)
            raise e
        finally:
            self.logger.warn('conscience agent: stopping')
            self.running = False
            self.stop_watchers()
Exemple #4
0
 def _send_data(self, conn):
     """
     Send data to an open connection, taking data blocks from `conn.queue`.
     """
     conn.upload_start = None
     while True:
         data = conn.queue.get()
         if isinstance(data, text_type):
             data = data.encode('utf-8')
         if not conn.failed:
             try:
                 with green.ChunkWriteTimeout(self.write_timeout):
                     if self.perfdata is not None \
                             and conn.upload_start is None:
                         conn.upload_start = monotonic_time()
                     conn.send(b'%x\r\n' % len(data))
                     conn.send(data)
                     conn.send(b'\r\n')
                 if not data:
                     # Last segment sent, disable TCP_CORK to flush buffers
                     conn.set_cork(False)
                 sleep(0)
             except (Exception, green.ChunkWriteTimeout) as err:
                 conn.failed = True
                 conn.chunk['error'] = str(err)
    def status(self,
               volume,
               max=1000,
               prefix=None,
               marker=None,
               max_attempts=3,
               **kwargs):
        """
        Get the status of chunks belonging to the specified volume.

        :param volume: the volume to get chunks from
        :type volume: `str`
        :param max: maximum number of results to return per request
            to the rdir server.
        :type max: `int`
        :keyword prefix: get only chunks belonging to
           the specified prefix
        :type prefix: `str`
        :keyword marker: fetch only chunk that appear after
            this marker
        :type marker: `str`
        """
        req_params = {'max': max}
        if prefix:
            req_params['prefix'] = prefix
        if marker:
            req_params['marker'] = marker
        chunks = dict()
        containers = dict()

        while True:
            for i in range(max_attempts):
                try:
                    _resp, resp_body = self._rdir_request(volume,
                                                          'GET',
                                                          'status',
                                                          params=req_params,
                                                          **kwargs)
                    break
                except OioNetworkException:
                    # Monotonic backoff
                    if i < max_attempts - 1:
                        sleep(i * 1.0)
                        continue
                    # Too many attempts
                    raise

            for (key, value) in resp_body.get('chunk', dict()).items():
                chunks[key] = chunks.get(key, 0) + value
            for (cid, info) in resp_body.get('container', dict()).items():
                for (key, value) in info.items():
                    containers[cid][key] = containers.setdefault(
                        cid, dict()).get(key, 0) + value

            if not true_value(
                    _resp.headers.get(HEADER_PREFIX + 'list-truncated')):
                break
            req_params['marker'] = _resp.headers[HEADER_PREFIX + 'list-marker']

        return {'chunk': chunks, 'container': containers}
Exemple #6
0
    def run(self):
        self.tool.start_time = self.tool.last_report = time.time()
        self.tool.log_report('START', force=True)
        reply_loc = {
            'addr': self.beanstalkd_reply.addr,
            'tube': self.beanstalkd_reply.tube
        }
        # pylint: disable=no-member
        thread = threading.Thread(target=self._distribute_events,
                                  args=[reply_loc])
        thread.start()

        # Wait until the thread is started sending events
        while self.sending is None:
            sleep(0.1)

        # Retrieve responses until all events are processed
        try:
            while not self._all_events_are_processed():
                tasks_res = self.beanstalkd_reply.fetch_job(
                    self._tasks_res_from_res_event,
                    timeout=DISTRIBUTED_DISPATCHER_TIMEOUT)
                for task_res in tasks_res:
                    self.tool.update_counters(task_res)
                    yield task_res
                self.tool.log_report('RUN')
        except OioTimeout:
            self.logger.error('No response for %d seconds',
                              DISTRIBUTED_DISPATCHER_TIMEOUT)
            self.tool.success = False
        except Exception:  # pylint: disable=broad-except
            self.logger.exception('ERROR in distributed dispatcher')
            self.tool.success = False

        self.tool.log_report('DONE', force=True)
Exemple #7
0
        def send(data):
            self.checksum.update(data)
            self.global_checksum.update(data)
            # get the encoded fragments
            if self.perfdata is not None:
                ec_start = monotonic_time()
            fragments = ec_stream.send(data)
            if self.perfdata is not None:
                ec_end = monotonic_time()
                rawx_perfdata = self.perfdata.setdefault('rawx', dict())
                rawx_perfdata['ec'] = rawx_perfdata.get('ec', 0.0) \
                    + ec_end - ec_start
            if fragments is None:
                # not enough data given
                return

            current_writers = list(writers)
            failed_chunks = list()
            for writer in current_writers:
                fragment = fragments[chunk_index[writer]]
                if not writer.failed:
                    if writer.checksum:
                        writer.checksum.update(fragment)
                    writer.send(fragment)
                else:
                    current_writers.remove(writer)
                    failed_chunks.append(writer.chunk)
            sleep(0)
            self.quorum_or_fail([w.chunk for w in current_writers],
                                failed_chunks)
Exemple #8
0
def run_indefinitely(checker,
                     entries=None,
                     rate_limiter=None,
                     pause_between_passes=0.0):
    def _stop(*args):
        checker.stop()

    import signal
    signal.signal(signal.SIGINT, _stop)
    signal.signal(signal.SIGQUIT, _stop)
    signal.signal(signal.SIGTERM, _stop)

    while checker.running:
        if checker.delayed_targets:
            run_once(checker,
                     entries=checker.delayed_targets.values(),
                     rate_limiter=rate_limiter)

        run_once(checker, entries, rate_limiter)

        checker.reset_stats()
        if checker.running and pause_between_passes > 0.0:
            checker.logger.info("Pausing for %.3fs", pause_between_passes)
            iterations, rest = divmod(pause_between_passes, 1)
            sleep(rest)
            for _ in range(int(iterations)):
                if not checker.running:
                    break
                sleep(1.0)
Exemple #9
0
    def listen_beanstalkd_reply_forever(self):
        """
            Process this orchestrator's job replies
        """

        self.logger.info('Connecting to the reply beanstalkd')

        while self.running:
            try:
                listener = BeanstalkdListener(
                    addr=self.beanstalkd_reply_addr,
                    tube=self.beanstalkd_reply_tube,
                    logger=self.logger)

                break
            except ConnectionError:
                self.logger.error('Failed to connect to the reply beanstalkd')

            sleep(5)

        self.logger.info('Listening to replies on %s (tube=%s)',
                         self.beanstalkd_reply_addr,
                         self.beanstalkd_reply_tube)

        # keep the job results in memory
        while self.running:
            connection_error = self.listen_loop(listener)

            # in case of a beanstalkd connection error
            # sleep to avoid spamming
            if connection_error:
                sleep(2)

        self.logger.info('Exited listening thread')
Exemple #10
0
 def _apply(self,
            mapping,
            moved=None,
            max_attempts=7,
            read_timeout=M0_READ_TIMEOUT):
     """
     Upload the specified mapping to the meta0 service,
     retry in case or error.
     """
     self.log.info("Saving...")
     for i in range(max_attempts):
         try:
             mapping.apply(moved=moved,
                           connection_timeout=M0_CONN_TIMEOUT,
                           read_timeout=read_timeout)
             break
         except ClientException as ex:
             # Manage several unretriable errors
             retry = (503, 504)
             if ex.status >= 400 and ex.status not in retry:
                 raise
             # Monotonic backoff (retriable and net errors)
             if i < max_attempts - 1:
                 sleep(i * 1.0)
                 continue
             # Too many attempts
             raise
Exemple #11
0
    def chunk_fetch(self,
                    volume,
                    limit=100,
                    rebuild=False,
                    container_id=None,
                    max_attempts=3,
                    start_after=None,
                    shuffle=False,
                    **kwargs):
        """
        Fetch the list of chunks belonging to the specified volume.

        :param volume: the volume to get chunks from
        :type volume: `str`
        :param limit: maximum number of results to return per request
            to the rdir server.
        :type limit: `int`
        :param rebuild: fetch only the chunks that were there
            before the last incident.
        :type rebuild: `bool`
        :keyword container_id: get only chunks belonging to
           the specified container
        :type container_id: `str`
        :keyword start_after: fetch only chunk that appear after
            this container ID
        :type start_after: `str`
        """
        req_body = {'limit': limit}
        if rebuild:
            req_body['rebuild'] = True
        if container_id:
            req_body['container_id'] = container_id
        if start_after:
            req_body['start_after'] = start_after

        while True:
            for i in range(max_attempts):
                try:
                    _resp, resp_body = self._rdir_request(volume,
                                                          'POST',
                                                          'fetch',
                                                          json=req_body,
                                                          **kwargs)
                    break
                except OioNetworkException:
                    # Monotonic backoff
                    if i < max_attempts - 1:
                        sleep(i * 1.0)
                        continue
                    # Too many attempts
                    raise
            if not resp_body:
                break
            req_body['start_after'] = resp_body[-1][0]
            if shuffle:
                random.shuffle(resp_body)
            for (key, value) in resp_body:
                container, content, chunk = key.split('|')
                yield container, content, chunk, value
Exemple #12
0
 def handle_backend_errors(self, func, *args, **kwargs):
     while True:
         try:
             return func(*args, **kwargs), None
         except (RedisConnectionError, RedisTimeoutError) as exc:
             self.logger.warn('Fail to communicate with redis: %s', exc)
             if not self.running:
                 return None, exc
             sleep(1)
Exemple #13
0
 def connect(*args, **ckwargs):
     headers = next(headers_iter)
     if body_iter is None:
         body = raw_body or ''
     else:
         body = next(body_iter)
     if kwargs.get("slow_connect", False):
         sleep(1)
     i, status = next(conn_id_status_iter)
     return FakeConn(status, body=body, headers=headers, conn_id=i,
                     cb_body=kwargs.get('cb_body'))
Exemple #14
0
 def run(self):
     try:
         self.container.container_create(account=self.account_name,
                                         reference=self.reference)
         super(CheckMeta2, self).run()
         self.container.container_delete(account=self.account_name,
                                         reference=self.reference)
         sleep(1)
         self.account.account_delete(self.account_name)
     except Exception as exc:
         print("Exception - " + str(exc))
Exemple #15
0
 def _read_retry_queue(self, queue, **kwargs):
     while True:
         # Reschedule jobs we were not able to handle.
         item = queue.get()
         sent = False
         while not sent:
             sent = self.sender.send_job(json.dumps(item),
                                         delay=self.retry_delay)
             if not sent:
                 sleep(1.0)
         self.sender.job_done()
         queue.task_done()
Exemple #16
0
 def ping(self, prefix):
     url = self.url_prefix + prefix.ljust(64, '0')
     max_attempts = 5
     for i in range(max_attempts):
         rep = self.pool.request('POST', url)
         if rep.status == 200:
             return
         self.log.warn("%d %s", rep.status, prefix)
         if rep.status == 503:
             sleep(i * 0.5)
         else:
             break
Exemple #17
0
 def _send_task_event(task_event, local_next_worker):
     # Send the event with a non-full sender
     task_event['beanstalkd_reply'] = beanstalkd_reply_event
     while True:
         for _ in range(nb_workers):
             success = workers[local_next_worker].send_job(
                 json.dumps(task_event))
             local_next_worker = (local_next_worker + 1) % nb_workers
             if success:
                 return local_next_worker
         self.logger.warn("All beanstalkd workers are full")
         sleep(5)
Exemple #18
0
 def watch(self):
     try:
         while self.running:
             self.check()
             self.get_stats()
             self.register()
             sleep(self.check_interval)
     except Exception as e:
         self.logger.warn('ERROR in watcher "%s"', e)
         self.failed = True
         raise e
     finally:
         self.logger.info('watcher "%s" stopped', self.name)
Exemple #19
0
    def chunk_fetch(self,
                    volume,
                    limit=100,
                    rebuild=False,
                    container_id=None,
                    max_attempts=3,
                    **kwargs):
        """
        Fetch the list of chunks belonging to the specified volume.

        :param volume: the volume to get chunks from
        :type volume: `str`
        :param limit: maximum number of results to return
        :type limit: `int`
        :param rebuild:
        :type rebuild: `bool`
        :keyword container_id: get only chunks belonging to
           the specified container
        :type container_id: `str`
        """
        req_body = {'limit': limit}
        if rebuild:
            req_body['rebuild'] = True
        if container_id:
            req_body['container_id'] = container_id

        while True:
            for i in range(max_attempts):
                try:
                    _resp, resp_body = self._rdir_request(volume,
                                                          'POST',
                                                          'fetch',
                                                          json=req_body,
                                                          **kwargs)
                    break
                except OioNetworkException:
                    # Monotonic backoff
                    if i < max_attempts - 1:
                        sleep(i * 1.0)
                        continue
                    # Too many attempts
                    raise
            if len(resp_body) == 0:
                break
            key = None
            for (key, value) in resp_body:
                container, content, chunk = key.split('|')
                yield container, content, chunk, value
            if key is not None:
                req_body['start_after'] = key
Exemple #20
0
 def _send_task_event(self, task_event, reply_loc, next_worker):
     """
     Send the event through a non-full sender.
     """
     task_event['beanstalkd_reply'] = reply_loc
     workers = self.beanstalkd_workers.values()
     nb_workers = len(workers)
     while True:
         for _ in range(nb_workers):
             success = workers[next_worker].send_job(json.dumps(task_event))
             next_worker = (next_worker + 1) % nb_workers
             if success:
                 return next_worker
         self.logger.warn("All beanstalkd workers are full")
         sleep(5)
Exemple #21
0
    def run(self):
        """
        Fetch results and write logs until all jobs have finished.

        :returns: a generator yielding check results.
        """
        while self.pool.running() + self.pool.waiting():
            for result in self.fetch_results():
                self.log_result(result)
                yield result
            sleep(0.1)
        self.pool.waitall()
        for result in self.fetch_results():
            self.log_result(result)
            yield result
Exemple #22
0
 def _read_retry_queue(self):
     if self.retry_queue is None:
         return
     while True:
         # Reschedule jobs we were not able to handle.
         item = self.retry_queue.get()
         if self.retryer:
             sent = False
             while not sent:
                 sent = self.retryer.send_job(json.dumps(
                     self.task_event_from_item(item)),
                                              delay=self.retry_delay)
                 if not sent:
                     sleep(1.0)
             self.retryer.job_done()
         self.retry_queue.task_done()
Exemple #23
0
 def finish(self, metachunk_size, metachunk_hash):
     """Send metachunk_size and metachunk_hash as trailers"""
     parts = [
         '0\r\n',
         '%s: %s\r\n' % (CHUNK_HEADERS['metachunk_size'], metachunk_size),
         '%s: %s\r\n' % (CHUNK_HEADERS['metachunk_hash'], metachunk_hash)
     ]
     if self.checksum:
         parts.append(
             '%s: %s\r\n' %
             (CHUNK_HEADERS['chunk_hash'], self.checksum.hexdigest()))
     parts.append('\r\n')
     to_send = "".join(parts)
     self.conn.send(to_send)
     # Last segment sent, disable TCP_CORK to flush buffers
     self.conn.set_cork(False)
     sleep(0)
Exemple #24
0
 def wait_until_empty(self,
                      tube,
                      timeout=float('inf'),
                      poll_interval=0.2,
                      initial_delay=0.0):
     """
     Wait until the the specified tube is empty, or the timeout expires.
     """
     # TODO(FVE): check tube stats to ensure some jobs have passed through
     # and then get rid of the initial_delay
     self.watch(tube)
     if initial_delay > 0.0:
         sleep(initial_delay)
     job_id, _ = self.peek_ready()
     deadline = time() + timeout
     while job_id is not None and time() < deadline:
         sleep(poll_interval)
         job_id, _ = self.peek_ready()
Exemple #25
0
    def run(self, rate_limiter=None):
        """
        Fetch results and write logs until all jobs have finished.

        :returns: a generator yielding check results.
        """
        while self.running and (self.pool.running() + self.pool.waiting()):
            for result in self.fetch_results(rate_limiter):
                self.log_result(result)
                yield result
            sleep(0.1)
        if self.running:
            self.pool.waitall()
        # No rate limiting
        for result in self.fetch_results():
            self.log_result(result)
            yield result
        self.list_cache = CacheDict(self.list_cache.size)
Exemple #26
0
    def dispatch_tasks_batch(self, beanstalkd_workers,
                             job_id, job_type, job_config, tasks):
        """
            Try sending a task until it's ok
        """

        beanstalkd_payload = self.make_beanstalkd_payload(
            job_id, job_type, job_config, tasks)

        if len(beanstalkd_payload) > 2**16:
            raise ValueError('Task payload is too big (length=%s)' %
                             len(beanstalkd_payload))

        # max 2 minutes per task
        ttr = len(tasks) * DEFAULT_TTR

        i = 0
        for beanstalkd_worker in beanstalkd_workers:
            if not self.running:
                return False
            i += 1
            if beanstalkd_worker is None:
                # Try for at least 30 seconds
                if i > 30:
                    break
                continue

            try:
                beanstalkd_worker.put(beanstalkd_payload, ttr=ttr)
                self.logger.debug(
                    '[job_id=%s] Tasks sent to %s: %s', job_id,
                    beanstalkd_worker.addr, str(tasks))
                return True
            except Exception as exc:
                self.logger.warn(
                    '[job_id=%s] Fail to send beanstalkd job: %s',
                    job_id, exc)
                # TODO(adu): We could be more lenient
                # and wait for a few errors in a row
                # to happen before marking it as broken.
                beanstalkd_worker.is_broken = True
            sleep(1)
        return False
Exemple #27
0
    def run_forever(self):
        """
            Take jobs from the queue and spawn threads to dispatch them
        """

        # gather beanstalkd info
        self.refresh_beanstalkd_workers_thread = threading.Thread(
            target=self.refresh_beanstalkd_workers_forever)
        self.refresh_beanstalkd_workers_thread.start()

        # start processing replies
        self.listen_beanstalkd_reply_thread = threading.Thread(
            target=self.listen_beanstalkd_reply_forever)
        self.listen_beanstalkd_reply_thread.start()

        if not self.running:
            return

        # restart running jobs
        self.logger.debug('Look for unfinished jobs')
        orchestrator_jobs, exc = self.handle_backend_errors(
            self.backend.list_orchestrator_jobs, self.orchestrator_id)
        if exc is not None:
            self.logger.warn(
                'Unable to list running jobs for this orchestrator: %s', exc)
            return
        for job_info in orchestrator_jobs:
            if not self.running:
                return
            self.safe_handle_running_job(job_info)

        # run next jobs
        while self.running:
            sleep(1)
            job_info, exc = self.handle_backend_errors(
                self.backend.run_next, self.orchestrator_id)
            if exc is not None:
                self.logger.warn('Unable to run next job: %s', exc)
                return
            if not job_info:
                continue
            self.safe_handle_running_job(job_info)
Exemple #28
0
    def refresh_beanstalkd_workers_forever(self):
        """
        Refresh beanstalkd workers by looking at the score,
        existing tubes and tube statistics.
        """
        while self.running:
            try:
                beanstalkd_workers = self._find_beanstalkd_workers()
            except Exception as exc:
                self.logger.error(
                    'Fail to find beanstalkd workers: %s', exc)
                # TODO(adu): We could keep trying to send jobs
                # to the beanstalkd we already found.
                # But we need the score to know how to dispatch the tasks...
                beanstalkd_workers = dict()

            old_beanstalkd_workers_addr = set(self.beanstalkd_workers.keys())
            new_beanstalkd_workers_addr = set(beanstalkd_workers.keys())

            added_beanstalkds = new_beanstalkd_workers_addr \
                - old_beanstalkd_workers_addr
            for beanstalkd_addr in added_beanstalkds:
                self.logger.info('Add beanstalkd %s' % beanstalkd_addr)
                beanstalkd = beanstalkd_workers[beanstalkd_addr]
                beanstalkd.use(self.beanstalkd_workers_tube)

            removed_beanstalkds = old_beanstalkd_workers_addr \
                - new_beanstalkd_workers_addr
            for beanstalkd_addr in removed_beanstalkds:
                self.logger.info('Remove beanstalkd %s' % beanstalkd_addr)

            self.logger.info('Refresh beanstalkd workers')
            self.beanstalkd_workers = beanstalkd_workers

            for _ in range(self.refresh_time_beanstalkd_workers):
                if not self.running:
                    break
                sleep(1)

        self.logger.info('Exited beanstalkd workers thread')
Exemple #29
0
    def _reply_task_res(self, beanstalkd_reply, task_res):
        self.queue_reply.put(task_res)

        if beanstalkd_reply is None:
            return

        res_event = self.tool.res_event_from_task_res(task_res)
        if self.tool.beanstalkd is not None:
            res_event['beanstalkd_worker'] = \
                {
                    'addr': self.tool.beanstalkd.addr,
                    'tube': self.tool.beanstalkd.tube
                }

        try:
            if self.beanstalkd_reply is None \
                    or self.beanstalkd_reply.addr != beanstalkd_reply['addr'] \
                    or self.beanstalkd_reply.tube != beanstalkd_reply['tube']:
                if self.beanstalkd_reply is not None:
                    self.beanstalkd_reply.close()
                self.beanstalkd_reply = BeanstalkdSender(
                    beanstalkd_reply['addr'], beanstalkd_reply['tube'],
                    self.logger)

            sent = False
            event_json = json.dumps(res_event)
            # This will loop forever if there is a connection issue with the
            # beanstalkd server. We chose to let it loop until someone fixes
            # the problem (or the problem resolves by magic).
            while not sent:
                sent = self.beanstalkd_reply.send_job(event_json)
                if not sent:
                    sleep(1.0)
            self.beanstalkd_reply.job_done()
        except Exception as exc:  # pylint: disable=broad-except
            item, info, error = task_res
            self.logger.warn(
                'Beanstalkd reply failed %s (info=%s error=%s): %s',
                self.tool.string_from_item(item), str(info), error, exc)
Exemple #30
0
    def get_beanstalkd_workers(self):
        """
            Yield beanstalkd workers following a loadbalancing strategy
        """

        beanstalkd_workers_id = None
        beanstalkd_workers = list()
        while True:
            if not self.beanstalkd_workers:
                self.logger.info('No beanstalkd worker available')
                sleep(1)
                yield None
                continue

            if id(self.beanstalkd_workers) != beanstalkd_workers_id:
                beanstalkd_workers_id = id(self.beanstalkd_workers)
                beanstalkd_workers = list()
                for beanstalkd in self.beanstalkd_workers.values():
                    for _ in range(beanstalkd.occurrence):
                        beanstalkd_workers.append(beanstalkd)

            # Shuffle to not have the same suite for all jobs
            random.shuffle(beanstalkd_workers)

            yielded = False
            for beanstalkd_worker in beanstalkd_workers:
                if id(self.beanstalkd_workers) != beanstalkd_workers_id:
                    break
                if beanstalkd_worker.is_broken:
                    continue
                yield beanstalkd_worker
                yielded = True
            else:
                if not yielded:
                    self.logger.info(
                        'All beanstalkd workers available are broken')
                    sleep(1)
                    yield None