def wait(self): """ Wait until all data in the queue has been processed by the send coroutine """ while self.queue.qsize(): sleep(0)
def _send(self): """Send coroutine loop""" self.conn.upload_start = None while True: # fetch input data from the queue data = self.queue.get() # use HTTP transfer encoding chunked # to write data to RAWX if not self.failed: try: with ChunkWriteTimeout(self.write_timeout): if self.perfdata is not None \ and self.conn.upload_start is None: self.conn.upload_start = monotonic_time() self.conn.send("%x\r\n" % len(data)) self.conn.send(data) self.conn.send("\r\n") self.bytes_transferred += len(data) sleep(0) except (Exception, ChunkWriteTimeout) as exc: self.failed = True msg = str(exc) logger.warn("Failed to write to %s (%s, reqid=%s)", self.chunk, msg, self.reqid) self.chunk['error'] = 'write: %s' % msg
def run(self, *args, **kwargs): try: self.logger.info('conscience agent: starting') pool = GreenPool(len(self.watchers)) for watcher in self.watchers: pool.spawn(watcher.start) self.running = True while self.running: sleep(1) for w in self.watchers: if w.failed: self.watchers.remove(w) self.logger.warn('restart watcher "%s"', w.name) new_w = ServiceWatcher(self.conf, w.service) self.watchers.append(new_w) pool.spawn(new_w.start) except Exception as e: self.logger.error('ERROR in main loop %s', e) raise e finally: self.logger.warn('conscience agent: stopping') self.running = False self.stop_watchers()
def _send_data(self, conn): """ Send data to an open connection, taking data blocks from `conn.queue`. """ conn.upload_start = None while True: data = conn.queue.get() if isinstance(data, text_type): data = data.encode('utf-8') if not conn.failed: try: with green.ChunkWriteTimeout(self.write_timeout): if self.perfdata is not None \ and conn.upload_start is None: conn.upload_start = monotonic_time() conn.send(b'%x\r\n' % len(data)) conn.send(data) conn.send(b'\r\n') if not data: # Last segment sent, disable TCP_CORK to flush buffers conn.set_cork(False) sleep(0) except (Exception, green.ChunkWriteTimeout) as err: conn.failed = True conn.chunk['error'] = str(err)
def status(self, volume, max=1000, prefix=None, marker=None, max_attempts=3, **kwargs): """ Get the status of chunks belonging to the specified volume. :param volume: the volume to get chunks from :type volume: `str` :param max: maximum number of results to return per request to the rdir server. :type max: `int` :keyword prefix: get only chunks belonging to the specified prefix :type prefix: `str` :keyword marker: fetch only chunk that appear after this marker :type marker: `str` """ req_params = {'max': max} if prefix: req_params['prefix'] = prefix if marker: req_params['marker'] = marker chunks = dict() containers = dict() while True: for i in range(max_attempts): try: _resp, resp_body = self._rdir_request(volume, 'GET', 'status', params=req_params, **kwargs) break except OioNetworkException: # Monotonic backoff if i < max_attempts - 1: sleep(i * 1.0) continue # Too many attempts raise for (key, value) in resp_body.get('chunk', dict()).items(): chunks[key] = chunks.get(key, 0) + value for (cid, info) in resp_body.get('container', dict()).items(): for (key, value) in info.items(): containers[cid][key] = containers.setdefault( cid, dict()).get(key, 0) + value if not true_value( _resp.headers.get(HEADER_PREFIX + 'list-truncated')): break req_params['marker'] = _resp.headers[HEADER_PREFIX + 'list-marker'] return {'chunk': chunks, 'container': containers}
def run(self): self.tool.start_time = self.tool.last_report = time.time() self.tool.log_report('START', force=True) reply_loc = { 'addr': self.beanstalkd_reply.addr, 'tube': self.beanstalkd_reply.tube } # pylint: disable=no-member thread = threading.Thread(target=self._distribute_events, args=[reply_loc]) thread.start() # Wait until the thread is started sending events while self.sending is None: sleep(0.1) # Retrieve responses until all events are processed try: while not self._all_events_are_processed(): tasks_res = self.beanstalkd_reply.fetch_job( self._tasks_res_from_res_event, timeout=DISTRIBUTED_DISPATCHER_TIMEOUT) for task_res in tasks_res: self.tool.update_counters(task_res) yield task_res self.tool.log_report('RUN') except OioTimeout: self.logger.error('No response for %d seconds', DISTRIBUTED_DISPATCHER_TIMEOUT) self.tool.success = False except Exception: # pylint: disable=broad-except self.logger.exception('ERROR in distributed dispatcher') self.tool.success = False self.tool.log_report('DONE', force=True)
def send(data): self.checksum.update(data) self.global_checksum.update(data) # get the encoded fragments if self.perfdata is not None: ec_start = monotonic_time() fragments = ec_stream.send(data) if self.perfdata is not None: ec_end = monotonic_time() rawx_perfdata = self.perfdata.setdefault('rawx', dict()) rawx_perfdata['ec'] = rawx_perfdata.get('ec', 0.0) \ + ec_end - ec_start if fragments is None: # not enough data given return current_writers = list(writers) failed_chunks = list() for writer in current_writers: fragment = fragments[chunk_index[writer]] if not writer.failed: if writer.checksum: writer.checksum.update(fragment) writer.send(fragment) else: current_writers.remove(writer) failed_chunks.append(writer.chunk) sleep(0) self.quorum_or_fail([w.chunk for w in current_writers], failed_chunks)
def run_indefinitely(checker, entries=None, rate_limiter=None, pause_between_passes=0.0): def _stop(*args): checker.stop() import signal signal.signal(signal.SIGINT, _stop) signal.signal(signal.SIGQUIT, _stop) signal.signal(signal.SIGTERM, _stop) while checker.running: if checker.delayed_targets: run_once(checker, entries=checker.delayed_targets.values(), rate_limiter=rate_limiter) run_once(checker, entries, rate_limiter) checker.reset_stats() if checker.running and pause_between_passes > 0.0: checker.logger.info("Pausing for %.3fs", pause_between_passes) iterations, rest = divmod(pause_between_passes, 1) sleep(rest) for _ in range(int(iterations)): if not checker.running: break sleep(1.0)
def listen_beanstalkd_reply_forever(self): """ Process this orchestrator's job replies """ self.logger.info('Connecting to the reply beanstalkd') while self.running: try: listener = BeanstalkdListener( addr=self.beanstalkd_reply_addr, tube=self.beanstalkd_reply_tube, logger=self.logger) break except ConnectionError: self.logger.error('Failed to connect to the reply beanstalkd') sleep(5) self.logger.info('Listening to replies on %s (tube=%s)', self.beanstalkd_reply_addr, self.beanstalkd_reply_tube) # keep the job results in memory while self.running: connection_error = self.listen_loop(listener) # in case of a beanstalkd connection error # sleep to avoid spamming if connection_error: sleep(2) self.logger.info('Exited listening thread')
def _apply(self, mapping, moved=None, max_attempts=7, read_timeout=M0_READ_TIMEOUT): """ Upload the specified mapping to the meta0 service, retry in case or error. """ self.log.info("Saving...") for i in range(max_attempts): try: mapping.apply(moved=moved, connection_timeout=M0_CONN_TIMEOUT, read_timeout=read_timeout) break except ClientException as ex: # Manage several unretriable errors retry = (503, 504) if ex.status >= 400 and ex.status not in retry: raise # Monotonic backoff (retriable and net errors) if i < max_attempts - 1: sleep(i * 1.0) continue # Too many attempts raise
def chunk_fetch(self, volume, limit=100, rebuild=False, container_id=None, max_attempts=3, start_after=None, shuffle=False, **kwargs): """ Fetch the list of chunks belonging to the specified volume. :param volume: the volume to get chunks from :type volume: `str` :param limit: maximum number of results to return per request to the rdir server. :type limit: `int` :param rebuild: fetch only the chunks that were there before the last incident. :type rebuild: `bool` :keyword container_id: get only chunks belonging to the specified container :type container_id: `str` :keyword start_after: fetch only chunk that appear after this container ID :type start_after: `str` """ req_body = {'limit': limit} if rebuild: req_body['rebuild'] = True if container_id: req_body['container_id'] = container_id if start_after: req_body['start_after'] = start_after while True: for i in range(max_attempts): try: _resp, resp_body = self._rdir_request(volume, 'POST', 'fetch', json=req_body, **kwargs) break except OioNetworkException: # Monotonic backoff if i < max_attempts - 1: sleep(i * 1.0) continue # Too many attempts raise if not resp_body: break req_body['start_after'] = resp_body[-1][0] if shuffle: random.shuffle(resp_body) for (key, value) in resp_body: container, content, chunk = key.split('|') yield container, content, chunk, value
def handle_backend_errors(self, func, *args, **kwargs): while True: try: return func(*args, **kwargs), None except (RedisConnectionError, RedisTimeoutError) as exc: self.logger.warn('Fail to communicate with redis: %s', exc) if not self.running: return None, exc sleep(1)
def connect(*args, **ckwargs): headers = next(headers_iter) if body_iter is None: body = raw_body or '' else: body = next(body_iter) if kwargs.get("slow_connect", False): sleep(1) i, status = next(conn_id_status_iter) return FakeConn(status, body=body, headers=headers, conn_id=i, cb_body=kwargs.get('cb_body'))
def run(self): try: self.container.container_create(account=self.account_name, reference=self.reference) super(CheckMeta2, self).run() self.container.container_delete(account=self.account_name, reference=self.reference) sleep(1) self.account.account_delete(self.account_name) except Exception as exc: print("Exception - " + str(exc))
def _read_retry_queue(self, queue, **kwargs): while True: # Reschedule jobs we were not able to handle. item = queue.get() sent = False while not sent: sent = self.sender.send_job(json.dumps(item), delay=self.retry_delay) if not sent: sleep(1.0) self.sender.job_done() queue.task_done()
def ping(self, prefix): url = self.url_prefix + prefix.ljust(64, '0') max_attempts = 5 for i in range(max_attempts): rep = self.pool.request('POST', url) if rep.status == 200: return self.log.warn("%d %s", rep.status, prefix) if rep.status == 503: sleep(i * 0.5) else: break
def _send_task_event(task_event, local_next_worker): # Send the event with a non-full sender task_event['beanstalkd_reply'] = beanstalkd_reply_event while True: for _ in range(nb_workers): success = workers[local_next_worker].send_job( json.dumps(task_event)) local_next_worker = (local_next_worker + 1) % nb_workers if success: return local_next_worker self.logger.warn("All beanstalkd workers are full") sleep(5)
def watch(self): try: while self.running: self.check() self.get_stats() self.register() sleep(self.check_interval) except Exception as e: self.logger.warn('ERROR in watcher "%s"', e) self.failed = True raise e finally: self.logger.info('watcher "%s" stopped', self.name)
def chunk_fetch(self, volume, limit=100, rebuild=False, container_id=None, max_attempts=3, **kwargs): """ Fetch the list of chunks belonging to the specified volume. :param volume: the volume to get chunks from :type volume: `str` :param limit: maximum number of results to return :type limit: `int` :param rebuild: :type rebuild: `bool` :keyword container_id: get only chunks belonging to the specified container :type container_id: `str` """ req_body = {'limit': limit} if rebuild: req_body['rebuild'] = True if container_id: req_body['container_id'] = container_id while True: for i in range(max_attempts): try: _resp, resp_body = self._rdir_request(volume, 'POST', 'fetch', json=req_body, **kwargs) break except OioNetworkException: # Monotonic backoff if i < max_attempts - 1: sleep(i * 1.0) continue # Too many attempts raise if len(resp_body) == 0: break key = None for (key, value) in resp_body: container, content, chunk = key.split('|') yield container, content, chunk, value if key is not None: req_body['start_after'] = key
def _send_task_event(self, task_event, reply_loc, next_worker): """ Send the event through a non-full sender. """ task_event['beanstalkd_reply'] = reply_loc workers = self.beanstalkd_workers.values() nb_workers = len(workers) while True: for _ in range(nb_workers): success = workers[next_worker].send_job(json.dumps(task_event)) next_worker = (next_worker + 1) % nb_workers if success: return next_worker self.logger.warn("All beanstalkd workers are full") sleep(5)
def run(self): """ Fetch results and write logs until all jobs have finished. :returns: a generator yielding check results. """ while self.pool.running() + self.pool.waiting(): for result in self.fetch_results(): self.log_result(result) yield result sleep(0.1) self.pool.waitall() for result in self.fetch_results(): self.log_result(result) yield result
def _read_retry_queue(self): if self.retry_queue is None: return while True: # Reschedule jobs we were not able to handle. item = self.retry_queue.get() if self.retryer: sent = False while not sent: sent = self.retryer.send_job(json.dumps( self.task_event_from_item(item)), delay=self.retry_delay) if not sent: sleep(1.0) self.retryer.job_done() self.retry_queue.task_done()
def finish(self, metachunk_size, metachunk_hash): """Send metachunk_size and metachunk_hash as trailers""" parts = [ '0\r\n', '%s: %s\r\n' % (CHUNK_HEADERS['metachunk_size'], metachunk_size), '%s: %s\r\n' % (CHUNK_HEADERS['metachunk_hash'], metachunk_hash) ] if self.checksum: parts.append( '%s: %s\r\n' % (CHUNK_HEADERS['chunk_hash'], self.checksum.hexdigest())) parts.append('\r\n') to_send = "".join(parts) self.conn.send(to_send) # Last segment sent, disable TCP_CORK to flush buffers self.conn.set_cork(False) sleep(0)
def wait_until_empty(self, tube, timeout=float('inf'), poll_interval=0.2, initial_delay=0.0): """ Wait until the the specified tube is empty, or the timeout expires. """ # TODO(FVE): check tube stats to ensure some jobs have passed through # and then get rid of the initial_delay self.watch(tube) if initial_delay > 0.0: sleep(initial_delay) job_id, _ = self.peek_ready() deadline = time() + timeout while job_id is not None and time() < deadline: sleep(poll_interval) job_id, _ = self.peek_ready()
def run(self, rate_limiter=None): """ Fetch results and write logs until all jobs have finished. :returns: a generator yielding check results. """ while self.running and (self.pool.running() + self.pool.waiting()): for result in self.fetch_results(rate_limiter): self.log_result(result) yield result sleep(0.1) if self.running: self.pool.waitall() # No rate limiting for result in self.fetch_results(): self.log_result(result) yield result self.list_cache = CacheDict(self.list_cache.size)
def dispatch_tasks_batch(self, beanstalkd_workers, job_id, job_type, job_config, tasks): """ Try sending a task until it's ok """ beanstalkd_payload = self.make_beanstalkd_payload( job_id, job_type, job_config, tasks) if len(beanstalkd_payload) > 2**16: raise ValueError('Task payload is too big (length=%s)' % len(beanstalkd_payload)) # max 2 minutes per task ttr = len(tasks) * DEFAULT_TTR i = 0 for beanstalkd_worker in beanstalkd_workers: if not self.running: return False i += 1 if beanstalkd_worker is None: # Try for at least 30 seconds if i > 30: break continue try: beanstalkd_worker.put(beanstalkd_payload, ttr=ttr) self.logger.debug( '[job_id=%s] Tasks sent to %s: %s', job_id, beanstalkd_worker.addr, str(tasks)) return True except Exception as exc: self.logger.warn( '[job_id=%s] Fail to send beanstalkd job: %s', job_id, exc) # TODO(adu): We could be more lenient # and wait for a few errors in a row # to happen before marking it as broken. beanstalkd_worker.is_broken = True sleep(1) return False
def run_forever(self): """ Take jobs from the queue and spawn threads to dispatch them """ # gather beanstalkd info self.refresh_beanstalkd_workers_thread = threading.Thread( target=self.refresh_beanstalkd_workers_forever) self.refresh_beanstalkd_workers_thread.start() # start processing replies self.listen_beanstalkd_reply_thread = threading.Thread( target=self.listen_beanstalkd_reply_forever) self.listen_beanstalkd_reply_thread.start() if not self.running: return # restart running jobs self.logger.debug('Look for unfinished jobs') orchestrator_jobs, exc = self.handle_backend_errors( self.backend.list_orchestrator_jobs, self.orchestrator_id) if exc is not None: self.logger.warn( 'Unable to list running jobs for this orchestrator: %s', exc) return for job_info in orchestrator_jobs: if not self.running: return self.safe_handle_running_job(job_info) # run next jobs while self.running: sleep(1) job_info, exc = self.handle_backend_errors( self.backend.run_next, self.orchestrator_id) if exc is not None: self.logger.warn('Unable to run next job: %s', exc) return if not job_info: continue self.safe_handle_running_job(job_info)
def refresh_beanstalkd_workers_forever(self): """ Refresh beanstalkd workers by looking at the score, existing tubes and tube statistics. """ while self.running: try: beanstalkd_workers = self._find_beanstalkd_workers() except Exception as exc: self.logger.error( 'Fail to find beanstalkd workers: %s', exc) # TODO(adu): We could keep trying to send jobs # to the beanstalkd we already found. # But we need the score to know how to dispatch the tasks... beanstalkd_workers = dict() old_beanstalkd_workers_addr = set(self.beanstalkd_workers.keys()) new_beanstalkd_workers_addr = set(beanstalkd_workers.keys()) added_beanstalkds = new_beanstalkd_workers_addr \ - old_beanstalkd_workers_addr for beanstalkd_addr in added_beanstalkds: self.logger.info('Add beanstalkd %s' % beanstalkd_addr) beanstalkd = beanstalkd_workers[beanstalkd_addr] beanstalkd.use(self.beanstalkd_workers_tube) removed_beanstalkds = old_beanstalkd_workers_addr \ - new_beanstalkd_workers_addr for beanstalkd_addr in removed_beanstalkds: self.logger.info('Remove beanstalkd %s' % beanstalkd_addr) self.logger.info('Refresh beanstalkd workers') self.beanstalkd_workers = beanstalkd_workers for _ in range(self.refresh_time_beanstalkd_workers): if not self.running: break sleep(1) self.logger.info('Exited beanstalkd workers thread')
def _reply_task_res(self, beanstalkd_reply, task_res): self.queue_reply.put(task_res) if beanstalkd_reply is None: return res_event = self.tool.res_event_from_task_res(task_res) if self.tool.beanstalkd is not None: res_event['beanstalkd_worker'] = \ { 'addr': self.tool.beanstalkd.addr, 'tube': self.tool.beanstalkd.tube } try: if self.beanstalkd_reply is None \ or self.beanstalkd_reply.addr != beanstalkd_reply['addr'] \ or self.beanstalkd_reply.tube != beanstalkd_reply['tube']: if self.beanstalkd_reply is not None: self.beanstalkd_reply.close() self.beanstalkd_reply = BeanstalkdSender( beanstalkd_reply['addr'], beanstalkd_reply['tube'], self.logger) sent = False event_json = json.dumps(res_event) # This will loop forever if there is a connection issue with the # beanstalkd server. We chose to let it loop until someone fixes # the problem (or the problem resolves by magic). while not sent: sent = self.beanstalkd_reply.send_job(event_json) if not sent: sleep(1.0) self.beanstalkd_reply.job_done() except Exception as exc: # pylint: disable=broad-except item, info, error = task_res self.logger.warn( 'Beanstalkd reply failed %s (info=%s error=%s): %s', self.tool.string_from_item(item), str(info), error, exc)
def get_beanstalkd_workers(self): """ Yield beanstalkd workers following a loadbalancing strategy """ beanstalkd_workers_id = None beanstalkd_workers = list() while True: if not self.beanstalkd_workers: self.logger.info('No beanstalkd worker available') sleep(1) yield None continue if id(self.beanstalkd_workers) != beanstalkd_workers_id: beanstalkd_workers_id = id(self.beanstalkd_workers) beanstalkd_workers = list() for beanstalkd in self.beanstalkd_workers.values(): for _ in range(beanstalkd.occurrence): beanstalkd_workers.append(beanstalkd) # Shuffle to not have the same suite for all jobs random.shuffle(beanstalkd_workers) yielded = False for beanstalkd_worker in beanstalkd_workers: if id(self.beanstalkd_workers) != beanstalkd_workers_id: break if beanstalkd_worker.is_broken: continue yield beanstalkd_worker yielded = True else: if not yielded: self.logger.info( 'All beanstalkd workers available are broken') sleep(1) yield None