def _reply_task_res(self, beanstalkd_reply, task_res): self.queue_reply.put(task_res) if beanstalkd_reply is None: return res_event = self.tool.res_event_from_task_res(task_res) if self.tool.beanstalkd is not None: res_event['beanstalkd_worker'] = \ { 'addr': self.tool.beanstalkd.addr, 'tube': self.tool.beanstalkd.tube } try: if self.beanstalkd_reply is None \ or self.beanstalkd_reply.addr != beanstalkd_reply['addr'] \ or self.beanstalkd_reply.tube != beanstalkd_reply['tube']: if self.beanstalkd_reply is not None: self.beanstalkd_reply.close() self.beanstalkd_reply = BeanstalkdSender( beanstalkd_reply['addr'], beanstalkd_reply['tube'], self.logger) self.beanstalkd_reply.send_job(json.dumps(res_event)) except Exception as exc: # pylint: disable=broad-except item, info, error = task_res self.logger.warn( 'Beanstalkd reply failed %s (info=%s error=%s): %s', self.tool.string_from_item(item), str(info), error, exc)
def __init__(self, conf, logger, beanstalkd_addr, **kwargs): super(BlobImprover, self).__init__(conf, logger, volume=None, **kwargs) self.content_factory = ContentFactory(self.conf, logger=self.logger) beanstalkd_tube = self.conf.get('beanstalkd_tube', DEFAULT_IMPROVER_TUBE) self.listener = BeanstalkdListener(beanstalkd_addr, beanstalkd_tube, self.logger, **kwargs) self.sender = BeanstalkdSender(beanstalkd_addr, beanstalkd_tube, self.logger, **kwargs) self.retry_delay = int_value(self.conf.get('retry_delay'), 30) self.reqid_prefix = 'blob-impr-'
def __init__(self, conf, beanstalkd_addr=None, logger=None): self.conf = conf self.logger = logger or get_logger(self.conf) self.namespace = conf['namespace'] self.success = True # exit gracefully self.running = True signal.signal(signal.SIGINT, self.exit_gracefully) signal.signal(signal.SIGTERM, self.exit_gracefully) # counters self.items_processed = 0 self.total_items_processed = 0 self.errors = 0 self.total_errors = 0 self.total_expected_items = None # report self.start_time = 0 self.last_report = 0 self.report_interval = int_value(self.conf.get( 'report_interval'), self.DEFAULT_REPORT_INTERVAL) # dispatcher self.dispatcher = None # input self.beanstalkd = None if beanstalkd_addr: self.beanstalkd = BeanstalkdListener( beanstalkd_addr, self.conf.get('beanstalkd_worker_tube') or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger) # retry self.retryer = None self.retry_queue = None if self.beanstalkd: self.retryer = BeanstalkdSender( self.beanstalkd.addr, self.beanstalkd.tube, self.logger) self.retry_queue = eventlet.Queue() self.retry_delay = int_value(self.conf.get('retry_delay'), self.DEFAULT_RETRY_DELAY)
def _reply_task_res(self, beanstalkd_reply, task_res): self.queue_reply.put(task_res) if beanstalkd_reply is None: return res_event = self.tool.res_event_from_task_res(task_res) if self.tool.beanstalkd is not None: res_event['beanstalkd_worker'] = \ { 'addr': self.tool.beanstalkd.addr, 'tube': self.tool.beanstalkd.tube } try: if self.beanstalkd_reply is None \ or self.beanstalkd_reply.addr != beanstalkd_reply['addr'] \ or self.beanstalkd_reply.tube != beanstalkd_reply['tube']: if self.beanstalkd_reply is not None: self.beanstalkd_reply.close() self.beanstalkd_reply = BeanstalkdSender( beanstalkd_reply['addr'], beanstalkd_reply['tube'], self.logger) sent = False event_json = json.dumps(res_event) # This will loop forever if there is a connection issue with the # beanstalkd server. We chose to let it loop until someone fixes # the problem (or the problem resolves by magic). while not sent: sent = self.beanstalkd_reply.send_job(event_json) if not sent: sleep(1.0) self.beanstalkd_reply.job_done() except Exception as exc: # pylint: disable=broad-except item, info, error = task_res self.logger.warn( 'Beanstalkd reply failed %s (info=%s error=%s): %s', self.tool.string_from_item(item), str(info), error, exc)
class BlobImprover(Rebuilder): """ Move chunks of objects declared as "perfectible", if possible to improve them (increased distance between chunks or better hosting service). """ supported_events = (EventTypes.CONTENT_PERFECTIBLE, ) def __init__(self, conf, logger, beanstalkd_addr, **kwargs): super(BlobImprover, self).__init__(conf, logger, volume=None, **kwargs) self.content_factory = ContentFactory(self.conf, logger=self.logger) beanstalkd_tube = self.conf.get('beanstalkd_tube', DEFAULT_IMPROVER_TUBE) self.listener = BeanstalkdListener(beanstalkd_addr, beanstalkd_tube, self.logger, **kwargs) self.sender = BeanstalkdSender(beanstalkd_addr, beanstalkd_tube, self.logger, **kwargs) self.retry_delay = int_value(self.conf.get('retry_delay'), 30) self.reqid_prefix = 'blob-impr-' def exit_gracefully(self, signum, frame): super(BlobImprover, self).exit_gracefully(signum, frame) self.listener.running = False def _event_from_job(self, job_id, data, **kwargs): """Decode a JSON string into an event dictionary.""" # pylint: disable=no-member event = json.loads(data) type_ = event.get('event') # Bury events that should not be there if type_ not in self.__class__.supported_events: msg = 'Discarding event %s (type=%s)' % (event.get('job_id'), type_) self.logger.info(msg) raise exceptions.ExplicitBury(msg) yield event def _create_worker(self, **kwargs): return BlobImproverWorker(self, **kwargs) def _fill_queue(self, queue, **kwargs): max_events = kwargs.get('max_events') sent_events = 0 # Do not block more than 2 seconds events = self.listener.fetch_jobs(self._event_from_job, reserve_timeout=2, **kwargs) for event in events: queue.put(event) sent_events += 1 if max_events > 0 and sent_events >= max_events: self.logger.info('Max events (%d) reached, exiting', max_events) break if not self.running: break events.close() def _read_retry_queue(self, queue, **kwargs): while True: # Reschedule jobs we were not able to handle. item = queue.get() sent = False while not sent: sent = self.sender.send_job(json.dumps(item), delay=self.retry_delay) if not sent: sleep(1.0) self.sender.job_done() queue.task_done() def _item_to_string(self, item, **kwargs): try: url = item['url'] fullpath = encode_fullpath(url['account'], url['user'], url['path'], url.get('version', 1), url['content']) # TODO(FVE): maybe tell some numbers about chunks if item.get('event') == EventTypes.CONTENT_PERFECTIBLE: return 'perfectible object %s' % (fullpath, ) else: return 'object %s' % (fullpath, ) except (KeyError, ValueError) as err: return '<unknown item> ({0})'.format(repr(err)) def _get_report(self, status, end_time, counters, **kwargs): items_processed, errors, total_items_processed, total_errors = counters time_since_last_report = (end_time - self.last_report) or 0.00001 total_time = (end_time - self.start_time) or 0.00001 return ('%(status)s volume=%(volume)s ' 'last_report=%(last_report)s %(time_since_last_report).2fs ' 'chunks=%(chunks)d %(chunks_rate).2f/s ' 'errors=%(errors)d %(errors_rate).2f%% ' 'start_time=%(start_time)s %(total_time).2fs ' 'total_chunks=%(total_chunks)d ' '%(total_chunks_rate).2f/s ' 'total_errors=%(total_errors)d %(total_errors_rate).2f%%' % { 'status': status, 'volume': self.volume, 'last_report': datetime.fromtimestamp(int(self.last_report)).isoformat(), 'time_since_last_report': time_since_last_report, 'chunks': items_processed, 'chunks_rate': items_processed / time_since_last_report, 'errors': errors, 'errors_rate': 100 * errors / float(items_processed or 1), 'start_time': datetime.fromtimestamp(int(self.start_time)).isoformat(), 'total_time': total_time, 'total_chunks': total_items_processed, 'total_chunks_rate': total_items_processed / total_time, 'total_errors': total_errors, 'total_errors_rate': 100 * total_errors / float(total_items_processed or 1) })
def __init__(self, namespace, concurrency=50, error_file=None, rebuild_file=None, check_xattr=True, limit_listings=0, request_attempts=1, logger=None, verbose=False, check_hash=False, min_time_in_error=0.0, required_confirmations=0, beanstalkd_addr=None, beanstalkd_tube=BlobRebuilder.DEFAULT_BEANSTALKD_WORKER_TUBE, cache_size=2**24, **_kwargs): self.pool = GreenPool(concurrency) self.error_file = error_file self.error_sender = None self.check_xattr = bool(check_xattr) self.check_hash = bool(check_hash) self.logger = logger or get_logger( {'namespace': namespace}, name='integrity', verbose=verbose) # Optimisation for when we are only checking one object # or one container. # 0 -> do not limit # 1 -> limit account listings (list of containers) # 2 -> limit container listings (list of objects) self.limit_listings = limit_listings if self.error_file: outfile = open(self.error_file, 'a') self.error_writer = csv.writer(outfile, delimiter=' ') self.rebuild_file = rebuild_file if self.rebuild_file: self.fd = open(self.rebuild_file, 'a') self.rebuild_writer = csv.writer(self.fd, delimiter='|') if beanstalkd_addr: self.error_sender = BeanstalkdSender(beanstalkd_addr, beanstalkd_tube, self.logger) self.api = ObjectStorageApi(namespace, logger=self.logger, max_retries=request_attempts - 1, request_attempts=request_attempts) self.rdir_client = RdirClient({"namespace": namespace}, logger=self.logger) self.accounts_checked = 0 self.containers_checked = 0 self.objects_checked = 0 self.chunks_checked = 0 self.account_not_found = 0 self.container_not_found = 0 self.object_not_found = 0 self.chunk_not_found = 0 self.account_exceptions = 0 self.container_exceptions = 0 self.object_exceptions = 0 self.chunk_exceptions = 0 self.list_cache = CacheDict(cache_size) self.running_tasks = {} self.running_lock = Semaphore(1) self.result_queue = LightQueue(concurrency) self.running = True self.run_time = 0 # Set of targets which must be checked again, to confirm # or deny the issues reported by previous passes. self.delayed_targets = dict() # Minimum time in error and number of confirmations of the error # before triggering a reconstruction action. self.min_time_in_error = min_time_in_error self.required_confirmations = required_confirmations
class Checker(object): def __init__(self, namespace, concurrency=50, error_file=None, rebuild_file=None, check_xattr=True, limit_listings=0, request_attempts=1, logger=None, verbose=False, check_hash=False, min_time_in_error=0.0, required_confirmations=0, beanstalkd_addr=None, beanstalkd_tube=BlobRebuilder.DEFAULT_BEANSTALKD_WORKER_TUBE, cache_size=2**24, **_kwargs): self.pool = GreenPool(concurrency) self.error_file = error_file self.error_sender = None self.check_xattr = bool(check_xattr) self.check_hash = bool(check_hash) self.logger = logger or get_logger( {'namespace': namespace}, name='integrity', verbose=verbose) # Optimisation for when we are only checking one object # or one container. # 0 -> do not limit # 1 -> limit account listings (list of containers) # 2 -> limit container listings (list of objects) self.limit_listings = limit_listings if self.error_file: outfile = open(self.error_file, 'a') self.error_writer = csv.writer(outfile, delimiter=' ') self.rebuild_file = rebuild_file if self.rebuild_file: self.fd = open(self.rebuild_file, 'a') self.rebuild_writer = csv.writer(self.fd, delimiter='|') if beanstalkd_addr: self.error_sender = BeanstalkdSender(beanstalkd_addr, beanstalkd_tube, self.logger) self.api = ObjectStorageApi(namespace, logger=self.logger, max_retries=request_attempts - 1, request_attempts=request_attempts) self.rdir_client = RdirClient({"namespace": namespace}, logger=self.logger) self.accounts_checked = 0 self.containers_checked = 0 self.objects_checked = 0 self.chunks_checked = 0 self.account_not_found = 0 self.container_not_found = 0 self.object_not_found = 0 self.chunk_not_found = 0 self.account_exceptions = 0 self.container_exceptions = 0 self.object_exceptions = 0 self.chunk_exceptions = 0 self.list_cache = CacheDict(cache_size) self.running_tasks = {} self.running_lock = Semaphore(1) self.result_queue = LightQueue(concurrency) self.running = True self.run_time = 0 # Set of targets which must be checked again, to confirm # or deny the issues reported by previous passes. self.delayed_targets = dict() # Minimum time in error and number of confirmations of the error # before triggering a reconstruction action. self.min_time_in_error = min_time_in_error self.required_confirmations = required_confirmations def reset_stats(self): self.accounts_checked = 0 self.containers_checked = 0 self.objects_checked = 0 self.chunks_checked = 0 self.account_not_found = 0 self.container_not_found = 0 self.object_not_found = 0 self.chunk_not_found = 0 self.account_exceptions = 0 self.container_exceptions = 0 self.object_exceptions = 0 self.chunk_exceptions = 0 def _spawn(self, func, target, *args, **kwargs): """ Spawn a task on the internal GreenPool. Discards the task if the pool is no more running. """ if self.running: return self.pool.spawn(func, target, *args, **kwargs) self.logger.info("Discarding %s", target) return None def _spawn_n(self, func, target, *args, **kwargs): """ Spawn a task on the internal GreenPool, do not wait for the result. Discards the task if the pool is no more running. """ if self.running: return self.pool.spawn_n(func, target, *args, **kwargs) self.logger.info("Discarding %s", target) return None def complete_target_from_chunk_metadata(self, target, xattr_meta): """ Complete a Target object from metadata found in chunk's extended attributes. In case the "fullpath" is not available, try to read legacy metadata, and maybe ask meta1 to resolve the CID into account and container names. """ # pylint: disable=unbalanced-tuple-unpacking try: acct, ct, path, vers, content_id = \ decode_fullpath(xattr_meta['full_path']) target.account = acct target.container = ct target.obj = path target.content_id = content_id target.version = vers except KeyError: # No fullpath header, try legacy headers if 'content_path' in xattr_meta: target.obj = xattr_meta['content_path'] if 'content_id' in xattr_meta: target.content_id = xattr_meta['content_id'] if 'content_version' in xattr_meta: target.version = xattr_meta['content_version'] cid = xattr_meta.get('container_id') if cid: try: md = self.api.directory.show(cid=cid) acct = md.get('account') ct = md.get('name') if acct: target.account = acct if ct: target.container = ct except Exception as err: self.logger.warn( "Failed to resolve CID %s into account " "and container names: %s", cid, err) def recover_and_complete_object_meta(self, target, chunk): _, rawx_service, chunk_id = chunk.rsplit('/', 2) # 1. Fetch chunk list from rdir (could be cached). # Unfortunately we cannot seek for a chunk ID. entries = [ x for x in self.rdir_client.chunk_fetch(rawx_service, limit=-1) if x[2] == chunk_id ] if not entries: self.logger.warn('Chunk %s not found in rdir' % chunk_id) return elif len(entries) > 1: self.logger.info('Chunk %s appears in %d objects', chunk_id, len(entries)) # 2. Find content and container IDs target.cid, target.content_id = entries[0][0:2] meta = self.api.object_get_properties(None, None, None, cid=target.cid, content=target.content_id) target.obj = meta['name'] target.version = meta['version'] target.account, target.container = self.api.resolve_cid(target.cid) def send_result(self, target, errors=None, irreparable=False): """ Put an item in the result queue. """ # TODO(FVE): send to an external queue. target.append_result(ItemResult(errors, irreparable)) self.result_queue.put(target) def send_chunk_job(self, target, irreparable=False): """ Send a "content broken" event, to trigger the reconstruction of the chunk. """ item = (self.api.namespace, target.cid, target.content_id, target.chunk) ev_dict = BlobRebuilder.task_event_from_item(item) if irreparable: ev_dict['data']['irreparable'] = irreparable job = json.dumps(ev_dict) self.error_sender.send_job(job) self.error_sender.job_done() # Don't expect any response def write_error(self, target, irreparable=False): if not self.error_file: return error = list() if irreparable: error.append(IRREPARABLE_PREFIX) error.append(target.account) if target.container: error.append(target.container) if target.obj: error.append(target.obj) if target.chunk: error.append(target.chunk) self.error_writer.writerow(error) def write_rebuilder_input(self, target, irreparable=False): error = list() if irreparable: error.append(IRREPARABLE_PREFIX) error.append(target.cid) # FIXME(FVE): ensure we always resolve content_id, # or pass object version along with object name. error.append(target.content_id or target.obj) error.append(target.chunk) self.rebuild_writer.writerow(error) def write_chunk_error(self, target, chunk=None, irreparable=False): if chunk is not None: target = target.copy() target.chunk = chunk self.write_error(target, irreparable=irreparable) if self.rebuild_file: self.write_rebuilder_input(target, irreparable=irreparable) if self.error_sender: self.send_chunk_job(target, irreparable=irreparable) def _check_chunk_xattr(self, target, obj_meta, xattr_meta): """ Check coherency of chunk extended attributes with object metadata. :returns: a list of errors """ errors = list() # Composed position -> erasure coding attr_prefix = 'meta' if '.' in obj_meta['pos'] else '' attr_key = attr_prefix + 'chunk_size' if str(obj_meta['size']) != xattr_meta.get(attr_key): errors.append( "'%s' xattr (%s) differs from size in meta2 (%s)" % (attr_key, xattr_meta.get(attr_key), obj_meta['size'])) attr_key = attr_prefix + 'chunk_hash' if obj_meta['hash'] != xattr_meta.get(attr_key): errors.append( "'%s' xattr (%s) differs from hash in meta2 (%s)" % (attr_key, xattr_meta.get(attr_key), obj_meta['hash'])) return errors def _check_chunk(self, target): """ Execute various checks on a chunk: - does it appear in object's chunk list? - is it reachable? - are its extended attributes coherent? :returns: the list of errors encountered, and the chunk's owner object metadata. """ chunk = target.chunk errors = list() obj_meta = None xattr_meta = None cached = self._get_cached_or_lock(chunk) if cached is not None: return cached + (True, ) self.logger.debug('Checking chunk "%s"', target) try: xattr_meta = self.api.blob_client.chunk_head( chunk, xattr=self.check_xattr, check_hash=self.check_hash) except exc.NotFound as err: self.chunk_not_found += 1 errors.append('Not found: %s' % (err, )) except exc.FaultyChunk as err: self.chunk_exceptions += 1 errors.append('Faulty: %r' % (err, )) except Exception as err: self.chunk_exceptions += 1 errors.append('Check failed: %s' % (err, )) if not target.obj: if xattr_meta: self.complete_target_from_chunk_metadata(target, xattr_meta) else: self.recover_and_complete_object_meta(target, chunk) if target.obj: obj_listing, obj_meta = self.check_obj(target.copy_object()) if chunk not in obj_listing: errors.append('Missing from object listing') db_meta = dict() else: db_meta = obj_listing[chunk] if db_meta and xattr_meta and self.check_xattr: errors.extend( self._check_chunk_xattr(target, db_meta, xattr_meta)) self.list_cache[chunk] = errors, obj_meta self._unlock(chunk) # Do not send errors directly, let the caller do it. # Indeed, it may want to check if the chunks can be repaired or not. self.chunks_checked += 1 return errors, obj_meta, False def check_chunk(self, target): errors, _obj_meta, from_cache = self._check_chunk(target) # If the result comes from the cache, we already reported it. if not from_cache: self.send_result(target, errors, target.irreparable) return errors def _check_metachunk(self, target, stg_met, pos, chunks, recurse=0): """ Check that a metachunk has the right number of chunks. :returns: the list of errors """ required = stg_met.expected_chunks errors = list() chunk_results = list() if len(chunks) < required: missing_chunks = required - len(chunks) if stg_met.ec: subs = {x['num'] for x in chunks} for sub in range(required): if sub not in subs: chkt = target.copy() chkt.chunk = '%d.%d' % (pos, sub) err = "Missing chunk at position %s" % chkt.chunk chunk_results.append((chkt, [err], False)) errors.append(err) else: for _ in range(missing_chunks): chkt = target.copy() chkt.chunk = '%d.%d' % (pos, sub) err = "Missing chunk at position %d" % pos chunk_results.append((chkt, [err], False)) errors.append(err) if recurse > 0: for chunk in chunks: tcopy = target.copy() tcopy.chunk = chunk['url'] chunk_errors, _, from_cache = self._check_chunk(tcopy) chunk_results.append((tcopy, chunk_errors, from_cache)) if chunk_errors: errors.append("Unusable chunk %s at position %s" % (chunk['url'], chunk['pos'])) irreparable = required - len(errors) < stg_met.min_chunks_to_read if irreparable: errors.append( "Unavailable metachunk at position %s " "(%d/%d chunks available, %d/%d required)" % (pos, required - len(errors), stg_met.expected_chunks, stg_met.min_chunks_to_read, stg_met.expected_chunks)) for tgt, errs, from_cache in chunk_results: # If the result comes from the cache, we already reported it. if not from_cache: self.send_result(tgt, errs, irreparable) # Since the "metachunk" is not an official item type, # this method does not report errors itself. Errors will # be reported as object errors. return errors def _check_obj_policy(self, target, obj_meta, chunks, recurse=0): """ Check that the list of chunks of an object matches the object's storage policy. :returns: the list of errors encountered """ stg_met = STORAGE_METHODS.load(obj_meta['chunk_method']) chunks_by_pos = _sort_chunks(chunks, stg_met.ec) tasks = list() for pos, pchunks in iteritems(chunks_by_pos): tasks.append((pos, self._spawn(self._check_metachunk, target.copy(), stg_met, pos, pchunks, recurse=recurse))) errors = list() for pos, task in tasks: if not task and not self.running: errors.append("Pos %d skipped: checker is exiting" % pos) continue try: errors.extend(task.wait()) except Exception as err: errors.append("Check failed: pos %d: %s" % (pos, err)) return errors def check_obj_versions(self, target, versions, recurse=0): """ Run checks of all versions of the targeted object in parallel. """ tasks = list() for ov in versions: tcopy = target.copy_object() tcopy.content_id = ov['id'] tcopy.version = str(ov['version']) tasks.append((tcopy.version, self._spawn(self.check_obj, tcopy, recurse=recurse))) errors = list() for version, task in tasks: if not task and not self.running: errors.append("Version %s skipped: checker is exiting" % version) continue try: task.wait() except Exception as err: errors.append("Check failed: version %s: %s" % (version, err)) if errors: # Send a result with the target without version to tell # we were not able to check all versions of the object. self.send_result(target, errors) def _load_obj_meta(self, target, errors): """ Load object metadata and chunks. :param target: which object to check. :param errors: list of errors that will be appended in case any error occurs. :returns: a tuple with object metadata and a list of chunks. """ try: return self.api.object_locate(target.account, target.container, target.obj, version=target.version, properties=False) except exc.NoSuchObject as err: self.object_not_found += 1 errors.append('Not found: %s' % (err, )) except Exception as err: self.object_exceptions += 1 errors.append('Check failed: %s' % (err, )) return None, [] def _get_cached_or_lock(self, lock_key): # If something is running, wait for it with self.running_lock: event = self.running_tasks.get(lock_key) if event: event.wait() event = None # Maybe get a cached result if lock_key in self.list_cache: return self.list_cache[lock_key] # No cached result, try to compute the thing ourselves while True: with self.running_lock: # Another check while locked if lock_key in self.list_cache: return self.list_cache[lock_key] # Still nothing cached event = self.running_tasks.get(lock_key) if event is None: self.running_tasks[lock_key] = Event() return None event.wait() def _unlock(self, lock_key): with self.running_lock: event = self.running_tasks[lock_key] del self.running_tasks[lock_key] event.send(True) def check_obj(self, target, recurse=0): """ Check one object version. If no version is specified, all versions of the object will be checked. :returns: the result of the check of the most recent version, or the one that is explicitly targeted. """ account = target.account container = target.container obj = target.obj vers = target.version # can be None cached = self._get_cached_or_lock((account, container, obj, vers)) if cached is not None: return cached self.logger.info('Checking object "%s"', target) container_listing, _ = self.check_container(target.copy_container()) errors = list() if obj not in container_listing: errors.append('Missing from container listing') # checksum = None else: versions = container_listing[obj] if vers is None: if target.content_id is None: # No version specified, check all versions self.check_obj_versions(target.copy_object(), versions, recurse=recurse) # Now return the cached result of the most recent version target.content_id = versions[0]['id'] target.version = str(versions[0]['version']) res = self.check_obj(target, recurse=0) self._unlock((account, container, obj, vers)) return res else: for ov in versions: if ov['id'] == target.content_id: vers = str(ov['version']) target.version = vers break else: errors.append('Missing from container listing') # TODO check checksum match # checksum = container_listing[obj]['hash'] pass meta, chunks = self._load_obj_meta(target, errors) chunk_listing = {c['url']: c for c in chunks} if meta: if target.content_id is None: target.content_id = meta['id'] if target.version is None: target.version = str(meta['version']) self.list_cache[(account, container, obj, vers)] = \ (chunk_listing, meta) self.objects_checked += 1 self._unlock((account, container, obj, vers)) # Skip the check if we could not locate the object if meta: errors.extend( self._check_obj_policy(target, meta, chunks, recurse=recurse)) self.send_result(target, errors) return chunk_listing, meta def check_container(self, target, recurse=0): account = target.account container = target.container cached = self._get_cached_or_lock((account, container)) if cached is not None: return cached self.logger.info('Checking container "%s"', target) account_listing = self.check_account(target.copy_account()) errors = list() if container not in account_listing: errors.append('Missing from account listing') marker = None results = [] ct_meta = dict() extra_args = dict() if self.limit_listings > 1 and target.obj: # When we are explicitly checking one object, start the listing # where this object is supposed to be. Do not use a limit, # but an end marker, in order to fetch all versions of the object. extra_args['prefix'] = target.obj extra_args['end_marker'] = target.obj + '\x00' # HACK while True: try: resp = self.api.object_list(account, container, marker=marker, versions=True, **extra_args) except exc.NoSuchContainer as err: self.container_not_found += 1 errors.append('Not found: %s' % (err, )) break except Exception as err: self.container_exceptions += 1 errors.append('Check failed: %s' % (err, )) break truncated = resp.get('truncated', False) if truncated: marker = resp['next_marker'] if resp['objects']: # safeguard, probably useless if not marker: marker = resp['objects'][-1]['name'] results.extend(resp['objects']) if not truncated or self.limit_listings > 1: break else: ct_meta = resp ct_meta.pop('objects') break container_listing = dict() # Save all object versions, with the most recent first for obj in results: container_listing.setdefault(obj['name'], list()).append(obj) for versions in container_listing.values(): versions.sort(key=lambda o: o['version'], reverse=True) if self.limit_listings <= 1: # We just listed the whole container, keep the result in a cache self.containers_checked += 1 self.list_cache[(account, container)] = container_listing, ct_meta self._unlock((account, container)) if recurse > 0: for obj_vers in container_listing.values(): for obj in obj_vers: tcopy = target.copy_object() tcopy.obj = obj['name'] tcopy.content_id = obj['id'] tcopy.version = str(obj['version']) self._spawn_n(self.check_obj, tcopy, recurse - 1) self.send_result(target, errors) return container_listing, ct_meta def check_account(self, target, recurse=0): account = target.account cached = self._get_cached_or_lock(account) if cached is not None: return cached self.logger.info('Checking account "%s"', target) errors = list() marker = None results = [] extra_args = dict() if self.limit_listings > 0 and target.container: # When we are explicitly checking one container, start the listing # where this container is supposed to be, and list only one # container. extra_args['prefix'] = target.container extra_args['limit'] = 1 while True: try: resp = self.api.container_list(account, marker=marker, **extra_args) except Exception as err: self.account_exceptions += 1 errors.append('Check failed: %s' % (err, )) break if resp: marker = resp[-1][0] results.extend(resp) if self.limit_listings > 0: break else: break containers = dict() for container in results: # Name, number of objects, number of bytes containers[container[0]] = (container[1], container[2]) if self.limit_listings <= 0: # We just listed the whole account, keep the result in a cache self.accounts_checked += 1 self.list_cache[account] = containers self._unlock(account) if recurse > 0: for container in containers: tcopy = target.copy_account() tcopy.container = container self._spawn_n(self.check_container, tcopy, recurse - 1) self.send_result(target, errors) return containers def check(self, target, recurse=0): if target.type == 'chunk': self._spawn_n(self.check_chunk, target) elif target.type == 'object': self._spawn_n(self.check_obj, target, recurse) elif target.type == 'container': self._spawn_n(self.check_container, target, recurse) else: self._spawn_n(self.check_account, target, recurse) def check_all_accounts(self, recurse=0): all_accounts = self.api.account_list() for acct in all_accounts: self.check(Target(acct), recurse=recurse) def fetch_results(self, rate_limiter=None): while self.running and not self.result_queue.empty(): res = self.result_queue.get(True) yield res # Rate limiting is done on the result queue for now. # Someday we could implement a submission queue instead of # letting each worker submit tasks to the pool, and do # the rate limiting on this queue. if rate_limiter is not None: self.run_time = rate_limiter(self.run_time) def merge_with_delayed_target(self, target): """ Merge the specified target with a delayed one. :returns: the delayed target, if there is one, with an error log including the errors of the new target. Return the new target otherwise. """ tkey = repr(target) prev_target = self.delayed_targets.get(tkey, target) if prev_target is not target: errors = dict(prev_target.error_log) errors.update(target.error_log) prev_target.error_log = sorted(errors.items()) return prev_target def log_result(self, target): """ Log a check result, if it shows errors. Dispatch the errors to the appropriate destinations (log files, queues, etc.). """ # The result may come from a new target, or from an old target # we checked another time, or both. target = self.merge_with_delayed_target(target) if target.has_errors: time_in_error, confirmations = target.time_in_error() if (time_in_error < self.min_time_in_error or confirmations < self.required_confirmations): self.logger.info("Delaying check for %s, %d/%d confirmations", target, confirmations, self.required_confirmations) self.delayed_targets[repr(target)] = target else: if target.type == 'chunk': self.logger.info( "Writing error for %s, %d/%d confirmations", target, confirmations, self.required_confirmations) self.write_chunk_error(target, irreparable=target.irreparable) else: self.write_error(target, irreparable=target.irreparable) self.delayed_targets.pop(repr(target), None) self.logger.warn( '%s:%s\n%s', target, ' irreparable' if target.irreparable else '', target.latest_error_result().errors_to_str(err_format=' %s')) def run(self, rate_limiter=None): """ Fetch results and write logs until all jobs have finished. :returns: a generator yielding check results. """ while self.running and (self.pool.running() + self.pool.waiting()): for result in self.fetch_results(rate_limiter): self.log_result(result) yield result sleep(0.1) if self.running: self.pool.waitall() # No rate limiting for result in self.fetch_results(): self.log_result(result) yield result self.list_cache = CacheDict(self.list_cache.size) def stop(self): self.logger.info("Stopping") self.running = False def report(self): success = True def _report_stat(name, stat): print("{0:18}: {1}".format(name, stat)) print() print('Report') _report_stat("Accounts checked", self.accounts_checked) if self.account_not_found: success = False _report_stat("Missing accounts", self.account_not_found) if self.account_exceptions: success = False _report_stat("Exceptions", self.account_exceptions) print() _report_stat("Containers checked", self.containers_checked) if self.container_not_found: success = False _report_stat("Missing containers", self.container_not_found) if self.container_exceptions: success = False _report_stat("Exceptions", self.container_exceptions) print() _report_stat("Objects checked", self.objects_checked) if self.object_not_found: success = False _report_stat("Missing objects", self.object_not_found) if self.object_exceptions: success = False _report_stat("Exceptions", self.object_exceptions) print() _report_stat("Chunks checked", self.chunks_checked) if self.chunk_not_found: success = False _report_stat("Missing chunks", self.chunk_not_found) if self.chunk_exceptions: success = False _report_stat("Exceptions", self.chunk_exceptions) return success
def __init__(self, conf, tool): super(_DistributedDispatcher, self).__init__(conf, tool) self.sending = None self.max_items_per_second = int_value( self.conf.get('items_per_second'), self.tool.DEFAULT_ITEM_PER_SECOND) # All available beanstalkd conscience_client = ConscienceClient(self.conf) all_beanstalkd = conscience_client.all_services('beanstalkd') all_available_beanstalkd = dict() for beanstalkd in all_beanstalkd: if beanstalkd['score'] <= 0: continue all_available_beanstalkd[beanstalkd['addr']] = beanstalkd if not all_available_beanstalkd: raise OioException('No beanstalkd available') # Beanstalkd workers workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \ or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE self.beanstalkd_workers = dict() for beanstalkd in locate_tube(all_available_beanstalkd.values(), workers_tube): beanstalkd_worker = BeanstalkdSender(beanstalkd['addr'], workers_tube, self.logger) self.beanstalkd_workers[beanstalkd['addr']] = beanstalkd_worker self.logger.info( 'Beanstalkd %s using tube %s is selected as a worker', beanstalkd_worker.addr, beanstalkd_worker.tube) if not self.beanstalkd_workers: raise OioException('No beanstalkd worker available') nb_workers = len(self.beanstalkd_workers) if self.max_items_per_second > 0: # Max 2 seconds in advance queue_size_per_worker = self.max_items_per_second * 2 / nb_workers else: queue_size_per_worker = 64 for _, beanstalkd_worker in self.beanstalkd_workers.items(): beanstalkd_worker.low_limit = queue_size_per_worker / 2 beanstalkd_worker.high_limit = queue_size_per_worker # Beanstalkd reply beanstalkd_reply = dict() try: local_services = conscience_client.local_services() for local_service in local_services: if local_service['type'] != 'beanstalkd': continue beanstalkd = all_available_beanstalkd.get( local_service['addr']) if beanstalkd is None: continue if beanstalkd_reply \ and beanstalkd_reply['score'] >= beanstalkd['score']: continue beanstalkd_reply = beanstalkd except Exception as exc: # pylint: disable=broad-except self.logger.warning( 'ERROR when searching for beanstalkd locally: %s', exc) if not beanstalkd_reply: self.logger.warn('No beanstalkd available locally') try: beanstalkd = conscience_client.next_instance('beanstalkd') beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']] except Exception as exc: # pylint: disable=broad-except self.logger.warning('ERROR when searching for beanstalkd: %s', exc) beanstalkd_reply_addr = beanstalkd_reply['addr'] # If the tube exists, another service must have already used this tube tube_reply = workers_tube + '.reply.' + str(time.time()) tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_reply_addr).tubes() if tube_reply in tubes: raise OioException('Beanstalkd %s using tube %s is already used') self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr, tube_reply, self.logger) self.logger.info( 'Beanstalkd %s using tube %s is selected for the replies', self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)
class Tool(object): """ Process all found items. For the task_res variable, the following format must be respected: (item, info, error). """ DEFAULT_BEANSTALKD_WORKER_TUBE = 'oio-process' DEFAULT_REPORT_INTERVAL = 3600 DEFAULT_RETRY_DELAY = 3600 DEFAULT_ITEM_PER_SECOND = 30 DEFAULT_CONCURRENCY = 1 DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE = 'oio-process' def __init__(self, conf, beanstalkd_addr=None, logger=None): self.conf = conf self.logger = logger or get_logger(self.conf) self.namespace = conf['namespace'] self.success = True # exit gracefully self.running = True signal.signal(signal.SIGINT, self.exit_gracefully) signal.signal(signal.SIGTERM, self.exit_gracefully) # counters self.items_processed = 0 self.total_items_processed = 0 self.errors = 0 self.total_errors = 0 self.total_expected_items = None # report self.start_time = 0 self.last_report = 0 self.report_interval = int_value(self.conf.get('report_interval'), self.DEFAULT_REPORT_INTERVAL) # dispatcher self.dispatcher = None # input self.beanstalkd = None if beanstalkd_addr: self.beanstalkd = BeanstalkdListener( beanstalkd_addr, self.conf.get('beanstalkd_worker_tube') or self.DEFAULT_BEANSTALKD_WORKER_TUBE, self.logger) # retry self.retryer = None self.retry_queue = None if self.beanstalkd: self.retryer = BeanstalkdSender(self.beanstalkd.addr, self.beanstalkd.tube, self.logger) self.retry_queue = eventlet.Queue() self.retry_delay = int_value(self.conf.get('retry_delay'), self.DEFAULT_RETRY_DELAY) @staticmethod def items_from_task_event(task_event): """ Convert the task event into a list (generator) of items. """ raise NotImplementedError() @staticmethod def task_event_from_item(item): """ Convert the item into a task event. """ raise NotImplementedError() @staticmethod def tasks_res_from_res_event(res_event): """ Convert the result event into a list (generator) of tasks result. """ raise NotImplementedError() @staticmethod def res_event_from_task_res(task_res): """ Convert the task result into a result event. """ raise NotImplementedError() @staticmethod def string_from_item(item): """ Convert the item into a string. """ raise NotImplementedError() def exit_gracefully(self, signum, frame): self.logger.info('Stop sending and wait for all results already sent') self.success = False self.running = False if self.beanstalkd: self.beanstalkd.running = False def _item_with_beanstalkd_reply_from_task_event(self, job_id, data): task_event = json.loads(data) beanstalkd_reply = task_event.get('beanstalkd_reply') items = self.items_from_task_event(task_event) for item in items: yield (item, beanstalkd_reply) def _fetch_items_with_beanstalkd_reply_from_beanstalkd(self): # Do not block more than 2 seconds return self.beanstalkd.fetch_jobs( self._item_with_beanstalkd_reply_from_task_event, reserve_timeout=2) def _fetch_items(self): """ Fetch items from inputs (other than the beanstalkd). """ raise NotImplementedError() def _fetch_items_with_beanstalkd_reply(self): items = self._fetch_items() for item in items: yield (item, None) def fetch_items_with_beanstalkd_reply(self): """ Fetch items with beanstalkd reply (useful if the task is distributed). """ if self.beanstalkd: return self._fetch_items_with_beanstalkd_reply_from_beanstalkd() return self._fetch_items_with_beanstalkd_reply() def update_counters(self, task_res): """ Update all counters of the tool. """ _, _, error = task_res self.items_processed += 1 if error is not None: self.errors += 1 def _update_total_counters(self): items_processed = self.items_processed self.items_processed = 0 self.total_items_processed += items_processed errors = self.errors self.errors = 0 self.total_errors += errors return items_processed, self.total_items_processed, \ errors, self.total_errors def _get_report(self, status, end_time, counters): raise NotImplementedError() def log_report(self, status, force=False): """ Log a report with a fixed interval. """ end_time = time.time() if force or (end_time - self.last_report >= self.report_interval): counters = self._update_total_counters() self.logger.info(self._get_report(status, end_time, counters)) self.last_report = end_time def create_worker(self, queue_workers, queue_reply): """ Create worker to process the items. """ raise NotImplementedError() def prepare_local_dispatcher(self): """ The tool will dispatch the tasks locally. """ self.dispatcher = _LocalDispatcher(self.conf, self) def prepare_distributed_dispatcher(self): """ The tool will dispatch the tasks on the platform. """ self.dispatcher = _DistributedDispatcher(self.conf, self) def _load_total_expected_items(self): raise NotImplementedError() def _read_retry_queue(self): if self.retry_queue is None: return while True: # Reschedule jobs we were not able to handle. item = self.retry_queue.get() if self.retryer: sent = False while not sent: sent = self.retryer.send_job(json.dumps( self.task_event_from_item(item)), delay=self.retry_delay) if not sent: sleep(1.0) self.retryer.job_done() self.retry_queue.task_done() def run(self): """ Start processing all found items. """ if self.dispatcher is None: raise ValueError('No dispatcher') eventlet.spawn_n(self._load_total_expected_items) # spawn one worker for the retry queue eventlet.spawn_n(self._read_retry_queue) for task_res in self.dispatcher.run(): yield task_res # block until the retry queue is empty if self.retry_queue: self.retry_queue.join() def is_success(self): """ Check if there are any errors. """ if not self.success: return False if self.total_items_processed == 0: self.logger.warn('No item to proccess') return self.total_errors == 0
class ToolWorker(object): """ Process all items given by the tool. """ def __init__(self, tool, queue_workers, queue_reply): self.tool = tool self.conf = self.tool.conf self.logger = self.tool.logger self.queue_workers = queue_workers self.queue_reply = queue_reply # reply self.beanstalkd_reply = None def _process_item(self, item): raise NotImplementedError() def _reply_task_res(self, beanstalkd_reply, task_res): self.queue_reply.put(task_res) if beanstalkd_reply is None: return res_event = self.tool.res_event_from_task_res(task_res) if self.tool.beanstalkd is not None: res_event['beanstalkd_worker'] = \ { 'addr': self.tool.beanstalkd.addr, 'tube': self.tool.beanstalkd.tube } try: if self.beanstalkd_reply is None \ or self.beanstalkd_reply.addr != beanstalkd_reply['addr'] \ or self.beanstalkd_reply.tube != beanstalkd_reply['tube']: if self.beanstalkd_reply is not None: self.beanstalkd_reply.close() self.beanstalkd_reply = BeanstalkdSender( beanstalkd_reply['addr'], beanstalkd_reply['tube'], self.logger) sent = False event_json = json.dumps(res_event) # This will loop forever if there is a connection issue with the # beanstalkd server. We chose to let it loop until someone fixes # the problem (or the problem resolves by magic). while not sent: sent = self.beanstalkd_reply.send_job(event_json) if not sent: sleep(1.0) self.beanstalkd_reply.job_done() except Exception as exc: # pylint: disable=broad-except item, info, error = task_res self.logger.warn( 'Beanstalkd reply failed %s (info=%s error=%s): %s', self.tool.string_from_item(item), str(info), error, exc) def run(self): """ Starting processing all items given by the tool. """ while True: item_with_beanstalkd_reply = self.queue_workers.get() if item_with_beanstalkd_reply is None: # end signal break item, beanstalkd_reply = item_with_beanstalkd_reply info = None error = None try: info = self._process_item(item) except RetryLater as exc: # Schedule a retry only if the sender did not set reply address # (rebuild CLIs set reply address, meta2 does not). if self.tool.retry_queue and not beanstalkd_reply: self.logger.warn( "Putting an item (%s) in the retry queue: %s", self.tool.string_from_item(item), exc.args[0]) self.tool.retry_queue.put(item) else: error = str(exc.args[0]) except Exception as exc: # pylint: disable=broad-except error = str(exc) task_res = (item, info, error) self._reply_task_res(beanstalkd_reply, task_res) self.queue_workers.task_done()
def __init__(self, conf, tool): super(_DistributedDispatcher, self).__init__(conf, tool) self.sending = False # All available beanstalkd conscience_client = ConscienceClient(self.conf) all_beanstalkd = conscience_client.all_services('beanstalkd') all_available_beanstalkd = dict() for beanstalkd in all_beanstalkd: if beanstalkd['score'] <= 0: continue all_available_beanstalkd[beanstalkd['addr']] = beanstalkd if not all_available_beanstalkd: raise OioException('No beanstalkd available') # Beanstalkd workers workers_tube = self.conf.get('distributed_beanstalkd_worker_tube') \ or self.tool.DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE self.beanstalkd_workers = dict() for _, beanstalkd in all_available_beanstalkd.items(): beanstalkd_worker_addr = beanstalkd['addr'] # If the tube exists, # there should be a service that listens to this tube tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_worker_addr).tubes() if workers_tube not in tubes: continue beanstalkd_worker = BeanstalkdSender(beanstalkd_worker_addr, workers_tube, self.logger) self.beanstalkd_workers[beanstalkd_worker_addr] = beanstalkd_worker self.logger.info( 'Beanstalkd %s using tube %s is selected as a worker', beanstalkd_worker.addr, beanstalkd_worker.tube) if not self.beanstalkd_workers: raise OioException('No beanstalkd worker available') # Beanstalkd reply beanstalkd_reply = dict() try: local_services = conscience_client.local_services() for local_service in local_services: if local_service['type'] != 'beanstalkd': continue beanstalkd = all_available_beanstalkd.get( local_service['addr']) if beanstalkd is None: continue if beanstalkd_reply \ and beanstalkd_reply['score'] >= beanstalkd['score']: continue beanstalkd_reply = beanstalkd except Exception as exc: # pylint: disable=broad-except self.logger.warning( 'ERROR when searching for beanstalkd locally: %s', exc) if not beanstalkd_reply: self.logger.warn('No beanstalkd available locally') try: beanstalkd = conscience_client.next_instance('beanstalkd') beanstalkd_reply = all_available_beanstalkd[beanstalkd['addr']] except Exception as exc: # pylint: disable=broad-except self.logger.warning('ERROR when searching for beanstalkd: %s', exc) beanstalkd_reply_addr = beanstalkd_reply['addr'] # If the tube exists, another service must have already used this tube tube_reply = workers_tube + '.reply.' + str(time.time()) tubes = Beanstalk.from_url('beanstalk://' + beanstalkd_reply_addr).tubes() if tube_reply in tubes: raise OioException('Beanstalkd %s using tube %s is already used') self.beanstalkd_reply = BeanstalkdListener(beanstalkd_reply_addr, tube_reply, self.logger) self.logger.info( 'Beanstalkd %s using tube %s is selected for the replies', self.beanstalkd_reply.addr, self.beanstalkd_reply.tube)
class ToolWorker(object): """ Process all items given by the tool. """ def __init__(self, tool, queue_workers, queue_reply): self.tool = tool self.conf = self.tool.conf self.logger = self.tool.logger self.queue_workers = queue_workers self.queue_reply = queue_reply # reply self.beanstalkd_reply = None def _process_item(self, item): raise NotImplementedError() def _reply_task_res(self, beanstalkd_reply, task_res): self.queue_reply.put(task_res) if beanstalkd_reply is None: return res_event = self.tool.res_event_from_task_res(task_res) if self.tool.beanstalkd is not None: res_event['beanstalkd_worker'] = \ { 'addr': self.tool.beanstalkd.addr, 'tube': self.tool.beanstalkd.tube } try: if self.beanstalkd_reply is None \ or self.beanstalkd_reply.addr != beanstalkd_reply['addr'] \ or self.beanstalkd_reply.tube != beanstalkd_reply['tube']: if self.beanstalkd_reply is not None: self.beanstalkd_reply.close() self.beanstalkd_reply = BeanstalkdSender( beanstalkd_reply['addr'], beanstalkd_reply['tube'], self.logger) self.beanstalkd_reply.send_job(json.dumps(res_event)) except Exception as exc: # pylint: disable=broad-except item, info, error = task_res self.logger.warn( 'Beanstalkd reply failed %s (info=%s error=%s): %s', self.tool.string_from_item(item), str(info), error, exc) def run(self): """ Starting processing all items given by the tool. """ while True: item_with_beanstalkd_reply = self.queue_workers.get() if item_with_beanstalkd_reply is None: # end signal break item, beanstalkd_reply = item_with_beanstalkd_reply info = None error = None try: info = self._process_item(item) except Exception as exc: # pylint: disable=broad-except error = str(exc) task_res = (item, info, error) self._reply_task_res(beanstalkd_reply, task_res) self.queue_workers.task_done()