class BlobRebuilderWorker(object): def __init__(self, conf, logger, volume): self.conf = conf self.logger = logger or get_logger(conf) self.volume = volume self.run_time = 0 self.passes = 0 self.errors = 0 self.last_reported = 0 self.chunks_run_time = 0 self.bytes_running_time = 0 self.bytes_processed = 0 self.total_bytes_processed = 0 self.total_chunks_processed = 0 self.dry_run = true_value( conf.get('dry_run', False)) self.report_interval = int_value( conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value( conf.get('chunks_per_second'), 30) self.max_bytes_per_second = int_value( conf.get('bytes_per_second'), 10000000) self.rdir_fetch_limit = int_value( conf.get('rdir_fetch_limit'), 100) self.rdir_client = RdirClient(conf) self.content_factory = ContentFactory(conf) def rebuilder_pass_with_lock(self): self.rdir_client.admin_lock(self.volume, "rebuilder on %s" % gethostname()) try: self.rebuilder_pass() finally: self.rdir_client.admin_unlock(self.volume) def rebuilder_pass(self): start_time = report_time = time.time() total_errors = 0 rebuilder_time = 0 chunks = self.rdir_client.chunk_fetch(self.volume, limit=self.rdir_fetch_limit, rebuild=True) for container_id, content_id, chunk_id, data in chunks: loop_time = time.time() if self.dry_run: self.dryrun_chunk_rebuild(container_id, content_id, chunk_id) else: self.safe_chunk_rebuild(container_id, content_id, chunk_id) self.chunks_run_time = ratelimit( self.chunks_run_time, self.max_chunks_per_second ) self.total_chunks_processed += 1 now = time.time() if now - self.last_reported >= self.report_interval: self.logger.info( '%(start_time)s ' '%(passes)d ' '%(errors)d ' '%(c_rate).2f ' '%(b_rate).2f ' '%(total).2f ' '%(rebuilder_time).2f' '%(rebuilder_rate).2f' % { 'start_time': time.ctime(report_time), 'passes': self.passes, 'errors': self.errors, 'c_rate': self.passes / (now - report_time), 'b_rate': self.bytes_processed / (now - report_time), 'total': (now - start_time), 'rebuilder_time': rebuilder_time, 'rebuilder_rate': rebuilder_time / (now - start_time) } ) report_time = now total_errors += self.errors self.passes = 0 self.bytes_processed = 0 self.last_reported = now rebuilder_time += (now - loop_time) elapsed = (time.time() - start_time) or 0.000001 self.logger.info( '%(elapsed).02f ' '%(errors)d ' '%(chunk_rate).2f ' '%(bytes_rate).2f ' '%(rebuilder_time).2f ' '%(rebuilder_rate).2f' % { 'elapsed': elapsed, 'errors': total_errors + self.errors, 'chunk_rate': self.total_chunks_processed / elapsed, 'bytes_rate': self.total_bytes_processed / elapsed, 'rebuilder_time': rebuilder_time, 'rebuilder_rate': rebuilder_time / elapsed } ) def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id): self.logger.info("[dryrun] Rebuilding " "container %s, content %s, chunk %s", container_id, content_id, chunk_id) self.passes += 1 def safe_chunk_rebuild(self, container_id, content_id, chunk_id): try: self.chunk_rebuild(container_id, content_id, chunk_id) except Exception as e: self.errors += 1 self.logger.error('ERROR while rebuilding chunk %s|%s|%s) : %s', container_id, content_id, chunk_id, e) self.passes += 1 def chunk_rebuild(self, container_id, content_id, chunk_id): self.logger.info('Rebuilding (container %s, content %s, chunk %s)', container_id, content_id, chunk_id) try: content = self.content_factory.get(container_id, content_id) except ContentNotFound: raise exc.OrphanChunk('Content not found') chunk = content.chunks.filter(id=chunk_id).one() if chunk is None: raise OrphanChunk("Chunk not found in content") chunk_size = chunk.size content.rebuild_chunk(chunk_id) self.rdir_client.chunk_push(self.volume, container_id, content_id, chunk_id, rtime=int(time.time())) self.bytes_processed += chunk_size self.total_bytes_processed += chunk_size
class BlobRebuilder(Tool): """ Rebuild chunks. """ DEFAULT_BEANSTALKD_WORKER_TUBE = 'oio-rebuild' DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE = 'oio-rebuild' DEFAULT_RDIR_FETCH_LIMIT = 100 DEFAULT_RDIR_TIMEOUT = 60.0 DEFAULT_ALLOW_FROZEN_CT = False DEFAULT_ALLOW_SAME_RAWX = True DEFAULT_TRY_CHUNK_DELETE = False DEFAULT_DRY_RUN = False def __init__(self, conf, input_file=None, service_id=None, **kwargs): super(BlobRebuilder, self).__init__(conf, **kwargs) # counters self.bytes_processed = 0 self.total_bytes_processed = 0 # input self.input_file = input_file self.rawx_id = service_id # rawx/rdir self.rdir_client = RdirClient(self.conf, logger=self.logger) self.rdir_fetch_limit = int_value(self.conf.get('rdir_fetch_limit'), self.DEFAULT_RDIR_FETCH_LIMIT) self.rdir_shuffle_chunks = true_value(conf.get('rdir_shuffle_chunks')) self.rdir_timeout = float_value(conf.get('rdir_timeout'), self.DEFAULT_RDIR_TIMEOUT) @staticmethod def items_from_task_event(task_event): namespace = task_event['url']['ns'] container_id = task_event['url']['id'] content_id = task_event['url']['content'] for chunk_id_or_pos in task_event['data']['missing_chunks']: yield namespace, container_id, content_id, str(chunk_id_or_pos) @staticmethod def task_event_from_item(item): namespace, container_id, content_id, chunk_id_or_pos = item return \ { 'when': time.time(), 'event': EventTypes.CONTENT_BROKEN, 'url': { 'ns': namespace, 'id': container_id, 'content': content_id }, 'data': { 'missing_chunks': [ chunk_id_or_pos ] } } @staticmethod def tasks_res_from_res_event(res_event): namespace = res_event['url']['ns'] container_id = res_event['url']['id'] content_id = res_event['url']['content'] for chunk_rebuilt in res_event['data']['chunks_rebuilt']: yield (namespace, container_id, content_id, str(chunk_rebuilt['chunk_id_or_pos'])), \ chunk_rebuilt['bytes_processed'], chunk_rebuilt['error'] @staticmethod def res_event_from_task_res(task_res): item, bytes_processed, error = task_res namespace, container_id, content_id, chunk_id_or_pos = item return \ { 'when': time.time(), 'event': EventTypes.CONTENT_REBUILT, 'url': { 'ns': namespace, 'id': container_id, 'content': content_id }, 'data': { 'chunks_rebuilt': [{ 'chunk_id_or_pos': chunk_id_or_pos, 'bytes_processed': bytes_processed, 'error': error }] } } @staticmethod def string_from_item(item): namespace, container_id, content_id, chunk_id_or_pos = item return '%s|%s|%s|%s' % (namespace, container_id, content_id, chunk_id_or_pos) def _fetch_items_from_input_file(self): with open(self.input_file, 'r') as ifile: for line in ifile: stripped = line.strip() if not stripped or stripped.startswith('#'): continue container_id, content_id, chunk_id_or_pos = \ stripped.split('|', 3)[:3] yield self.namespace, container_id, content_id, \ chunk_id_or_pos def _fetch_items_from_rawx_id(self): lost_chunks = self.rdir_client.chunk_fetch( self.rawx_id, limit=self.rdir_fetch_limit, rebuild=True, shuffle=self.rdir_shuffle_chunks, timeout=self.rdir_timeout) for container_id, content_id, chunk_id, _ in lost_chunks: yield self.namespace, container_id, content_id, chunk_id def _fetch_items(self): if self.input_file: return self._fetch_items_from_input_file() if self.rawx_id: return self._fetch_items_from_rawx_id() def _empty_generator(): return yield # pylint: disable=unreachable return _empty_generator() def update_counters(self, task_res): super(BlobRebuilder, self).update_counters(task_res) _, bytes_processed, _ = task_res if bytes_processed is not None: self.bytes_processed += bytes_processed def _update_total_counters(self): chunks_processed, total_chunks_processed, errors, total_errors = \ super(BlobRebuilder, self)._update_total_counters() bytes_processed = self.bytes_processed self.bytes_processed = 0 self.total_bytes_processed += bytes_processed return chunks_processed, total_chunks_processed, \ bytes_processed, self.total_bytes_processed, \ errors, total_errors def _get_report(self, status, end_time, counters): chunks_processed, total_chunks_processed, \ bytes_processed, total_bytes_processed, \ errors, total_errors = counters time_since_last_report = (end_time - self.last_report) or 0.00001 total_time = (end_time - self.start_time) or 0.00001 report = ( '%(status)s ' 'last_report=%(last_report)s %(time_since_last_report).2fs ' 'chunks=%(chunks)d %(chunks_rate).2f/s ' 'bytes=%(bytes)d %(bytes_rate).2fB/s ' 'errors=%(errors)d %(errors_rate).2f%% ' 'start_time=%(start_time)s %(total_time).2fs ' 'total_chunks=%(total_chunks)d %(total_chunks_rate).2f/s ' 'total_bytes=%(total_bytes)d %(total_bytes_rate).2fB/s ' 'total_errors=%(total_errors)d %(total_errors_rate).2f%%' % { 'status': status, 'last_report': datetime.fromtimestamp(int(self.last_report)).isoformat(), 'time_since_last_report': time_since_last_report, 'chunks': chunks_processed, 'chunks_rate': chunks_processed / time_since_last_report, 'bytes': bytes_processed, 'bytes_rate': bytes_processed / time_since_last_report, 'errors': errors, 'errors_rate': 100 * errors / float(chunks_processed or 1), 'start_time': datetime.fromtimestamp(int(self.start_time)).isoformat(), 'total_time': total_time, 'total_chunks': total_chunks_processed, 'total_chunks_rate': total_chunks_processed / total_time, 'total_bytes': total_bytes_processed, 'total_bytes_rate': total_bytes_processed / total_time, 'total_errors': total_errors, 'total_errors_rate': 100 * total_errors / float(total_chunks_processed or 1) }) if self.total_expected_items is not None: progress = 100 * total_chunks_processed / \ float(self.total_expected_items or 1) report += ' progress=%d/%d %.2f%%' % \ (total_chunks_processed, self.total_expected_items, progress) return report def create_worker(self, queue_workers, queue_reply): return BlobRebuilderWorker(self, queue_workers, queue_reply) def _load_total_expected_items(self): if self.rawx_id: try: info = self.rdir_client.status(self.rawx_id, read_timeout=self.rdir_timeout) self.total_expected_items = info.get('chunk', dict()).get( 'to_rebuild', None) except Exception as exc: self.logger.warn( 'Failed to fetch the total chunks to rebuild: %s', exc) def run(self): if self.rawx_id: self.rdir_client.admin_lock(self.rawx_id, "rebuilder on %s" % gethostname(), timeout=self.rdir_timeout) success = super(BlobRebuilder, self).run() if self.rawx_id: self.rdir_client.admin_unlock(self.rawx_id, timeout=self.rdir_timeout) return success
class BlobRebuilderWorker(object): def __init__(self, conf, logger, volume, input_file=None, try_chunk_delete=False, beanstalkd_addr=None): self.conf = conf self.logger = logger or get_logger(conf) self.volume = volume self.run_time = 0 self.passes = 0 self.errors = 0 self.last_reported = 0 self.chunks_run_time = 0 self.bytes_running_time = 0 self.bytes_processed = 0 self.total_bytes_processed = 0 self.total_chunks_processed = 0 self.dry_run = true_value(conf.get('dry_run', False)) self.report_interval = int_value(conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value(conf.get('chunks_per_second'), 30) self.max_bytes_per_second = int_value(conf.get('bytes_per_second'), 10000000) self.rdir_fetch_limit = int_value(conf.get('rdir_fetch_limit'), 100) self.allow_same_rawx = true_value(conf.get('allow_same_rawx')) self.input_file = input_file self.rdir_client = RdirClient(conf, logger=self.logger) self.content_factory = ContentFactory(conf) self.try_chunk_delete = try_chunk_delete self.beanstalkd_addr = beanstalkd_addr self.beanstalkd_tube = conf.get('beanstalkd_tube', 'rebuild') self.beanstalk = None def _fetch_chunks_from_event(self, job_id, data): env = json.loads(data) for chunk_pos in env['data']['missing_chunks']: yield [ env['url']['id'], env['url']['content'], str(chunk_pos), None ] def _connect_to_beanstalk(self): self.beanstalk = Beanstalk.from_url(self.beanstalkd_addr) self.beanstalk.use(self.beanstalkd_tube) self.beanstalk.watch(self.beanstalkd_tube) def _handle_beanstalk_event(self, conn_error): try: job_id, data = self.beanstalk.reserve() if conn_error: self.logger.warn("beanstalk reconnected") except ConnectionError: if not conn_error: self.logger.warn("beanstalk connection error") raise try: for chunk in self._fetch_chunks_from_event(job_id, data): yield chunk self.beanstalk.delete(job_id) except Exception: self.logger.exception("handling event %s (bury)", job_id) self.beanstalk.bury(job_id) def _fetch_chunks_from_beanstalk(self): conn_error = False while 1: try: self._connect_to_beanstalk() for chunk in self._handle_beanstalk_event(conn_error): conn_error = False yield chunk except ConnectionError: conn_error = True time.sleep(1.0) def _fetch_chunks_from_file(self): with open(self.input_file, 'r') as ifile: for line in ifile: stripped = line.strip() if stripped and not stripped.startswith('#'): yield stripped.split('|', 3)[:3] + [None] def _fetch_chunks(self): if self.input_file: return self._fetch_chunks_from_file() elif self.beanstalkd_addr: return self._fetch_chunks_from_beanstalk() else: return self.rdir_client.chunk_fetch(self.volume, limit=self.rdir_fetch_limit, rebuild=True) def rebuilder_pass_with_lock(self): self.rdir_client.admin_lock(self.volume, "rebuilder on %s" % gethostname()) try: self.rebuilder_pass() finally: self.rdir_client.admin_unlock(self.volume) def rebuilder_pass(self): start_time = report_time = time.time() rebuilder_time = 0 chunks = self._fetch_chunks() for cid, content_id, chunk_id_or_pos, _ in chunks: loop_time = time.time() if self.dry_run: self.dryrun_chunk_rebuild(cid, content_id, chunk_id_or_pos) else: self.safe_chunk_rebuild(cid, content_id, chunk_id_or_pos) self.chunks_run_time = ratelimit(self.chunks_run_time, self.max_chunks_per_second) self.total_chunks_processed += 1 now = time.time() if now - self.last_reported >= self.report_interval: self.logger.info( 'RUN %(volume)s ' 'started=%(start_time)s ' 'passes=%(passes)d ' 'errors=%(errors)d ' 'chunks=%(nb_chunks)d %(c_rate).2f/s ' 'bytes=%(nb_bytes)d %(b_rate).2fB/s ' 'elapsed=%(total).2f ' '(rebuilder: %(success_rate).2f%%)' % { 'volume': self.volume, 'start_time': datetime.fromtimestamp(int(report_time)).isoformat(), 'passes': self.passes, 'errors': self.errors, 'nb_chunks': self.total_chunks_processed, 'nb_bytes': self.total_bytes_processed, 'c_rate': self.passes / (now - report_time), 'b_rate': self.bytes_processed / (now - report_time), 'total': (now - start_time), 'rebuilder_time': rebuilder_time, 'success_rate': 100 * ((self.total_chunks_processed - self.errors) / float(self.total_chunks_processed)) }) report_time = now self.passes = 0 self.bytes_processed = 0 self.last_reported = now rebuilder_time += (now - loop_time) end_time = time.time() elapsed = (end_time - start_time) or 0.000001 self.logger.info( 'DONE %(volume)s ' 'started=%(start_time)s ' 'ended=%(end_time)s ' 'passes=%(passes)d ' 'elapsed=%(elapsed).02f ' 'errors=%(errors)d ' 'chunks=%(nb_chunks)d %(c_rate).2f/s ' 'bytes=%(nb_bytes)d %(b_rate).2fB/s ' 'elapsed=%(rebuilder_time).2f ' '(rebuilder: %(success_rate).2f%%)' % { 'volume': self.volume, 'start_time': datetime.fromtimestamp(int(start_time)).isoformat(), 'end_time': datetime.fromtimestamp(int(end_time)).isoformat(), 'passes': self.passes, 'elapsed': elapsed, 'errors': self.errors, 'nb_chunks': self.total_chunks_processed, 'nb_bytes': self.total_bytes_processed, 'c_rate': self.total_chunks_processed / elapsed, 'b_rate': self.total_bytes_processed / elapsed, 'rebuilder_time': rebuilder_time, 'success_rate': 100 * ((self.total_chunks_processed - self.errors) / float(self.total_chunks_processed or 1)) }) def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id_or_pos): self.logger.info( "[dryrun] Rebuilding " "container %s, content %s, chunk %s", container_id, content_id, chunk_id_or_pos) self.passes += 1 def safe_chunk_rebuild(self, container_id, content_id, chunk_id_or_pos): try: self.chunk_rebuild(container_id, content_id, chunk_id_or_pos) except Exception as e: self.errors += 1 self.logger.error('ERROR while rebuilding chunk %s|%s|%s: %s', container_id, content_id, chunk_id_or_pos, e) self.passes += 1 def chunk_rebuild(self, container_id, content_id, chunk_id_or_pos): self.logger.info('Rebuilding (container %s, content %s, chunk %s)', container_id, content_id, chunk_id_or_pos) try: content = self.content_factory.get(container_id, content_id) except ContentNotFound: raise OrphanChunk('Content not found: possible orphan chunk') chunk_size = 0 chunk_pos = None if len(chunk_id_or_pos) < 32: chunk_pos = chunk_id_or_pos chunk_id = None metapos = int(chunk_pos.split('.', 1)[0]) chunk_size = content.chunks.filter(metapos=metapos).all()[0].size else: if '/' in chunk_id_or_pos: chunk_id = chunk_id_or_pos.rsplit('/', 1)[-1] else: chunk_id = chunk_id_or_pos chunk = content.chunks.filter(id=chunk_id).one() if chunk is None: raise OrphanChunk(("Chunk not found in content:" "possible orphan chunk")) elif self.volume and chunk.host != self.volume: raise ValueError("Chunk does not belong to this volume") chunk_size = chunk.size content.rebuild_chunk(chunk_id, allow_same_rawx=self.allow_same_rawx, chunk_pos=chunk_pos) if self.try_chunk_delete: try: content.blob_client.chunk_delete(chunk.url) self.logger.info("Chunk %s deleted", chunk.url) except NotFound as exc: self.logger.debug("Chunk %s: %s", chunk.url, exc) # This call does not raise exception if chunk is not referenced if chunk_id is not None: self.rdir_client.chunk_delete(chunk.host, container_id, content_id, chunk_id) self.bytes_processed += chunk_size self.total_bytes_processed += chunk_size
class BlobRebuilder(Rebuilder): def __init__(self, conf, logger, volume, try_chunk_delete=False, beanstalkd_addr=None, **kwargs): super(BlobRebuilder, self).__init__(conf, logger, **kwargs) self.volume = volume self.rdir_client = RdirClient(conf, logger=self.logger) self.try_chunk_delete = try_chunk_delete self.beanstalkd_addr = beanstalkd_addr self.beanstalkd_tube = conf.get('beanstalkd_tube', DEFAULT_REBUILDER_TUBE) self.beanstalk = None self.rdir_fetch_limit = int_value(conf.get('rdir_fetch_limit'), 100) def _fetch_chunks_from_event(self, job_id, data): env = json.loads(data) for chunk_pos in env['data']['missing_chunks']: yield [ env['url']['id'], env['url']['content'], str(chunk_pos), None ] def _connect_to_beanstalk(self): self.logger.debug('Connecting to %s', self.beanstalkd_addr) self.beanstalk = Beanstalk.from_url(self.beanstalkd_addr) self.logger.debug('Using tube %s', self.beanstalkd_tube) self.beanstalk.use(self.beanstalkd_tube) self.beanstalk.watch(self.beanstalkd_tube) def _handle_beanstalk_event(self, conn_error): try: job_id, data = self.beanstalk.reserve() if conn_error: self.logger.warn("beanstalk reconnected") except ConnectionError: if not conn_error: self.logger.warn("beanstalk connection error") raise try: for chunk in self._fetch_chunks_from_event(job_id, data): yield chunk self.beanstalk.delete(job_id) except Exception: self.logger.exception("handling event %s (bury)", job_id) self.beanstalk.bury(job_id) def _fetch_chunks_from_beanstalk(self): conn_error = False while 1: try: self._connect_to_beanstalk() for chunk in self._handle_beanstalk_event(conn_error): conn_error = False yield chunk except ConnectionError as exc: self.logger.warn('Disconnected: %s', exc) if 'Invalid URL' in str(exc): raise conn_error = True time.sleep(1.0) def _fetch_chunks_from_file(self): with open(self.input_file, 'r') as ifile: for line in ifile: stripped = line.strip() if stripped and not stripped.startswith('#'): yield stripped.split('|', 3)[:3] + [None] def _fetch_chunks(self): if self.input_file: return self._fetch_chunks_from_file() elif self.beanstalkd_addr: return self._fetch_chunks_from_beanstalk() else: return self.rdir_client.chunk_fetch(self.volume, limit=self.rdir_fetch_limit, rebuild=True) def rebuilder_pass_with_lock(self): self.rdir_client.admin_lock(self.volume, "rebuilder on %s" % gethostname()) try: self.rebuilder_pass() finally: self.rdir_client.admin_unlock(self.volume) def _create_worker(self, **kwargs): return BlobRebuilderWorker(self.conf, self.logger, self.volume, self.try_chunk_delete) def _fill_queue(self, queue, **kwargs): chunks = self._fetch_chunks() for chunk in chunks: queue.put(chunk) def _init_info(self, **kwargs): return 0 def _compute_info(self, worker, total_bytes_processed, **kwargs): total_bytes_processed += worker.total_bytes_processed return total_bytes_processed def _get_report(self, start_time, end_time, passes, errors, waiting_time, rebuilder_time, elapsed, total_chunks_processed, total_bytes_processed, **kwargs): return ('DONE %(volume)s ' 'started=%(start_time)s ' 'ended=%(end_time)s ' 'elapsed=%(elapsed).2f ' 'passes=%(passes)d ' 'errors=%(errors)d ' 'chunks=%(nb_chunks)d %(c_rate).2f/s ' 'bytes=%(nb_bytes)d %(b_rate).2fB/s ' 'waiting_time=%(waiting_time).2f ' 'rebuilder_time=%(rebuilder_time).2f ' '(rebuilder: %(success_rate).2f%%)' % { 'volume': self.volume, 'start_time': datetime.fromtimestamp(int(start_time)).isoformat(), 'end_time': datetime.fromtimestamp(int(end_time)).isoformat(), 'elapsed': elapsed, 'passes': passes, 'errors': errors, 'nb_chunks': total_chunks_processed, 'nb_bytes': total_bytes_processed, 'c_rate': total_chunks_processed / elapsed, 'b_rate': total_bytes_processed / elapsed, 'rebuilder_time': rebuilder_time, 'waiting_time': waiting_time, 'success_rate': 100 * ((total_chunks_processed - errors) / float(total_chunks_processed or 1)) })
class BlobRebuilderWorker(object): def __init__(self, conf, logger, volume): self.conf = conf self.logger = logger or get_logger(conf) self.volume = volume self.run_time = 0 self.passes = 0 self.errors = 0 self.last_reported = 0 self.chunks_run_time = 0 self.bytes_running_time = 0 self.bytes_processed = 0 self.total_bytes_processed = 0 self.total_chunks_processed = 0 self.dry_run = true_value(conf.get('dry_run', False)) self.report_interval = int_value(conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value(conf.get('chunks_per_second'), 30) self.max_bytes_per_second = int_value(conf.get('bytes_per_second'), 10000000) self.rdir_fetch_limit = int_value(conf.get('rdir_fetch_limit'), 100) self.allow_same_rawx = true_value(conf.get('allow_same_rawx')) self.rdir_client = RdirClient(conf) self.content_factory = ContentFactory(conf) def rebuilder_pass_with_lock(self): self.rdir_client.admin_lock(self.volume, "rebuilder on %s" % gethostname()) try: self.rebuilder_pass() finally: self.rdir_client.admin_unlock(self.volume) def rebuilder_pass(self): start_time = report_time = time.time() total_errors = 0 rebuilder_time = 0 chunks = self.rdir_client.chunk_fetch(self.volume, limit=self.rdir_fetch_limit, rebuild=True) for container_id, content_id, chunk_id, data in chunks: loop_time = time.time() if self.dry_run: self.dryrun_chunk_rebuild(container_id, content_id, chunk_id) else: self.safe_chunk_rebuild(container_id, content_id, chunk_id) self.chunks_run_time = ratelimit(self.chunks_run_time, self.max_chunks_per_second) self.total_chunks_processed += 1 now = time.time() if now - self.last_reported >= self.report_interval: self.logger.info( 'RUN %(volume)s ' 'started=%(start_time)s ' 'passes=%(passes)d ' 'errors=%(errors)d ' 'chunks=%(nb_chunks)d %(c_rate).2f/s ' 'bytes=%(nb_bytes)d %(b_rate).2fB/s ' 'elapsed=%(total).2f ' '(rebuilder: %(rebuilder_rate).2f%%)' % { 'volume': self.volume, 'start_time': datetime.fromtimestamp(int(report_time)).isoformat(), 'passes': self.passes, 'errors': self.errors, 'nb_chunks': self.total_chunks_processed, 'nb_bytes': self.total_bytes_processed, 'c_rate': self.passes / (now - report_time), 'b_rate': self.bytes_processed / (now - report_time), 'total': (now - start_time), 'rebuilder_time': rebuilder_time, 'rebuilder_rate': 100.0 * rebuilder_time / float(now - start_time) }) report_time = now total_errors += self.errors self.passes = 0 self.bytes_processed = 0 self.last_reported = now rebuilder_time += (now - loop_time) end_time = time.time() elapsed = (end_time - start_time) or 0.000001 self.logger.info( 'DONE %(volume)s ' 'started=%(start_time)s ' 'ended=%(end_time)s ' 'elapsed=%(elapsed).02f ' 'errors=%(errors)d ' 'chunks=%(nb_chunks)d %(c_rate).2f/s ' 'bytes=%(nb_bytes)d %(b_rate).2fB/s ' 'elapsed=%(rebuilder_time).2f ' '(rebuilder: %(rebuilder_rate).2f%%)' % { 'volume': self.volume, 'start_time': datetime.fromtimestamp( int(start_time)).isoformat(), 'end_time': datetime.fromtimestamp(int(end_time)).isoformat(), 'elapsed': elapsed, 'errors': total_errors + self.errors, 'nb_chunks': self.total_chunks_processed, 'nb_bytes': self.total_bytes_processed, 'c_rate': self.total_chunks_processed / elapsed, 'b_rate': self.total_bytes_processed / elapsed, 'rebuilder_time': rebuilder_time, 'rebuilder_rate': 100.0 * rebuilder_time / float(elapsed) }) def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id): self.logger.info( "[dryrun] Rebuilding " "container %s, content %s, chunk %s", container_id, content_id, chunk_id) self.passes += 1 def safe_chunk_rebuild(self, container_id, content_id, chunk_id): try: self.chunk_rebuild(container_id, content_id, chunk_id) except Exception as e: self.errors += 1 self.logger.error('ERROR while rebuilding chunk %s|%s|%s) : %s', container_id, content_id, chunk_id, e) self.passes += 1 def chunk_rebuild(self, container_id, content_id, chunk_id): self.logger.info('Rebuilding (container %s, content %s, chunk %s)', container_id, content_id, chunk_id) try: content = self.content_factory.get(container_id, content_id) except ContentNotFound: raise exc.OrphanChunk('Content not found') chunk = content.chunks.filter(id=chunk_id).one() if chunk is None: raise OrphanChunk("Chunk not found in content") chunk_size = chunk.size content.rebuild_chunk(chunk_id, allow_same_rawx=self.allow_same_rawx) self.rdir_client.chunk_delete(self.volume, container_id, content_id, chunk_id) self.bytes_processed += chunk_size self.total_bytes_processed += chunk_size
class BlobRebuilder(Rebuilder): def __init__(self, conf, logger, volume, try_chunk_delete=False, beanstalkd_addr=None, **kwargs): super(BlobRebuilder, self).__init__(conf, logger, volume, **kwargs) # rdir self.rdir_client = RdirClient(conf, logger=self.logger) self.rdir_fetch_limit = int_value(conf.get('rdir_fetch_limit'), 100) # rawx self.try_chunk_delete = try_chunk_delete # beanstalk if beanstalkd_addr: self.beanstalkd_listener = BeanstalkdListener( beanstalkd_addr, conf.get('beanstalkd_tube', DEFAULT_REBUILDER_TUBE), self.logger, **kwargs) else: self.beanstalkd_listener = None # counters self.bytes_processed = 0 self.total_bytes_processed = 0 self.total_expected_chunks = None # distributed self.distributed = False def _create_worker(self, **kwargs): return BlobRebuilderWorker(self, try_chunk_delete=self.try_chunk_delete, **kwargs) def _fill_queue(self, queue, **kwargs): chunks = self._fetch_chunks(**kwargs) for chunk in chunks: queue.put(chunk) def _item_to_string(self, chunk, **kwargs): cid, content_id, chunk_id_or_pos, _ = chunk return 'chunk %s|%s|%s' % (cid, content_id, chunk_id_or_pos) def _get_report(self, status, end_time, counters, **kwargs): chunks_processed, bytes_processed, errors, total_chunks_processed, \ total_bytes_processed, total_errors = counters time_since_last_report = (end_time - self.last_report) or 0.00001 total_time = (end_time - self.start_time) or 0.00001 report = ( '%(status)s volume=%(volume)s ' 'last_report=%(last_report)s %(time_since_last_report).2fs ' 'chunks=%(chunks)d %(chunks_rate).2f/s ' 'bytes=%(bytes)d %(bytes_rate).2fB/s ' 'errors=%(errors)d %(errors_rate).2f%% ' 'start_time=%(start_time)s %(total_time).2fs ' 'total_chunks=%(total_chunks)d %(total_chunks_rate).2f/s ' 'total_bytes=%(total_bytes)d %(total_bytes_rate).2fB/s ' 'total_errors=%(total_errors)d %(total_errors_rate).2f%%' % { 'status': status, 'volume': self.volume, 'last_report': datetime.fromtimestamp(int(self.last_report)).isoformat(), 'time_since_last_report': time_since_last_report, 'chunks': chunks_processed, 'chunks_rate': chunks_processed / time_since_last_report, 'bytes': bytes_processed, 'bytes_rate': bytes_processed / time_since_last_report, 'errors': errors, 'errors_rate': 100 * errors / float(chunks_processed or 1), 'start_time': datetime.fromtimestamp(int(self.start_time)).isoformat(), 'total_time': total_time, 'total_chunks': total_chunks_processed, 'total_chunks_rate': total_chunks_processed / total_time, 'total_bytes': total_bytes_processed, 'total_bytes_rate': total_bytes_processed / total_time, 'total_errors': total_errors, 'total_errors_rate': 100 * total_errors / float(total_chunks_processed or 1) }) if self.total_expected_chunks is not None: progress = 100 * total_chunks_processed / \ float(self.total_expected_chunks or 1) report += ' progress=%d/%d %.2f%%' % \ (total_chunks_processed, self.total_expected_chunks, progress) return report def _update_processed_without_lock(self, bytes_processed, error=None, **kwargs): super(BlobRebuilder, self)._update_processed_without_lock(None, error=error, **kwargs) if bytes_processed is not None: self.bytes_processed += bytes_processed def _update_totals_without_lock(self, **kwargs): chunks_processed, errors, total_chunks_processed, total_errors = \ super(BlobRebuilder, self)._update_totals_without_lock(**kwargs) bytes_processed = self.bytes_processed self.bytes_processed = 0 self.total_bytes_processed += bytes_processed return chunks_processed, bytes_processed, errors, \ total_chunks_processed, self.total_bytes_processed, total_errors def _rebuilder_pass(self, **kwargs): return super(BlobRebuilder, self).rebuilder_pass(**kwargs) def rebuilder_pass(self, **kwargs): success = False if self.volume: self.rdir_client.admin_lock(self.volume, "rebuilder on %s" % gethostname()) info = self.rdir_client.status(self.volume) self.total_expected_chunks = info.get('chunk', dict()).get( 'to_rebuild', None) try: success = self._rebuilder_pass(**kwargs) finally: if self.volume: self.rdir_client.admin_unlock(self.volume) return success def _event_from_broken_chunk(self, chunk, reply, **kwargs): cid, content_id, chunk_id_or_pos, _ = chunk event = {} event['when'] = time.time() event['event'] = 'storage.content.broken' event['data'] = {'missing_chunks': [chunk_id_or_pos]} event['url'] = {'ns': self.namespace, 'id': cid, 'content': content_id} event['reply'] = reply return json.dumps(event) def _chunks_from_event(self, job_id, data, **kwargs): decoded = json.loads(data) container_id = decoded['url']['id'] content_id = decoded['url']['content'] more = None reply = decoded.get('reply', None) if reply: more = {'reply': reply} for chunk_id_or_pos in decoded['data']['missing_chunks']: yield [container_id, content_id, str(chunk_id_or_pos), more] def _fetch_events_from_beanstalk(self, **kwargs): return self.beanstalkd_listener.fetch_events(self._chunks_from_event, **kwargs) def _fetch_chunks_from_file(self, **kwargs): with open(self.input_file, 'r') as ifile: for line in ifile: stripped = line.strip() if stripped and not stripped.startswith('#'): yield stripped.split('|', 3)[:3] + [None] def _fetch_chunks(self, **kwargs): if self.input_file: return self._fetch_chunks_from_file(**kwargs) if self.beanstalkd_listener and not self.distributed: return self._fetch_events_from_beanstalk(**kwargs) if self.volume: return self.rdir_client.chunk_fetch(self.volume, limit=self.rdir_fetch_limit, rebuild=True, **kwargs) raise ConfigurationException('No source to fetch chunks from')
class BlobRebuilderWorker(object): def __init__(self, conf, logger, volume): self.conf = conf self.logger = logger or get_logger(conf) self.volume = volume self.run_time = 0 self.passes = 0 self.errors = 0 self.last_reported = 0 self.chunks_run_time = 0 self.bytes_running_time = 0 self.bytes_processed = 0 self.total_bytes_processed = 0 self.total_chunks_processed = 0 self.dry_run = true_value( conf.get('dry_run', False)) self.report_interval = int_value( conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value( conf.get('chunks_per_second'), 30) self.max_bytes_per_second = int_value( conf.get('bytes_per_second'), 10000000) self.rdir_fetch_limit = int_value( conf.get('rdir_fetch_limit'), 100) self.blob_client = BlobClient() self.container_client = ContainerClient(conf) self.rdir_client = RdirClient(conf) def rebuilder_pass_with_lock(self): self.rdir_client.admin_lock(self.volume, "rebuilder on %s" % gethostname()) try: self.rebuilder_pass() finally: self.rdir_client.admin_unlock(self.volume) def rebuilder_pass(self): start_time = report_time = time.time() total_errors = 0 rebuilder_time = 0 chunks = self.rdir_client.chunk_fetch(self.volume, limit=self.rdir_fetch_limit, rebuild=True) for container_id, content_id, chunk_id, data in chunks: loop_time = time.time() if self.dry_run: self.dryrun_chunk_rebuild(container_id, content_id, chunk_id) else: self.safe_chunk_rebuild(container_id, content_id, chunk_id) self.chunks_run_time = ratelimit( self.chunks_run_time, self.max_chunks_per_second ) self.total_chunks_processed += 1 now = time.time() if now - self.last_reported >= self.report_interval: self.logger.info( '%(start_time)s ' '%(passes)d ' '%(errors)d ' '%(c_rate).2f ' '%(b_rate).2f ' '%(total).2f ' '%(rebuilder_time).2f' '%(rebuilder_rate).2f' % { 'start_time': time.ctime(report_time), 'passes': self.passes, 'errors': self.errors, 'c_rate': self.passes / (now - report_time), 'b_rate': self.bytes_processed / (now - report_time), 'total': (now - start_time), 'rebuilder_time': rebuilder_time, 'rebuilder_rate': rebuilder_time / (now - start_time) } ) report_time = now total_errors += self.errors self.passes = 0 self.bytes_processed = 0 self.last_reported = now rebuilder_time += (now - loop_time) elapsed = (time.time() - start_time) or 0.000001 self.logger.info( '%(elapsed).02f ' '%(errors)d ' '%(chunk_rate).2f ' '%(bytes_rate).2f ' '%(rebuilder_time).2f ' '%(rebuilder_rate).2f' % { 'elapsed': elapsed, 'errors': total_errors + self.errors, 'chunk_rate': self.total_chunks_processed / elapsed, 'bytes_rate': self.total_bytes_processed / elapsed, 'rebuilder_time': rebuilder_time, 'rebuilder_rate': rebuilder_time / elapsed } ) def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id): self.logger.info("[dryrun] Rebuilding " "container %s, content %s, chunk %s" % (container_id, content_id, chunk_id)) self.passes += 1 def safe_chunk_rebuild(self, container_id, content_id, chunk_id): self.logger.info('Rebuilding (container %s, content %s, chunk %s)' % (container_id, content_id, chunk_id)) try: self.chunk_rebuild(container_id, content_id, chunk_id) except Exception as e: self.errors += 1 self.logger.error('ERROR while rebuilding chunk %s|%s|%s) : %s', container_id, content_id, chunk_id, e) self.passes += 1 def _meta2_get_chunks_at_pos(self, container_id, content_id, chunk_id): current_chunk_url = 'http://%s/%s' % (self.volume, chunk_id) try: data = self.container_client.content_show( cid=container_id, content=content_id) except exc.NotFound: raise exc.OrphanChunk('Content not found') current_chunk = None for c in data: if c['url'] == current_chunk_url: current_chunk = c break if not current_chunk: raise exc.OrphanChunk('Chunk not found in content') duplicate_chunks = [] for c in data: if c['pos'] == current_chunk['pos'] \ and c['url'] != current_chunk['url']: duplicate_chunks.append(c) if len(duplicate_chunks) == 0: raise exc.UnrecoverableContent('No copy of missing chunk') return current_chunk, duplicate_chunks def _meta2_get_spare_chunk(self, container_id, content_id, notin, broken): spare_data = {'notin': notin, 'broken': [broken], 'size': 0} try: spare_resp = self.container_client.content_spare( cid=container_id, content=content_id, data=spare_data) except ClientException as e: raise exc.SpareChunkException('No spare chunk (%s)' % e.message) return spare_resp['chunks'][0] def _meta2_replace_chunk(self, container_id, content_id, current_chunk, new_chunk): old = [{'type': 'chunk', 'id': current_chunk['url'], 'hash': current_chunk['hash'], 'size': current_chunk['size'], 'pos': current_chunk['pos'], 'content': content_id}] new = [{'type': 'chunk', 'id': new_chunk['id'], 'hash': current_chunk['hash'], 'size': current_chunk['size'], 'pos': current_chunk['pos'], 'content': content_id}] update_data = {'old': old, 'new': new} self.container_client.container_raw_update( cid=container_id, data=update_data) # TODO rain support def chunk_rebuild(self, container_id, content_id, chunk_id): current_chunk, duplicate_chunks = self._meta2_get_chunks_at_pos( container_id, content_id, chunk_id) spare_chunk = self._meta2_get_spare_chunk( container_id, content_id, duplicate_chunks, current_chunk) uploaded = False for src in duplicate_chunks: try: self.blob_client.chunk_copy(src['url'], spare_chunk['id']) self.logger.debug('copy chunk from %s to %s', src['url'], spare_chunk['id']) uploaded = True break except Exception as e: self.logger.debug('Failed to copy chunk from %s to %s: %s', src['url'], spare_chunk['id'], type(e)) if not uploaded: raise exc.UnrecoverableContent('No copy available ' 'of missing chunk') self._meta2_replace_chunk(container_id, content_id, current_chunk, spare_chunk) self.rdir_client.chunk_push(self.volume, container_id, content_id, chunk_id, rtime=int(time.time())) self.bytes_processed += current_chunk['size'] self.total_bytes_processed += current_chunk['size']