def __init__(self, conf, **kwargs): super(BlobIndexer, self).__init__(conf) self.logger = get_logger(conf) volume = conf.get('volume') if not volume: raise exc.ConfigurationException('No volume specified for indexer') self.volume = volume self.passes = 0 self.errors = 0 self.successes = 0 self.last_reported = 0 self.total_since_last_reported = 0 self.chunks_run_time = 0 self.interval = int_value( conf.get('interval'), 300) self.report_interval = int_value( conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value( conf.get('chunks_per_second'), 30) pm = get_pool_manager(pool_connections=10) self.index_client = RdirClient(conf, logger=self.logger, pool_manager=pm) self.namespace, self.volume_id = check_volume(self.volume) self.convert_chunks = true_value(conf.get('convert_chunks')) if self.convert_chunks: converter_conf = self.conf.copy() converter_conf['no_backup'] = True self.converter = BlobConverter(converter_conf, logger=self.logger, pool_manager=pm) else: self.converter = None
def _convert_and_check(self, chunk_volume, chunk_path, chunk_id_info, expected_raw_meta=None, expected_errors=0): conf = self.conf conf['volume'] = self.rawx_volumes[chunk_volume] converter = BlobConverter(conf, logger=self.logger) converter.safe_convert_chunk(chunk_path) self.assertEqual(1, converter.total_chunks_processed) self.assertEqual(1, converter.passes) self.assertEqual(expected_errors, converter.errors) checker = Checker(self.ns) for chunk_id, info in chunk_id_info.items(): account, container, path, version, content_id = info fullpath = encode_fullpath(account, container, path, version, content_id) cid = cid_from_name(account, container) meta, raw_meta = read_chunk_metadata(chunk_path, chunk_id) self.assertEqual(meta.get('chunk_id'), chunk_id) self.assertEqual(meta.get('container_id'), cid) self.assertEqual(meta.get('content_path'), path) self.assertEqual(meta.get('content_version'), version) self.assertEqual(meta.get('content_id'), content_id) self.assertEqual(meta.get('full_path'), fullpath) checker.check( Target(account, container=container, obj=path, chunk='http://' + converter.volume_id + '/' + chunk_id)) for _ in checker.run(): pass self.assertTrue(checker.report()) if expected_raw_meta: self.assertDictEqual(expected_raw_meta, raw_meta) continue self.assertNotIn(CHUNK_XATTR_KEYS['chunk_id'], raw_meta) self.assertNotIn(CHUNK_XATTR_KEYS['container_id'], raw_meta) self.assertNotIn(CHUNK_XATTR_KEYS['content_path'], raw_meta) self.assertNotIn(CHUNK_XATTR_KEYS['content_version'], raw_meta) self.assertNotIn(CHUNK_XATTR_KEYS['content_id'], raw_meta) self.assertIn(CHUNK_XATTR_CONTENT_FULLPATH_PREFIX + chunk_id, raw_meta) for k in raw_meta.keys(): if k.startswith('oio:'): self.fail('old fullpath always existing') self.assertEqual(raw_meta[CHUNK_XATTR_KEYS['oio_version']], OIO_VERSION)
def test_recover_missing_fullpath_not_indexed(self): """ Test what happens when the BlobConverter encounters a chunk with neither a fullpath extended attribute, not any of the legacy attributes, and the chunk does not appear in rdir. """ victim = random.choice(self.chunks) path = self._chunk_path(victim) remove_fullpath_xattr(path) self._deindex_chunk(victim) conf = dict(self.conf) conf['volume'] = self.rawx_volumes[self._chunk_volume_id(victim)] converter = BlobConverter(conf) self.assertRaises(KeyError, converter.recover_chunk_fullpath, path)
def test_recover_missing_fullpath_orphan_chunk(self): """ Test what happens when the BlobConverter encounters a chunk with neither a fullpath extended attribute, not any of the legacy attributes, and the chunk does not appear in object description. """ victim = random.choice(self.chunks) path = self._chunk_path(victim) remove_fullpath_xattr(path) cbean = { 'content': self.content_id, 'hash': victim['hash'], 'id': victim['url'], 'size': victim['size'], 'pos': victim['pos'], 'type': 'chunk' } self.api.container.container_raw_delete( self.account, self.container, data=[cbean]) conf = dict(self.conf) conf['volume'] = self.rawx_volumes[self._chunk_volume_id(victim)] converter = BlobConverter(conf) self.assertRaises(OrphanChunk, converter.recover_chunk_fullpath, path)
class BlobIndexer(Daemon): def __init__(self, conf, **kwargs): super(BlobIndexer, self).__init__(conf) self.logger = get_logger(conf) volume = conf.get('volume') if not volume: raise exc.ConfigurationException('No volume specified for indexer') self.volume = volume self.passes = 0 self.errors = 0 self.successes = 0 self.last_reported = 0 self.total_since_last_reported = 0 self.chunks_run_time = 0 self.interval = int_value( conf.get('interval'), 300) self.report_interval = int_value( conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value( conf.get('chunks_per_second'), 30) pm = get_pool_manager(pool_connections=10) self.index_client = RdirClient(conf, logger=self.logger, pool_manager=pm) self.namespace, self.volume_id = check_volume(self.volume) self.convert_chunks = true_value(conf.get('convert_chunks')) if self.convert_chunks: converter_conf = self.conf.copy() converter_conf['no_backup'] = True self.converter = BlobConverter(converter_conf, logger=self.logger, pool_manager=pm) else: self.converter = None def safe_recover_fullpath(self, path): try: return self.converter.recover_chunk_fullpath(path) except Exception as err: self.logger.error('Could not recover fullpath xattr of %s: %s', path, err) return False def safe_update_index(self, path): chunk_id = path.rsplit('/', 1)[-1] if len(chunk_id) != STRLEN_CHUNKID: if chunk_id.endswith(CHUNK_SUFFIX_PENDING): self.logger.info('Skipping pending chunk %s', path) else: self.logger.warn('WARN Not a chunk %s', path) return for char in chunk_id: if char not in hexdigits: self.logger.warn('WARN Not a chunk %s', path) return try: self.update_index(path, chunk_id) self.successes += 1 self.logger.debug('Updated %s', path) except exc.OioNetworkException as err: self.errors += 1 self.logger.warn('ERROR while updating %s: %s', path, err) except exc.VolumeException as err: self.errors += 1 self.logger.error('Cannot index %s: %s', path, err) # All chunks of this volume are indexed in the same service, # no need to try another chunk, it will generate the same # error. Let the upper level retry later. raise except (exc.ChunkException, exc.MissingAttribute) as err: if (self.convert_chunks and self.converter and self.converter.is_fullpath_error(err)): self.logger.warn( 'Could not update %s: %s, will try to recover', path, err) if self.safe_recover_fullpath(path): self.successes += 1 self.logger.info( 'Fullpath xattr of %s was recovered', path) else: self.errors += 1 # Logging already done by safe_recover_fullpath else: self.errors += 1 self.logger.error('ERROR while updating %s: %s', path, err) except Exception as err: # We cannot compare errno in the 'except' line. # pylint: disable=no-member if isinstance(err, IOError) and err.errno == errno.ENOENT: self.logger.debug('Chunk %s disappeared before indexing', path) # Neither an error nor a success, do not touch counters. else: self.errors += 1 self.logger.exception('ERROR while updating %s', path) self.total_since_last_reported += 1 def report(self, tag, start_time): total = self.errors + self.successes now = time.time() elapsed = (now - start_time) or 0.000001 self.logger.info( '%(tag)s=%(current_time)s ' 'elapsed=%(elapsed).02f ' 'pass=%(pass)d ' 'errors=%(errors)d ' 'chunks=%(nb_chunks)d %(c_rate).2f/s' % { 'tag': tag, 'current_time': datetime.fromtimestamp( int(now)).isoformat(), 'pass': self.passes, 'errors': self.errors, 'nb_chunks': total, 'c_rate': self.total_since_last_reported / (now - self.last_reported), 'elapsed': elapsed } ) self.last_reported = now self.total_since_last_reported = 0 def index_pass(self): start_time = time.time() self.last_reported = start_time self.errors = 0 self.successes = 0 paths = paths_gen(self.volume) self.report('started', start_time) for path in paths: self.safe_update_index(path) self.chunks_run_time = ratelimit( self.chunks_run_time, self.max_chunks_per_second ) now = time.time() if now - self.last_reported >= self.report_interval: self.report('running', start_time) self.report('ended', start_time) def update_index(self, path, chunk_id): with open(path) as file_: try: meta = None if self.convert_chunks and self.converter: _, meta = self.converter.convert_chunk(file_, chunk_id) if meta is None: meta, _ = read_chunk_metadata(file_, chunk_id) except exc.MissingAttribute as err: raise exc.FaultyChunk(err) data = {'mtime': int(time.time())} headers = {REQID_HEADER: request_id('blob-indexer-')} self.index_client.chunk_push(self.volume_id, meta['container_id'], meta['content_id'], meta['chunk_id'], headers=headers, **data) def run(self, *args, **kwargs): time.sleep(random() * self.interval) while True: pre = time.time() try: self.index_pass() except exc.VolumeException as err: self.logger.error('Cannot index chunks, will retry later: %s', err) except Exception as err: self.logger.exception('ERROR during indexing: %s', err) else: self.passes += 1 elapsed = (time.time() - pre) or 0.000001 if elapsed < self.interval: time.sleep(self.interval - elapsed)
class BlobIndexer(Daemon): def __init__(self, conf, **kwargs): super(BlobIndexer, self).__init__(conf) self.logger = get_logger(conf) volume = conf.get('volume') if not volume: raise exc.ConfigurationException('No volume specified for indexer') self.volume = volume self.passes = 0 self.errors = 0 self.successes = 0 self.last_reported = 0 self.total_since_last_reported = 0 self.chunks_run_time = 0 self.interval = int_value(conf.get('interval'), 300) self.report_interval = int_value(conf.get('report_interval'), 3600) self.max_chunks_per_second = int_value(conf.get('chunks_per_second'), 30) pm = get_pool_manager(pool_connections=10) self.index_client = RdirClient(conf, logger=self.logger, pool_manager=pm) self.namespace, self.volume_id = check_volume(self.volume) self.convert_chunks = true_value(conf.get('convert_chunks')) if self.convert_chunks: self.converter = BlobConverter(self.conf, logger=self.logger, pool_manager=pm) else: self.converter = None def index_pass(self): def safe_update_index(path): chunk_id = path.rsplit('/', 1)[-1] if len(chunk_id) != STRLEN_CHUNKID: self.logger.warn('WARN Not a chunk %s' % path) return for c in chunk_id: if c not in hexdigits: self.logger.warn('WARN Not a chunk %s' % path) return try: self.update_index(path, chunk_id) self.successes += 1 self.logger.debug('Updated %s', path) except OioNetworkException as exc: self.errors += 1 self.logger.warn('ERROR while updating %s: %s', path, exc) except VolumeException as exc: self.errors += 1 self.logger.error('Cannot index %s: %s', path, exc) # All chunks of this volume are indexed in the same service, # no need to try another chunk, it will generate the same # error. Let the upper level retry later. raise except Exception: self.errors += 1 self.logger.exception('ERROR while updating %s', path) self.total_since_last_reported += 1 def report(tag): total = self.errors + self.successes now = time.time() elapsed = (now - start_time) or 0.000001 self.logger.info( '%(tag)s=%(current_time)s ' 'elapsed=%(elapsed).02f ' 'pass=%(pass)d ' 'errors=%(errors)d ' 'chunks=%(nb_chunks)d %(c_rate).2f/s' % { 'tag': tag, 'current_time': datetime.fromtimestamp(int(now)).isoformat(), 'pass': self.passes, 'errors': self.errors, 'nb_chunks': total, 'c_rate': self.total_since_last_reported / (now - self.last_reported), 'elapsed': elapsed }) self.last_reported = now self.total_since_last_reported = 0 start_time = time.time() self.last_reported = start_time self.errors = 0 self.successes = 0 paths = paths_gen(self.volume) report('started') for path in paths: safe_update_index(path) self.chunks_run_time = ratelimit(self.chunks_run_time, self.max_chunks_per_second) now = time.time() if now - self.last_reported >= self.report_interval: report('running') report('ended') def update_index(self, path, chunk_id): with open(path) as f: try: meta = None if self.convert_chunks and self.converter: _, meta = self.converter.convert_chunk(f, chunk_id) if meta is None: meta, _ = read_chunk_metadata(f, chunk_id) except exc.MissingAttribute as e: raise exc.FaultyChunk('Missing extended attribute %s' % e) data = {'mtime': int(time.time())} headers = {'X-oio-req-id': 'blob-indexer-' + request_id()[:-13]} self.index_client.chunk_push(self.volume_id, meta['container_id'], meta['content_id'], meta['chunk_id'], headers=headers, **data) def run(self, *args, **kwargs): time.sleep(random() * self.interval) while True: pre = time.time() try: self.index_pass() except VolumeException as exc: self.logger.error('Cannot index chunks, will retry later: %s', exc) except Exception as exc: self.logger.exception('ERROR during indexing: %s', exc) else: self.passes += 1 elapsed = (time.time() - pre) or 0.000001 if elapsed < self.interval: time.sleep(self.interval - elapsed)