Beispiel #1
0
class BlobRebuilderWorker(object):
    def __init__(self, conf, logger, volume):
        self.conf = conf
        self.logger = logger or get_logger(conf)
        self.volume = volume
        self.run_time = 0
        self.passes = 0
        self.errors = 0
        self.last_reported = 0
        self.chunks_run_time = 0
        self.bytes_running_time = 0
        self.bytes_processed = 0
        self.total_bytes_processed = 0
        self.total_chunks_processed = 0
        self.dry_run = true_value(
            conf.get('dry_run', False))
        self.report_interval = int_value(
            conf.get('report_interval'), 3600)
        self.max_chunks_per_second = int_value(
            conf.get('chunks_per_second'), 30)
        self.max_bytes_per_second = int_value(
            conf.get('bytes_per_second'), 10000000)
        self.rdir_fetch_limit = int_value(
            conf.get('rdir_fetch_limit'), 100)
        self.rdir_client = RdirClient(conf)
        self.content_factory = ContentFactory(conf)

    def rebuilder_pass_with_lock(self):
        self.rdir_client.admin_lock(self.volume,
                                    "rebuilder on %s" % gethostname())
        try:
            self.rebuilder_pass()
        finally:
            self.rdir_client.admin_unlock(self.volume)

    def rebuilder_pass(self):
        start_time = report_time = time.time()

        total_errors = 0
        rebuilder_time = 0

        chunks = self.rdir_client.chunk_fetch(self.volume,
                                              limit=self.rdir_fetch_limit,
                                              rebuild=True)
        for container_id, content_id, chunk_id, data in chunks:
            loop_time = time.time()

            if self.dry_run:
                self.dryrun_chunk_rebuild(container_id, content_id, chunk_id)
            else:
                self.safe_chunk_rebuild(container_id, content_id, chunk_id)

            self.chunks_run_time = ratelimit(
                self.chunks_run_time,
                self.max_chunks_per_second
            )
            self.total_chunks_processed += 1
            now = time.time()

            if now - self.last_reported >= self.report_interval:
                self.logger.info(
                    '%(start_time)s '
                    '%(passes)d '
                    '%(errors)d '
                    '%(c_rate).2f '
                    '%(b_rate).2f '
                    '%(total).2f '
                    '%(rebuilder_time).2f'
                    '%(rebuilder_rate).2f' % {
                        'start_time': time.ctime(report_time),
                        'passes': self.passes,
                        'errors': self.errors,
                        'c_rate': self.passes / (now - report_time),
                        'b_rate': self.bytes_processed / (now - report_time),
                        'total': (now - start_time),
                        'rebuilder_time': rebuilder_time,
                        'rebuilder_rate': rebuilder_time / (now - start_time)
                    }
                )
                report_time = now
                total_errors += self.errors
                self.passes = 0
                self.bytes_processed = 0
                self.last_reported = now
            rebuilder_time += (now - loop_time)
        elapsed = (time.time() - start_time) or 0.000001
        self.logger.info(
            '%(elapsed).02f '
            '%(errors)d '
            '%(chunk_rate).2f '
            '%(bytes_rate).2f '
            '%(rebuilder_time).2f '
            '%(rebuilder_rate).2f' % {
                'elapsed': elapsed,
                'errors': total_errors + self.errors,
                'chunk_rate': self.total_chunks_processed / elapsed,
                'bytes_rate': self.total_bytes_processed / elapsed,
                'rebuilder_time': rebuilder_time,
                'rebuilder_rate': rebuilder_time / elapsed
            }
        )

    def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id):
        self.logger.info("[dryrun] Rebuilding "
                         "container %s, content %s, chunk %s",
                         container_id, content_id, chunk_id)
        self.passes += 1

    def safe_chunk_rebuild(self, container_id, content_id, chunk_id):
        try:
            self.chunk_rebuild(container_id, content_id, chunk_id)
        except Exception as e:
            self.errors += 1
            self.logger.error('ERROR while rebuilding chunk %s|%s|%s) : %s',
                              container_id, content_id, chunk_id, e)

        self.passes += 1

    def chunk_rebuild(self, container_id, content_id, chunk_id):
        self.logger.info('Rebuilding (container %s, content %s, chunk %s)',
                         container_id, content_id, chunk_id)

        try:
            content = self.content_factory.get(container_id, content_id)
        except ContentNotFound:
            raise exc.OrphanChunk('Content not found')

        chunk = content.chunks.filter(id=chunk_id).one()
        if chunk is None:
            raise OrphanChunk("Chunk not found in content")
        chunk_size = chunk.size

        content.rebuild_chunk(chunk_id)

        self.rdir_client.chunk_push(self.volume, container_id, content_id,
                                    chunk_id, rtime=int(time.time()))

        self.bytes_processed += chunk_size
        self.total_bytes_processed += chunk_size
Beispiel #2
0
class BlobRebuilder(Tool):
    """
    Rebuild chunks.
    """

    DEFAULT_BEANSTALKD_WORKER_TUBE = 'oio-rebuild'
    DEFAULT_DISTRIBUTED_BEANSTALKD_WORKER_TUBE = 'oio-rebuild'
    DEFAULT_RDIR_FETCH_LIMIT = 100
    DEFAULT_RDIR_TIMEOUT = 60.0
    DEFAULT_ALLOW_FROZEN_CT = False
    DEFAULT_ALLOW_SAME_RAWX = True
    DEFAULT_TRY_CHUNK_DELETE = False
    DEFAULT_DRY_RUN = False

    def __init__(self, conf, input_file=None, service_id=None, **kwargs):
        super(BlobRebuilder, self).__init__(conf, **kwargs)

        # counters
        self.bytes_processed = 0
        self.total_bytes_processed = 0

        # input
        self.input_file = input_file
        self.rawx_id = service_id

        # rawx/rdir
        self.rdir_client = RdirClient(self.conf, logger=self.logger)
        self.rdir_fetch_limit = int_value(self.conf.get('rdir_fetch_limit'),
                                          self.DEFAULT_RDIR_FETCH_LIMIT)
        self.rdir_shuffle_chunks = true_value(conf.get('rdir_shuffle_chunks'))
        self.rdir_timeout = float_value(conf.get('rdir_timeout'),
                                        self.DEFAULT_RDIR_TIMEOUT)

    @staticmethod
    def items_from_task_event(task_event):
        namespace = task_event['url']['ns']
        container_id = task_event['url']['id']
        content_id = task_event['url']['content']
        for chunk_id_or_pos in task_event['data']['missing_chunks']:
            yield namespace, container_id, content_id, str(chunk_id_or_pos)

    @staticmethod
    def task_event_from_item(item):
        namespace, container_id, content_id, chunk_id_or_pos = item
        return \
            {
                'when': time.time(),
                'event': EventTypes.CONTENT_BROKEN,
                'url': {
                    'ns': namespace,
                    'id': container_id,
                    'content': content_id
                },
                'data': {
                    'missing_chunks': [
                        chunk_id_or_pos
                    ]
                }
            }

    @staticmethod
    def tasks_res_from_res_event(res_event):
        namespace = res_event['url']['ns']
        container_id = res_event['url']['id']
        content_id = res_event['url']['content']
        for chunk_rebuilt in res_event['data']['chunks_rebuilt']:
            yield (namespace, container_id, content_id,
                   str(chunk_rebuilt['chunk_id_or_pos'])), \
                chunk_rebuilt['bytes_processed'], chunk_rebuilt['error']

    @staticmethod
    def res_event_from_task_res(task_res):
        item, bytes_processed, error = task_res
        namespace, container_id, content_id, chunk_id_or_pos = item
        return \
            {
                'when': time.time(),
                'event': EventTypes.CONTENT_REBUILT,
                'url': {
                    'ns': namespace,
                    'id': container_id,
                    'content': content_id
                },
                'data': {
                    'chunks_rebuilt': [{
                        'chunk_id_or_pos': chunk_id_or_pos,
                        'bytes_processed': bytes_processed,
                        'error': error
                    }]
                }
            }

    @staticmethod
    def string_from_item(item):
        namespace, container_id, content_id, chunk_id_or_pos = item
        return '%s|%s|%s|%s' % (namespace, container_id, content_id,
                                chunk_id_or_pos)

    def _fetch_items_from_input_file(self):
        with open(self.input_file, 'r') as ifile:
            for line in ifile:
                stripped = line.strip()
                if not stripped or stripped.startswith('#'):
                    continue

                container_id, content_id, chunk_id_or_pos = \
                    stripped.split('|', 3)[:3]
                yield self.namespace, container_id, content_id, \
                    chunk_id_or_pos

    def _fetch_items_from_rawx_id(self):
        lost_chunks = self.rdir_client.chunk_fetch(
            self.rawx_id,
            limit=self.rdir_fetch_limit,
            rebuild=True,
            shuffle=self.rdir_shuffle_chunks,
            timeout=self.rdir_timeout)
        for container_id, content_id, chunk_id, _ in lost_chunks:
            yield self.namespace, container_id, content_id, chunk_id

    def _fetch_items(self):
        if self.input_file:
            return self._fetch_items_from_input_file()
        if self.rawx_id:
            return self._fetch_items_from_rawx_id()

        def _empty_generator():
            return
            yield  # pylint: disable=unreachable

        return _empty_generator()

    def update_counters(self, task_res):
        super(BlobRebuilder, self).update_counters(task_res)
        _, bytes_processed, _ = task_res
        if bytes_processed is not None:
            self.bytes_processed += bytes_processed

    def _update_total_counters(self):
        chunks_processed, total_chunks_processed, errors, total_errors = \
            super(BlobRebuilder, self)._update_total_counters()
        bytes_processed = self.bytes_processed
        self.bytes_processed = 0
        self.total_bytes_processed += bytes_processed
        return chunks_processed, total_chunks_processed, \
            bytes_processed, self.total_bytes_processed, \
            errors, total_errors

    def _get_report(self, status, end_time, counters):
        chunks_processed, total_chunks_processed, \
            bytes_processed, total_bytes_processed, \
            errors, total_errors = counters
        time_since_last_report = (end_time - self.last_report) or 0.00001
        total_time = (end_time - self.start_time) or 0.00001
        report = (
            '%(status)s '
            'last_report=%(last_report)s %(time_since_last_report).2fs '
            'chunks=%(chunks)d %(chunks_rate).2f/s '
            'bytes=%(bytes)d %(bytes_rate).2fB/s '
            'errors=%(errors)d %(errors_rate).2f%% '
            'start_time=%(start_time)s %(total_time).2fs '
            'total_chunks=%(total_chunks)d %(total_chunks_rate).2f/s '
            'total_bytes=%(total_bytes)d %(total_bytes_rate).2fB/s '
            'total_errors=%(total_errors)d %(total_errors_rate).2f%%' % {
                'status':
                status,
                'last_report':
                datetime.fromtimestamp(int(self.last_report)).isoformat(),
                'time_since_last_report':
                time_since_last_report,
                'chunks':
                chunks_processed,
                'chunks_rate':
                chunks_processed / time_since_last_report,
                'bytes':
                bytes_processed,
                'bytes_rate':
                bytes_processed / time_since_last_report,
                'errors':
                errors,
                'errors_rate':
                100 * errors / float(chunks_processed or 1),
                'start_time':
                datetime.fromtimestamp(int(self.start_time)).isoformat(),
                'total_time':
                total_time,
                'total_chunks':
                total_chunks_processed,
                'total_chunks_rate':
                total_chunks_processed / total_time,
                'total_bytes':
                total_bytes_processed,
                'total_bytes_rate':
                total_bytes_processed / total_time,
                'total_errors':
                total_errors,
                'total_errors_rate':
                100 * total_errors / float(total_chunks_processed or 1)
            })
        if self.total_expected_items is not None:
            progress = 100 * total_chunks_processed / \
                float(self.total_expected_items or 1)
            report += ' progress=%d/%d %.2f%%' % \
                (total_chunks_processed, self.total_expected_items, progress)
        return report

    def create_worker(self, queue_workers, queue_reply):
        return BlobRebuilderWorker(self, queue_workers, queue_reply)

    def _load_total_expected_items(self):
        if self.rawx_id:
            try:
                info = self.rdir_client.status(self.rawx_id,
                                               read_timeout=self.rdir_timeout)
                self.total_expected_items = info.get('chunk', dict()).get(
                    'to_rebuild', None)
            except Exception as exc:
                self.logger.warn(
                    'Failed to fetch the total chunks to rebuild: %s', exc)

    def run(self):
        if self.rawx_id:
            self.rdir_client.admin_lock(self.rawx_id,
                                        "rebuilder on %s" % gethostname(),
                                        timeout=self.rdir_timeout)
        success = super(BlobRebuilder, self).run()
        if self.rawx_id:
            self.rdir_client.admin_unlock(self.rawx_id,
                                          timeout=self.rdir_timeout)
        return success
Beispiel #3
0
class BlobRebuilderWorker(object):
    def __init__(self,
                 conf,
                 logger,
                 volume,
                 input_file=None,
                 try_chunk_delete=False,
                 beanstalkd_addr=None):
        self.conf = conf
        self.logger = logger or get_logger(conf)
        self.volume = volume
        self.run_time = 0
        self.passes = 0
        self.errors = 0
        self.last_reported = 0
        self.chunks_run_time = 0
        self.bytes_running_time = 0
        self.bytes_processed = 0
        self.total_bytes_processed = 0
        self.total_chunks_processed = 0
        self.dry_run = true_value(conf.get('dry_run', False))
        self.report_interval = int_value(conf.get('report_interval'), 3600)
        self.max_chunks_per_second = int_value(conf.get('chunks_per_second'),
                                               30)
        self.max_bytes_per_second = int_value(conf.get('bytes_per_second'),
                                              10000000)
        self.rdir_fetch_limit = int_value(conf.get('rdir_fetch_limit'), 100)
        self.allow_same_rawx = true_value(conf.get('allow_same_rawx'))
        self.input_file = input_file
        self.rdir_client = RdirClient(conf, logger=self.logger)
        self.content_factory = ContentFactory(conf)
        self.try_chunk_delete = try_chunk_delete
        self.beanstalkd_addr = beanstalkd_addr
        self.beanstalkd_tube = conf.get('beanstalkd_tube', 'rebuild')
        self.beanstalk = None

    def _fetch_chunks_from_event(self, job_id, data):
        env = json.loads(data)
        for chunk_pos in env['data']['missing_chunks']:
            yield [
                env['url']['id'], env['url']['content'],
                str(chunk_pos), None
            ]

    def _connect_to_beanstalk(self):
        self.beanstalk = Beanstalk.from_url(self.beanstalkd_addr)
        self.beanstalk.use(self.beanstalkd_tube)
        self.beanstalk.watch(self.beanstalkd_tube)

    def _handle_beanstalk_event(self, conn_error):
        try:
            job_id, data = self.beanstalk.reserve()
            if conn_error:
                self.logger.warn("beanstalk reconnected")
        except ConnectionError:
            if not conn_error:
                self.logger.warn("beanstalk connection error")
            raise
        try:
            for chunk in self._fetch_chunks_from_event(job_id, data):
                yield chunk
            self.beanstalk.delete(job_id)
        except Exception:
            self.logger.exception("handling event %s (bury)", job_id)
            self.beanstalk.bury(job_id)

    def _fetch_chunks_from_beanstalk(self):
        conn_error = False
        while 1:
            try:
                self._connect_to_beanstalk()
                for chunk in self._handle_beanstalk_event(conn_error):
                    conn_error = False
                    yield chunk
            except ConnectionError:
                conn_error = True
                time.sleep(1.0)

    def _fetch_chunks_from_file(self):
        with open(self.input_file, 'r') as ifile:
            for line in ifile:
                stripped = line.strip()
                if stripped and not stripped.startswith('#'):
                    yield stripped.split('|', 3)[:3] + [None]

    def _fetch_chunks(self):
        if self.input_file:
            return self._fetch_chunks_from_file()
        elif self.beanstalkd_addr:
            return self._fetch_chunks_from_beanstalk()
        else:
            return self.rdir_client.chunk_fetch(self.volume,
                                                limit=self.rdir_fetch_limit,
                                                rebuild=True)

    def rebuilder_pass_with_lock(self):
        self.rdir_client.admin_lock(self.volume,
                                    "rebuilder on %s" % gethostname())
        try:
            self.rebuilder_pass()
        finally:
            self.rdir_client.admin_unlock(self.volume)

    def rebuilder_pass(self):
        start_time = report_time = time.time()

        rebuilder_time = 0

        chunks = self._fetch_chunks()
        for cid, content_id, chunk_id_or_pos, _ in chunks:
            loop_time = time.time()
            if self.dry_run:
                self.dryrun_chunk_rebuild(cid, content_id, chunk_id_or_pos)
            else:
                self.safe_chunk_rebuild(cid, content_id, chunk_id_or_pos)

            self.chunks_run_time = ratelimit(self.chunks_run_time,
                                             self.max_chunks_per_second)
            self.total_chunks_processed += 1
            now = time.time()

            if now - self.last_reported >= self.report_interval:
                self.logger.info(
                    'RUN  %(volume)s '
                    'started=%(start_time)s '
                    'passes=%(passes)d '
                    'errors=%(errors)d '
                    'chunks=%(nb_chunks)d %(c_rate).2f/s '
                    'bytes=%(nb_bytes)d %(b_rate).2fB/s '
                    'elapsed=%(total).2f '
                    '(rebuilder: %(success_rate).2f%%)' % {
                        'volume':
                        self.volume,
                        'start_time':
                        datetime.fromtimestamp(int(report_time)).isoformat(),
                        'passes':
                        self.passes,
                        'errors':
                        self.errors,
                        'nb_chunks':
                        self.total_chunks_processed,
                        'nb_bytes':
                        self.total_bytes_processed,
                        'c_rate':
                        self.passes / (now - report_time),
                        'b_rate':
                        self.bytes_processed / (now - report_time),
                        'total': (now - start_time),
                        'rebuilder_time':
                        rebuilder_time,
                        'success_rate':
                        100 * ((self.total_chunks_processed - self.errors) /
                               float(self.total_chunks_processed))
                    })
                report_time = now
                self.passes = 0
                self.bytes_processed = 0
                self.last_reported = now
            rebuilder_time += (now - loop_time)
        end_time = time.time()
        elapsed = (end_time - start_time) or 0.000001
        self.logger.info(
            'DONE %(volume)s '
            'started=%(start_time)s '
            'ended=%(end_time)s '
            'passes=%(passes)d '
            'elapsed=%(elapsed).02f '
            'errors=%(errors)d '
            'chunks=%(nb_chunks)d %(c_rate).2f/s '
            'bytes=%(nb_bytes)d %(b_rate).2fB/s '
            'elapsed=%(rebuilder_time).2f '
            '(rebuilder: %(success_rate).2f%%)' % {
                'volume':
                self.volume,
                'start_time':
                datetime.fromtimestamp(int(start_time)).isoformat(),
                'end_time':
                datetime.fromtimestamp(int(end_time)).isoformat(),
                'passes':
                self.passes,
                'elapsed':
                elapsed,
                'errors':
                self.errors,
                'nb_chunks':
                self.total_chunks_processed,
                'nb_bytes':
                self.total_bytes_processed,
                'c_rate':
                self.total_chunks_processed / elapsed,
                'b_rate':
                self.total_bytes_processed / elapsed,
                'rebuilder_time':
                rebuilder_time,
                'success_rate':
                100 * ((self.total_chunks_processed - self.errors) /
                       float(self.total_chunks_processed or 1))
            })

    def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id_or_pos):
        self.logger.info(
            "[dryrun] Rebuilding "
            "container %s, content %s, chunk %s", container_id, content_id,
            chunk_id_or_pos)
        self.passes += 1

    def safe_chunk_rebuild(self, container_id, content_id, chunk_id_or_pos):
        try:
            self.chunk_rebuild(container_id, content_id, chunk_id_or_pos)
        except Exception as e:
            self.errors += 1
            self.logger.error('ERROR while rebuilding chunk %s|%s|%s: %s',
                              container_id, content_id, chunk_id_or_pos, e)

        self.passes += 1

    def chunk_rebuild(self, container_id, content_id, chunk_id_or_pos):
        self.logger.info('Rebuilding (container %s, content %s, chunk %s)',
                         container_id, content_id, chunk_id_or_pos)
        try:
            content = self.content_factory.get(container_id, content_id)
        except ContentNotFound:
            raise OrphanChunk('Content not found: possible orphan chunk')

        chunk_size = 0
        chunk_pos = None
        if len(chunk_id_or_pos) < 32:
            chunk_pos = chunk_id_or_pos
            chunk_id = None
            metapos = int(chunk_pos.split('.', 1)[0])
            chunk_size = content.chunks.filter(metapos=metapos).all()[0].size
        else:
            if '/' in chunk_id_or_pos:
                chunk_id = chunk_id_or_pos.rsplit('/', 1)[-1]
            else:
                chunk_id = chunk_id_or_pos

            chunk = content.chunks.filter(id=chunk_id).one()
            if chunk is None:
                raise OrphanChunk(("Chunk not found in content:"
                                   "possible orphan chunk"))
            elif self.volume and chunk.host != self.volume:
                raise ValueError("Chunk does not belong to this volume")
            chunk_size = chunk.size

        content.rebuild_chunk(chunk_id,
                              allow_same_rawx=self.allow_same_rawx,
                              chunk_pos=chunk_pos)

        if self.try_chunk_delete:
            try:
                content.blob_client.chunk_delete(chunk.url)
                self.logger.info("Chunk %s deleted", chunk.url)
            except NotFound as exc:
                self.logger.debug("Chunk %s: %s", chunk.url, exc)

        # This call does not raise exception if chunk is not referenced
        if chunk_id is not None:
            self.rdir_client.chunk_delete(chunk.host, container_id, content_id,
                                          chunk_id)

        self.bytes_processed += chunk_size
        self.total_bytes_processed += chunk_size
Beispiel #4
0
class BlobRebuilder(Rebuilder):
    def __init__(self,
                 conf,
                 logger,
                 volume,
                 try_chunk_delete=False,
                 beanstalkd_addr=None,
                 **kwargs):
        super(BlobRebuilder, self).__init__(conf, logger, **kwargs)
        self.volume = volume
        self.rdir_client = RdirClient(conf, logger=self.logger)
        self.try_chunk_delete = try_chunk_delete
        self.beanstalkd_addr = beanstalkd_addr
        self.beanstalkd_tube = conf.get('beanstalkd_tube',
                                        DEFAULT_REBUILDER_TUBE)
        self.beanstalk = None
        self.rdir_fetch_limit = int_value(conf.get('rdir_fetch_limit'), 100)

    def _fetch_chunks_from_event(self, job_id, data):
        env = json.loads(data)
        for chunk_pos in env['data']['missing_chunks']:
            yield [
                env['url']['id'], env['url']['content'],
                str(chunk_pos), None
            ]

    def _connect_to_beanstalk(self):
        self.logger.debug('Connecting to %s', self.beanstalkd_addr)
        self.beanstalk = Beanstalk.from_url(self.beanstalkd_addr)
        self.logger.debug('Using tube %s', self.beanstalkd_tube)
        self.beanstalk.use(self.beanstalkd_tube)
        self.beanstalk.watch(self.beanstalkd_tube)

    def _handle_beanstalk_event(self, conn_error):
        try:
            job_id, data = self.beanstalk.reserve()
            if conn_error:
                self.logger.warn("beanstalk reconnected")
        except ConnectionError:
            if not conn_error:
                self.logger.warn("beanstalk connection error")
            raise
        try:
            for chunk in self._fetch_chunks_from_event(job_id, data):
                yield chunk
            self.beanstalk.delete(job_id)
        except Exception:
            self.logger.exception("handling event %s (bury)", job_id)
            self.beanstalk.bury(job_id)

    def _fetch_chunks_from_beanstalk(self):
        conn_error = False
        while 1:
            try:
                self._connect_to_beanstalk()
                for chunk in self._handle_beanstalk_event(conn_error):
                    conn_error = False
                    yield chunk
            except ConnectionError as exc:
                self.logger.warn('Disconnected: %s', exc)
                if 'Invalid URL' in str(exc):
                    raise
                conn_error = True
                time.sleep(1.0)

    def _fetch_chunks_from_file(self):
        with open(self.input_file, 'r') as ifile:
            for line in ifile:
                stripped = line.strip()
                if stripped and not stripped.startswith('#'):
                    yield stripped.split('|', 3)[:3] + [None]

    def _fetch_chunks(self):
        if self.input_file:
            return self._fetch_chunks_from_file()
        elif self.beanstalkd_addr:
            return self._fetch_chunks_from_beanstalk()
        else:
            return self.rdir_client.chunk_fetch(self.volume,
                                                limit=self.rdir_fetch_limit,
                                                rebuild=True)

    def rebuilder_pass_with_lock(self):
        self.rdir_client.admin_lock(self.volume,
                                    "rebuilder on %s" % gethostname())
        try:
            self.rebuilder_pass()
        finally:
            self.rdir_client.admin_unlock(self.volume)

    def _create_worker(self, **kwargs):
        return BlobRebuilderWorker(self.conf, self.logger, self.volume,
                                   self.try_chunk_delete)

    def _fill_queue(self, queue, **kwargs):
        chunks = self._fetch_chunks()
        for chunk in chunks:
            queue.put(chunk)

    def _init_info(self, **kwargs):
        return 0

    def _compute_info(self, worker, total_bytes_processed, **kwargs):
        total_bytes_processed += worker.total_bytes_processed
        return total_bytes_processed

    def _get_report(self, start_time, end_time, passes, errors, waiting_time,
                    rebuilder_time, elapsed, total_chunks_processed,
                    total_bytes_processed, **kwargs):
        return ('DONE %(volume)s '
                'started=%(start_time)s '
                'ended=%(end_time)s '
                'elapsed=%(elapsed).2f '
                'passes=%(passes)d '
                'errors=%(errors)d '
                'chunks=%(nb_chunks)d %(c_rate).2f/s '
                'bytes=%(nb_bytes)d %(b_rate).2fB/s '
                'waiting_time=%(waiting_time).2f '
                'rebuilder_time=%(rebuilder_time).2f '
                '(rebuilder: %(success_rate).2f%%)' % {
                    'volume':
                    self.volume,
                    'start_time':
                    datetime.fromtimestamp(int(start_time)).isoformat(),
                    'end_time':
                    datetime.fromtimestamp(int(end_time)).isoformat(),
                    'elapsed':
                    elapsed,
                    'passes':
                    passes,
                    'errors':
                    errors,
                    'nb_chunks':
                    total_chunks_processed,
                    'nb_bytes':
                    total_bytes_processed,
                    'c_rate':
                    total_chunks_processed / elapsed,
                    'b_rate':
                    total_bytes_processed / elapsed,
                    'rebuilder_time':
                    rebuilder_time,
                    'waiting_time':
                    waiting_time,
                    'success_rate':
                    100 * ((total_chunks_processed - errors) /
                           float(total_chunks_processed or 1))
                })
Beispiel #5
0
class BlobRebuilderWorker(object):
    def __init__(self, conf, logger, volume):
        self.conf = conf
        self.logger = logger or get_logger(conf)
        self.volume = volume
        self.run_time = 0
        self.passes = 0
        self.errors = 0
        self.last_reported = 0
        self.chunks_run_time = 0
        self.bytes_running_time = 0
        self.bytes_processed = 0
        self.total_bytes_processed = 0
        self.total_chunks_processed = 0
        self.dry_run = true_value(conf.get('dry_run', False))
        self.report_interval = int_value(conf.get('report_interval'), 3600)
        self.max_chunks_per_second = int_value(conf.get('chunks_per_second'),
                                               30)
        self.max_bytes_per_second = int_value(conf.get('bytes_per_second'),
                                              10000000)
        self.rdir_fetch_limit = int_value(conf.get('rdir_fetch_limit'), 100)
        self.allow_same_rawx = true_value(conf.get('allow_same_rawx'))
        self.rdir_client = RdirClient(conf)
        self.content_factory = ContentFactory(conf)

    def rebuilder_pass_with_lock(self):
        self.rdir_client.admin_lock(self.volume,
                                    "rebuilder on %s" % gethostname())
        try:
            self.rebuilder_pass()
        finally:
            self.rdir_client.admin_unlock(self.volume)

    def rebuilder_pass(self):
        start_time = report_time = time.time()

        total_errors = 0
        rebuilder_time = 0

        chunks = self.rdir_client.chunk_fetch(self.volume,
                                              limit=self.rdir_fetch_limit,
                                              rebuild=True)
        for container_id, content_id, chunk_id, data in chunks:
            loop_time = time.time()

            if self.dry_run:
                self.dryrun_chunk_rebuild(container_id, content_id, chunk_id)
            else:
                self.safe_chunk_rebuild(container_id, content_id, chunk_id)

            self.chunks_run_time = ratelimit(self.chunks_run_time,
                                             self.max_chunks_per_second)
            self.total_chunks_processed += 1
            now = time.time()

            if now - self.last_reported >= self.report_interval:
                self.logger.info(
                    'RUN  %(volume)s '
                    'started=%(start_time)s '
                    'passes=%(passes)d '
                    'errors=%(errors)d '
                    'chunks=%(nb_chunks)d %(c_rate).2f/s '
                    'bytes=%(nb_bytes)d %(b_rate).2fB/s '
                    'elapsed=%(total).2f '
                    '(rebuilder: %(rebuilder_rate).2f%%)' % {
                        'volume':
                        self.volume,
                        'start_time':
                        datetime.fromtimestamp(int(report_time)).isoformat(),
                        'passes':
                        self.passes,
                        'errors':
                        self.errors,
                        'nb_chunks':
                        self.total_chunks_processed,
                        'nb_bytes':
                        self.total_bytes_processed,
                        'c_rate':
                        self.passes / (now - report_time),
                        'b_rate':
                        self.bytes_processed / (now - report_time),
                        'total': (now - start_time),
                        'rebuilder_time':
                        rebuilder_time,
                        'rebuilder_rate':
                        100.0 * rebuilder_time / float(now - start_time)
                    })
                report_time = now
                total_errors += self.errors
                self.passes = 0
                self.bytes_processed = 0
                self.last_reported = now
            rebuilder_time += (now - loop_time)
        end_time = time.time()
        elapsed = (end_time - start_time) or 0.000001
        self.logger.info(
            'DONE %(volume)s '
            'started=%(start_time)s '
            'ended=%(end_time)s '
            'elapsed=%(elapsed).02f '
            'errors=%(errors)d '
            'chunks=%(nb_chunks)d %(c_rate).2f/s '
            'bytes=%(nb_bytes)d %(b_rate).2fB/s '
            'elapsed=%(rebuilder_time).2f '
            '(rebuilder: %(rebuilder_rate).2f%%)' % {
                'volume': self.volume,
                'start_time': datetime.fromtimestamp(
                    int(start_time)).isoformat(),
                'end_time': datetime.fromtimestamp(int(end_time)).isoformat(),
                'elapsed': elapsed,
                'errors': total_errors + self.errors,
                'nb_chunks': self.total_chunks_processed,
                'nb_bytes': self.total_bytes_processed,
                'c_rate': self.total_chunks_processed / elapsed,
                'b_rate': self.total_bytes_processed / elapsed,
                'rebuilder_time': rebuilder_time,
                'rebuilder_rate': 100.0 * rebuilder_time / float(elapsed)
            })

    def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id):
        self.logger.info(
            "[dryrun] Rebuilding "
            "container %s, content %s, chunk %s", container_id, content_id,
            chunk_id)
        self.passes += 1

    def safe_chunk_rebuild(self, container_id, content_id, chunk_id):
        try:
            self.chunk_rebuild(container_id, content_id, chunk_id)
        except Exception as e:
            self.errors += 1
            self.logger.error('ERROR while rebuilding chunk %s|%s|%s) : %s',
                              container_id, content_id, chunk_id, e)

        self.passes += 1

    def chunk_rebuild(self, container_id, content_id, chunk_id):
        self.logger.info('Rebuilding (container %s, content %s, chunk %s)',
                         container_id, content_id, chunk_id)

        try:
            content = self.content_factory.get(container_id, content_id)
        except ContentNotFound:
            raise exc.OrphanChunk('Content not found')

        chunk = content.chunks.filter(id=chunk_id).one()
        if chunk is None:
            raise OrphanChunk("Chunk not found in content")
        chunk_size = chunk.size

        content.rebuild_chunk(chunk_id, allow_same_rawx=self.allow_same_rawx)

        self.rdir_client.chunk_delete(self.volume, container_id, content_id,
                                      chunk_id)

        self.bytes_processed += chunk_size
        self.total_bytes_processed += chunk_size
Beispiel #6
0
class BlobRebuilder(Rebuilder):
    def __init__(self,
                 conf,
                 logger,
                 volume,
                 try_chunk_delete=False,
                 beanstalkd_addr=None,
                 **kwargs):
        super(BlobRebuilder, self).__init__(conf, logger, volume, **kwargs)
        # rdir
        self.rdir_client = RdirClient(conf, logger=self.logger)
        self.rdir_fetch_limit = int_value(conf.get('rdir_fetch_limit'), 100)
        # rawx
        self.try_chunk_delete = try_chunk_delete
        # beanstalk
        if beanstalkd_addr:
            self.beanstalkd_listener = BeanstalkdListener(
                beanstalkd_addr,
                conf.get('beanstalkd_tube', DEFAULT_REBUILDER_TUBE),
                self.logger, **kwargs)
        else:
            self.beanstalkd_listener = None
        # counters
        self.bytes_processed = 0
        self.total_bytes_processed = 0
        self.total_expected_chunks = None
        # distributed
        self.distributed = False

    def _create_worker(self, **kwargs):
        return BlobRebuilderWorker(self,
                                   try_chunk_delete=self.try_chunk_delete,
                                   **kwargs)

    def _fill_queue(self, queue, **kwargs):
        chunks = self._fetch_chunks(**kwargs)
        for chunk in chunks:
            queue.put(chunk)

    def _item_to_string(self, chunk, **kwargs):
        cid, content_id, chunk_id_or_pos, _ = chunk
        return 'chunk %s|%s|%s' % (cid, content_id, chunk_id_or_pos)

    def _get_report(self, status, end_time, counters, **kwargs):
        chunks_processed, bytes_processed, errors, total_chunks_processed, \
            total_bytes_processed, total_errors = counters
        time_since_last_report = (end_time - self.last_report) or 0.00001
        total_time = (end_time - self.start_time) or 0.00001
        report = (
            '%(status)s volume=%(volume)s '
            'last_report=%(last_report)s %(time_since_last_report).2fs '
            'chunks=%(chunks)d %(chunks_rate).2f/s '
            'bytes=%(bytes)d %(bytes_rate).2fB/s '
            'errors=%(errors)d %(errors_rate).2f%% '
            'start_time=%(start_time)s %(total_time).2fs '
            'total_chunks=%(total_chunks)d %(total_chunks_rate).2f/s '
            'total_bytes=%(total_bytes)d %(total_bytes_rate).2fB/s '
            'total_errors=%(total_errors)d %(total_errors_rate).2f%%' % {
                'status':
                status,
                'volume':
                self.volume,
                'last_report':
                datetime.fromtimestamp(int(self.last_report)).isoformat(),
                'time_since_last_report':
                time_since_last_report,
                'chunks':
                chunks_processed,
                'chunks_rate':
                chunks_processed / time_since_last_report,
                'bytes':
                bytes_processed,
                'bytes_rate':
                bytes_processed / time_since_last_report,
                'errors':
                errors,
                'errors_rate':
                100 * errors / float(chunks_processed or 1),
                'start_time':
                datetime.fromtimestamp(int(self.start_time)).isoformat(),
                'total_time':
                total_time,
                'total_chunks':
                total_chunks_processed,
                'total_chunks_rate':
                total_chunks_processed / total_time,
                'total_bytes':
                total_bytes_processed,
                'total_bytes_rate':
                total_bytes_processed / total_time,
                'total_errors':
                total_errors,
                'total_errors_rate':
                100 * total_errors / float(total_chunks_processed or 1)
            })
        if self.total_expected_chunks is not None:
            progress = 100 * total_chunks_processed / \
                float(self.total_expected_chunks or 1)
            report += ' progress=%d/%d %.2f%%' % \
                (total_chunks_processed, self.total_expected_chunks, progress)
        return report

    def _update_processed_without_lock(self,
                                       bytes_processed,
                                       error=None,
                                       **kwargs):
        super(BlobRebuilder, self)._update_processed_without_lock(None,
                                                                  error=error,
                                                                  **kwargs)
        if bytes_processed is not None:
            self.bytes_processed += bytes_processed

    def _update_totals_without_lock(self, **kwargs):
        chunks_processed, errors, total_chunks_processed, total_errors = \
            super(BlobRebuilder, self)._update_totals_without_lock(**kwargs)
        bytes_processed = self.bytes_processed
        self.bytes_processed = 0
        self.total_bytes_processed += bytes_processed
        return chunks_processed, bytes_processed, errors, \
            total_chunks_processed, self.total_bytes_processed, total_errors

    def _rebuilder_pass(self, **kwargs):
        return super(BlobRebuilder, self).rebuilder_pass(**kwargs)

    def rebuilder_pass(self, **kwargs):
        success = False
        if self.volume:
            self.rdir_client.admin_lock(self.volume,
                                        "rebuilder on %s" % gethostname())
            info = self.rdir_client.status(self.volume)
            self.total_expected_chunks = info.get('chunk', dict()).get(
                'to_rebuild', None)
        try:
            success = self._rebuilder_pass(**kwargs)
        finally:
            if self.volume:
                self.rdir_client.admin_unlock(self.volume)
        return success

    def _event_from_broken_chunk(self, chunk, reply, **kwargs):
        cid, content_id, chunk_id_or_pos, _ = chunk
        event = {}
        event['when'] = time.time()
        event['event'] = 'storage.content.broken'
        event['data'] = {'missing_chunks': [chunk_id_or_pos]}
        event['url'] = {'ns': self.namespace, 'id': cid, 'content': content_id}
        event['reply'] = reply
        return json.dumps(event)

    def _chunks_from_event(self, job_id, data, **kwargs):
        decoded = json.loads(data)
        container_id = decoded['url']['id']
        content_id = decoded['url']['content']
        more = None
        reply = decoded.get('reply', None)
        if reply:
            more = {'reply': reply}
        for chunk_id_or_pos in decoded['data']['missing_chunks']:
            yield [container_id, content_id, str(chunk_id_or_pos), more]

    def _fetch_events_from_beanstalk(self, **kwargs):
        return self.beanstalkd_listener.fetch_events(self._chunks_from_event,
                                                     **kwargs)

    def _fetch_chunks_from_file(self, **kwargs):
        with open(self.input_file, 'r') as ifile:
            for line in ifile:
                stripped = line.strip()
                if stripped and not stripped.startswith('#'):
                    yield stripped.split('|', 3)[:3] + [None]

    def _fetch_chunks(self, **kwargs):
        if self.input_file:
            return self._fetch_chunks_from_file(**kwargs)
        if self.beanstalkd_listener and not self.distributed:
            return self._fetch_events_from_beanstalk(**kwargs)
        if self.volume:
            return self.rdir_client.chunk_fetch(self.volume,
                                                limit=self.rdir_fetch_limit,
                                                rebuild=True,
                                                **kwargs)
        raise ConfigurationException('No source to fetch chunks from')
Beispiel #7
0
class BlobRebuilderWorker(object):
    def __init__(self, conf, logger, volume):
        self.conf = conf
        self.logger = logger or get_logger(conf)
        self.volume = volume
        self.run_time = 0
        self.passes = 0
        self.errors = 0
        self.last_reported = 0
        self.chunks_run_time = 0
        self.bytes_running_time = 0
        self.bytes_processed = 0
        self.total_bytes_processed = 0
        self.total_chunks_processed = 0
        self.dry_run = true_value(
            conf.get('dry_run', False))
        self.report_interval = int_value(
            conf.get('report_interval'), 3600)
        self.max_chunks_per_second = int_value(
            conf.get('chunks_per_second'), 30)
        self.max_bytes_per_second = int_value(
            conf.get('bytes_per_second'), 10000000)
        self.rdir_fetch_limit = int_value(
            conf.get('rdir_fetch_limit'), 100)
        self.blob_client = BlobClient()
        self.container_client = ContainerClient(conf)
        self.rdir_client = RdirClient(conf)

    def rebuilder_pass_with_lock(self):
        self.rdir_client.admin_lock(self.volume,
                                    "rebuilder on %s" % gethostname())
        try:
            self.rebuilder_pass()
        finally:
            self.rdir_client.admin_unlock(self.volume)

    def rebuilder_pass(self):
        start_time = report_time = time.time()

        total_errors = 0
        rebuilder_time = 0

        chunks = self.rdir_client.chunk_fetch(self.volume,
                                              limit=self.rdir_fetch_limit,
                                              rebuild=True)
        for container_id, content_id, chunk_id, data in chunks:
            loop_time = time.time()

            if self.dry_run:
                self.dryrun_chunk_rebuild(container_id, content_id, chunk_id)
            else:
                self.safe_chunk_rebuild(container_id, content_id, chunk_id)

            self.chunks_run_time = ratelimit(
                self.chunks_run_time,
                self.max_chunks_per_second
            )
            self.total_chunks_processed += 1
            now = time.time()

            if now - self.last_reported >= self.report_interval:
                self.logger.info(
                    '%(start_time)s '
                    '%(passes)d '
                    '%(errors)d '
                    '%(c_rate).2f '
                    '%(b_rate).2f '
                    '%(total).2f '
                    '%(rebuilder_time).2f'
                    '%(rebuilder_rate).2f' % {
                        'start_time': time.ctime(report_time),
                        'passes': self.passes,
                        'errors': self.errors,
                        'c_rate': self.passes / (now - report_time),
                        'b_rate': self.bytes_processed / (now - report_time),
                        'total': (now - start_time),
                        'rebuilder_time': rebuilder_time,
                        'rebuilder_rate': rebuilder_time / (now - start_time)
                    }
                )
                report_time = now
                total_errors += self.errors
                self.passes = 0
                self.bytes_processed = 0
                self.last_reported = now
            rebuilder_time += (now - loop_time)
        elapsed = (time.time() - start_time) or 0.000001
        self.logger.info(
            '%(elapsed).02f '
            '%(errors)d '
            '%(chunk_rate).2f '
            '%(bytes_rate).2f '
            '%(rebuilder_time).2f '
            '%(rebuilder_rate).2f' % {
                'elapsed': elapsed,
                'errors': total_errors + self.errors,
                'chunk_rate': self.total_chunks_processed / elapsed,
                'bytes_rate': self.total_bytes_processed / elapsed,
                'rebuilder_time': rebuilder_time,
                'rebuilder_rate': rebuilder_time / elapsed
            }
        )

    def dryrun_chunk_rebuild(self, container_id, content_id, chunk_id):
        self.logger.info("[dryrun] Rebuilding "
                         "container %s, content %s, chunk %s"
                         % (container_id, content_id, chunk_id))
        self.passes += 1

    def safe_chunk_rebuild(self, container_id, content_id, chunk_id):
        self.logger.info('Rebuilding (container %s, content %s, chunk %s)'
                         % (container_id, content_id, chunk_id))
        try:
            self.chunk_rebuild(container_id, content_id, chunk_id)
        except Exception as e:
            self.errors += 1
            self.logger.error('ERROR while rebuilding chunk %s|%s|%s) : %s',
                              container_id, content_id, chunk_id, e)

        self.passes += 1

    def _meta2_get_chunks_at_pos(self, container_id, content_id, chunk_id):
        current_chunk_url = 'http://%s/%s' % (self.volume, chunk_id)

        try:
            data = self.container_client.content_show(
                cid=container_id, content=content_id)
        except exc.NotFound:
            raise exc.OrphanChunk('Content not found')

        current_chunk = None
        for c in data:
            if c['url'] == current_chunk_url:
                current_chunk = c
                break
        if not current_chunk:
            raise exc.OrphanChunk('Chunk not found in content')

        duplicate_chunks = []
        for c in data:
            if c['pos'] == current_chunk['pos'] \
                    and c['url'] != current_chunk['url']:
                duplicate_chunks.append(c)
        if len(duplicate_chunks) == 0:
            raise exc.UnrecoverableContent('No copy of missing chunk')

        return current_chunk, duplicate_chunks

    def _meta2_get_spare_chunk(self, container_id, content_id, notin, broken):
        spare_data = {'notin': notin,
                      'broken': [broken],
                      'size': 0}
        try:
            spare_resp = self.container_client.content_spare(
                cid=container_id, content=content_id, data=spare_data)
        except ClientException as e:
            raise exc.SpareChunkException('No spare chunk (%s)' % e.message)

        return spare_resp['chunks'][0]

    def _meta2_replace_chunk(self, container_id, content_id,
                             current_chunk, new_chunk):
        old = [{'type': 'chunk',
                'id': current_chunk['url'],
                'hash': current_chunk['hash'],
                'size': current_chunk['size'],
                'pos': current_chunk['pos'],
                'content': content_id}]
        new = [{'type': 'chunk',
                'id': new_chunk['id'],
                'hash': current_chunk['hash'],
                'size': current_chunk['size'],
                'pos': current_chunk['pos'],
                'content': content_id}]
        update_data = {'old': old, 'new': new}

        self.container_client.container_raw_update(
            cid=container_id, data=update_data)

    # TODO rain support
    def chunk_rebuild(self, container_id, content_id, chunk_id):

        current_chunk, duplicate_chunks = self._meta2_get_chunks_at_pos(
            container_id, content_id, chunk_id)

        spare_chunk = self._meta2_get_spare_chunk(
            container_id, content_id, duplicate_chunks, current_chunk)

        uploaded = False
        for src in duplicate_chunks:
            try:
                self.blob_client.chunk_copy(src['url'], spare_chunk['id'])
                self.logger.debug('copy chunk from %s to %s',
                                  src['url'], spare_chunk['id'])
                uploaded = True
                break
            except Exception as e:
                self.logger.debug('Failed to copy chunk from %s to %s: %s',
                                  src['url'], spare_chunk['id'], type(e))
        if not uploaded:
            raise exc.UnrecoverableContent('No copy available '
                                           'of missing chunk')

        self._meta2_replace_chunk(container_id, content_id,
                                  current_chunk, spare_chunk)

        self.rdir_client.chunk_push(self.volume, container_id, content_id,
                                    chunk_id, rtime=int(time.time()))

        self.bytes_processed += current_chunk['size']
        self.total_bytes_processed += current_chunk['size']