Example #1
0
def recover(ctx):
    ret = True
    stats = ctx.monitor.stats['recover']

    stats.timer('recover', 'started')

    it = iterate_key(ctx.merged_filename, ctx.groups)

    node = elliptics_create_node(address=ctx.address,
                                 elog=ctx.elog,
                                 wait_timeout=ctx.wait_timeout,
                                 net_thread_num=4,
                                 io_thread_num=1,
                                 remotes=ctx.remotes)

    for batch_id, batch in groupby(enumerate(it),
                                   key=lambda x: x[0] / ctx.batch_size):
        recovers = []
        rs = RecoverStat()
        for _, val in batch:
            rec = KeyRecover(ctx, *val, node=node)
            recovers.append(rec)
        for r in recovers:
            r.wait()
            ret &= r.succeeded()
            rs += r.stats
        rs.apply(stats)
    stats.timer('recover', 'finished')
    return ret
Example #2
0
    def __init__(self, ctx, key, key_infos, missed_groups, node, callback):
        self.ctx = ctx
        self.complete = threading.Event()
        self.callback = callback
        self.stats = RecoverStat()
        self.stats_cmd = ctx.stats['commands']
        self.key = key
        self.key_flags = 0
        self.key_infos = key_infos
        self.diff_groups = []
        self.missed_groups = list(missed_groups)

        self.read_session = elliptics.newapi.Session(node)
        self.read_session.trace_id = ctx.trace_id
        self.read_session.set_filter(elliptics.filters.all)

        self.write_session = elliptics.newapi.Session(node)
        self.write_session.trace_id = ctx.trace_id
        self.write_session.set_checker(elliptics.checkers.all)
        self.write_session.ioflags |= elliptics.io_flags.cas_timestamp

        self.remove_session = elliptics.newapi.Session(node)
        self.remove_session.trace_id = ctx.trace_id
        self.remove_session.set_filter(elliptics.filters.all_final)
        self.remove_session.ioflags |= elliptics.io_flags.cas_timestamp
        self.remove_session.timestamp = ctx.prepare_timeout

        self.result = False
        self.attempt = 0
        self.pending_operations = 1
        self.pending_operations_lock = threading.Lock()

        log.debug("Recovering key: {0} from nonempty groups: {1} and missed groups: {2}"
                  .format(repr(self.key), [k.group_id for k in self.key_infos], self.missed_groups))
        self.run()
Example #3
0
def recover(ctx):
    ret = True
    stats = ctx.monitor.stats['recover']

    stats.timer('recover', 'started')

    it = iterate_key(ctx.merged_filename, ctx.groups)

    node = elliptics_create_node(address=ctx.address,
                                 elog=ctx.elog,
                                 wait_timeout=ctx.wait_timeout,
                                 net_thread_num=4,
                                 io_thread_num=1,
                                 remotes=ctx.remotes)

    for batch_id, batch in groupby(enumerate(it),
                                   key=lambda x: x[0] / ctx.batch_size):
        recovers = []
        rs = RecoverStat()
        for _, val in batch:
            rec = KeyRecover(ctx, *val, node=node)
            recovers.append(rec)
        for r in recovers:
            r.wait()
            ret &= r.succeeded()
            rs += r.stats
        rs.apply(stats)
    stats.timer('recover', 'finished')
    return ret
Example #4
0
    def __init__(self, ctx, key, key_infos, missed_groups, node, callback):
        self.ctx = ctx
        self.complete = threading.Event()
        self.callback = callback
        self.stats = RecoverStat()
        self.key = key
        self.key_flags = 0
        self.key_infos = key_infos
        self.diff_groups = []
        self.missed_groups = list(missed_groups)

        self.read_session = elliptics.Session(node)
        self.read_session.trace_id = ctx.trace_id
        self.read_session.set_filter(elliptics.filters.all)

        self.write_session = elliptics.Session(node)
        self.write_session.trace_id = ctx.trace_id
        self.write_session.set_checker(elliptics.checkers.all)

        self.remove_session = elliptics.Session(node)
        self.remove_session.trace_id = ctx.trace_id
        self.remove_session.set_checker(elliptics.checkers.all)

        self.result = False
        self.attempt = 0

        log.debug(
            "Recovering key: {0} from nonempty groups: {1} and missed groups: {2}"
            .format(repr(self.key), [k.group_id for k in self.key_infos],
                    self.missed_groups))
        self.run()
Example #5
0
def recover(ctx):
    from itertools import islice
    import time
    ret = True
    stats = ctx.stats['recover']

    stats.timer('recover', 'started')

    it = iterate_key(ctx.merged_filename, ctx.groups)

    elog = elliptics.Logger(ctx.log_file, int(ctx.log_level))
    node = elliptics_create_node(address=ctx.address,
                                 elog=elog,
                                 wait_timeout=ctx.wait_timeout,
                                 net_thread_num=4,
                                 io_thread_num=1,
                                 remotes=ctx.remotes)
    processed_keys = 0
    start = time.time()
    while 1:
        batch = tuple(islice(it, ctx.batch_size))
        if not batch:
            break
        recovers = []
        rs = RecoverStat()
        for val in batch:
            rec = KeyRecover(ctx, *val, node=node)
            recovers.append(rec)
        successes, failures = 0, 0
        for r in recovers:
            r.wait()
            ret &= r.succeeded()
            rs += r.stats
            if r.succeeded():
                successes += 1
            else:
                failures += 1
        processed_keys += successes + failures
        rs.apply(stats)
        stats.counter('recovered_keys', successes)
        ctx.stats.counter('recovered_keys', successes)
        stats.counter('recovered_keys', -failures)
        ctx.stats.counter('recovered_keys', -failures)
        stats.set_counter('recovery_speed',
                          processed_keys / (time.time() - start))
    stats.timer('recover', 'finished')
    return ret
Example #6
0
def recover(ctx):
    from itertools import islice
    import time
    ret = True
    stats = ctx.stats['recover']

    stats.timer('recover', 'started')

    it = iterate_key(ctx.merged_filename, ctx.groups)

    elog = elliptics.Logger(ctx.log_file, int(ctx.log_level))
    node = elliptics_create_node(address=ctx.address,
                                 elog=elog,
                                 wait_timeout=ctx.wait_timeout,
                                 net_thread_num=4,
                                 io_thread_num=1,
                                 remotes=ctx.remotes)
    processed_keys = 0
    start = time.time()
    while 1:
        batch = tuple(islice(it, ctx.batch_size))
        if not batch:
            break
        recovers = []
        rs = RecoverStat()
        for val in batch:
            rec = KeyRecover(ctx, *val, node=node)
            recovers.append(rec)
        successes, failures = 0, 0
        for r in recovers:
            r.wait()
            ret &= r.succeeded()
            rs += r.stats
            if r.succeeded():
                successes += 1
            else:
                failures += 1
        processed_keys += successes + failures
        rs.apply(stats)
        stats.counter('recovered_keys', successes)
        ctx.stats.counter('recovered_keys', successes)
        stats.counter('recovered_keys', -failures)
        ctx.stats.counter('recovered_keys', -failures)
        stats.set_counter('recovery_speed', processed_keys / (time.time() - start))
    stats.timer('recover', 'finished')
    return ret
Example #7
0
class KeyRecover(object):
    def __init__(self, ctx, key, key_infos, missed_groups, node, callback):
        self.ctx = ctx
        self.complete = threading.Event()
        self.callback = callback
        self.stats = RecoverStat()
        self.stats_cmd = ctx.stats['commands']
        self.key = key
        self.key_flags = 0
        self.key_infos = key_infos
        self.diff_groups = []
        self.missed_groups = list(missed_groups)

        self.read_session = elliptics.newapi.Session(node)
        self.read_session.trace_id = ctx.trace_id
        self.read_session.set_filter(elliptics.filters.all)

        self.write_session = elliptics.newapi.Session(node)
        self.write_session.trace_id = ctx.trace_id
        self.write_session.set_checker(elliptics.checkers.all)
        self.write_session.ioflags |= elliptics.io_flags.cas_timestamp

        self.remove_session = elliptics.newapi.Session(node)
        self.remove_session.trace_id = ctx.trace_id
        self.remove_session.set_filter(elliptics.filters.all_final)
        self.remove_session.ioflags |= elliptics.io_flags.cas_timestamp
        self.remove_session.timestamp = ctx.prepare_timeout

        self.result = False
        self.attempt = 0
        self.pending_operations = 1
        self.pending_operations_lock = threading.Lock()

        log.debug("Recovering key: {0} from nonempty groups: {1} and missed groups: {2}"
                  .format(repr(self.key), [k.group_id for k in self.key_infos], self.missed_groups))
        self.run()

    def run(self):
        self.total_size = self.key_infos[0].size
        self.chunked = self.total_size > self.ctx.chunk_size
        self.recovered_size = 0

        same_meta = lambda lhs, rhs: (lhs.timestamp, lhs.size, lhs.user_flags) == (rhs.timestamp, rhs.size, rhs.user_flags)
        same_infos = [info for info in self.key_infos if same_meta(info, self.key_infos[0])]
        self.key_flags = same_infos[0].flags

        self.same_groups = [info.group_id for info in same_infos]
        self.key_infos = [info for info in self.key_infos if info.group_id not in self.same_groups]
        self.diff_groups += [info.group_id for info in self.key_infos]
        self.diff_groups = list(set(self.diff_groups).difference(self.same_groups))

        if not self.diff_groups and not self.missed_groups:
            log.debug("Key: {0} already up-to-date in all groups: {1}".format(self.key, self.same_groups))
            self.stop(False)
            return

        log.debug("Try to recover key: {0} from groups: {1} to groups: {2}: diff groups: {3}, missed groups: {4}"
                  .format(self.key, self.same_groups, self.diff_groups + self.missed_groups,
                          self.diff_groups, self.missed_groups))

        self.read_session.groups = self.same_groups

        groups_for_write = []
        for group in self.diff_groups + self.missed_groups:
            if group in self.ctx.ro_groups:
                self.stats.counter('skip_write_to_ro_group', 1)
                continue

            groups_for_write.append(group)

        self.write_session.groups = groups_for_write
        self.read()

    def stop(self, result):
        self.result = result
        log.debug("Finished recovering key: {0} with result: {1}".format(self.key, self.result))
        self.on_complete()

    def on_complete(self):
        with self.pending_operations_lock:
            self.pending_operations -= 1
            if self.pending_operations:
                return

        # Remove all sessions to release links to elliptics node. Main thread must hold last reference to
        # elliptics node after recovery completion. Otherwise, node may be destroyed in i/o thread.
        del self.read_session, self.write_session, self.remove_session
        self.complete.set()
        self.callback(self.result, self.stats)

    def read(self):
        try:
            size = self.total_size
            log.debug("Reading key: {0} from groups: {1}, chunked: {2}"
                      .format(self.key, self.read_session.groups, self.chunked))
            if self.chunked:
                size = min(self.total_size - self.recovered_size, self.ctx.chunk_size)
            # do not check checksum for all but the first chunk
            if self.recovered_size != 0:
                if self.key_flags & elliptics.record_flags.chunked_csum:
                    # if record was checksummed by chunks there is no need to disable checksum verification
                    self.read_session.ioflags &= ~elliptics.io_flags.nocsum
                else:
                    self.read_session.ioflags |= elliptics.io_flags.nocsum
            else:
                size = min(self.total_size, size)
                self.read_session.ioflags = 0

            log.debug("Reading key: {0} from groups: {1}, chunked: {2}, offset: {3}, size: {4}, total_size: {5}"
                      .format(self.key, self.read_session.groups, self.chunked,
                              self.recovered_size, size, self.total_size))

            if self.recovered_size == 0:
                # read first chunk of data with json
                read_result = self.read_session.read(key=self.key,
                                                     offset=0,
                                                     size=size)
            else:
                # read none-first chunks of data without json
                read_result = self.read_session.read_data(key=self.key,
                                                          offset=self.recovered_size,
                                                          size=size)
            read_result.connect(self.onread)
        except Exception as e:
            log.error("Read key: {0} by offset: {1} and size: {2} raised exception: {3}, traceback: {4}"
                      .format(self.key, self.recovered_size, size, repr(e), traceback.format_exc()))
            self.stop(False)

    def write(self):
        try:
            log.debug('Writing key: %s to groups: %s',
                      repr(self.key), self.diff_groups + self.missed_groups)
            params = {'key': self.key,
                      'json': self.json,
                      'data': self.write_data,
                      'data_offset': self.recovered_size}
            if self.chunked:
                if self.recovered_size == 0:
                    params['json_capacity'] = self.json_capacity
                    params['data_capacity'] = self.total_size
                    log.debug('Write_prepare key: %s to groups: %s, json: %d, json_capacity: %d, '
                              'data: %d, data_offset: %d, data_capacity: %d',
                              params['key'], self.write_session.groups, len(params['json']),
                              params['json_capacity'], len(params['data']), params['data_offset'],
                              params['data_capacity'])
                    write_result = self.write_session.write_prepare(**params)
                elif self.recovered_size + len(params['data']) < self.total_size:
                    log.debug('Write_plain key: %s to groups: %s, json: %d, data: %d, data_offset: %d',
                              params['key'], self.write_session.groups,
                              len(params['json']), len(params['data']), params['data_offset'])
                    write_result = self.write_session.write_plain(**params)
                else:
                    params['data_commit_size'] = self.total_size
                    log.debug('Write_commit key: %s to groups: %s, json: %d, data: %d, data_offset: %d'
                              'data_commit_size: %d',
                              params['key'], self.write_session.groups,
                              len(params['json']), len(params['data']), params['data_offset'],
                              params['data_commit_size'])
                    write_result = self.write_session.write_commit(**params)
            else:
                del params['data_offset']
                params['json_capacity'] = self.json_capacity
                params['data_capacity'] = self.total_size
                log.debug('Write key: %s to groups: %s, json: %d, json_capacity: %d, '
                          'data: %d, data_capacity: %d',
                          params['key'], self.write_session.groups,
                          len(params['json']), params['json_capacity'], len(params['data']),
                          params['data_capacity'])
                write_result = self.write_session.write(**params)
            write_result.connect(self.onwrite)
        except Exception as e:
            log.error('Writing key: %s raised exception: %s, traceback: %s',
                      self.key, repr(e), traceback.format_exc())
            self.stop(False)

    def onread(self, results, error):
        try:
            corrupted_groups = []
            for result in results:
                if result.status != -errno.EILSEQ:
                    continue

                if result.group_id in self.ctx.ro_groups:
                    self.stats.counter('skip_remove_corrupted_key_from_ro_group', 1)
                    continue

                corrupted_groups.append(result.group_id)

            if corrupted_groups:
                with self.pending_operations_lock:
                    self.pending_operations += 1
                KeyRemover(self.key, self.total_size, self.remove_session, corrupted_groups,
                           self.ctx, self.stats, self.on_complete).remove()

            if error.code:
                log.error("Failed to read key: {0} from groups: {1}: {2}".format(self.key, self.same_groups, error))
                self.stats_cmd.counter('read.{0}'.format(error.code), 1)
                self.stats.read_failed += len(results)
                if error.code == -errno.ETIMEDOUT:
                    if self.attempt < self.ctx.attempts:
                        self.attempt += 1
                        old_timeout = self.read_session.timeout
                        self.read_session.timeout *= 2
                        log.error("Read has been timed out. Try to reread key: {0} from groups: {1}, attempt: {2}/{3} "
                                  "with increased timeout: {4}/{5}"
                                  .format(self.key, self.same_groups, self.attempt, self.ctx.attempts,
                                          self.read_session.timeout, old_timeout))
                        self.read()
                    else:
                        log.error("Read has been timed out {0} times, all {1} attempts are used. "
                                  "The key: {1} can't be recovery now. Skip it"
                                  .format(self.attempt, self.key))
                        self.stats.skipped += 1
                        self.stop(False)
                elif len(self.key_infos) > 1:
                    log.error("Key: {0} has available replicas in other groups. Try to recover the key from them"
                              .format(self.key))
                    self.diff_groups += self.read_session.groups
                    self.run()
                else:
                    log.error("Failed to read key: {0} from any available group. "
                              "This key can't be recovered now. Skip it"
                              .format(self.key))
                    self.stats.skipped += 1
                    self.stop(False)
                return

            self.stats.read_failed += len(results) - 1
            self.stats.read += 1
            self.stats.read_bytes += results[-1].size

            if self.recovered_size == 0:
                self.write_session.user_flags = results[-1].record_info.user_flags
                self.write_session.timestamp = results[-1].record_info.data_timestamp
                self.write_session.json_timestamp = results[-1].record_info.json_timestamp
                self.read_session.ioflags |= elliptics.io_flags.nocsum
                self.read_session.groups = [results[-1].group_id]
                self.key_flags = results[-1].record_info.record_flags
                self.json_capacity = results[-1].record_info.json_capacity
                if self.total_size != results[-1].record_info.data_size:
                    self.total_size = results[-1].record_info.data_size
                    self.chunked = self.total_size > self.ctx.chunk_size
            self.attempt = 0

            if self.chunked and len(results) > 1:
                self.missed_groups += [r.group_id for r in results if r.error.code]

            log.debug("Copy key: %s from groups: %s to groups: %s",
                      repr(self.key), self.same_groups, self.write_session.groups)
            self.write_data = results[-1].data
            self.json = results[-1].json
            self.write()
        except Exception as e:
            log.error("Failed to handle origin key: {0}, exception: {1}, traceback: {2}"
                      .format(self.key, repr(e), traceback.format_exc()))
            self.stop(False)

    def onwrite(self, results, error):
        try:
            if error.code:
                self.stats_cmd.counter('write.{0}'.format(error.code), 1)
                self.stats.write_failed += 1
                log.error("Failed to write key: {0}: to groups: {1}: {2}"
                          .format(self.key, self.write_session.groups, error))
                if self.attempt < self.ctx.attempts and error.code in (-errno.ETIMEDOUT, -errno.ENXIO):
                    old_timeout = self.write_session.timeout
                    self.write_session.timeout *= 2
                    self.attempt += 1
                    log.info("Retry to write key: {0} attempts: {1}/{2} increased timeout: {3}/{4}"
                             .format(repr(self.key), self.attempt, self.ctx.attempts,
                                     self.write_session.timeout, old_timeout))
                    self.stats.write_retries += 1
                    self.write()
                    return
                self.stop(False)
                return

            self.stats.write += len(results)
            self.recovered_size += len(self.write_data)
            self.stats.written_bytes += len(self.write_data) * len(results)
            self.attempt = 0
            if self.recovered_size < self.total_size:
                self.read()
            else:
                log.debug("Key: {0} has been successfully copied to groups: {1}"
                          .format(repr(self.key), [r.group_id for r in results]))
                self.stop(True)
        except Exception as e:
            log.error("Failed to handle write result key: {0}: {1}, traceback: {2}"
                      .format(self.key, repr(e), traceback.format_exc()))
            self.stop(False)

    def wait(self):
        if not self.complete.is_set():
            self.complete.wait()

    def succeeded(self):
        self.wait()
        return self.result