Esempio n. 1
0
 def __init__(self, etcd, data_source, partition_id, raw_data_options,
              batch_processor_options):
     super(ExampleIdBatchFetcher,
           self).__init__(batch_processor_options.max_flying_item)
     self._raw_data_visitor = RawDataVisitor(etcd, data_source,
                                             partition_id, raw_data_options)
     self._batch_size = batch_processor_options.batch_size
     self._partition_id = partition_id
Esempio n. 2
0
 def __init__(self, etcd, data_source, partition_id, options):
     self._lock = threading.Lock()
     self._data_source = data_source
     self._partition_id = partition_id
     self._data_block_manager = DataBlockManager(data_source, partition_id)
     self._raw_data_visitor = RawDataVisitor(etcd, data_source,
                                             partition_id, options)
     self._next_data_block_index = (
         self._data_block_manager.get_dumped_data_block_num())
     self._fly_data_block_meta = []
     self._stale_with_dfs = False
     self._synced_data_block_meta_finished = False
Esempio n. 3
0
 def __init__(self, etcd, data_source, partition_id, raw_data_options,
              example_id_batch_options):
     self._lock = threading.Lock()
     self._partition_id = partition_id
     self._raw_data_visitor = RawDataVisitor(etcd, data_source,
                                             partition_id, raw_data_options)
     self._example_id_batch_options = example_id_batch_options
     self._flying_example_id_count = 0
     self._batch_queue = []
     self._raw_data_finished = False
     self._fetch_finished = False
     self._last_index = None
Esempio n. 4
0
    def __init__(self, etcd, data_source, partition_id, options):
        self._data_source = data_source
        self._partition_id = partition_id
        self._leader_visitor = ExampleIdVisitor(
            ExampleIdManager(data_source, partition_id))
        self._follower_visitor = RawDataVisitor(etcd, data_source,
                                                partition_id, options)
        self._data_block_manager = DataBlockManager(data_source, partition_id)

        self._data_block_builder = None
        self._stale_with_dfs = False
        self._follower_restart_index = 0
        self._sync_state()
Esempio n. 5
0
 def __init__(self, etcd, data_source, partition_id, raw_data_options):
     self._lock = threading.Lock()
     self._data_source = data_source
     self._partition_id = partition_id
     self._data_block_manager = \
             DataBlockManager(data_source, partition_id)
     self._raw_data_visitor = \
             RawDataVisitor(etcd, data_source,
                            partition_id, raw_data_options)
     self._next_data_block_index = \
             self._data_block_manager.get_dumped_data_block_count()
     self._fly_data_block_meta = []
     self._state_stale = False
     self._synced_data_block_meta_finished = False
Esempio n. 6
0
 def _allocate_sync_partition_fn(self):
     assert self._processing_manifest is None
     req = dj_pb.RawDataRequest(
         data_source_meta=self._data_source.data_source_meta,
         rank_id=self._rank_id,
         sync_example_id=dj_pb.SyncExampleIdRequest(partition_id=-1))
     rsp = self._master_client.RequestJoinPartition(req)
     if rsp.status.code != 0:
         raise RuntimeError("Failed to Request partition for sync "\
                            "id to follower, error msg {}".format(
                             rsp.status.error_message))
     if rsp.HasField('finished'):
         with self._lock:
             self._state = None
             return
     if not rsp.HasField('manifest'):
         logging.warning("no manifest is at state %d, wait and retry",
                         dj_pb.RawDataState.UnAllocated)
         return
     rdv = RawDataVisitor(self._etcd, self._data_source,
                          rsp.manifest.partition_id, self._options)
     with self._lock:
         self._processing_manifest = rsp.manifest
         self._raw_data_visitor = rdv
         self._check_manifest()
         self._wakeup_follower_example_id_syncer()
Esempio n. 7
0
 def __init__(self, etcd, data_source, partition_id,
              raw_data_options, data_block_builder_options):
     self._lock = threading.Lock()
     self._data_source = data_source
     self._partition_id = partition_id
     self._data_block_manager = \
             DataBlockManager(data_source, partition_id)
     self._raw_data_visitor = \
             RawDataVisitor(etcd, data_source,
                            partition_id, raw_data_options)
     self._data_block_builder_options = data_block_builder_options
     self._next_data_block_index = \
             self._data_block_manager.get_dumped_data_block_count()
     self._fly_data_block_meta = []
     self._state_stale = False
     self._synced_data_block_meta_finished = False
     ds_name = self._data_source.data_source_meta.name
     self._metrics_tags = {'data_source_name': ds_name,
                           'partiton': self._partition_id}
Esempio n. 8
0
 def __init__(self, example_joiner_options, raw_data_options,
              data_block_builder_options, kvstore, data_source,
              partition_id):
     self._lock = threading.Lock()
     self._example_joiner_options = example_joiner_options
     self._raw_data_options = raw_data_options
     self._data_source = data_source
     self._partition_id = partition_id
     self._leader_visitor = \
             ExampleIdVisitor(kvstore, self._data_source, self._partition_id)
     self._follower_visitor = \
             RawDataVisitor(kvstore, self._data_source,
                            self._partition_id, raw_data_options)
     self._data_block_manager = \
             DataBlockManager(self._data_source, self._partition_id)
     meta = self._data_block_manager.get_lastest_data_block_meta()
     if meta is None:
         self._joiner_stats = JoinerStats(0, -1, -1)
     else:
         stats_info = meta.joiner_stats_info
         self._joiner_stats = JoinerStats(stats_info.stats_cum_join_num,
                                          stats_info.leader_stats_index,
                                          stats_info.follower_stats_index)
     self._data_block_builder_options = data_block_builder_options
     self._data_block_builder = None
     self._state_stale = False
     self._follower_restart_index = 0
     self._sync_example_id_finished = False
     self._raw_data_finished = False
     self._join_finished = False
     ds_name = self._data_source.data_source_meta.name
     self._metrics_tags = {
         'data_source_name': ds_name,
         'partition': partition_id,
         'joiner_name': self.name()
     }
     self._optional_stats = OptionalStats(raw_data_options,
                                          self._metrics_tags)
     self._latest_dump_timestamp = time.time()
     self._sync_state()
Esempio n. 9
0
class ExampleJoiner(object):
    def __init__(self, example_joiner_options, raw_data_options,
                 data_block_builder_options, kvstore, data_source,
                 partition_id):
        self._lock = threading.Lock()
        self._example_joiner_options = example_joiner_options
        self._raw_data_options = raw_data_options
        self._data_source = data_source
        self._partition_id = partition_id
        self._leader_visitor = \
                ExampleIdVisitor(kvstore, self._data_source, self._partition_id)
        self._follower_visitor = \
                RawDataVisitor(kvstore, self._data_source,
                               self._partition_id, raw_data_options)
        self._data_block_manager = \
                DataBlockManager(self._data_source, self._partition_id)
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is None:
            self._joiner_stats = JoinerStats(0, -1, -1)
        else:
            stats_info = meta.joiner_stats_info
            self._joiner_stats = JoinerStats(stats_info.stats_cum_join_num,
                                             stats_info.leader_stats_index,
                                             stats_info.follower_stats_index)
        self._data_block_builder_options = data_block_builder_options
        self._data_block_builder = None
        self._state_stale = False
        self._follower_restart_index = 0
        self._sync_example_id_finished = False
        self._raw_data_finished = False
        self._join_finished = False
        ds_name = self._data_source.data_source_meta.name
        self._metrics_tags = {
            'data_source_name': ds_name,
            'partition': partition_id,
            'joiner_name': self.name()
        }
        self._optional_stats = OptionalStats(raw_data_options,
                                             self._metrics_tags)
        self._latest_dump_timestamp = time.time()
        self._sync_state()

    @contextmanager
    def make_example_joiner(self):
        state_stale = self._is_state_stale()
        self._acuqire_state_stale()
        yield self._inner_joiner(state_stale)
        self._release_state_stale()

    @classmethod
    def name(cls):
        return 'BASE_EXAMPLE_JOINER'

    def get_data_block_meta_by_index(self, index):
        with self._lock:
            manager = self._data_block_manager
            return self._join_finished, \
                    manager.get_data_block_meta_by_index(index)

    def get_dumped_data_block_count(self):
        return self._data_block_manager.get_dumped_data_block_count()

    def is_join_finished(self):
        with self._lock:
            return self._join_finished

    def set_sync_example_id_finished(self):
        with self._lock:
            self._sync_example_id_finished = True

    def set_raw_data_finished(self):
        with self._lock:
            self._raw_data_finished = True

    def is_sync_example_id_finished(self):
        with self._lock:
            return self._sync_example_id_finished

    def is_raw_data_finished(self):
        with self._lock:
            return self._raw_data_finished

    def need_join(self):
        with self._lock:
            if self._join_finished:
                return False
            if self._state_stale or self._sync_example_id_finished:
                return True
            if self._follower_visitor.is_visitor_stale() or \
                    self._leader_visitor.is_visitor_stale():
                return True
            if not self._follower_visitor.finished() and \
                    not self._leader_visitor.finished():
                return True
            return self._need_finish_data_block_since_interval()

    def _prepare_join(self, state_stale):
        if state_stale:
            self._sync_state()
            self._reset_data_block_builder()
        sync_example_id_finished = self.is_sync_example_id_finished()
        raw_data_finished = self.is_raw_data_finished()
        self._active_visitors()
        return sync_example_id_finished, raw_data_finished

    def _inner_joiner(self, reset_state):
        raise NotImplementedError(
            "_inner_joiner not implement for base class: %s" %
            ExampleJoiner.name())

    def _is_state_stale(self):
        with self._lock:
            return self._state_stale

    def _active_visitors(self):
        self._leader_visitor.active_visitor()
        self._follower_visitor.active_visitor()

    def _sync_state(self):
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is not None:
            try:
                self._leader_visitor.seek(meta.leader_end_index)
            except StopIteration:
                logging.warning("leader visitor finished")
            try:
                self._follower_visitor.seek(meta.follower_restart_index)
            except StopIteration:
                logging.warning("follower visitor finished")
        else:
            self._leader_visitor.reset()
            self._follower_visitor.reset()

    def _get_data_block_builder(self, create_if_no_existed):
        if self._data_block_builder is None and create_if_no_existed:
            data_block_index = \
                    self._data_block_manager.get_dumped_data_block_count()
            self._data_block_builder = DataBlockBuilder(
                common.data_source_data_block_dir(self._data_source),
                self._data_source.data_source_meta.name, self._partition_id,
                data_block_index, self._data_block_builder_options,
                self._example_joiner_options.data_block_dump_threshold)
            self._data_block_builder.set_data_block_manager(
                self._data_block_manager)
            self._data_block_builder.set_follower_restart_index(
                self._follower_restart_index)
        return self._data_block_builder

    def _finish_data_block(self):
        if self._data_block_builder is not None:
            self._data_block_builder.set_join_stats_info(
                self._create_join_stats_info())
            meta = self._data_block_builder.finish_data_block(
                True, self._metrics_tags)
            self._optional_stats.emit_optional_stats()
            self._reset_data_block_builder()
            self._update_latest_dump_timestamp()
            return meta
        return None

    def _create_join_stats_info(self):
        builder = self._get_data_block_builder(False)
        nstats_cum_join_num = self._joiner_stats.calc_stats_joined_num()
        nactual_cum_join_num = 0 if builder is None \
                               else builder.example_count()
        meta = self._data_block_manager.get_lastest_data_block_meta()
        if meta is not None:
            nactual_cum_join_num += meta.joiner_stats_info.actual_cum_join_num
        return dj_pb.JoinerStatsInfo(
            stats_cum_join_num=nstats_cum_join_num,
            actual_cum_join_num=nactual_cum_join_num,
            leader_stats_index=self._joiner_stats.get_leader_stats_index(),
            follower_stats_index=self._joiner_stats.get_follower_stats_index())

    def _reset_data_block_builder(self):
        builder = None
        with self._lock:
            builder = self._data_block_builder
            self._data_block_builder = None
        if builder is not None:
            del builder

    def _update_latest_dump_timestamp(self):
        data_block_dump_duration = time.time() - self._latest_dump_timestamp
        metrics.emit_timer(name='data_block_dump_duration',
                           value=int(data_block_dump_duration),
                           tags=self._metrics_tags)
        self._latest_dump_timestamp = time.time()

    def _acuqire_state_stale(self):
        with self._lock:
            self._state_stale = True

    def _release_state_stale(self):
        with self._lock:
            self._state_stale = False

    def _set_join_finished(self):
        with self._lock:
            self._join_finished = True

    def _need_finish_data_block_since_interval(self):
        dump_interval = self._example_joiner_options.data_block_dump_interval
        duration_since_dump = time.time() - self._latest_dump_timestamp
        return 0 < dump_interval <= duration_since_dump
Esempio n. 10
0
class ExampleJoiner(object):
    def __init__(self, etcd, data_source, partition_id, options):
        self._data_source = data_source
        self._partition_id = partition_id
        self._leader_visitor = ExampleIdVisitor(
            ExampleIdManager(data_source, partition_id))
        self._follower_visitor = RawDataVisitor(etcd, data_source,
                                                partition_id, options)
        self._data_block_manager = DataBlockManager(data_source, partition_id)

        self._data_block_builder = None
        self._stale_with_dfs = False
        self._follower_restart_index = 0
        self._sync_state()

    def join_example(self):
        raise NotImplementedError(
            "join exampel not implement for base class: %s" %
            ExampleJoiner.name())

    @classmethod
    def name(cls):
        return 'EXAMPLE_JOINER'

    def get_data_block_number(self):
        return self._data_block_manager.num_dumped_data_block()

    def get_data_block_meta(self, index):
        return self._data_block_manager.get_data_block_meta_by_index(index)

    def join_finished(self):
        return self._data_block_manager.join_finished()

    def _sync_state(self):
        meta = self._data_block_manager.get_last_data_block_meta(
            self._stale_with_dfs)
        if meta is not None:
            try:
                self._leader_visitor.seek(meta.leader_end_index)
            except StopIteration:
                logging.warning("leader visitor finished")
            try:
                self._follower_visitor.seek(meta.follower_restart_index)
            except StopIteration:
                logging.warning("follower visitor finished")
            if (self._leader_visitor.finished()
                    or self._follower_visitor.finished()):
                self._data_block_manager.finish_join()
        self._stale_with_dfs = False

    def _get_data_block_builder(self):
        if self._data_block_builder is not None:
            return self._data_block_builder
        data_block_index = self._data_block_manager.get_dumped_data_block_num()
        self._data_block_builder = DataBlockBuilder(
            self._data_source.data_block_dir, self._partition_id,
            data_block_index,
            self._data_source.data_source_meta.max_example_in_data_block)
        return self._data_block_builder

    def _finish_data_block(self):
        assert self._data_block_builder is not None
        self._data_block_builder.set_follower_restart_index(
            self._follower_restart_index)
        self._data_block_builder.finish_data_block()
        meta = self._data_block_builder.get_data_block_meta()
        if meta is not None:
            self._data_block_manager.add_dumped_data_block_meta(meta)
        self._data_block_builder = None
Esempio n. 11
0
class ExampleIdBatchFetcher(object):
    class ExampleIdBatch(object):
        def __init__(self, partition_id, begin_index):
            self._lite_example_ids = dj_pb.LiteExampleIds(
                partition_id=partition_id, begin_index=begin_index)

        def append(self, example_id, event_time):
            self._lite_example_ids.example_id.append(example_id)
            self._lite_example_ids.event_time.append(event_time)

        @property
        def begin_index(self):
            return self._lite_example_ids.begin_index

        @property
        def lite_example_ids(self):
            return self._lite_example_ids

        @property
        def partition_id(self):
            return self._lite_example_ids.partition_id

        def __len__(self):
            return len(self._lite_example_ids.example_id)

        def __lt__(self, other):
            assert isinstance(other, ExampleIdBatchFetcher.ExampleIdBatch)
            assert self.partition_id == other.partition_id
            return self.begin_index < other.begin_index

    def __init__(self, etcd, data_source, partition_id, raw_data_options,
                 example_id_batch_options):
        self._lock = threading.Lock()
        self._partition_id = partition_id
        self._raw_data_visitor = RawDataVisitor(etcd, data_source,
                                                partition_id, raw_data_options)
        self._example_id_batch_options = example_id_batch_options
        self._flying_example_id_count = 0
        self._batch_queue = []
        self._raw_data_finished = False
        self._fetch_finished = False
        self._last_index = None

    def need_fetch(self, next_index):
        with self._lock:
            if next_index is None:
                return False
            if self._last_index is not None and next_index > self._last_index:
                assert self._fetch_finished
                return False
            if self._check_index_rollback(next_index):
                return True
            return self._flying_example_id_count < \
                    self._example_id_batch_options.max_flying_example_id

    def set_raw_data_finished(self):
        with self._lock:
            self._raw_data_finished = True

    def is_raw_data_finished(self):
        with self._lock:
            return self._raw_data_finished

    @contextmanager
    def make_fetcher(self, next_index):
        yield self._inner_fetcher(next_index)

    def _inner_fetcher(self, next_index):
        raw_data_finished = False
        with self._lock:
            if next_index is None:
                return
            if self._check_index_rollback(next_index):
                self._batch_queue = []
                self._flying_example_id_count = 0
            if len(self._batch_queue) > 0:
                end_batch = self._batch_queue[-1]
                next_index = end_batch.begin_index + len(end_batch)
            raw_data_finished = self._raw_data_finished
        assert next_index >= 0, "the next index should >= 0"
        self._raw_data_visitor.active_visitor()
        if next_index == 0:
            self._raw_data_visitor.reset()
        else:
            self._raw_data_visitor.seek(next_index - 1)
        while not self._raw_data_visitor.finished() and \
                not self._fly_example_id_full():
            next_batch = ExampleIdBatchFetcher.ExampleIdBatch(
                self._partition_id, next_index)
            for (index, item) in self._raw_data_visitor:
                if index != next_index:
                    logging.fatal("index is for partition %d not consecutive, "\
                                  "%d != %d",
                                  self._partition_id, index, next_index)
                    os._exit(-1)  # pylint: disable=protected-access
                next_batch.append(item.example_id, item.event_time)
                next_index += 1
                if len(next_batch) > \
                        self._example_id_batch_options.example_id_batch_size:
                    break
            self._append_new_example_id_batch(next_batch)
            yield next_batch
        if raw_data_finished and self._raw_data_visitor.finished():
            self._set_fetch_finished(self._raw_data_visitor.get_index())

    def fetch_example_id_batch_by_index(self, next_index, hit_idx=None):
        with self._lock:
            if next_index is None:
                return False, None, hit_idx
            if self._last_index is not None and self._last_index < next_index:
                assert self._fetch_finished
                return True, None, None
            if len(self._batch_queue) == 0:
                return False, None, 0
            end_batch = self._batch_queue[-1]
            # fast path, use the hit
            if hit_idx is not None:
                if hit_idx < len(self._batch_queue):
                    if self._batch_queue[hit_idx].begin_index == next_index:
                        return False, self._batch_queue[hit_idx], hit_idx
                elif next_index >= end_batch.begin_index + len(end_batch):
                    return self._fetch_finished, None, hit_idx
            fake_batch = ExampleIdBatchFetcher.ExampleIdBatch(
                self._partition_id, next_index)
            idx = bisect.bisect_left(self._batch_queue, fake_batch)
            if idx == len(self._batch_queue):
                if end_batch.begin_index + len(end_batch) >= next_index:
                    return self._fetch_finished, None, len(self._batch_queue)
            elif self._batch_queue[idx].begin_index == next_index:
                return False, self._batch_queue[idx], idx
            logging.warning("next_index %d rollback! check it", next_index)
            return False, None, None

    def evict_staless_example_id_batch(self, dumped_index):
        with self._lock:
            skip_batch = 0
            while dumped_index is not None and \
                    len(self._batch_queue) > skip_batch:
                batch = self._batch_queue[skip_batch]
                if batch.begin_index + len(batch) - 1 <= dumped_index:
                    skip_batch += 1
                    self._flying_example_id_count -= len(batch)
                else:
                    break
            self._batch_queue = self._batch_queue[skip_batch:]
            return skip_batch

    def _append_new_example_id_batch(self, next_batch):
        with self._lock:
            if len(self._batch_queue) > 0:
                end_batch = self._batch_queue[-1]
                expected_index = end_batch.begin_index + len(end_batch)
                if expected_index != next_batch.begin_index:
                    logging.fatal("next batch index is not consecutive!"\
                                  "%d(expected_index) != %d(supply_index)",
                                  expected_index, next_batch.begin_index)
                    os._exit(-1)  # pylint: disable=protected-access
            self._batch_queue.append(next_batch)
            self._flying_example_id_count += len(next_batch)

    def _check_index_rollback(self, next_index):
        assert next_index is not None
        if len(self._batch_queue) == 0:
            return True
        end_batch = self._batch_queue[-1]
        # fast path check index consecutively
        if next_index == end_batch.begin_index + len(end_batch):
            return False
        # slow path since need binary search
        fake_batch = ExampleIdBatchFetcher.ExampleIdBatch(
            self._partition_id, next_index)
        idx = bisect.bisect_left(self._batch_queue, fake_batch)
        if idx == len(self._batch_queue):
            return next_index != end_batch.begin_index + len(end_batch)
        return self._batch_queue[idx].begin_index != next_index

    def _fly_example_id_full(self):
        with self._lock:
            return self._flying_example_id_count > \
                    self._example_id_batch_options.max_flying_example_id

    def _set_fetch_finished(self, last_index):
        with self._lock:
            self._fetch_finished = True
            self._last_index = last_index
Esempio n. 12
0
class DataBlockDumperManager(object):
    def __init__(self, etcd, data_source, partition_id):
        self._lock = threading.Lock()
        self._data_source = data_source
        self._partition_id = partition_id
        self._data_block_manager = DataBlockManager(data_source, partition_id)
        self._raw_data_visitor = RawDataVisitor(
                etcd, data_source, partition_id
            )
        self._next_data_block_index = (
                self._data_block_manager.get_dumped_data_block_num()
            )
        self._fly_data_block_meta = []
        self._stale_with_dfs = False
        self._synced_data_block_meta_finished = False

    def get_partition_id(self):
        return self._partition_id

    def get_next_data_block_index(self):
        with self._lock:
            return self._next_data_block_index

    def append_synced_data_block_meta(self, meta):
        with self._lock:
            if self._next_data_block_index != meta.data_block_index:
                return False, self._next_data_block_index
            self._fly_data_block_meta.append(meta)
            self._next_data_block_index += 1
            return True, self._next_data_block_index

    def finish_sync_data_block_meta(self):
        with self._lock:
            self._synced_data_block_meta_finished = True

    def need_dump(self):
        with self._lock:
            return (len(self._fly_data_block_meta) > 0 or
                    self._stale_with_dfs)

    def dump_data_blocks(self):
        try:
            self._sync_with_dfs()
            while True:
                finished = False
                meta = None
                builder = None
                with self._lock:
                    finished, meta = self._get_next_data_block_meta()
                self._create_data_block_by_meta(meta)
                if meta is None:
                    return
        except Exception as e: # pylint: disable=broad-except
            logging.error("Failed to dump data block for partition "\
                          "%d with expect %s", self._partition_id, e)
            with self._lock:
                self._stale_with_dfs = True

    def data_block_meta_sync_finished(self):
        with self._lock:
            return self._synced_data_block_meta_finished

    def _get_next_data_block_meta(self):
        if len(self._fly_data_block_meta) == 0:
            if self._synced_data_block_meta_finished:
                return True, None
            return False, None
        return False, self._fly_data_block_meta[0]

    @contextmanager
    def _make_data_block_builder(self, meta):
        manager = self._data_block_manager
        assert manager is not None
        assert self._partition_id == meta.partition_id
        builder = None
        try:
            builder = DataBlockBuilder(
                    self._data_source.data_block_dir,
                    self._partition_id,
                    meta.data_block_index,
                )
            builder.init_by_meta(meta)
            yield builder
        except Exception as e: # pylint: disable=broad-except
            logging.warning(
                    "Failed make data block builder, reason %s", e
                )
        del builder

    def _create_data_block_by_meta(self, meta):
        if meta is None:
            return
        with self._make_data_block_builder(meta) as data_block_builder:
            try:
                if meta.leader_start_index == 0:
                    self._raw_data_visitor.reset()
                else:
                    assert meta.leader_start_index > 0
                    self._raw_data_visitor.seek(meta.leader_start_index-1)
            except StopIteration:
                logging.fatal("raw data finished before when seek to %d",
                              meta.leader_start_index-1)
                os._exit(-1) # pylint: disable=protected-access
            match_index = 0
            example_num = len(meta.example_ids)
            for (index, item) in self._raw_data_visitor:
                example_id = item.example_id
                if example_id == meta.example_ids[match_index]:
                    data_block_builder.append_raw_example(item.record)
                    match_index += 1
                if match_index >= example_num:
                    break
                if index >= meta.leader_end_index:
                    break
            if match_index < example_num:
                for idx in range(match_index, example_num):
                    feat = {}
                    example_id = meta.example_ids[idx]
                    feat['example_id'] = tf.train.Feature(
                            bytes_list=tf.train.BytesList(value=[example_id]))
                    empty_example = tf.train.Example(
                        features=tf.train.Features(feature=feat))
                    data_block_builder.append_raw_example(
                            empty_example.SerializeToString()
                        )
            data_block_builder.finish_data_block()
            assert meta == data_block_builder.get_data_block_meta()
            self._data_block_manager.add_dumped_data_block_meta(meta)
            with self._lock:
                assert self._fly_data_block_meta[0] == meta
                self._fly_data_block_meta.pop(0)

    def _sync_with_dfs(self):
        manager = self._data_block_manager
        dumped_num = manager.get_dumped_data_block_num(self._sync_with_dfs)
        with self._lock:
            skip_count = 0
            for meta in self._fly_data_block_meta:
                if meta.data_block_index >= dumped_num:
                    break
                skip_count += 1
            self._fly_data_block_meta = self._fly_data_block_meta[skip_count:]
Esempio n. 13
0
class DataBlockDumperManager(object):
    def __init__(self, kvstore, data_source, partition_id, raw_data_options,
                 data_block_builder_options):
        self._lock = threading.Lock()
        self._data_source = data_source
        self._partition_id = partition_id
        self._data_block_manager = \
                DataBlockManager(data_source, partition_id)
        self._raw_data_visitor = \
                RawDataVisitor(kvstore, data_source,
                               partition_id, raw_data_options)
        self._data_block_builder_options = data_block_builder_options
        self._next_data_block_index = \
                self._data_block_manager.get_dumped_data_block_count()
        self._fly_data_block_meta = []
        self._state_stale = False
        self._synced_data_block_meta_finished = False
        ds_name = self._data_source.data_source_meta.name
        self._metrics_tags = {
            'data_source_name': ds_name,
            'partition': self._partition_id
        }
        self._optional_stats = OptionalStats(raw_data_options,
                                             self._metrics_tags)

    def get_next_data_block_index(self):
        with self._lock:
            return self._next_data_block_index

    def get_dumped_data_block_index(self):
        return self._data_block_manager.get_dumped_data_block_count() - 1

    def add_synced_data_block_meta(self, meta):
        with self._lock:
            if self._synced_data_block_meta_finished:
                raise RuntimeError(
                        "data block dumper manager has been mark as "\
                        "no more data block meta"
                    )
            if self._next_data_block_index != meta.data_block_index:
                return False, self._next_data_block_index
            self._fly_data_block_meta.append(meta)
            self._next_data_block_index += 1
            return True, self._next_data_block_index

    def finish_sync_data_block_meta(self):
        with self._lock:
            self._synced_data_block_meta_finished = True

    def need_dump(self):
        with self._lock:
            return len(self._fly_data_block_meta) > 0

    def is_synced_data_block_meta_finished(self):
        with self._lock:
            return self._synced_data_block_meta_finished

    @contextmanager
    def make_data_block_dumper(self):
        self._sync_with_data_block_manager()
        self._acquire_state_stale()
        yield self._dump_data_blocks
        self._release_state_stale()

    def _dump_data_blocks(self):
        while self.need_dump():
            meta = self._get_next_data_block_meta()
            if meta is not None:
                start_tm = time.time()
                self._raw_data_visitor.active_visitor()
                self._dump_data_block_by_meta(meta)
                dump_duration = time.time() - start_tm
                metrics.emit_timer(name='data_block_dump_duration',
                                   value=int(dump_duration),
                                   tags=self._metrics_tags)

    def data_block_meta_sync_finished(self):
        with self._lock:
            return self._synced_data_block_meta_finished

    def _acquire_state_stale(self):
        with self._lock:
            self._state_stale = True

    def _release_state_stale(self):
        with self._lock:
            self._state_stale = False

    def _get_next_data_block_meta(self):
        with self._lock:
            if len(self._fly_data_block_meta) == 0:
                return None
            return self._fly_data_block_meta[0]

    @contextmanager
    def _make_data_block_builder(self, meta):
        assert self._partition_id == meta.partition_id, \
            "partition id of building data block meta mismatch "\
            "{} != {}".format(self._partition_id, meta.partition_id)
        builder = None
        expt = None
        try:
            builder = DataBlockBuilder(
                common.data_source_data_block_dir(self._data_source),
                self._data_source.data_source_meta.name, self._partition_id,
                meta.data_block_index, self._data_block_builder_options)
            builder.init_by_meta(meta)
            builder.set_data_block_manager(self._data_block_manager)
            yield builder
        except Exception as e:  # pylint: disable=broad-except
            logging.warning("Failed make data block builder, " \
                             "reason %s", e)
            expt = e
        if builder is not None:
            del builder
        if expt is not None:
            raise expt

    def _dump_data_block_by_meta(self, meta):
        assert meta is not None, "input data block must not be None"
        with self._make_data_block_builder(meta) as data_block_builder:
            try:
                if meta.leader_start_index == 0:
                    self._raw_data_visitor.reset()
                else:
                    assert meta.leader_start_index > 0, \
                        "leader start index must be positive"
                    self._raw_data_visitor.seek(meta.leader_start_index - 1)
            except StopIteration:
                logging.fatal("raw data finished before when seek to %d",
                              meta.leader_start_index - 1)
                traceback.print_stack()
                os._exit(-1)  # pylint: disable=protected-access
            match_index = 0
            example_num = len(meta.example_ids)
            is_v2 = len(meta.indices) > 0

            def if_match(meta, match_index, index, example_id, is_v2):
                if is_v2:
                    return meta.indices[match_index] == index
                return example_id == meta.example_ids[match_index]

            for (index, item) in self._raw_data_visitor:
                example_id = item.example_id
                joined = False
                # Elements in meta.example_ids maybe duplicated
                while match_index < example_num and \
                        if_match(meta, match_index, index, example_id, is_v2):
                    if len(meta.joined) > 0:
                        item.add_extra_fields(
                            {'joined': meta.joined[match_index]}, True)
                    data_block_builder.write_item(item)
                    self._optional_stats.update_stats(item, kind='joined')
                    match_index += 1
                    joined = True
                if not joined:
                    self._optional_stats.update_stats(item, kind='unjoined')
                if match_index >= example_num:
                    break
                if index >= meta.leader_end_index:
                    break
            if match_index < example_num:
                logging.fatal(
                    "Data lose corrupt! only match %d/%d example "
                    "for data block %s", match_index, example_num,
                    meta.block_id)
                traceback.print_stack()
                os._exit(-1)  # pylint: disable=protected-access
            dumped_meta = data_block_builder.finish_data_block(True)
            self._optional_stats.emit_optional_stats()
            assert dumped_meta == meta, "the generated dumped meta should "\
                                        "be the same with input mata"
            with self._lock:
                assert self._fly_data_block_meta[0] == meta
                self._fly_data_block_meta.pop(0)

    def _is_state_stale(self):
        with self._lock:
            return self._state_stale

    def _sync_with_data_block_manager(self):
        if self._is_state_stale():
            self._evict_dumped_data_block_meta()

    def _evict_dumped_data_block_meta(self):
        next_data_block_index = \
                self._data_block_manager.get_dumped_data_block_count()
        with self._lock:
            skip_count = 0
            for meta in self._fly_data_block_meta:
                if meta.data_block_index >= next_data_block_index:
                    break
                skip_count += 1
            self._fly_data_block_meta = \
                    self._fly_data_block_meta[skip_count:]