def __init__(self, etcd, options):
     super(RawDataBatchFetcher,
           self).__init__(options.batch_processor_options.max_flying_item, )
     self._raw_data_visitor = FileBasedMockRawDataVisitor(
         etcd, options.raw_data_options,
         '{}-partitioner-mock-data-source-{:04}'.format(
             options.partitioner_name,
             options.partitioner_rank_id), options.input_file_paths)
     self._batch_size = options.batch_processor_options.batch_size
     self.set_input_finished()
class RawDataBatchFetcher(ItemBatchSeqProcessor):
    def __init__(self, kvstore, options):
        super(RawDataBatchFetcher, self).__init__(
                options.batch_processor_options.max_flying_item,
            )
        self._raw_data_visitor = FileBasedMockRawDataVisitor(
                kvstore, options.raw_data_options,
                '{}-partitioner-mock-data-source-{:04}'.format(
                        options.partitioner_name,
                        options.partitioner_rank_id
                    ),
                options.input_file_paths
            )
        self._batch_size = options.batch_processor_options.batch_size
        self._metrics_tags = {
            'partition_name': options.partitioner_name,
            'partition': options.partitioner_rank_id
        }
        self._metric_stats = MetricStats(options.raw_data_options,
                                         self._metrics_tags)
        self.set_input_finished()

    @classmethod
    def name(cls):
        return 'RawDataBatchFetcher'

    def _make_item_batch(self, begin_index):
        return RawDataBatch(begin_index)

    def _make_inner_generator(self, next_index):
        assert next_index is not None
        if next_index == 0:
            self._raw_data_visitor.reset()
        else:
            self._raw_data_visitor.seek(next_index - 1)
        while not self._raw_data_visitor.finished() and \
                not self._fly_item_full():
            next_batch = self._make_item_batch(next_index)
            for (index, item) in self._raw_data_visitor:
                if index != next_index:
                    logging.fatal("batch raw data visitor is not consecutive, "\
                                  "%d != %d", index, next_index)
                    traceback.print_stack()
                    os._exit(-1) # pylint: disable=protected-access
                self._metric_stats.emit_metric(item)
                next_batch.append(item)
                next_index += 1
                if len(next_batch) >= self._batch_size:
                    break
            yield next_batch, self._raw_data_visitor.finished()
        yield self._make_item_batch(next_index), \
                self._raw_data_visitor.finished()

    def cleanup_visitor_meta_data(self):
        self._raw_data_visitor.cleanup_meta_data()
 def __init__(self, kvstore, options):
     super(RawDataBatchFetcher,
           self).__init__(options.batch_processor_options.max_flying_item, )
     self._raw_data_visitor = FileBasedMockRawDataVisitor(
         kvstore, options.raw_data_options,
         '{}-partitioner-mock-data-source-{:04}'.format(
             options.partitioner_name,
             options.partitioner_rank_id), options.input_file_paths)
     self._batch_size = options.batch_processor_options.batch_size
     self._metrics_tags = {
         'partition_name': options.partitioner_name,
         'partition': options.partitioner_rank_id
     }
     self._metric_stats = MetricStats(options.raw_data_options,
                                      self._metrics_tags)
     self.set_input_finished()
Example #4
0
 def _create_file_based_mock_visitor(self):
     return FileBasedMockRawDataVisitor(
         self._etcd,
         dj_pb.RawDataOptions(raw_data_iter='CSV_DICT',
                              read_ahead_size=134217728),
         '{}-proprocessor-mock-data-source-{:04}'.format(
             self._options.preprocessor_name,
             self._options.partition_id), self._options.input_file_paths)
 def _create_file_based_mock_visitor(self):
     return FileBasedMockRawDataVisitor(
         self._etcd, self._options.input_raw_data,
         '{}-proprocessor-mock-data-source-{:04}'.format(
             self._options.preprocessor_name,
             self._options.partition_id), self._options.input_file_paths)