Esempio n. 1
0
 def _raw_data_batch_fetch_cond(self):
     next_part_index = self._get_next_part_index()
     fetcher = self._raw_data_batch_fetcher
     fly_item_cnt = fetcher.get_flying_item_count()
     return self._raw_data_batch_fetcher.need_process(next_part_index) and \
             not common.get_heap_mem_stats(None).CheckOomRisk(
                 fly_item_cnt, self._options.memory_limit_ratio)
Esempio n. 2
0
 def _raw_data_part_fn(self):
     if self._check_finished_tag():
         logging.warning("raw data has been parttedfor rank id of parti"\
                         "tioner %d", self._options.partitioner_rank_id)
         self._notify_part_finished()
         return
     self._sync_partitioner_state()
     assert self._dumped_process_index is not None
     assert len(self._flying_writers) == 0
     fetcher = self._raw_data_batch_fetcher
     fetch_finished = False
     next_index = self._get_next_part_index()
     hint_index = None
     bp_options = self._options.batch_processor_options
     round_dumped_item = 0
     while not fetch_finished:
         fetch_finished, batch, hint_index = \
                 fetcher.fetch_item_batch_by_index(next_index, hint_index)
         if batch is not None:
             for index, item in enumerate(batch):
                 raw_id = getattr(item, self._part_field)
                 partition_id = CityHash32(raw_id) % \
                         self._options.output_partition_num
                 writer = self._get_file_writer(partition_id)
                 writer.append_item(batch.begin_index+index, item)
             next_index += len(batch)
             round_dumped_item += len(batch)
             fly_item_cnt = fetcher.get_flying_item_count()
             if round_dumped_item // self._options.output_partition_num \
                     > (1<<21) or \
                     common.get_heap_mem_stats(None).CheckOomRisk(
                         fly_item_cnt,
                         self._options.memory_limit_ratio-0.05):
                 self._finish_file_writers()
                 self._set_next_part_index(next_index)
                 hint_index = self._evict_staless_batch(hint_index,
                                                        next_index-1)
                 logging.info("consumed %d items", next_index-1)
                 gc_cnt = gc.collect()
                 logging.warning("finish writer partition trigger "\
                                 "gc %d actively", gc_cnt)
                 round_dumped_item = 0
                 self._wakeup_raw_data_fetcher()
         elif not fetch_finished:
             with self._cond:
                 self._cond.wait(1)
     self._finish_file_writers()
     self._dump_finished_tag()
     for partition_id, metas in self._dumped_file_metas.items():
         logging.info("part %d output %d files by partitioner",
                       partition_id, len(metas))
         for meta in metas:
             logging.info("%s", meta.encode_meta_to_fname())
         logging.info("-----------------------------------")
     self._notify_part_finished()
Esempio n. 3
0
 def _data_producer_cond(self):
     with self._lock:
         oom_risk = False
         if self._impl_ctx is not None:
             self._worker_map[self._producer_name()].setup_args(
                 self._impl_ctx)
             fly_item_cnt = self._impl_ctx.get_flying_item_cnt()
             oom_risk = common.get_heap_mem_stats(None).CheckOomRisk(
                 fly_item_cnt, 0.60)
         return self._impl_ctx is not None and not oom_risk and \
                 not self._impl_ctx.is_produce_finished()
Esempio n. 4
0
 def _raw_data_batch_fetch_fn(self):
     next_part_index = self._get_next_part_index()
     fetcher = self._raw_data_batch_fetcher
     for batch in fetcher.make_processor(next_part_index):
         logging.debug("fetch batch begin at %d, len %d. wakeup "\
                       "partitioner", batch.begin_index, len(batch))
         self._wakeup_partitioner()
         fly_item_cnt = fetcher.get_flying_item_count()
         if common.get_heap_mem_stats(None).CheckOomRisk(fly_item_cnt, 0.70):
             logging.warning('early stop the raw data fetch '\
                             'since the oom risk')
             break
Esempio n. 5
0
 def _data_producer_fn(self, impl_ctx):
     assert isinstance(impl_ctx, TransmitLeader.ImplContext)
     if not impl_ctx.is_produce_finished():
         for item in impl_ctx.make_producer():
             if item is None:
                 continue
             self._wakeup_data_consumer()
             fly_item_cnt = impl_ctx.get_flying_item_cnt()
             if common.get_heap_mem_stats(None).CheckOomRisk(fly_item_cnt,
                                                             0.50):
                 logging.warning("%s early stop produce item since "\
                                 "oom risk", self._repr_str)
                 break
Esempio n. 6
0
 def _stop_fetch_id(self):
     total_flying_item = self._produce_item_cnt - self._comsume_item_cnt
     if total_flying_item >= 5 << 20:
         logging.warning("stop fetch id since flying item "\
                         "reach to %d > 5m, produce_item_cnt: %d; "\
                         "consume_item_cnt: %d", total_flying_item,
                         self._produce_item_cnt, self._comsume_item_cnt)
         return True
     potential_mem_incr = total_flying_item * \
                          self._psi_rsa_signer.additional_item_mem_usage()
     if get_heap_mem_stats(None).CheckOomRisk(total_flying_item, 0.80,
                                              potential_mem_incr):
         logging.warning("stop fetch id since has oom risk for 0.80, "\
                         "flying item reach to %d", total_flying_item)
         return True
     return False
Esempio n. 7
0
 def _data_producer_cond(self):
     with self._lock:
         oom_risk = False
         if self._impl_ctx is not None:
             self._process_producer_hook(self._impl_ctx)
             self._worker_map[self._producer_name()].setup_args(
                     self._impl_ctx
                 )
             fly_item_cnt = self._impl_ctx.get_flying_item_cnt()
             oom_risk = common.get_heap_mem_stats(None).CheckOomRisk(
                     fly_item_cnt, 0.60
                 )
         status = self._impl_ctx is not None and not oom_risk and \
                 not self._impl_ctx.is_produce_finished()
         logging.debug("%s producer condition return %s",
                       self.__class__.__name__, status)
         return status