def extract_stats_parallel( index: io.Index, pool: Optional[multiprocessing.pool.Pool] = None) -> List[StationStat]: """ Extracts StationStat information in parallel from packets stored in the provided index. :param index: Index of packets to extract information from. :param pool: optional multiprocessing pool. :return: A list of StationStat objects. """ # Partition the index entries by number of cores num_cores: int = multiprocessing.cpu_count() partitioned: List[List[io.IndexEntry]] = _partition_list( index.entries, num_cores) indices: List[io.Index] = list( map(lambda entries: io.Index(entries), partitioned)) # Run da buggahs in parallel # _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool # nested: List[List[StationStat]] = _pool.map(extract_stats_serial, indices) # if pool is None: # _pool.close() nested: Iterator[List[StationStat]] = maybe_parallel_map( pool, extract_stats_serial, iter(indices), lambda: len(indices) > 128, chunk_size=64, ) return [item for sublist in nested for item in sublist]
def test_normal_managed_pool(self): settings.set_parallelism_enabled(True) usage_out = [] res = maybe_parallel_map(None, map_fn, self.data, usage_out=usage_out) self.assertEqual(self.res, list(res)) self.assertEqual(MappingType.ParallelManaged, usage_out[0]) settings.set_parallelism_enabled(False)
def get_stations( self, pool: Optional[multiprocessing.pool.Pool] = None ) -> List[Station]: """ :return: a list of all stations represented by the data packets """ station_ids: List[str] = self.index_summary.station_ids() stations_opt: Iterator[Optional[Station]] = maybe_parallel_map( pool, self.get_station_by_id, iter(station_ids), lambda: len(station_ids) > 2, chunk_size=1 ) # _pool: multiprocessing.pool.Pool = ( # multiprocessing.Pool() if pool is None else pool # ) # # stations_opt: List[Optional[Station]] = _pool.map( # self.get_station_by_id, station_ids # ) # if pool is None: # _pool.close() # noinspection Mypy return list(filter(lambda station: station is not None, stations_opt))
def test_normal_provided_pool(self): settings.set_parallelism_enabled(True) pool = Pool() usage_out = [] res = maybe_parallel_map(pool, map_fn, self.data, usage_out=usage_out) self.assertEqual(self.res, list(res)) self.assertEqual(MappingType.ParallelUnmanaged, usage_out[0]) pool.close() settings.set_parallelism_enabled(False)
def test_parallel_condition_bad(self): settings.set_parallelism_enabled(True) usage_out = [] res = maybe_parallel_map(None, map_fn, self.data, usage_out=usage_out, condition=lambda: len(self.data) > 10) self.assertEqual(self.res, list(res)) self.assertEqual(MappingType.Serial, usage_out[0]) settings.set_parallelism_enabled(False)
def get_stations( self, pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]: """ :param pool: optional multiprocessing pool :return: List of all stations in the ApiReader """ return list( maybe_parallel_map(pool, self._station_by_index, self.files_index, chunk_size=1))
def _read_stations( self, pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]: """ :param pool: optional multiprocessing pool :return: List of all stations in the ApiReader, without building the data from parquet """ if settings.is_parallelism_enabled() and len(self.files_index) > 1: return list( maybe_parallel_map(pool, self._station_by_index, self.files_index, chunk_size=1)) return list(map(self._station_by_index, self.files_index))
def create_data_window(self, pool: Optional[multiprocessing.pool.Pool] = None): """ updates the DataWindow to contain only the data within the window parameters stations without audio or any data outside the window are removed """ # Let's create and manage a single pool of workers that we can utilize throughout # the instantiation of the data window. _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool r_f = io.ReadFilter() if self._config.start_datetime: r_f.with_start_dt(self._config.start_datetime) if self._config.end_datetime: r_f.with_end_dt(self._config.end_datetime) if self._config.station_ids: r_f.with_station_ids(self._config.station_ids) if self._config.extensions: r_f.with_extensions(self._config.extensions) else: self._config.extensions = r_f.extensions if self._config.api_versions: r_f.with_api_versions(self._config.api_versions) else: self._config.api_versions = r_f.api_versions r_f.with_start_dt_buf(self._config.start_buffer_td) r_f.with_end_dt_buf(self._config.end_buffer_td) if self.debug: print("Reading files from disk. This may take a few minutes to complete.") # get the data to convert into a window a_r = ApiReaderDw(self._config.input_dir, self._config.structured_layout, r_f, correct_timestamps=self._config.apply_correction, use_model_correction=self._config.use_model_correction, dw_base_dir=self.save_dir(), dw_save_mode=self._fs_writer.save_mode(), debug=self.debug, pool=_pool) self._errors.extend_error(a_r.errors) if self._fs_writer.is_use_mem() and a_r.dw_save_mode != self._fs_writer.save_mode(): if self.debug: print("Estimated size of files exceeds available memory.") print("Automatically using temporary directory to store data.") self._fs_writer.set_use_temp(True) # Parallel update # Apply timing correction in parallel by station sts = a_r.get_stations() if self.debug: print("num stations loaded: ", len(sts)) # if self._config.apply_correction: # for st in maybe_parallel_map(_pool, Station.update_timestamps, # iter(sts), chunk_size=1): # self._add_sensor_to_window(st) # if self.debug: # print("station processed: ", st.id()) for st in maybe_parallel_map(_pool, Station.update_timestamps, iter(sts), chunk_size=1): self.create_window_in_sensors(st, self._config.start_datetime, self._config.end_datetime) if self.debug: print("station processed: ", st.id()) # check for stations without data self._check_for_audio() self._check_valid_ids() # update the default data window name if we have data and the default name exists if self.event_name == "dw" and len(self._stations) > 0: self.event_name = f"dw_{int(self.start_date())}_{len(self._stations)}" # must update the start and end in order for the data to be saved # update remaining data window values if they're still default if not self._config.start_datetime and len(self._stations) > 0: self._config.start_datetime = dtu.datetime_from_epoch_microseconds_utc( np.min([t.first_data_timestamp() for t in self._stations])) # end_datetime is non-inclusive, so it must be greater than our latest timestamp if not self._config.end_datetime and len(self._stations) > 0: self._config.end_datetime = dtu.datetime_from_epoch_microseconds_utc( np.max([t.last_data_timestamp() for t in self._stations]) + 1) # If the pool was created by this function, then it needs to managed by this function. if pool is None: _pool.close()