def extract_stats_parallel(
        index: io.Index,
        pool: Optional[multiprocessing.pool.Pool] = None) -> List[StationStat]:
    """
    Extracts StationStat information in parallel from packets stored in the provided index.

    :param index: Index of packets to extract information from.
    :param pool: optional multiprocessing pool.
    :return: A list of StationStat objects.
    """
    # Partition the index entries by number of cores
    num_cores: int = multiprocessing.cpu_count()
    partitioned: List[List[io.IndexEntry]] = _partition_list(
        index.entries, num_cores)
    indices: List[io.Index] = list(
        map(lambda entries: io.Index(entries), partitioned))

    # Run da buggahs in parallel
    # _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool
    # nested: List[List[StationStat]] = _pool.map(extract_stats_serial, indices)
    # if pool is None:
    #     _pool.close()
    nested: Iterator[List[StationStat]] = maybe_parallel_map(
        pool,
        extract_stats_serial,
        iter(indices),
        lambda: len(indices) > 128,
        chunk_size=64,
    )
    return [item for sublist in nested for item in sublist]
Example #2
0
 def test_normal_managed_pool(self):
     settings.set_parallelism_enabled(True)
     usage_out = []
     res = maybe_parallel_map(None, map_fn, self.data, usage_out=usage_out)
     self.assertEqual(self.res, list(res))
     self.assertEqual(MappingType.ParallelManaged, usage_out[0])
     settings.set_parallelism_enabled(False)
 def get_stations(
     self, pool: Optional[multiprocessing.pool.Pool] = None
 ) -> List[Station]:
     """
     :return: a list of all stations represented by the data packets
     """
     station_ids: List[str] = self.index_summary.station_ids()
     stations_opt: Iterator[Optional[Station]] = maybe_parallel_map(
         pool,
         self.get_station_by_id,
         iter(station_ids),
         lambda: len(station_ids) > 2,
         chunk_size=1
     )
     # _pool: multiprocessing.pool.Pool = (
     #     multiprocessing.Pool() if pool is None else pool
     # )
     #
     # stations_opt: List[Optional[Station]] = _pool.map(
     #     self.get_station_by_id, station_ids
     # )
     # if pool is None:
     #     _pool.close()
     # noinspection Mypy
     return list(filter(lambda station: station is not None, stations_opt))
Example #4
0
 def test_normal_provided_pool(self):
     settings.set_parallelism_enabled(True)
     pool = Pool()
     usage_out = []
     res = maybe_parallel_map(pool, map_fn, self.data, usage_out=usage_out)
     self.assertEqual(self.res, list(res))
     self.assertEqual(MappingType.ParallelUnmanaged, usage_out[0])
     pool.close()
     settings.set_parallelism_enabled(False)
Example #5
0
 def test_parallel_condition_bad(self):
     settings.set_parallelism_enabled(True)
     usage_out = []
     res = maybe_parallel_map(None,
                              map_fn,
                              self.data,
                              usage_out=usage_out,
                              condition=lambda: len(self.data) > 10)
     self.assertEqual(self.res, list(res))
     self.assertEqual(MappingType.Serial, usage_out[0])
     settings.set_parallelism_enabled(False)
Example #6
0
 def get_stations(
         self,
         pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]:
     """
     :param pool: optional multiprocessing pool
     :return: List of all stations in the ApiReader
     """
     return list(
         maybe_parallel_map(pool,
                            self._station_by_index,
                            self.files_index,
                            chunk_size=1))
Example #7
0
 def _read_stations(
         self,
         pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]:
     """
     :param pool: optional multiprocessing pool
     :return: List of all stations in the ApiReader, without building the data from parquet
     """
     if settings.is_parallelism_enabled() and len(self.files_index) > 1:
         return list(
             maybe_parallel_map(pool,
                                self._station_by_index,
                                self.files_index,
                                chunk_size=1))
     return list(map(self._station_by_index, self.files_index))
    def create_data_window(self, pool: Optional[multiprocessing.pool.Pool] = None):
        """
        updates the DataWindow to contain only the data within the window parameters
        stations without audio or any data outside the window are removed
        """
        # Let's create and manage a single pool of workers that we can utilize throughout
        # the instantiation of the data window.
        _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool

        r_f = io.ReadFilter()
        if self._config.start_datetime:
            r_f.with_start_dt(self._config.start_datetime)
        if self._config.end_datetime:
            r_f.with_end_dt(self._config.end_datetime)
        if self._config.station_ids:
            r_f.with_station_ids(self._config.station_ids)
        if self._config.extensions:
            r_f.with_extensions(self._config.extensions)
        else:
            self._config.extensions = r_f.extensions
        if self._config.api_versions:
            r_f.with_api_versions(self._config.api_versions)
        else:
            self._config.api_versions = r_f.api_versions
        r_f.with_start_dt_buf(self._config.start_buffer_td)
        r_f.with_end_dt_buf(self._config.end_buffer_td)

        if self.debug:
            print("Reading files from disk.  This may take a few minutes to complete.")

        # get the data to convert into a window
        a_r = ApiReaderDw(self._config.input_dir, self._config.structured_layout, r_f,
                          correct_timestamps=self._config.apply_correction,
                          use_model_correction=self._config.use_model_correction,
                          dw_base_dir=self.save_dir(),
                          dw_save_mode=self._fs_writer.save_mode(),
                          debug=self.debug, pool=_pool)

        self._errors.extend_error(a_r.errors)

        if self._fs_writer.is_use_mem() and a_r.dw_save_mode != self._fs_writer.save_mode():
            if self.debug:
                print("Estimated size of files exceeds available memory.")
                print("Automatically using temporary directory to store data.")
            self._fs_writer.set_use_temp(True)

        # Parallel update
        # Apply timing correction in parallel by station
        sts = a_r.get_stations()
        if self.debug:
            print("num stations loaded: ", len(sts))
        # if self._config.apply_correction:
            # for st in maybe_parallel_map(_pool, Station.update_timestamps,
            #                              iter(sts), chunk_size=1):
            #     self._add_sensor_to_window(st)
            #     if self.debug:
            #         print("station processed: ", st.id())
        for st in maybe_parallel_map(_pool, Station.update_timestamps, iter(sts), chunk_size=1):
            self.create_window_in_sensors(st, self._config.start_datetime, self._config.end_datetime)
            if self.debug:
                print("station processed: ", st.id())

        # check for stations without data
        self._check_for_audio()
        self._check_valid_ids()

        # update the default data window name if we have data and the default name exists
        if self.event_name == "dw" and len(self._stations) > 0:
            self.event_name = f"dw_{int(self.start_date())}_{len(self._stations)}"

        # must update the start and end in order for the data to be saved
        # update remaining data window values if they're still default
        if not self._config.start_datetime and len(self._stations) > 0:
            self._config.start_datetime = dtu.datetime_from_epoch_microseconds_utc(
                np.min([t.first_data_timestamp() for t in self._stations]))
        # end_datetime is non-inclusive, so it must be greater than our latest timestamp
        if not self._config.end_datetime and len(self._stations) > 0:
            self._config.end_datetime = dtu.datetime_from_epoch_microseconds_utc(
                np.max([t.last_data_timestamp() for t in self._stations]) + 1)

        # If the pool was created by this function, then it needs to managed by this function.
        if pool is None:
            _pool.close()