def extract_stats_parallel( index: io.Index, pool: Optional[multiprocessing.pool.Pool] = None) -> List[StationStat]: """ Extracts StationStat information in parallel from packets stored in the provided index. :param index: Index of packets to extract information from. :param pool: optional multiprocessing pool. :return: A list of StationStat objects. """ # Partition the index entries by number of cores num_cores: int = multiprocessing.cpu_count() partitioned: List[List[io.IndexEntry]] = _partition_list( index.entries, num_cores) indices: List[io.Index] = list( map(lambda entries: io.Index(entries), partitioned)) # Run da buggahs in parallel # _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool # nested: List[List[StationStat]] = _pool.map(extract_stats_serial, indices) # if pool is None: # _pool.close() nested: Iterator[List[StationStat]] = maybe_parallel_map( pool, extract_stats_serial, iter(indices), lambda: len(indices) > 128, chunk_size=64, ) return [item for sublist in nested for item in sublist]
def _get_all_files( self, pool: Optional[multiprocessing.pool.Pool] = None ) -> io.Index: """ get all files in the base dir of the ApiReader :return: index with all the files that match the filter """ _pool: multiprocessing.pool.Pool = ( multiprocessing.Pool() if pool is None else pool ) index = io.Index() # this guarantees that all ids we search for are valid all_index = self._apply_filter(pool=_pool) for station_id in all_index.summarize().station_ids(): station_filter = self.filter.clone() checked_index = self._check_station_stats( station_filter.with_station_ids({station_id}), pool=_pool ) index.append(checked_index.entries) if pool is None: _pool.close() return index
def _flatten_files_index(self): """ :return: flattened version of files_index """ result = io.Index() for i in self.files_index: result.append(i.entries) return result
def _split_workload(self, findex: io.Index) -> List[io.Index]: """ takes an index and splits it into chunks based on a size limit while running_total + next_file_size < limit, adds files to a chunk (Index) if limit is exceeded, adds the chunk and puts the next file into a new chunk :param findex: index of files to split :return: list of Index to process """ packet_list = [] chunk_queue = 0 chunk_list = [] for f in findex.entries: chunk_queue += f.file_size_bytes if chunk_queue > self.chunk_limit: packet_list.append(io.Index(chunk_list)) chunk_queue = 0 chunk_list = [] chunk_list.append(f) packet_list.append(io.Index(chunk_list)) return packet_list
def _check_station_stats( self, station_index: io.Index, pool: Optional[multiprocessing.pool.Pool] = None, ) -> List[io.Index]: """ check the index's results; if it has enough information, return it, otherwise search for more data. The index should only request one station id If the station was restarted during the request period, a new group of indexes will be created to represent the change in station metadata. :param station_index: index representing the requested information :return: List of Indexes that includes as much information as possible that fits the request """ _pool: multiprocessing.pool.Pool = multiprocessing.Pool( ) if pool is None else pool # if we found nothing, return the index if len(station_index.entries) < 1: return [station_index] stats = fs.extract_stats(station_index, pool=_pool) # Close pool if created here if pool is None: _pool.close() timing_offsets: Optional[ offset_model.TimingOffsets] = offset_model.compute_offsets(stats) # punt if duration or other important values are invalid or if the latency array was empty if timing_offsets is None: return [station_index] diff_s = diff_e = timedelta(seconds=0) # if our filtered files do not encompass the request even when the packet times are updated # try getting 1.5 times the difference of the expected start/end and the start/end of the data insufficient_str = "" if self.filter.start_dt and timing_offsets.adjusted_start > self.filter.start_dt: insufficient_str += f" {self.filter.start_dt} (start)" # diff_s = self.filter.start_dt_buf + 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt) new_end = self.filter.start_dt - self.filter.start_dt_buf new_start = new_end - 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt) new_index = self._apply_filter(io.ReadFilter().with_start_dt( new_start).with_end_dt(new_end).with_extensions( self.filter.extensions).with_api_versions( self.filter.api_versions).with_station_ids( set(station_index.summarize().station_ids()) ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e)) if len(new_index.entries) > 0: station_index.append(new_index.entries) stats.extend(fs.extract_stats(new_index)) if self.filter.end_dt and timing_offsets.adjusted_end < self.filter.end_dt: insufficient_str += f" {self.filter.end_dt} (end)" # diff_e = self.filter.end_dt_buf + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end) new_start = self.filter.end_dt + self.filter.end_dt_buf new_end = new_start + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end) new_index = self._apply_filter(io.ReadFilter().with_start_dt( new_start).with_end_dt(new_end).with_extensions( self.filter.extensions).with_api_versions( self.filter.api_versions).with_station_ids( set(station_index.summarize().station_ids()) ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e)) if len(new_index.entries) > 0: station_index.append(new_index.entries) stats.extend(fs.extract_stats(new_index)) if len(insufficient_str) > 0: self.errors.append( f"Data for {station_index.summarize().station_ids()} exists, " f"but not at:{insufficient_str}") results = {} keys = [] for v, e in enumerate(stats): key = e.app_start_dt if key not in keys: keys.append(key) results[key] = io.Index() results[key].append(entries=[station_index.entries[v]]) return list(results.values())