def __init__(self, name: str = "event", schema: Optional[Dict[str, list]] = None, save_mode: FileSystemSaveMode = FileSystemSaveMode.MEM, base_dir: str = "."): """ initialize EventStream for a station :param name: name of the EventStream. Default "event" :param schema: a structured dictionary of the data table schema. Dictionary must look like: {"string": [s_values], "numeric": [n_values], "boolean": [o_values], "byte": [b_values]} where [*_values] is a list of strings and can be empty. Default None :param save_mode: FileSystemSaveMode that determines how data is saved. Default FileSystemSaveMode.MEM (use RAM). Other options are DISK (save to directory) and TEMP (save to temporary directory) :param base_dir: the location of the parquet file that holds the data. Not used if save_data is False. Default current directory (".") """ self.name = name self.timestamps_metadata = {} self.metadata = {} self._errors = RedVoxExceptions("EventStream") self._is_timestamps_corrected = False self._fs_writer = Fsw(f"event_{name}", "parquet", base_dir, save_mode) self._data = None self._schema = {"string": [], "numeric": [], "boolean": [], "byte": []} if schema is not None: self.set_schema(schema)
def __init__( self, base_dir: str, structured_dir: bool = False, read_filter: io.ReadFilter = None, debug: bool = False, pool: Optional[multiprocessing.pool.Pool] = None, ): """ Initialize the ApiReader object :param base_dir: directory containing the files to read :param structured_dir: if True, base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. :param read_filter: ReadFilter for the data files, if None, get everything. Default None :param debug: if True, output program warnings/errors during function execution. Default False. """ _pool: multiprocessing.pool.Pool = (multiprocessing.Pool() if pool is None else pool) if read_filter: self.filter = read_filter if self.filter.station_ids: self.filter.station_ids = set(self.filter.station_ids) else: self.filter = io.ReadFilter() self.base_dir = base_dir self.structured_dir = structured_dir self.debug = debug self.errors = RedVoxExceptions("APIReader") self.files_index = self._get_all_files(_pool) self.index_summary = io.IndexSummary.from_index( self._flatten_files_index()) mem_split_factor = 1 if len(self.files_index) > 0: if settings.is_parallelism_enabled(): mem_split_factor = len(self.files_index) self.chunk_limit = psutil.virtual_memory( ).available * PERCENT_FREE_MEM_USE / mem_split_factor max_file_size = max([ fe.file_size_bytes for fi in self.files_index for fe in fi.entries ]) if max_file_size > self.chunk_limit: raise MemoryError( f"System requires {max_file_size} bytes of memory to process a file but only has " f"{self.chunk_limit} available. Please free or add more RAM." ) if debug: print( f"{mem_split_factor} stations each have {int(self.chunk_limit)} bytes for loading files in " f"memory.") else: self.chunk_limit = 0 if debug: self.errors.print() if pool is None: _pool.close()
def __init__( self, station_id: str = "", audio_sample_rate_hz: float = np.nan, station_start_timestamp: float = np.nan, time_sync_data: Optional[List[TimeSyncData]] = None, ): """ Initialize the object :param station_id: id of the station to analyze, default empty string :param audio_sample_rate_hz: audio sample rate in hz of the station, default np.nan :param station_start_timestamp: timestamp of when station started recording, default np.nan :param time_sync_data: the TimeSyncData objects created from the packets of the station, default None """ self.station_id: str = station_id self.sample_rate_hz: float = audio_sample_rate_hz self.station_start_timestamp: float = station_start_timestamp self.best_latency_index: int = np.nan self.latency_stats = sh.StatsContainer("latency") self.offset_stats = sh.StatsContainer("offset") self.errors = RedVoxExceptions("TimeSyncAnalysis") if time_sync_data: self.timesync_data: List[TimeSyncData] = time_sync_data self.evaluate_and_validate_data() else: self.timesync_data = [] self.offset_model = OffsetModel.empty_model()
def from_json_dict(json_dict: Dict) -> "DataWindow": """ Reads a JSON dictionary and loads the data into the DataWindow. If dictionary is improperly formatted, raises a ValueError. :param json_dict: the dictionary to read :return: The DataWindow as defined by the JSON """ if "out_type" not in json_dict.keys() \ or json_dict["out_type"].upper() not in dw_io.DataWindowOutputType.list_names(): raise ValueError('Dictionary loading type is invalid or unknown. ' 'Check the value "out_type"; it must be one of: ' f'{dw_io.DataWindowOutputType.list_non_none_names()}') else: out_type = dw_io.DataWindowOutputType.str_to_type(json_dict["out_type"]) if out_type == dw_io.DataWindowOutputType.PARQUET: dwin = DataWindow(json_dict["event_name"], EventOrigin.from_dict(json_dict["event_origin"]), None, json_dict["base_dir"], json_dict["out_type"], json_dict["make_runme"], json_dict["debug"]) dwin._config = DataWindowConfig.from_dict(json_dict["config"]) dwin._errors = RedVoxExceptions.from_dict(json_dict["errors"]) dwin._sdk_version = json_dict["sdk_version"] for st in json_dict["stations"]: dwin.add_station(Station.from_json_file(os.path.join(json_dict["base_dir"], st), f"{st}.json")) elif out_type == dw_io.DataWindowOutputType.LZ4: dwin = DataWindow.deserialize(os.path.join(json_dict["base_dir"], f"{json_dict['event_name']}.pkl.lz4")) else: dwin = DataWindow() return dwin
def validate_station_key_list(data_packets: List[api_m.RedvoxPacketM], errors: RedVoxExceptions) -> bool: """ Checks for consistency in the data packets. Returns False if discrepancies are found. If debug is True, will output the discrepancies. :param data_packets: list of WrappedRedvoxPacketM to look at :param errors: RedVoxExceptions detailing errors found while validating :return: True if no discrepancies found. False otherwise """ my_errors = RedVoxExceptions("StationKeyValidation") if len(data_packets) < 2: return True j: np.ndarray = np.transpose([[ t.station_information.id, t.station_information.uuid, t.timing_information.app_start_mach_timestamp, t.api, t.sub_api, t.station_information.make, t.station_information.model, t.station_information.os, t.station_information.os_version, t.station_information.app_version, t.station_information.is_private, len(t.sensors.audio.samples.values) / t.sensors.audio.sample_rate, ] for t in data_packets]) k: Dict[str, np.ndarray] = { "ids": j[0], "uuids": j[1], "station_start_times": j[2], "apis": j[3], "sub_apis": j[4], "makes": j[5], "models": j[6], "os": j[7], "os_versions": j[8], "app_versions": j[9], "privates": j[10], "durations": j[11], } for key, value in k.items(): result = np.unique(value) if len(result) > 1: my_errors.append( f"WARNING: {data_packets[0].station_information.id} " f"{key} contains multiple unique values: {result}.\n" "Please update your query to focus on one of these values.") if my_errors.get_num_errors() > 0: errors.extend_error(my_errors) return False return True # if here, everything is consistent
def __init__( self, event_name: str = "dw", event_origin: Optional[EventOrigin] = None, config: Optional[DataWindowConfig] = None, output_dir: str = ".", out_type: str = "NONE", make_runme: bool = False, debug: bool = False, ): """ Initialize the DataWindow :param event_name: name of the DataWindow. defaults to "dw" :param event_origin: Optional EventOrigin which describes the physical location and radius of the origin event. Default empty EventOrigin (no valid data) :param config: Optional DataWindowConfig which describes how to extract data from Redvox files. Default None :param output_dir: output directory for saving files. Default "." (current directory) :param out_type: type of file to save the DataWindow as. Options: "PARQUET", "LZ4", "NONE". Default "NONE" (no saving) :param make_runme: if True, saves an example runme.py file with the data. Default False :param debug: if True, outputs additional information during initialization. Default False """ self.event_name: str = event_name self.event_origin: EventOrigin = event_origin if event_origin else EventOrigin() self._fs_writer = dw_io.DataWindowFileSystemWriter(self.event_name, out_type, output_dir, make_runme) self.debug: bool = debug self._sdk_version: str = redvox.VERSION self._errors = RedVoxExceptions("DataWindow") self._stations: List[Station] = [] self._config = config if config: if config.start_datetime and config.end_datetime and (config.end_datetime <= config.start_datetime): self._errors.append("DataWindow will not work when end datetime is before or equal to start datetime.\n" f"Your times: {config.end_datetime} <= {config.start_datetime}") else: self.create_data_window() if self.debug: self.print_errors()
class AudioWithGaps: """ Represents methods of reconstructing audio data with or without gaps in it Properties: sample_interval_micros: microseconds between sample points metadata: list of start times in microseconds since epoch UTC and the data to add gaps: the list of start and end points of gaps (the start and end are actual data points) errors: the errors encountered while getting the data """ sample_interval_micros: float metadata: Optional[List[Tuple[float, pa.Table]]] = None gaps: List[Tuple[float, float]] = field(default_factory=lambda: []) errors: RedVoxExceptions = field( default_factory=lambda: RedVoxExceptions("AudioWithGaps")) def create_timestamps(self) -> pa.Table: """ :return: converts the audio metadata into a data table """ result_array = [[], [], []] for m in self.metadata: timestamps = calc_evenly_sampled_timestamps( m[0], m[1].num_rows, self.sample_interval_micros) result_array[0].extend(timestamps) result_array[1].extend(timestamps) result_array[2].extend(m[1]["microphone"].to_numpy()) for gs, ge in self.gaps: num_samples = int((ge - gs) / self.sample_interval_micros) - 1 timestamps = calc_evenly_sampled_timestamps( gs + self.sample_interval_micros, num_samples, self.sample_interval_micros) gap_array = [timestamps, np.full(len(timestamps), np.nan)] result_array[0].extend(gap_array[0]) result_array[1].extend(gap_array[0]) result_array[2].extend(gap_array[1]) ptable = pa.Table.from_pydict(dict(zip(AUDIO_DF_COLUMNS, result_array))) return pc.take( ptable, pc.sort_indices(ptable, sort_keys=[("timestamps", "ascending")])) def add_error(self, error: str): """ add an error to the result :param error: error message to add """ self.errors.append(error)
class GapPadResult: """ The result of filling gaps or padding a time series """ result_df: Optional[pd.DataFrame] = None gaps: List[Tuple[float, float]] = field(default_factory=lambda: []) errors: RedVoxExceptions = field( default_factory=lambda: RedVoxExceptions("GapPadResult")) def add_error(self, error: str): """ add an error to the result :param error: error message to add """ self.errors.append(error)
def __init__( self, data_packets: Optional[List[api_m.RedvoxPacketM]] = None, station_id: str = None, uuid: str = None, start_time: float = np.nan, use_model_correction: bool = True, ): """ initialize Station :param data_packets: optional list of data packets representing the station, default None :param station_id: optional id if no data packets, default None :param uuid: optional uuid if no data packets, default None :param start_time: optional start time in microseconds since epoch UTC if no data packets, default np.nan :param use_model_correction: if True, use OffsetModel functions for time correction, add OffsetModel best offset (intercept value) otherwise. Default True """ self.data = [] self.packet_metadata: List[st_utils.StationPacketMetadata] = [] self.is_timestamps_updated = False self._gaps: List[Tuple[float, float]] = [] self.errors: RedVoxExceptions = RedVoxExceptions("Station") self.use_model_correction = use_model_correction if data_packets and st_utils.validate_station_key_list( data_packets, self.errors): # noinspection Mypy self._load_metadata_from_packet(data_packets[0]) self.timesync_analysis = TimeSyncAnalysis( self.id, self.audio_sample_rate_nominal_hz, self.start_timestamp).from_raw_packets(data_packets) if self.timesync_analysis.errors.get_num_errors() > 0: self.errors.extend_error(self.timesync_analysis.errors) self._set_all_sensors(data_packets) self._get_start_and_end_timestamps() else: self.id = station_id self.uuid = uuid self.metadata = st_utils.StationMetadata("None") self.start_timestamp = start_time self.first_data_timestamp = np.nan self.last_data_timestamp = np.nan self.audio_sample_rate_nominal_hz = np.nan self.is_audio_scrambled = False self.timesync_analysis = TimeSyncAnalysis()
def __init__(self, sensor_name: str, sensor_data: pd.DataFrame, sensor_type: SensorType = SensorType.UNKNOWN_SENSOR, sample_rate_hz: float = np.nan, sample_interval_s: float = np.nan, sample_interval_std_s: float = np.nan, is_sample_rate_fixed: bool = False, are_timestamps_altered: bool = False, calculate_stats: bool = False): """ initialize the sensor data with params :param sensor_name: name of the sensor :param sensor_type: enumerated type of the sensor, default SensorType.UNKNOWN_SENSOR :param sensor_data: dataframe with the timestamps and sensor data; first column is always the timestamps, the other columns are the data channels in the sensor :param sample_rate_hz: sample rate in hz of the data :param sample_interval_s: sample interval in seconds of the data :param sample_interval_std_s: std dev of sample interval in seconds of the data :param is_sample_rate_fixed: if True, sample rate is constant for all data, default False :param are_timestamps_altered: if True, timestamps in the sensor have been altered from their original values, default False :param calculate_stats: if True, calculate sample_rate, sample_interval_s, and sample_interval_std_s default False """ if "timestamps" not in sensor_data.columns: raise AttributeError( 'SensorData requires the data frame to contain a column titled "timestamps"' ) self.name: str = sensor_name self.type: SensorType = sensor_type self.data_df: pd.DataFrame = sensor_data.infer_objects() self.sample_rate_hz: float = sample_rate_hz self.sample_interval_s: float = sample_interval_s self.sample_interval_std_s: float = sample_interval_std_s self.is_sample_rate_fixed: bool = is_sample_rate_fixed self.timestamps_altered: bool = are_timestamps_altered self.errors: RedVoxExceptions = RedVoxExceptions("Sensor") if calculate_stats: self.organize_and_update_stats() else: self.sort_by_data_timestamps()
def from_json_file(file_dir: str, file_name: str) -> "EventStream": """ :param file_dir: full path to containing directory for the file :param file_name: name of file and extension to load data from :return: EventStream from json file """ if file_name is None: file_name = io.get_json_file(file_dir) if file_name is None: result = EventStream("Empty") result.append_error("JSON file to load EventStream from not found.") return result json_data = io.json_file_to_dict(os.path.join(file_dir, f"{file_name}.json")) if "name" in json_data.keys(): result = EventStream(json_data["name"], json_data["schema"], FileSystemSaveMode.DISK, file_dir) result.metadata = json_data["metadata"] result.timestamps_metadata = json_data["timestamps_metadata"] result.set_errors(RedVoxExceptions.from_dict(json_data["errors"])) result.read_from_dir(json_data["file_path"]) else: result = EventStream("Empty") result.append_error(f"Loading from {file_name} failed; missing EventStream name.") return result
class AggregateSummary: """ aggregate of summaries properties: summaries: the summaries of sensors gaps: gaps in audio data as a list of tuples of start and end time """ summaries: List[PyarrowSummary] = field(default_factory=lambda: []) gaps: List[Tuple[float, float]] = field(default_factory=lambda: []) errors: RedVoxExceptions = RedVoxExceptions("AggregateSummary") def to_dict(self) -> dict: """ :return: dictionary representation of all summaries """ result = {} for ps in self.summaries: result[ps.stype.name] = ps.to_dict() return result @staticmethod def from_dict(summary_dict: dict) -> "AggregateSummary": """ :param summary_dict: dictionary to load data from :return: AggregateSummary from a dictionary """ result = AggregateSummary() for v in summary_dict.values(): result.summaries.append(PyarrowSummary(v["name"], SensorType[v["stype"]], v["start"], v["srate_hz"], v["fdir"], v["scount"], v["smint_s"], v["sstd_s"])) return result def add_aggregate_summary(self, agg_sum: 'AggregateSummary'): """ adds another aggregate summary to this one :param agg_sum: another aggregate summary to add """ self.summaries.extend(agg_sum.summaries) def add_summary(self, pya_sum: PyarrowSummary): """ adds a summary to the aggregate :param pya_sum: the summary to add """ self.summaries.append(pya_sum) def merge_audio_summaries(self): """ combines and replaces all Audio summaries into a single summary; also adds any gaps in the data """ pckt_info = [] audio_lst = self.get_audio() frst_audio = audio_lst[0] use_mem = frst_audio.check_data() for adl in audio_lst: pckt_info.append((int(adl.start), adl.data())) audio_data = gpu.fill_audio_gaps2(pckt_info, dtu.seconds_to_microseconds(1 / frst_audio.srate_hz) ) tbl = audio_data.create_timestamps() frst_audio = PyarrowSummary(frst_audio.name, frst_audio.stype, frst_audio.start, frst_audio.srate_hz, frst_audio.fdir, tbl.num_rows, frst_audio.smint_s, frst_audio.sstd_s, tbl) if not use_mem: frst_audio.write_data(True) self.gaps = audio_data.gaps self.summaries = self.get_non_audio_list() self.add_summary(frst_audio) def merge_non_audio_summaries(self): """ combines and replaces all summaries per type except for audio summaries """ smrs_dict = {} for smry in self.summaries: if smry.stype != SensorType.AUDIO: if smry.stype in smrs_dict.keys(): smrs_dict[smry.stype].append(smry) else: smrs_dict[smry.stype] = [smry] self.summaries = self.get_audio() for styp, smrys in smrs_dict.items(): first_summary = smrys.pop(0) tbl = first_summary.data() combined_mint = np.mean([smrs.smint_s for smrs in smrys]) combined_std = np.mean([smrs.sstd_s for smrs in smrys]) if not first_summary.check_data(): os.makedirs(first_summary.fdir, exist_ok=True) for smrs in smrys: tbl = pa.concat_tables([tbl, smrs.data()]) if not first_summary.check_data(): os.remove(smrs.file_name()) if first_summary.check_data(): first_summary._data = tbl else: pq.write_table(tbl, first_summary.file_name()) mnint = dtu.microseconds_to_seconds(float(np.mean(np.diff(tbl["timestamps"].to_numpy())))) stdint = dtu.microseconds_to_seconds(float(np.std(np.diff(tbl["timestamps"].to_numpy())))) if not combined_mint + combined_std > mnint > combined_mint - combined_std: self.errors.append(f"Mean interval s of combined {styp.name} sensor does not match the " f"compilation of individual mean interval s per packet. Will use compilation of " f"individual values.") mnint = combined_mint stdint = combined_std single_smry = PyarrowSummary(first_summary.name, styp, first_summary.start, 1 / mnint, first_summary.fdir, tbl.num_rows, mnint, stdint, first_summary.data() if first_summary.check_data() else None ) self.summaries.append(single_smry) def merge_summaries_of_type(self, stype: SensorType): """ combines and replaces multiple summaries of one SensorType into a single one *caution: using this on an audio sensor may cause data validation issues* :param stype: the type of sensor to combine """ smrs = [] other_smrs = [] for smry in self.summaries: if smry.stype == stype: smrs.append(smry) else: other_smrs.append(smry) first_summary = smrs.pop(0) tbl = first_summary.data() if not first_summary.check_data(): os.makedirs(first_summary.fdir, exist_ok=True) for smrys in smrs: tbl = pa.concat_tables([first_summary.data(), smrys.data()]) if first_summary.check_data(): first_summary._data = tbl else: pq.write_table(tbl, first_summary.file_name()) os.remove(smrys.file_name()) mnint = dtu.microseconds_to_seconds(float(np.mean(np.diff(tbl["timestamps"].to_numpy())))) stdint = dtu.microseconds_to_seconds(float(np.std(np.diff(tbl["timestamps"].to_numpy())))) single_smry = PyarrowSummary(first_summary.name, first_summary.stype, first_summary.start, 1 / mnint, first_summary.fdir, tbl.num_rows, mnint, stdint, first_summary.data() if first_summary.check_data() else None ) self.summaries = other_smrs self.summaries.append(single_smry) def merge_all_summaries(self): """ merge all PyarrowSummary with the same sensor type into single PyarrowSummary per type """ self.merge_audio_summaries() self.merge_non_audio_summaries() def get_audio(self) -> List[PyarrowSummary]: """ :return: a list of PyarrowSummary of only Audio data """ return [s for s in self.summaries if s.stype == srupa.SensorType.AUDIO] def get_non_audio(self) -> Dict[srupa.SensorType, List[PyarrowSummary]]: """ :return: a dictionary of non-Audio SensorType: PyarrowSummary """ result = {} for k in self.sensor_types(): if k != srupa.SensorType.AUDIO: result[k] = [s for s in self.summaries if s.stype == k] return result def get_non_audio_list(self) -> List[PyarrowSummary]: """ :return: a list of all non-Audio PyarrowSummary """ return [s for s in self.summaries if s.stype != srupa.SensorType.AUDIO] def get_sensor(self, stype: srupa.SensorType) -> List[PyarrowSummary]: """ :param stype: type of sensor to find :return: a list of all PyarrowSummary of the specified type """ return [s for s in self.summaries if s.stype == stype] def sensor_types(self) -> List[srupa.SensorType]: """ :return: a list of sensor types in self.summaries """ result = [] for s in self.summaries: if s.stype not in result: result.append(s.stype) return result
class ApiReader: """ Reads data from api 900 or api 1000 format, converting all data read into RedvoxPacketM for ease of comparison and use. Properties: filter: io.ReadFilter with the station ids, start and end time, start and end time padding, and types of files to read base_dir: str of the directory containing all the files to read structured_dir: bool, if True, the base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. files_index: io.Index of the files that match the filter that are in base_dir index_summary: io.IndexSummary of the filtered data debug: bool, if True, output additional information during function execution. Default False. """ def __init__( self, base_dir: str, structured_dir: bool = False, read_filter: io.ReadFilter = None, debug: bool = False, pool: Optional[multiprocessing.pool.Pool] = None, ): """ Initialize the ApiReader object :param base_dir: directory containing the files to read :param structured_dir: if True, base_dir contains a specific directory structure used by the respective api formats. If False, base_dir only has the data files. Default False. :param read_filter: ReadFilter for the data files, if None, get everything. Default None :param debug: if True, output program warnings/errors during function execution. Default False. """ _pool: multiprocessing.pool.Pool = (multiprocessing.Pool() if pool is None else pool) if read_filter: self.filter = read_filter if self.filter.station_ids: self.filter.station_ids = set(self.filter.station_ids) else: self.filter = io.ReadFilter() self.base_dir = base_dir self.structured_dir = structured_dir self.debug = debug self.errors = RedVoxExceptions("APIReader") self.files_index = self._get_all_files(_pool) self.index_summary = io.IndexSummary.from_index( self._flatten_files_index()) mem_split_factor = 1 if len(self.files_index) > 0: if settings.is_parallelism_enabled(): mem_split_factor = len(self.files_index) self.chunk_limit = psutil.virtual_memory( ).available * PERCENT_FREE_MEM_USE / mem_split_factor max_file_size = max([ fe.file_size_bytes for fi in self.files_index for fe in fi.entries ]) if max_file_size > self.chunk_limit: raise MemoryError( f"System requires {max_file_size} bytes of memory to process a file but only has " f"{self.chunk_limit} available. Please free or add more RAM." ) if debug: print( f"{mem_split_factor} stations each have {int(self.chunk_limit)} bytes for loading files in " f"memory.") else: self.chunk_limit = 0 if debug: self.errors.print() if pool is None: _pool.close() def _flatten_files_index(self): """ :return: flattened version of files_index """ result = io.Index() for i in self.files_index: result.append(i.entries) return result def _get_all_files( self, pool: Optional[multiprocessing.pool.Pool] = None ) -> List[io.Index]: """ get all files in the base dir of the ApiReader :return: index with all the files that match the filter """ _pool: multiprocessing.pool.Pool = (multiprocessing.Pool() if pool is None else pool) index: List[io.Index] = [] # this guarantees that all ids we search for are valid all_index = self._apply_filter(pool=_pool) for station_id in all_index.summarize().station_ids(): id_index = all_index.get_index_for_station_id(station_id) checked_index = self._check_station_stats(id_index, pool=_pool) index.extend(checked_index) if pool is None: _pool.close() return index def _apply_filter( self, reader_filter: Optional[io.ReadFilter] = None, pool: Optional[multiprocessing.pool.Pool] = None, ) -> io.Index: """ apply the filter of the reader, or another filter if specified :param reader_filter: optional filter; if None, use the reader's filter, default None :return: index of the filtered files """ _pool: multiprocessing.pool.Pool = (multiprocessing.Pool() if pool is None else pool) if not reader_filter: reader_filter = self.filter if self.structured_dir: index = io.index_structured(self.base_dir, reader_filter, pool=_pool) else: index = io.index_unstructured(self.base_dir, reader_filter, pool=_pool) if pool is None: _pool.close() return index def _check_station_stats( self, station_index: io.Index, pool: Optional[multiprocessing.pool.Pool] = None, ) -> List[io.Index]: """ check the index's results; if it has enough information, return it, otherwise search for more data. The index should only request one station id If the station was restarted during the request period, a new group of indexes will be created to represent the change in station metadata. :param station_index: index representing the requested information :return: List of Indexes that includes as much information as possible that fits the request """ _pool: multiprocessing.pool.Pool = multiprocessing.Pool( ) if pool is None else pool # if we found nothing, return the index if len(station_index.entries) < 1: return [station_index] stats = fs.extract_stats(station_index, pool=_pool) # Close pool if created here if pool is None: _pool.close() timing_offsets: Optional[ offset_model.TimingOffsets] = offset_model.compute_offsets(stats) # punt if duration or other important values are invalid or if the latency array was empty if timing_offsets is None: return [station_index] diff_s = diff_e = timedelta(seconds=0) # if our filtered files do not encompass the request even when the packet times are updated # try getting 1.5 times the difference of the expected start/end and the start/end of the data insufficient_str = "" if self.filter.start_dt and timing_offsets.adjusted_start > self.filter.start_dt: insufficient_str += f" {self.filter.start_dt} (start)" # diff_s = self.filter.start_dt_buf + 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt) new_end = self.filter.start_dt - self.filter.start_dt_buf new_start = new_end - 1.5 * (timing_offsets.adjusted_start - self.filter.start_dt) new_index = self._apply_filter(io.ReadFilter().with_start_dt( new_start).with_end_dt(new_end).with_extensions( self.filter.extensions).with_api_versions( self.filter.api_versions).with_station_ids( set(station_index.summarize().station_ids()) ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e)) if len(new_index.entries) > 0: station_index.append(new_index.entries) stats.extend(fs.extract_stats(new_index)) if self.filter.end_dt and timing_offsets.adjusted_end < self.filter.end_dt: insufficient_str += f" {self.filter.end_dt} (end)" # diff_e = self.filter.end_dt_buf + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end) new_start = self.filter.end_dt + self.filter.end_dt_buf new_end = new_start + 1.5 * (self.filter.end_dt - timing_offsets.adjusted_end) new_index = self._apply_filter(io.ReadFilter().with_start_dt( new_start).with_end_dt(new_end).with_extensions( self.filter.extensions).with_api_versions( self.filter.api_versions).with_station_ids( set(station_index.summarize().station_ids()) ).with_start_dt_buf(diff_s).with_end_dt_buf(diff_e)) if len(new_index.entries) > 0: station_index.append(new_index.entries) stats.extend(fs.extract_stats(new_index)) if len(insufficient_str) > 0: self.errors.append( f"Data for {station_index.summarize().station_ids()} exists, " f"but not at:{insufficient_str}") results = {} keys = [] for v, e in enumerate(stats): key = e.app_start_dt if key not in keys: keys.append(key) results[key] = io.Index() results[key].append(entries=[station_index.entries[v]]) return list(results.values()) def _split_workload(self, findex: io.Index) -> List[io.Index]: """ takes an index and splits it into chunks based on a size limit while running_total + next_file_size < limit, adds files to a chunk (Index) if limit is exceeded, adds the chunk and puts the next file into a new chunk :param findex: index of files to split :return: list of Index to process """ packet_list = [] chunk_queue = 0 chunk_list = [] for f in findex.entries: chunk_queue += f.file_size_bytes if chunk_queue > self.chunk_limit: packet_list.append(io.Index(chunk_list)) chunk_queue = 0 chunk_list = [] chunk_list.append(f) packet_list.append(io.Index(chunk_list)) return packet_list @staticmethod def read_files_in_index(indexf: io.Index) -> List[api_m.RedvoxPacketM]: """ read all the files in the index :return: list of RedvoxPacketM, converted from API 900 if necessary """ result: List[api_m.RedvoxPacketM] = [] # Iterate over the API 900 packets in a memory efficient way # and convert to API 1000 # noinspection PyTypeChecker for packet_900 in indexf.stream_raw( io.ReadFilter.empty().with_api_versions( {io.ApiVersion.API_900})): # noinspection Mypy result.append(ac.convert_api_900_to_1000_raw(packet_900)) # Grab the API 1000 packets # noinspection PyTypeChecker for packet in indexf.stream_raw( io.ReadFilter.empty().with_api_versions( {io.ApiVersion.API_1000})): # noinspection Mypy result.append(packet) return result # noinspection PyTypeChecker def read_files_by_id( self, station_id: str) -> Optional[List[api_m.RedvoxPacketM]]: """ :param station_id: the id to filter on :return: the list of packets with the requested id, or None if the id can't be found """ result: List[api_m.RedvoxPacketM] = [] # Iterate over the API 900 packets in a memory efficient way # and convert to API 1000 for packet_900 in self._flatten_files_index().stream_raw( io.ReadFilter.empty().with_api_versions( {io.ApiVersion.API_900}).with_station_ids({station_id})): # noinspection Mypy result.append(ac.convert_api_900_to_1000_raw(packet_900)) # Grab the API 1000 packets for packet in self._flatten_files_index().stream_raw( io.ReadFilter.empty().with_api_versions( {io.ApiVersion.API_1000}).with_station_ids({station_id})): # noinspection Mypy result.append(packet) if len(result) == 0: return None return result def _station_by_index(self, findex: io.Index) -> Station: """ :param findex: index with files to build a station with :return: Station built from files in findex """ return Station.create_from_packets(self.read_files_in_index(findex)) def get_stations( self, pool: Optional[multiprocessing.pool.Pool] = None) -> List[Station]: """ :param pool: optional multiprocessing pool :return: List of all stations in the ApiReader """ return list( maybe_parallel_map(pool, self._station_by_index, self.files_index, chunk_size=1)) def get_station_by_id(self, get_id: str) -> Optional[List[Station]]: """ :param get_id: the id to filter on :return: list of all stations with the requested id or None if id can't be found """ result = [s for s in self.get_stations() if s.id() == get_id] if len(result) < 1: return None return result
class TimeSyncAnalysis: """ Used for multiple TimeSyncData objects from a station properties: station_id: string, the station_id of the station being analyzed, default empty string best_latency_index: int, the index of the TimeSyncData object with the best latency, default np.nan latency_stats: StatsContainer, the statistics of the latencies offset_stats: StatsContainer, the statistics of the offsets offset_model: optional OffsetModel, used to calculate offset at a given point in time sample_rate_hz: float, the audio sample rate in hz of the station, default np.nan timesync_data: list of TimeSyncData, the TimeSyncData to analyze, default empty list station_start_timestamp: float, the timestamp of when the station became active, default np.nan """ def __init__( self, station_id: str = "", audio_sample_rate_hz: float = np.nan, station_start_timestamp: float = np.nan, time_sync_data: Optional[List[TimeSyncData]] = None, ): """ Initialize the object :param station_id: id of the station to analyze, default empty string :param audio_sample_rate_hz: audio sample rate in hz of the station, default np.nan :param station_start_timestamp: timestamp of when station started recording, default np.nan :param time_sync_data: the TimeSyncData objects created from the packets of the station, default None """ self.station_id: str = station_id self.sample_rate_hz: float = audio_sample_rate_hz self.station_start_timestamp: float = station_start_timestamp self.best_latency_index: int = np.nan self.latency_stats = sh.StatsContainer("latency") self.offset_stats = sh.StatsContainer("offset") self.errors = RedVoxExceptions("TimeSyncAnalysis") if time_sync_data: self.timesync_data: List[TimeSyncData] = time_sync_data self.evaluate_and_validate_data() else: self.timesync_data = [] self.offset_model = OffsetModel.empty_model() def evaluate_and_validate_data(self): """ check the data for errors and update the analysis statistics """ self.evaluate_latencies() self.validate_start_timestamp() self.validate_sample_rate() self._calc_timesync_stats() self.offset_model = self.get_offset_model() def get_offset_model(self) -> OffsetModel: """ :return: an OffsetModel based on the information in the timesync analysis """ return OffsetModel( self.get_latencies(), self.get_offsets(), np.array([ td.get_best_latency_timestamp() for td in self.timesync_data ]), self.timesync_data[0].packet_start_timestamp, self.timesync_data[-1].packet_end_timestamp) def _calc_timesync_stats(self): """ calculates the mean and std deviation for latencies and offsets """ if len(self.timesync_data) < 1: self.errors.append( "Nothing to calculate stats; length of timesync data is less than 1" ) else: for index in range(len(self.timesync_data)): # add the stats of the latency self.latency_stats.add( self.timesync_data[index].mean_latency, self.timesync_data[index].latency_std, self.timesync_data[index].num_tri_messages() * 2, ) # add the stats of the offset self.offset_stats.add( self.timesync_data[index].mean_offset, self.timesync_data[index].offset_std, self.timesync_data[index].num_tri_messages() * 2, ) self.latency_stats.best_value = self.get_best_latency() self.offset_stats.best_value = self.get_best_offset() def from_packets( self, packets: List[Union[WrappedRedvoxPacketM, WrappedRedvoxPacket]] ) -> 'TimeSyncAnalysis': """ converts packets into TimeSyncData objects, then performs analysis :param packets: list of WrappedRedvoxPacketM to convert :return: modified version of self """ self.timesync_data = [ TimeSyncData( self.station_id, self.sample_rate_hz, packet.get_sensors().get_audio().get_num_samples(), self.station_start_timestamp, packet.get_timing_information( ).get_server_acquisition_arrival_timestamp(), packet.get_timing_information(). get_packet_start_mach_timestamp(), packet.get_timing_information().get_packet_end_mach_timestamp( ), packet.get_timing_information().get_synch_exchange_array(), packet.get_timing_information().get_best_latency(), packet.get_timing_information().get_best_offset(), ) if isinstance(packet, WrappedRedvoxPacketM) else TimeSyncData( self.station_id, self.sample_rate_hz, packet.microphone_sensor().payload_values().size, self.station_start_timestamp, packet.server_timestamp_epoch_microseconds_utc(), packet.start_timestamp_us_utc(), packet.end_timestamp_us_utc(), list(packet.time_synchronization_sensor().payload_values()), packet.best_latency(), packet.best_offset(), ) for packet in packets ] if len(self.timesync_data) > 0: self.evaluate_and_validate_data() return self def from_raw_packets( self, packets: List[Union[RedvoxPacketM, RedvoxPacket]]) -> 'TimeSyncAnalysis': """ converts packets into TimeSyncData objects, then performs analysis :param packets: list of WrappedRedvoxPacketM to convert :return: modified version of self """ timesync_data: List[TimeSyncData] = [] packet: Union[RedvoxPacketM, RedvoxPacket] for packet in packets: tsd: TimeSyncData if isinstance(packet, RedvoxPacketM): exchanges: List[float] = reduce( lambda acc, ex: acc + [ex.a1, ex.a2, ex.a3, ex.b1, ex.b2, ex.b3], packet.timing_information.synch_exchanges, []) tsd = TimeSyncData( packet.station_information.id, packet.sensors.audio.sample_rate, len(packet.sensors.audio.samples.values), packet.timing_information.app_start_mach_timestamp, packet. timing_information.server_acquisition_arrival_timestamp, packet.timing_information.packet_start_mach_timestamp, packet.timing_information.packet_end_mach_timestamp, exchanges, packet.timing_information.best_latency, packet.timing_information.best_offset) else: mtz: float = np.nan best_latency: float = np.nan best_offset: float = np.nan for i, v in enumerate(packet.metadata): plus_1: int = i + 1 try: if v == "machTimeZero" and plus_1 < len( packet.metadata): mtz = float(packet.metadata[plus_1]) if v == "bestLatency" and plus_1 < len( packet.metadata): best_latency = float(packet.metadata[plus_1]) if v == "bestOffset" and plus_1 < len(packet.metadata): best_offset = float(packet.metadata[plus_1]) except (KeyError, ValueError): continue # Get synch exchanges exchanges: Optional[np.ndarray] = None ch: api900_pb2.UnevenlySampledChannel for ch in packet.unevenly_sampled_channels: if api900_pb2.TIME_SYNCHRONIZATION in ch.channel_types: exchanges = util_900.extract_payload(ch) tsd = TimeSyncData( packet.redvox_id, packet.evenly_sampled_channels[0].sample_rate_hz, util_900.payload_len(packet.evenly_sampled_channels[0]), mtz, packet.evenly_sampled_channels[0]. first_sample_timestamp_epoch_microseconds_utc, packet.server_timestamp_epoch_microseconds_utc, packet.app_file_start_timestamp_machine, list(exchanges), best_latency, best_offset, ) timesync_data.append(tsd) self.timesync_data = timesync_data if len(self.timesync_data) > 0: self.evaluate_and_validate_data() return self def add_timesync_data(self, timesync_data: TimeSyncData): """ adds a TimeSyncData object to the analysis :param timesync_data: TimeSyncData to add """ self.timesync_data.append(timesync_data) self.evaluate_and_validate_data() def get_num_packets(self) -> int: """ :return: number of packets analyzed """ return len(self.timesync_data) def get_best_latency(self) -> float: """ :return: the best latency """ if np.isnan(self.best_latency_index): return np.nan return self.timesync_data[self.best_latency_index].best_latency def get_latencies(self) -> np.array: """ :return: np.array containing all the latencies """ return np.array( [ts_data.best_latency for ts_data in self.timesync_data]) def get_mean_latency(self) -> float: """ :return: the mean of the latencies, or np.nan if it doesn't exist """ return self.latency_stats.mean_of_means() def get_latency_stdev(self) -> float: """ :return: the standard deviation of the latencies, or np.nan if it doesn't exist """ return self.latency_stats.total_std_dev() def get_best_offset(self) -> float: """ :return: offset associated with the best latency """ if np.isnan(self.best_latency_index): return np.nan return self.timesync_data[self.best_latency_index].best_offset def get_offsets(self) -> np.array: """ :return: np.array containing all the offsets """ return np.array( [ts_data.best_offset for ts_data in self.timesync_data]) def get_mean_offset(self) -> float: """ :return: the mean of the offsets, or np.nan if it doesn't exist """ return self.offset_stats.mean_of_means() def get_offset_stdev(self) -> float: """ :return: the standard deviation of the offsets, or np.nan if it doesn't exist """ return self.offset_stats.total_std_dev() def get_best_packet_latency_index(self) -> int: """ :return: the best latency's index in the packet with the best latency """ if np.isnan(self.best_latency_index): return np.nan return self.timesync_data[self.best_latency_index].best_latency_index def get_best_start_time(self) -> float: """ :return: start timestamp associated with the best latency """ if np.isnan(self.best_latency_index): return np.nan return self.timesync_data[ self.best_latency_index].packet_start_timestamp def get_start_times(self) -> np.array: """ :return: list of the start timestamps of each packet """ start_times = [] for ts_data in self.timesync_data: start_times.append(ts_data.packet_start_timestamp) return np.array(start_times) def get_bad_packets(self) -> List[int]: """ :return: list of all packets that contains invalid data """ bad_packets = [] for idx in range(self.get_num_packets() ): # mark bad indices (they have a 0 or less value) if self.get_latencies()[idx] <= 0 or np.isnan( self.get_latencies()[idx]): bad_packets.append(idx) return bad_packets def evaluate_latencies(self): """ finds the best latency outputs warnings if a change in timestamps is detected """ if self.get_num_packets() < 1: self.errors.append( "Latencies cannot be evaluated; length of timesync data is less than 1" ) else: self.best_latency_index = 0 # assume the first element has the best timesync values for now, then compare with the others for index in range(1, self.get_num_packets()): best_latency = self.get_best_latency() # find the best latency; in this case, the minimum # if new value exists and if the current best does not or new value is better than current best, update if (not np.isnan(self.timesync_data[index].best_latency) and (np.isnan(best_latency)) or self.timesync_data[index].best_latency < best_latency): self.best_latency_index = index def validate_start_timestamp(self, debug: bool = False) -> bool: """ confirms if station_start_timestamp differs in any of the timesync_data outputs warnings if a change in timestamps is detected :param debug: if True, output warning message, default False :return: True if no change """ for index in range(self.get_num_packets()): # compare station start timestamps; notify when they are different if (self.timesync_data[index].station_start_timestamp != self.station_start_timestamp): self.errors.append( f"Change in station start timestamp detected; " f"expected: {self.station_start_timestamp}, read: " f"{self.timesync_data[index].station_start_timestamp}") if debug: self.errors.print() return False # if here, all the sample timestamps are the same return True def validate_sample_rate(self, debug: bool = False) -> bool: """ confirms if sample rate is the same across all timesync_data outputs warning if a change in sample rate is detected :param debug: if True, output warning message, default False :return: True if no change """ for index in range(self.get_num_packets()): # compare station start timestamps; notify when they are different if (np.isnan(self.timesync_data[index].sample_rate_hz) or self.timesync_data[index].sample_rate_hz != self.sample_rate_hz): self.errors.append( f"Change in station sample rate detected; " f"expected: {self.sample_rate_hz}, read: {self.timesync_data[index].sample_rate_hz}" ) if debug: self.errors.print() return False # if here, all the sample rates are the same return True def validate_time_gaps(self, gap_duration_s: float, debug: bool = False) -> bool: """ confirms there are no data gaps between packets outputs warning if a gap is detected :param gap_duration_s: length of time in seconds to be detected as a gap :param debug: if True, output warning message, default False :return: True if no gap """ if self.get_num_packets() < 2: self.errors.append( "Less than 2 timesync data objects to evaluate gaps with") if debug: self.errors.print() else: for index in range(1, self.get_num_packets()): # compare last packet's end timestamp with current start timestamp if (dt.microseconds_to_seconds( self.timesync_data[index].packet_start_timestamp - self.timesync_data[index - 1].packet_end_timestamp) > gap_duration_s): self.errors.append( f"Gap detected at packet number: {index}") if debug: self.errors.print() return False # if here, no gaps return True def update_timestamps(self, use_model: bool = True): """ update timestamps by adding microseconds based on the OffsetModel. :param use_model: if True, use the model, otherwise use best offset """ if use_model and self.offset_model: self.station_start_timestamp += self.offset_model.get_offset_at_time( self.station_start_timestamp) for tsd in self.timesync_data: tsd.update_timestamps(self.offset_model) else: self.station_start_timestamp += self.get_best_offset() for tsd in self.timesync_data: tsd.update_timestamps()
class DataWindow: """ Holds the data for a given time window; adds interpolated timestamps to fill gaps and pad start and end values Properties: event_name: str, name of the DataWindow. defaults to "dw" event_origin: Optional EventOrigin which describes the physical location and radius of the origin event. Default empty EventOrigin (no valid data) config: optional DataWindowConfig with information on how to construct DataWindow from Redvox (.rdvx*) files. Default None sdk_version: str, the version of the Redvox SDK used to create the DataWindow debug: bool, if True, outputs additional information during initialization. Default False Protected: _fs_writer: DataWindowFileSystemWriter; includes event_name, output directory (Default "."), output type (options: "PARQUET", "LZ4", "NONE". Default NONE), and option to make a runme.py example file (Default False) _stations: List of Stations that belong to the DataWindow _errors: RedVoxExceptions; contains a list of all errors encountered by the DataWindow """ def __init__( self, event_name: str = "dw", event_origin: Optional[EventOrigin] = None, config: Optional[DataWindowConfig] = None, output_dir: str = ".", out_type: str = "NONE", make_runme: bool = False, debug: bool = False, ): """ Initialize the DataWindow :param event_name: name of the DataWindow. defaults to "dw" :param event_origin: Optional EventOrigin which describes the physical location and radius of the origin event. Default empty EventOrigin (no valid data) :param config: Optional DataWindowConfig which describes how to extract data from Redvox files. Default None :param output_dir: output directory for saving files. Default "." (current directory) :param out_type: type of file to save the DataWindow as. Options: "PARQUET", "LZ4", "NONE". Default "NONE" (no saving) :param make_runme: if True, saves an example runme.py file with the data. Default False :param debug: if True, outputs additional information during initialization. Default False """ self.event_name: str = event_name self.event_origin: EventOrigin = event_origin if event_origin else EventOrigin() self._fs_writer = dw_io.DataWindowFileSystemWriter(self.event_name, out_type, output_dir, make_runme) self.debug: bool = debug self._sdk_version: str = redvox.VERSION self._errors = RedVoxExceptions("DataWindow") self._stations: List[Station] = [] self._config = config if config: if config.start_datetime and config.end_datetime and (config.end_datetime <= config.start_datetime): self._errors.append("DataWindow will not work when end datetime is before or equal to start datetime.\n" f"Your times: {config.end_datetime} <= {config.start_datetime}") else: self.create_data_window() if self.debug: self.print_errors() # def __repr__(self): # # todo: use representations for the datetime and timedelta objects # # todo: use the dictionary function # return dw_io.dict_to_json({ # "event_name": self.event_name, # "event_origin": repr(self.event_origin), # "config": repr(self._config), # "base_dir": self.save_dir(), # "out_type": self._fs_writer.file_extension, # "make_runme": self._fs_writer.make_run_me, # "sdk_version": self._sdk_version, # "errors": repr(self._errors), # "debug": self.debug # }) # # def __str__(self): # # todo: use representations for the datetime and timedelta objects # # todo: use the dictionary function # return dw_io.dict_to_json( # {"event_name": self.event_name, # "event_origin": str(self.event_origin), # "config": str(self._config), # "base_dir": self.save_dir(), # "stations": [s.default_station_json_file_name() for s in self._stations], # "out_type": self._fs_writer.file_extension, # "make_runme": self._fs_writer.make_run_me, # "sdk_version": self._sdk_version, # "errors": str(self._errors), # "debug": self.debug # }) def save_dir(self) -> str: """ :return: directory data is saved to (empty string means saving to memory) """ return self._fs_writer.save_dir() def set_save_dir(self, new_save_dir: Optional[str] = "."): """ :param new_save_dir: directory to save data to; default current directory, or "." """ self._fs_writer.base_dir = new_save_dir def is_make_runme(self) -> bool: """ :return: if DataWindow will be saved with a runme file """ return self._fs_writer.make_run_me def set_make_runme(self, make_runme: bool = False): """ :param make_runme: if True, DataWindow will create a runme file when saved. Default False """ self._fs_writer.make_run_me = make_runme def fs_writer(self) -> dw_io.DataWindowFileSystemWriter: """ :return: DataWindowFileSystemWriter for DataWindow """ return self._fs_writer def out_type(self) -> str: """ :return: string of the output type of the DataWindow """ return self._fs_writer.file_extension def set_out_type(self, new_out_type: str): """ set the output type of the DataWindow. options are "NONE", "PARQUET" and "LZ4". invalid values become "NONE" :param new_out_type: new output type of the DataWindow """ self._fs_writer.set_extension(new_out_type) def as_dict(self) -> Dict: """ :return: DataWindow properties as dictionary """ return {"event_name": self.event_name, "event_origin": self.event_origin.as_dict(), "start_time": self.start_date(), "end_time": self.end_date(), "base_dir": self.save_dir(), "stations": [s.default_station_json_file_name() for s in self._stations], "config": self._config.as_dict(), "debug": self.debug, "errors": self._errors.as_dict(), "sdk_version": self._sdk_version, "out_type": self._fs_writer.file_extension, "make_runme": self._fs_writer.make_run_me } def pretty(self) -> str: """ :return: DataWindow as dictionary, but easier to read """ # noinspection Mypy return pprint.pformat(self.as_dict()) @staticmethod def from_config(config: DataWindowConfigFile) -> "DataWindow": """ Use a config file to create a DataWindow :param config: DataWindowConfigFile to load from :return: DataWindow """ event_origin = EventOrigin(config.origin_provider, config.origin_latitude, config.origin_latitude_std, config.origin_longitude, config.origin_longitude_std, config.origin_altitude, config.origin_altitude_std, config.origin_event_radius_m) dw_config = DataWindowConfig(config.input_directory, config.structured_layout, config.start_dt(), config.end_dt(), config.start_buffer_td(), config.end_buffer_td(), config.drop_time_seconds, config.station_ids, config.extensions, config.api_versions, config.apply_correction, config.use_model_correction, config.copy_edge_points()) return DataWindow(config.event_name, event_origin, dw_config, config.output_dir, config.output_type, config.make_runme, config.debug) @staticmethod def from_config_file(file: str) -> "DataWindow": """ Loads a configuration file to create the DataWindow :param file: full path to config file :return: DataWindow """ return DataWindow.from_config(DataWindowConfigFile.from_path(file)) @staticmethod def deserialize(path: str) -> "DataWindow": """ Decompresses and deserializes a DataWindow written to disk. :param path: Path to the serialized and compressed DataWindow. :return: An instance of a DataWindow. """ return dw_io.deserialize_data_window(path) def serialize(self, compression_factor: int = 4) -> Path: """ Serializes and compresses this DataWindow to a file. Uses the event_name and out_dir to name the file. :param compression_factor: A value between 1 and 12. Higher values provide better compression, but take longer. (default=4). :return: The path to the written file. """ return dw_io.serialize_data_window(self, self.save_dir(), f"{self.event_name}.pkl.lz4", compression_factor) def _to_json_file(self) -> Path: """ Converts the DataWindow metadata into a JSON file and compresses the DataWindow and writes it to disk. :return: The path to the written file """ return dw_io.data_window_to_json(self, self.save_dir()) def to_json(self) -> str: """ :return: The DataWindow metadata into a JSON string. """ return dw_io.data_window_as_json(self) @staticmethod def from_json(json_str: str) -> "DataWindow": """ Read the DataWindow from a JSON string. If file is improperly formatted, raises a ValueError. :param json_str: the JSON to read :return: The DataWindow as defined by the JSON """ return DataWindow.from_json_dict(dw_io.json_to_dict(json_str)) @staticmethod def from_json_dict(json_dict: Dict) -> "DataWindow": """ Reads a JSON dictionary and loads the data into the DataWindow. If dictionary is improperly formatted, raises a ValueError. :param json_dict: the dictionary to read :return: The DataWindow as defined by the JSON """ if "out_type" not in json_dict.keys() \ or json_dict["out_type"].upper() not in dw_io.DataWindowOutputType.list_names(): raise ValueError('Dictionary loading type is invalid or unknown. ' 'Check the value "out_type"; it must be one of: ' f'{dw_io.DataWindowOutputType.list_non_none_names()}') else: out_type = dw_io.DataWindowOutputType.str_to_type(json_dict["out_type"]) if out_type == dw_io.DataWindowOutputType.PARQUET: dwin = DataWindow(json_dict["event_name"], EventOrigin.from_dict(json_dict["event_origin"]), None, json_dict["base_dir"], json_dict["out_type"], json_dict["make_runme"], json_dict["debug"]) dwin._config = DataWindowConfig.from_dict(json_dict["config"]) dwin._errors = RedVoxExceptions.from_dict(json_dict["errors"]) dwin._sdk_version = json_dict["sdk_version"] for st in json_dict["stations"]: dwin.add_station(Station.from_json_file(os.path.join(json_dict["base_dir"], st), f"{st}.json")) elif out_type == dw_io.DataWindowOutputType.LZ4: dwin = DataWindow.deserialize(os.path.join(json_dict["base_dir"], f"{json_dict['event_name']}.pkl.lz4")) else: dwin = DataWindow() return dwin def save(self) -> Path: """ save the DataWindow to disk if saving is enabled if saving is not enabled, adds an error to the DataWindow and returns an empty path. :return: the path to where the files exist; an empty path means no files were saved """ if self._fs_writer.is_save_disk(): if self._fs_writer.is_use_disk() and self._fs_writer.make_run_me: shutil.copyfile(os.path.abspath(inspect.getfile(run_me)), os.path.join(self._fs_writer.save_dir(), "runme.py")) if self._fs_writer.file_extension == "parquet": return self._to_json_file() elif self._fs_writer.file_extension == "lz4": return self.serialize() else: self._errors.append("Saving not enabled.") print("WARNING: Cannot save data window without knowing extension.") return Path() @staticmethod def load(file_path: str) -> "DataWindow": """ load from json metadata and lz4 compressed file or directory of files :param file_path: full path of file to load :return: DataWindow from json metadata """ cur_path = os.getcwd() os.chdir(os.path.dirname(file_path)) result = DataWindow.from_json_dict(dw_io.json_file_to_data_window(file_path)) os.chdir(cur_path) return result def config(self) -> DataWindowConfig: """ :return: settings used to create the DataWindow """ return self._config def sdk_version(self) -> str: """ :return: sdk version used to create the DataWindow """ return self._sdk_version def set_sdk_version(self, version: str): """ :param version: the sdk version to set """ self._sdk_version = version def start_date(self) -> float: """ :return: minimum start timestamp of the data or np.nan if no data """ if len(self._stations) > 0: return np.min([s.first_data_timestamp() for s in self._stations]) return np.nan def end_date(self) -> float: """ :return: maximum end timestamp of the data or np.nan if no data """ if len(self._stations) > 0: return np.max([s.last_data_timestamp() for s in self._stations]) return np.nan def stations(self) -> List[Station]: """ :return: list of stations in the DataWindow """ return self._stations def station_ids(self) -> List[str]: """ :return: ids of stations in the DataWindow """ return [s.id() for s in self._stations] def add_station(self, station: Station): """ add a station to the DataWindow :param station: Station to add """ self._stations.append(station) def remove_station(self, station_id: Optional[str] = None, start_date: Optional[float] = None): """ remove the first station from the DataWindow, or a specific station if given the id and/or start date if an id is given, the first station with that id will be removed if a start date is given, the removed station will start at or after the start date start date is in microseconds since epoch UTC :param station_id: id of station to remove :param start_date: start date that is at or before the station to remove """ id_removals = [] sd_removals = [] if station_id is None and start_date is None: self._stations.pop() else: if station_id is not None: for s in range(len(self._stations)): if self._stations[s].id == station_id: id_removals.append(s) if start_date is not None: for s in range(len(self._stations)): if self._stations[s].start_date() >= start_date: sd_removals.append(s) if len(id_removals) > 0 and start_date is None: self._stations.pop(id_removals.pop()) elif len(sd_removals) > 0 and station_id is None: self._stations.pop(sd_removals.pop()) elif len(id_removals) > 0 and len(sd_removals) > 0: for a in id_removals: for b in sd_removals: if a == b: self._stations.pop(a) return if a < b: continue def first_station(self, station_id: Optional[str] = None) -> Optional[Station]: """ :param station_id: optional station id to filter on :return: first station matching params; if no params given, gets first station in list. returns None if no station with given station_id exists. """ if len(self._stations) < 1: self._errors.append(f"Attempted to get a station, but there are no stations in the data window!") if self.debug: print(f"Attempted to get a station, but there are no stations in the data window!") return None elif station_id: result = [s for s in self._stations if s.get_key().check_key(station_id, None, None)] if len(result) > 0: return result[0] self._errors.append(f"Attempted to get station {station_id}, but that station is not in this data window!") if self.debug: print(f"Attempted to get station {station_id}, but that station is not in this data window!") return None return self._stations[0] def get_station(self, station_id: str, station_uuid: Optional[str] = None, start_timestamp: Optional[float] = None) -> Optional[List[Station]]: """ Get stations from the DataWindow. Must give at least the station's id. Other parameters may be None, which means the value will be ignored when searching. Results will match all non-None parameters given. :param station_id: station id :param station_uuid: station uuid, default None :param start_timestamp: station start timestamp in microseconds since UTC epoch, default None :return: A list of valid stations or None if the station cannot be found """ result = [s for s in self._stations if s.get_key().check_key(station_id, station_uuid, start_timestamp)] if len(result) > 0: return result self._errors.append(f"Attempted to get station {station_id}, but that station is not in this data window!") if self.debug: print(f"Attempted to get station {station_id}, but that station is not in this data window!") return None # def _add_sensor_to_window(self, station: Station): # set the window start and end if they were specified, otherwise use the bounds of the data # self.create_window_in_sensors(station, self._config.start_datetime, self._config.end_datetime) def create_data_window(self, pool: Optional[multiprocessing.pool.Pool] = None): """ updates the DataWindow to contain only the data within the window parameters stations without audio or any data outside the window are removed """ # Let's create and manage a single pool of workers that we can utilize throughout # the instantiation of the data window. _pool: multiprocessing.pool.Pool = multiprocessing.Pool() if pool is None else pool r_f = io.ReadFilter() if self._config.start_datetime: r_f.with_start_dt(self._config.start_datetime) if self._config.end_datetime: r_f.with_end_dt(self._config.end_datetime) if self._config.station_ids: r_f.with_station_ids(self._config.station_ids) if self._config.extensions: r_f.with_extensions(self._config.extensions) else: self._config.extensions = r_f.extensions if self._config.api_versions: r_f.with_api_versions(self._config.api_versions) else: self._config.api_versions = r_f.api_versions r_f.with_start_dt_buf(self._config.start_buffer_td) r_f.with_end_dt_buf(self._config.end_buffer_td) if self.debug: print("Reading files from disk. This may take a few minutes to complete.") # get the data to convert into a window a_r = ApiReaderDw(self._config.input_dir, self._config.structured_layout, r_f, correct_timestamps=self._config.apply_correction, use_model_correction=self._config.use_model_correction, dw_base_dir=self.save_dir(), dw_save_mode=self._fs_writer.save_mode(), debug=self.debug, pool=_pool) self._errors.extend_error(a_r.errors) if self._fs_writer.is_use_mem() and a_r.dw_save_mode != self._fs_writer.save_mode(): if self.debug: print("Estimated size of files exceeds available memory.") print("Automatically using temporary directory to store data.") self._fs_writer.set_use_temp(True) # Parallel update # Apply timing correction in parallel by station sts = a_r.get_stations() if self.debug: print("num stations loaded: ", len(sts)) # if self._config.apply_correction: # for st in maybe_parallel_map(_pool, Station.update_timestamps, # iter(sts), chunk_size=1): # self._add_sensor_to_window(st) # if self.debug: # print("station processed: ", st.id()) for st in maybe_parallel_map(_pool, Station.update_timestamps, iter(sts), chunk_size=1): self.create_window_in_sensors(st, self._config.start_datetime, self._config.end_datetime) if self.debug: print("station processed: ", st.id()) # check for stations without data self._check_for_audio() self._check_valid_ids() # update the default data window name if we have data and the default name exists if self.event_name == "dw" and len(self._stations) > 0: self.event_name = f"dw_{int(self.start_date())}_{len(self._stations)}" # must update the start and end in order for the data to be saved # update remaining data window values if they're still default if not self._config.start_datetime and len(self._stations) > 0: self._config.start_datetime = dtu.datetime_from_epoch_microseconds_utc( np.min([t.first_data_timestamp() for t in self._stations])) # end_datetime is non-inclusive, so it must be greater than our latest timestamp if not self._config.end_datetime and len(self._stations) > 0: self._config.end_datetime = dtu.datetime_from_epoch_microseconds_utc( np.max([t.last_data_timestamp() for t in self._stations]) + 1) # If the pool was created by this function, then it needs to managed by this function. if pool is None: _pool.close() def _check_for_audio(self): """ removes any station without audio data from the DataWindow """ remove = [] for s in self._stations: if not s.has_audio_sensor(): remove.append(s.id()) if len(remove) > 0: self._stations = [s for s in self._stations if s.id() not in remove] def _check_valid_ids(self): """ if there are stations, searches the station_ids for any ids not in the data collected and creates an error message for each id requested but has no data if there are no stations, creates a single error message declaring no data found """ if len(self._stations) < 1 and self._config.station_ids: if len(self._config.station_ids) > 1: add_ids = f"for all stations {self._config.station_ids} " else: add_ids = "" self._errors.append(f"No data matching criteria {add_ids}in {self._config.input_dir}" f"\nPlease adjust parameters of DataWindow") elif len(self._stations) > 0 and self._config.station_ids: for ids in self._config.station_ids: if ids.zfill(10) not in [i.id() for i in self._stations]: self._errors.append( f"Requested {ids} but there is no data to read for that station" ) def create_window_in_sensors( self, station: Station, start_datetime: Optional[dtu.datetime] = None, end_datetime: Optional[dtu.datetime] = None ): """ truncate the sensors in the station to only contain data from start_date_timestamp to end_date_timestamp if the start and/or end are not specified, keeps all audio data that fits and uses it to truncate the other sensors. returns nothing, updates the station in place :param station: station object to truncate sensors of :param start_datetime: datetime of start of window, default None :param end_datetime: datetime of end of window, default None """ if start_datetime: start_datetime = dtu.datetime_to_epoch_microseconds_utc(start_datetime) else: start_datetime = 0 if end_datetime: end_datetime = dtu.datetime_to_epoch_microseconds_utc(end_datetime) else: end_datetime = dtu.datetime_to_epoch_microseconds_utc(dtu.datetime.max) self.process_sensor(station.audio_sensor(), station.id(), start_datetime, end_datetime) for sensor in [s for s in station.data() if s.type() != SensorType.AUDIO]: self.process_sensor(sensor, station.id(), station.audio_sensor().first_data_timestamp(), station.audio_sensor().last_data_timestamp()) # recalculate metadata station.update_first_and_last_data_timestamps() station.set_packet_metadata([meta for meta in station.packet_metadata() if meta.packet_start_mach_timestamp < station.last_data_timestamp() and meta.packet_end_mach_timestamp >= station.first_data_timestamp()]) if self._fs_writer.is_save_disk(): station.set_save_mode(io.FileSystemSaveMode.DISK) station.set_save_dir(self.save_dir() if self._fs_writer.is_use_disk() else self._fs_writer.get_temp()) self._stations.append(station) def process_sensor(self, sensor: SensorData, station_id: str, start_date_timestamp: float, end_date_timestamp: float): """ process a non audio sensor to fit within the DataWindow. Updates sensor in place, returns nothing. :param sensor: sensor to process :param station_id: station id :param start_date_timestamp: start of DataWindow :param end_date_timestamp: end of DataWindow """ if sensor.num_samples() > 0: # get only the timestamps between the start and end timestamps before_start = np.where(sensor.data_timestamps() < start_date_timestamp)[0] after_end = np.where(end_date_timestamp <= sensor.data_timestamps())[0] # start_index is inclusive of window start if len(before_start) > 0: last_before_start = before_start[-1] start_index = last_before_start + 1 else: last_before_start = None start_index = 0 # end_index is non-inclusive of window end if len(after_end) > 0: first_after_end = after_end[0] end_index = first_after_end else: first_after_end = None end_index = sensor.num_samples() # check if all the samples have been cut off is_audio = sensor.type() == SensorType.AUDIO if end_index <= start_index: if is_audio: self._errors.append(f"Data window for {station_id} " f"Audio sensor has truncated all data points") elif last_before_start is not None and first_after_end is None: first_entry = sensor.pyarrow_table().slice(last_before_start, 1).to_pydict() first_entry["timestamps"] = [start_date_timestamp] sensor.write_pyarrow_table(pa.Table.from_pydict(first_entry)) elif last_before_start is None and first_after_end is not None: last_entry = sensor.pyarrow_table().slice(first_after_end, 1).to_pydict() last_entry["timestamps"] = [start_date_timestamp] sensor.write_pyarrow_table(pa.Table.from_pydict(last_entry)) elif last_before_start is not None and first_after_end is not None: sensor.write_pyarrow_table( sensor.interpolate(start_date_timestamp, last_before_start, 1, self._config.copy_edge_points == gpu.DataPointCreationMode.COPY)) else: self._errors.append( f"Data window for {station_id} {sensor.type().name} " f"sensor has truncated all data points" ) else: _arrow = sensor.pyarrow_table().slice(start_index, end_index-start_index) # if sensor is audio or location, we want nan'd edge points if sensor.type() in [SensorType.LOCATION, SensorType.AUDIO]: new_point_mode = gpu.DataPointCreationMode.NAN else: new_point_mode = self._config.copy_edge_points # add in the data points at the edges of the window if there are defined start and/or end times slice_start = _arrow["timestamps"].to_numpy()[0] slice_end = _arrow["timestamps"].to_numpy()[-1] if not is_audio: end_sample_interval = end_date_timestamp - slice_end end_samples_to_add = 1 start_sample_interval = start_date_timestamp - slice_start start_samples_to_add = 1 else: end_sample_interval = dtu.seconds_to_microseconds(sensor.sample_interval_s()) start_sample_interval = -end_sample_interval if self._config.end_datetime: end_samples_to_add = int((dtu.datetime_to_epoch_microseconds_utc(self._config.end_datetime) - slice_end) / end_sample_interval) else: end_samples_to_add = 0 if self._config.start_datetime: start_samples_to_add = int((slice_start - dtu.datetime_to_epoch_microseconds_utc(self._config.start_datetime)) / end_sample_interval) else: start_samples_to_add = 0 # add to end _arrow = (gpu.add_data_points_to_df(data_table=_arrow, start_index=_arrow.num_rows - 1, sample_interval_micros=end_sample_interval, num_samples_to_add=end_samples_to_add, point_creation_mode=new_point_mode)) # add to begin _arrow = (gpu.add_data_points_to_df(data_table=_arrow, start_index=0, sample_interval_micros=start_sample_interval, num_samples_to_add=start_samples_to_add, point_creation_mode=new_point_mode)) sensor.sort_by_data_timestamps(_arrow) else: self._errors.append(f"Data window for {station_id} {sensor.type().name} " f"sensor has no data points!") def print_errors(self): """ prints errors to screen """ self._errors.print() for stn in self._stations: stn.print_errors()
class EventStream: """ stores event stream data gathered from a single station. ALL timestamps in microseconds since epoch UTC unless otherwise stated """ def __init__(self, name: str = "event", schema: Optional[Dict[str, list]] = None, save_mode: FileSystemSaveMode = FileSystemSaveMode.MEM, base_dir: str = "."): """ initialize EventStream for a station :param name: name of the EventStream. Default "event" :param schema: a structured dictionary of the data table schema. Dictionary must look like: {"string": [s_values], "numeric": [n_values], "boolean": [o_values], "byte": [b_values]} where [*_values] is a list of strings and can be empty. Default None :param save_mode: FileSystemSaveMode that determines how data is saved. Default FileSystemSaveMode.MEM (use RAM). Other options are DISK (save to directory) and TEMP (save to temporary directory) :param base_dir: the location of the parquet file that holds the data. Not used if save_data is False. Default current directory (".") """ self.name = name self.timestamps_metadata = {} self.metadata = {} self._errors = RedVoxExceptions("EventStream") self._is_timestamps_corrected = False self._fs_writer = Fsw(f"event_{name}", "parquet", base_dir, save_mode) self._data = None self._schema = {"string": [], "numeric": [], "boolean": [], "byte": []} if schema is not None: self.set_schema(schema) def as_dict(self) -> dict: """ :return: EventStream as a dictionary """ return { "name": self.name, "metadata": self.metadata, "timestamps_metadata": self.timestamps_metadata, "is_timestamps_corrected": self._is_timestamps_corrected, "schema": self._schema, "file_path": self.full_path(), "errors": self._errors.as_dict() } @staticmethod def __get_items(payload: Mapping[str]): return payload.get_metadata().items() @staticmethod def __get_items_raw(payload): return payload.items() @staticmethod def __get_keys(ptype: str, payload: Mapping[str]): return ptype, payload.get_metadata().keys() @staticmethod def __get_keys_raw(ptype: str, payload): return ptype, payload.keys() def __set_schema(self, name: str, value: str): self._schema[name].append(value) def _get_tbl_schema(self) -> Dict[str, list]: """ :return: the dictionary used to create the EventStream data object """ if self._data: result = {} for f in self._data.schema.names: result[f] = [] else: result = {"timestamps": [], "unaltered_timestamps": []} for t, s in self._schema.items(): for k in s: result[k] = [] return result def read_events(self, eventstream: es.EventStream): """ read the payloads of each event in the eventstream and separate the data by payload type :param eventstream: stream of events to process """ self.name = eventstream.get_name() self._fs_writer.file_name = f"event_{self.name}" num_events = eventstream.get_events().get_count() if num_events > 1: tbl = self._get_tbl_schema() self.timestamps_metadata = eventstream.get_timestamps().get_metadata() self.metadata = eventstream.get_metadata() first_event = eventstream.get_events().get_values()[0] for t, c in map(self.__get_keys, ["string", "numeric", "boolean", "byte"], [first_event.get_string_payload(), first_event.get_numeric_payload(), first_event.get_boolean_payload(), first_event.get_byte_payload()]): for k in c: self.add_to_schema(t, k) tbl[k] = [] for i in range(num_events): tbl["timestamps"].append(eventstream.get_timestamps().get_timestamps()[i]) tbl["unaltered_timestamps"].append(eventstream.get_timestamps().get_timestamps()[i]) evnt = eventstream.get_events().get_values()[i] for items in map(self.__get_items, [evnt.get_string_payload(), evnt.get_numeric_payload(), evnt.get_boolean_payload(), evnt.get_byte_payload()]): for c, st in items: tbl[c].append(st) self._data = pa.Table.from_pydict(tbl) def read_raw(self, stream: RedvoxPacketM.EventStream) -> 'EventStream': """ read the contents of a protobuf stream :param stream: the protobuf stream to read """ self.name = stream.name self._fs_writer.file_name = f"event_{self.name}" num_events = len(stream.events) if num_events > 1: tbl = self._get_tbl_schema() self.timestamps_metadata = stream.timestamps.metadata self.metadata = stream.metadata first_event = stream.events[0] for t, c in map(EventStream.__get_keys_raw, ["string", "numeric", "boolean", "byte"], [first_event.string_payload, first_event.numeric_payload, first_event.boolean_payload, first_event.byte_payload]): for k in c: self.add_to_schema(t, k) tbl[k] = [] for i in range(num_events): tbl["timestamps"].append(stream.timestamps.timestamps[i]) tbl["unaltered_timestamps"].append(stream.timestamps.timestamps[i]) evnt = stream.events[i] for items in map(EventStream.__get_items_raw, [evnt.string_payload, evnt.numeric_payload, evnt.boolean_payload, evnt.byte_payload]): for c, st in items: tbl[c].append(st) self._data = pa.Table.from_pydict(tbl) return self def read_from_dir(self, file: str): """ read a pyarrow table from a file on disk :param file: full path to the file to read """ try: tbl = pq.read_table(file) if tbl.schema.names == self._get_tbl_schema(): self._data = tbl except FileNotFoundError: self._errors.append("No data file was found; this event is empty.") self._data = None def get_string_schema(self) -> List[str]: """ :return: the column names of string typed data as a list of strings """ return self._schema["string"] def get_numeric_schema(self) -> List[str]: """ :return: the column names of numeric typed data as a list of strings """ return self._schema["numeric"] def get_boolean_schema(self) -> List[str]: """ :return: the column names of boolean typed data as a list of strings """ return self._schema["boolean"] def get_byte_schema(self) -> List[str]: """ :return: the column names of byte typed data as a list of strings """ return self._schema["byte"] def get_schema(self) -> dict: """ :return: the schema of the EventStream """ return self._schema def get_string_values(self) -> pa.Table: """ :return: the string data as a pyarrow table """ return self._data.select(self.get_string_schema()) if self._data else pa.Table.from_pydict({}) def get_numeric_values(self) -> pa.Table: """ :return: the numeric data as a pyarrow table """ return self._data.select(self.get_numeric_schema()) if self._data else pa.Table.from_pydict({}) def get_boolean_values(self) -> pa.Table: """ :return: the boolean data as a pyarrow table """ return self._data.select(self.get_boolean_schema()) if self._data else pa.Table.from_pydict({}) def get_byte_values(self) -> pa.Table: """ :return: the byte data as a pyarrow table """ return self._data.select(self.get_byte_schema()) if self._data else pa.Table.from_pydict({}) def _check_for_name(self, column_name: str, schema: List[str]) -> bool: """ :param column_name: name of column to check for :param schema: list of allowed names :return: True if column_name is in schema, sets error and returns False if not """ if column_name not in schema: self._errors.append(f"WARNING: Column {column_name} does not exist; try one of {schema}") return False return True def __get_column_data(self, schema: List[str], column_name: str) -> np.array: """ :param schema: list of column names to search :param column_name: column name to get :return: the data as an np.array; if empty, column name or data doesn't exist """ return self._data[column_name].to_numpy() if self._check_for_name(column_name, schema) else np.array([]) def get_string_column(self, column_name: str) -> np.array: """ :param column_name: name of string payload to retrieve :return: string data from the column specified """ return self.__get_column_data(self.get_string_schema(), column_name) def get_numeric_column(self, column_name: str) -> np.array: """ :param column_name: name of numeric payload to retrieve :return: numeric data from the column specified """ return self.__get_column_data(self.get_numeric_schema(), column_name) def get_boolean_column(self, column_name: str) -> np.array: """ :param column_name: name of boolean payload to retrieve :return: boolean data from the column specified """ return self.__get_column_data(self.get_boolean_schema(), column_name) def get_byte_column(self, column_name: str) -> np.array: """ :param column_name: name of byte payload to retrieve :return: bytes data from the column specified """ return self.__get_column_data(self.get_byte_schema(), column_name) def set_schema(self, schema: Dict[str, list]): """ sets the schema of the EventStream using a specially structured dictionary. Structure is: {"string": [s_values], "numeric": [n_values], "boolean": [o_values], "byte": [b_values]} where [*_values] is a list of strings and can be empty :param schema: specially structured dictionary of data table schema """ if schema.keys() != self._schema.keys(): self._errors.append(f"Attempted to add invalid schema with keys {list(schema.keys())} to EventStreams.\n" f"Valid keys are: {list(self._schema.keys())}") else: self._schema = schema def add_to_schema(self, key: str, value: str): """ adds a value to the schema, under the specified key :param key: one of "string", "numeric", "boolean", or "byte" :param value: the name of the column to add to the schema """ if key not in self._schema.keys(): self._errors.append("Attempted to add an unknown key to the EventStream schema.\n" f"You must use one of {self._schema.keys()}.") elif value not in self._schema[key]: self._schema[key].append(value) def add(self, other_stream: es.EventStream): """ adds a Redvox Api1000 EventStream with the same name to the data :param other_stream: another EventStream with the same name """ if self.name != other_stream.get_name(): self._errors.append(f"Attempted to add a stream with a different name ({other_stream.get_name()})") else: self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.get_timestamps().get_metadata()} self.metadata = {**self.metadata, **other_stream.get_metadata()} num_events = other_stream.get_events().get_count() if num_events > 1: tbl = self._get_tbl_schema() for i in range(num_events): tbl["timestamps"].append(other_stream.get_timestamps().get_timestamps()[i]) tbl["unaltered_timestamps"].append(other_stream.get_timestamps().get_timestamps()[i]) evnt = other_stream.get_events().get_values()[i] for items in map(self.__get_items, [evnt.get_string_payload(), evnt.get_numeric_payload(), evnt.get_boolean_payload(), evnt.get_byte_payload()]): for c, st in items: tbl[c].append(st) self._data = pa.concat_tables([self._data, pa.Table.from_pydict(tbl)]) def add_raw(self, other_stream: RedvoxPacketM.EventStream): """ add a protobuf EventStream with the same name to the data :param other_stream: a protobuf EventStream to add """ if self.name != other_stream.name: self._errors.append(f"Attempted to add a stream with a different name ({other_stream.name})") else: self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.timestamps.metadata} self.metadata = {**self.metadata, **other_stream.metadata} num_events = len(other_stream.events) if num_events > 1: tbl = self._get_tbl_schema() for i in range(num_events): tbl["timestamps"].append(other_stream.timestamps.timestamps[i]) tbl["unaltered_timestamps"].append(other_stream.timestamps.timestamps[i]) evnt = other_stream.events[i] for items in map(EventStream.__get_items_raw, [evnt.string_payload, evnt.numeric_payload, evnt.boolean_payload, evnt.byte_payload]): for c, st in items: tbl[c].append(st) self._data = pa.concat_tables([self._data, pa.Table.from_pydict(tbl)]) def append(self, other_stream: "EventStream"): """ add another EventStream onto the calling one if they have the same name :param other_stream: other stream to add to current """ if other_stream.name == self.name: self._data = pa.concat_tables([self._data, other_stream._data]) self.timestamps_metadata = {**self.timestamps_metadata, **other_stream.timestamps_metadata} self.metadata = {**self.metadata, **other_stream.metadata} self._errors.extend_error(other_stream.errors()) def timestamps(self) -> np.array: """ :return: the timestamps as a numpy array; returns empty array if no timestamps exist """ if "timestamps" in self.data().schema.names: return self.data()["timestamps"].to_numpy() else: return np.array([]) def unaltered_timestamps(self) -> np.array: """ :return: the unaltered timestamps as a numpy array; returns empty array if no timestamps exist """ if "unaltered_timestamps" in self.data().schema.names: return self.data()["unaltered_timestamps"].to_numpy() else: return np.array([]) def update_timestamps(self, offset_model: om.OffsetModel, use_model_function: bool = False): """ updates the timestamps of the data points :param offset_model: model used to update the timestamps :param use_model_function: if True, use the model's slope function to update the timestamps. otherwise uses the best offset (model's intercept value). Default False """ if self._data is not None and self._data.num_rows > 0: timestamps = pa.array(offset_model.update_timestamps(self._data["timestamps"].to_numpy(), use_model_function)) self._data.set_column(0, "timestamps", timestamps) def default_json_file_name(self) -> str: """ :return: default event stream json file name (event_[event.name]): note there is no extension """ return f"event_{self.name}" def is_save_to_disk(self) -> bool: """ :return: True if sensor will be saved to disk """ return self._fs_writer.is_save_disk() def set_save_to_disk(self, save: bool): """ :param save: If True, save to disk """ self._fs_writer.save_to_disk = save def set_save_mode(self, save_mode: FileSystemSaveMode): """ set the save mode :param save_mode: new save mode """ self._fs_writer.set_save_mode(save_mode) def set_file_name(self, new_file: Optional[str] = None): """ * set the pyarrow file name or use the default: event_{EventStream.name} * Do not give an extension :param new_file: optional file name to change to; default None (use default name) """ self._fs_writer.file_name = new_file if new_file else f"event_{self.name}" def full_file_name(self) -> str: """ :return: full name of parquet file containing the data """ return self._fs_writer.full_name() def file_name(self) -> str: """ :return: file name without extension """ return self._fs_writer.file_name def set_save_dir(self, new_dir: Optional[str] = None): """ set the pyarrow directory or use the default: "." (current directory) :param new_dir: the directory to change to; default None (use current directory) """ self._fs_writer.base_dir = new_dir if new_dir else "." def save_dir(self) -> str: """ :return: directory containing parquet files for the sensor """ return self._fs_writer.save_dir() def full_path(self) -> str: """ :return: the full path to the data file """ return self._fs_writer.full_path() def fs_writer(self) -> Fsw: """ :return: FileSystemWriter object """ return self._fs_writer def write_table(self): """ writes the event stream data to disk. """ if self._data is not None: pq.write_table(self._data, self.full_path()) def has_data(self) -> bool: """ :return: True if EventStream contains at least one data point """ return self.data().num_rows > 0 def data(self) -> pa.Table: """ :return: the data as a pyarrow table """ if self._data is None: if self.is_save_to_disk(): self._data = pq.read_table(self.full_path()) else: return pa.Table.from_pydict({}) return self._data @staticmethod def from_json_file(file_dir: str, file_name: str) -> "EventStream": """ :param file_dir: full path to containing directory for the file :param file_name: name of file and extension to load data from :return: EventStream from json file """ if file_name is None: file_name = io.get_json_file(file_dir) if file_name is None: result = EventStream("Empty") result.append_error("JSON file to load EventStream from not found.") return result json_data = io.json_file_to_dict(os.path.join(file_dir, f"{file_name}.json")) if "name" in json_data.keys(): result = EventStream(json_data["name"], json_data["schema"], FileSystemSaveMode.DISK, file_dir) result.metadata = json_data["metadata"] result.timestamps_metadata = json_data["timestamps_metadata"] result.set_errors(RedVoxExceptions.from_dict(json_data["errors"])) result.read_from_dir(json_data["file_path"]) else: result = EventStream("Empty") result.append_error(f"Loading from {file_name} failed; missing EventStream name.") return result def to_json_file(self, file_name: Optional[str] = None) -> Path: """ saves the EventStream as a json file :param file_name: the optional base file name. Do not include a file extension. If None, a default file name is created using this format: event_[event.name].json :return: path to json file """ if self._fs_writer.file_extension == "parquet" and self._data is not None: self.write_table() return io.to_json_file(self, file_name) def errors(self) -> RedVoxExceptions: """ :return: errors of the sensor """ return self._errors def set_errors(self, errors: RedVoxExceptions): """ sets the errors of the Sensor :param errors: errors to set """ self._errors = errors def append_error(self, error: str): """ add an error to the Sensor :param error: error to add """ self._errors.append(error) def print_errors(self): """ print all errors to screen """ self._errors.print()