def _func(time, ind, df): """ return waveforms from df of bulk parameters """ match_chars = {"*", "?", "[", "]"} t1, t2 = time[0], time[1] # filter index based on start/end times in_time = ~((ind["starttime"] > t2) | (ind["endtime"] < t1)) ind = ind[in_time] # create indices used to load data ar = np.ones(len(ind)) # indices of ind to use to load data df = df[(df.t1 == time[0]) & (df.t2 == time[1])] # determine which columns use any matching or other select features uses_matches = [_column_contains(df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = df[match_ar] df_no_match = df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(ind, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_seed_id_series(df_no_match)) nslc2 = get_seed_id_series(ind) ar = np.logical_and(ar, nslc2.isin(nslc1)) return self._index2stream(ind[ar], t1, t2)
def _func(time, ind, df, st): """ return waveforms from df of bulk parameters """ match_chars = {"*", "?", "[", "]"} ar = np.ones(len(ind)) # indices of ind to use to load data _t1, _t2 = time[0], time[1] df = df[(df.t1 == time[0]) & (df.t2 == time[1])] # determine which columns use any matching or other select features uses_matches = [_column_contains(df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = df[match_ar] df_no_match = df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(ind, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_seed_id_series(df_no_match)) nslc2 = get_seed_id_series(ind) ar = np.logical_and(ar, nslc2.isin(nslc1)) # get a list of used traces, combine and trim st = obspy.Stream([x for x, y in zip(st, ar) if y]) return st.slice(starttime=to_utc(_t1), endtime=to_utc(_t2))
def get_waveforms( self, network=None, station=None, location=None, channel=None, starttime=None, endtime=None, ) -> obspy.Stream: """ Get waveforms from the cache, read from disk and cache if needed. See obplus.WaveBank.get_waveforms for param descriptions. """ filt = filter_index(self.index, network, station, location, channel, starttime, endtime) ser = self.index[filt].set_index("unique_key")["st_call"] # drop duplicates ser = ser[~ser.index.duplicated()] # no waveforms found, return empty waveforms if not len(ser): return obspy.Stream() st = reduce(add, (x() for x in ser)) if starttime is not None or endtime is not None: # use start/endtime or set far out constants starttime = starttime or 0 endtime = endtime or 32503680000 return st.trim(starttime=starttime, endtime=endtime) else: return st
def _get_bulk_args(self, starttime=None, endtime=None, **kwargs) -> bulk_waveform_arg_type: """ Get the bulk waveform arguments based on given start/end times. This method also takes into account data availability as contained in the stations data. Parameters ---------- starttime Start times for query. endtime End times for query. Returns ------- List of tuples of the form: [(network, station, location, channel, starttime, endtime)] """ station_df = self.station_df.copy() inv = station_df[filter_index(station_df, **kwargs)] # replace None/Nan with larger number inv.loc[inv["end_date"].isnull(), "end_date"] = LARGEDT64 inv["end_date"] = inv["end_date"].astype("datetime64[ns]") # get start/end of the inventory inv_start = inv["start_date"].min() inv_end = inv["end_date"].max() # remove station/channels that dont have data for requested time min_time = to_datetime64(starttime, default=inv_start).min() max_time = to_datetime64(endtime, default=inv_end).max() con1, con2 = (inv["start_date"] > max_time), (inv["end_date"] < min_time) df = inv[~(con1 | con2)].set_index("seed_id")[list(NSLC)] if df.empty: # return empty list if no data found return [] if isinstance(starttime, pd.Series): # Have to get clever here to make sure only active stations get used # and indices are not duplicated. new_start = starttime.loc[set(starttime.index).intersection( df.index)] new_end = endtime.loc[set(endtime.index).intersection(df.index)] df["starttime"] = new_start.loc[~new_start.index.duplicated()] df["endtime"] = new_end.loc[~new_end.index.duplicated()] else: df["starttime"] = starttime df["endtime"] = endtime # remove any rows that don't have defined start/end times out = df[~(df["starttime"].isnull() | df["endtime"].isnull())] # ensure we have UTCDateTime objects out["starttime"] = [to_utc(x) for x in out["starttime"]] out["endtime"] = [to_utc(x) for x in out["endtime"]] # convert to list of tuples and return return [tuple(x) for x in out.to_records(index=False)]
def test_filter_index(self, crandall_dataset): """Tests for filtering index with filter index function.""" # this is mainly here to test the time filtering, because the bank # operations pass this off to the HDF5 kernel. index = crandall_dataset.waveform_client.read_index(network="UU") t1_ns = int(index["starttime"].astype(np.int64).mean()) t1 = np.datetime64(t1_ns, "ns") t2 = index["endtime"].max() kwargs = dict(network="UU", station="*", location="*", channel="*") bool_ind = filter_index(index, starttime=t1, endtime=t2, **kwargs) assert (~np.logical_not(bool_ind)).any()
def stream_bulk_split(st: Stream, bulk: List[waveform_request_type], fill_value: Any = None) -> List[Stream]: """ Split a stream into a list of streams that meet requirements in bulk. This is similar to the get_waveforms_bulk methods of waveform_client, but rather than merging any overlapping data it is returned in a list of traces. Parameters ---------- st A stream object bulk A bulk request. Wildcards not currently supported on str params. fill_value If not None fill any missing data in time range with this value. Returns ------- List of traces, each meeting the corresponding request in bulk. """ # return nothing if empty bulk or stream args bulk = _get_bulk(bulk) if not bulk or len(st) == 0: return [] # # get dataframe of stream contents sdf = _stream_data_to_df(st) # iterate stream, return output out = [] for barg in bulk: assert len( barg) == 6, f"{barg} is not a valid bulk arg, must have len 6" need = filter_index(sdf, *barg) traces = [tr for tr, bo in zip(st, need) if bo] new_st = obspy.Stream(traces) t1, t2 = to_utc(barg[-2]), to_utc(barg[-1]) new = new_st.slice(starttime=t1, endtime=t2) # apply fill if needed if fill_value is not None: new = new.trim(starttime=t1, endtime=t2, fill_value=fill_value, pad=True) if new is None or not len(new): out.append(obspy.Stream()) continue new = merge_traces(new) out.append(new) assert len(out) == len(bulk), "output is not the same len as stream list" return out
def _filter_index_to_bulk(time, index_df, bulk_df) -> pd.DataFrame: """ Using an index_df, apply conditions in request_df and return array indicating if values in index meet requested conditions. Parameters ---------- time A tuple of mintime, maxtime index_df A dataframe indexing a waveform resource. Can be an index of traces in a stream or an index from a wavebank. bulk_df The dataframe containing bulk requests. """ match_chars = {"*", "?", "[", "]"} # filter out any index times not in current time pair too_late = index_df["starttime"] > time[1] too_early = index_df["endtime"] < time[0] index_df = index_df[~(too_early | too_late)] ar = np.ones(len(index_df)) # indices of ind to use to load data # filter out any request times which are not for the current time pair is_starttime = bulk_df["starttime"] == time[0] is_endtime = bulk_df["endtime"] == time[1] bulk_df = bulk_df[is_starttime & is_endtime] # determine which columns use matching. These must be handled separately. uses_matches = [_column_contains(bulk_df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = bulk_df[match_ar] df_no_match = bulk_df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(index_df, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_seed_id_series(df_no_match)) nslc2 = get_seed_id_series(index_df) ar = np.logical_and(ar, nslc2.isin(nslc1)) # get a list of used traces, combine and trim return index_df[ar]
def read_index( self, network: Optional[str] = None, station: Optional[str] = None, location: Optional[str] = None, channel: Optional[str] = None, starttime: Optional[utc_time_type] = None, endtime: Optional[utc_time_type] = None, **kwargs, ) -> pd.DataFrame: """ Return a dataframe of the index, optionally applying filters. Parameters ---------- {waveform_params} kwargs kwargs are passed to pandas.read_hdf function """ self.ensure_bank_path_exists() if starttime is not None and endtime is not None: if starttime > endtime: msg = "starttime cannot be greater than endtime." raise ValueError(msg) if not self.index_path.exists(): self.update_index() # if no file was created (dealing with empty bank) return empty index if not self.index_path.exists(): return pd.DataFrame(columns=self.index_columns) # grab index from cache index = self._index_cache(starttime, endtime, buffer=self.buffer, **kwargs) # filter and return filt = filter_index(index, network=network, station=station, location=location, channel=channel) return index[filt]