def get_waveforms( self, network=None, station=None, location=None, channel=None, starttime=None, endtime=None, ) -> obspy.Stream: """ Get waveforms from the cache, read from disk and cache if needed. See obplus.WaveBank.get_waveforms for param descriptions. """ filt = filter_index(self.index, network, station, location, channel, starttime, endtime) ser = self.index[filt].set_index("unique_key")["st_call"] # drop duplicates ser = ser[~ser.index.duplicated()] # no waveforms found, return empty waveforms if not len(ser): return obspy.Stream() st = reduce(add, (x() for x in ser)) if starttime is not None or endtime is not None: # use start/endtime or set far out constants starttime = starttime or 0 endtime = endtime or 32503680000 return st.trim(starttime=starttime, endtime=endtime) else: return st
def _func(time, ind, df): """ return waveforms from df of bulk parameters """ match_chars = {"*", "?", "[", "]"} t1, t2 = time[0], time[1] # filter index based on start/end times in_time = ~((ind["starttime"] > t2) | (ind["endtime"] < t1)) ind = ind[in_time] # create indices used to load data ar = np.ones(len(ind)) # indices of ind to use to load data df = df[(df.t1 == time[0]) & (df.t2 == time[1])] # determine which columns use any matching or other select features uses_matches = [_column_contains(df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = df[match_ar] df_no_match = df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(ind, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_seed_id_series(df_no_match)) nslc2 = get_seed_id_series(ind) ar = np.logical_and(ar, nslc2.isin(nslc1)) return self._index2stream(ind[ar], t1, t2)
def _func(time, ind, df, st): """ return waveforms from df of bulk parameters """ match_chars = {"*", "?", "[", "]"} ar = np.ones(len(ind)) # indices of ind to use to load data t1, t2 = time[0], time[1] df = df[(df.t1 == time[0]) & (df.t2 == time[1])] # determine which columns use any matching or other select features uses_matches = [_column_contains(df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = df[match_ar] df_no_match = df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(ind, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_nslc_series(df_no_match)) nslc2 = get_nslc_series(ind) ar = np.logical_and(ar, nslc2.isin(nslc1)) # get a list of used traces, combine and trim st = obspy.Stream([x for x, y in zip(st, ar) if y]) return st.slice(starttime=UTC(t1), endtime=UTC(t2))
def test_filter_index(self, crandall_dataset): """ Tests for filtering index with filter index function. """ # this is mainly here to test the time filtering, because the bank # operations pass this of to the HDF5 kernel. index = crandall_dataset.waveform_client.read_index(network="UU") t1 = index.starttime.mean() t2 = index.endtime.max() kwargs = dict(network="UU", station="*", location="*", channel="*") bool_ind = filter_index(index, starttime=t1, endtime=t2, **kwargs) assert (~np.logical_not(bool_ind)).any()
def stream_bulk_split(st: Stream, bulk: List[waveform_request_type]) -> List[Stream]: """ Split a stream into a list of streams that meet requirements in bulk. This is similar to the get_waveforms_bulk methods of waveform_client, but rather than merging any overlapping data it is returned in a list of traces. Parameters ---------- st A stream object bulk A bulk request. Wildcards not currently supported on str params. Returns ------- List of traces, each meeting the corresponding request in bulk. """ # return nothing if empty bulk or stream args bulk = _get_bulk(bulk) if not bulk or len(st) == 0: return [] # # get dataframe of stream contents sdf = _stream_data_to_df(st) # iterate stream, return output out = [] for barg in bulk: assert len( barg) == 6, f"{barg} is not a valid bulk arg, must have len 6" need = filter_index(sdf, *barg) traces = [tr for tr, bo in zip(st, need) if bo] new_st = obspy.Stream(traces) t1, t2 = to_utc(barg[-2]), to_utc(barg[-1]) new = new_st.slice(starttime=t1, endtime=t2) if new is None or not len(new): out.append(obspy.Stream()) continue new = merge_traces(new) out.append(new) assert len(out) == len(bulk), "output is not the same len as stream list" return out
def read_index( self, network: Optional[str] = None, station: Optional[str] = None, location: Optional[str] = None, channel: Optional[str] = None, starttime: Optional[utc_time_type] = None, endtime: Optional[utc_time_type] = None, **kwargs, ) -> pd.DataFrame: """ Return a dataframe of the index, optionally applying filters. Parameters ---------- {waveform_params} kwargs kwargs are passed to pandas.read_hdf function """ self.ensure_bank_path_exists() if starttime is not None and endtime is not None: if starttime > endtime: msg = f"starttime cannot be greater than endtime" raise ValueError(msg) if not os.path.exists(self.index_path): self.update_index() # if no file was created (dealing with empty bank) return empty index if not os.path.exists(self.index_path): return pd.DataFrame(columns=self.index_columns) # grab index from cache index = self._index_cache(starttime, endtime, buffer=self.buffer, **kwargs) # filter and return filt = filter_index(index, network=network, station=station, location=location, channel=channel) return index[filt]
def _get_bulk_arg(self, starttime=None, endtime=None, **kwargs) -> list: """ get the argument passed to get_waveforms_bulk, see obspy.fdsn.client for more info """ station_df = self.station_df.copy() inv = station_df[filter_index(station_df, **kwargs)] # replace None/Nan with larger number inv.loc[inv["end_date"].isnull(), "end_date"] = LARGEDT64 inv["end_date"] = inv["end_date"].astype("datetime64[ns]") # remove station/channels that dont have data for requested time starttime = to_datetime64(starttime, default=inv["start_date"].min()) endtime = to_datetime64(endtime, default=inv["end_date"].max()) con1, con2 = (inv["start_date"] > endtime), (inv["end_date"] < starttime) inv = inv[~(con1 | con2)] df = inv[list(NSLC)] if df.empty: # return empty list if no data found return [] df.loc[:, "starttime"] = starttime df.loc[:, "endtime"] = endtime # remove any rows that don't have defined start/end times out = df[(~df["starttime"].isnull()) & (~df["endtime"].isnull())] # convert to list of tuples and return return [tuple(x) for x in out.to_records(index=False)]
def _get_bulk_arg(self, starttime=None, endtime=None, **kwargs) -> list: """ get the argument passed to get_waveforms_bulk, see obspy.fdsn.client for more info """ station_df = self.station_df.copy() inv = station_df[filter_index(station_df, **kwargs)] # replace None/Nan with larger number null_inds = inv["end_date"].isnull() inv.loc[null_inds, "end_date"] = far_out_time # remove station/channels that dont have data for requested time starttime = starttime if starttime is not None else inv.start_date.min( ) endtime = endtime if endtime is not None else inv.end_date.max() con1, con2 = (inv["start_date"] > endtime), (inv["end_date"] < starttime) inv = inv[~(con1 | con2)] df = inv[list(NSLC)] if df.empty: # return empty list if no data found return [] df.loc[:, "starttime"] = starttime df.loc[:, "endtime"] = endtime # remove any rows that don't have defined start/end times df = df[(~df.starttime.isnull()) & (~df.endtime.isnull())] # convert to list of tuples and return return [tuple(x) for x in df.to_records(index=False)]