def _func(time, ind, df, st): """ return waveforms from df of bulk parameters """ match_chars = {"*", "?", "[", "]"} ar = np.ones(len(ind)) # indices of ind to use to load data _t1, _t2 = time[0], time[1] df = df[(df.t1 == time[0]) & (df.t2 == time[1])] # determine which columns use any matching or other select features uses_matches = [_column_contains(df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = df[match_ar] df_no_match = df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(ind, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_seed_id_series(df_no_match)) nslc2 = get_seed_id_series(ind) ar = np.logical_and(ar, nslc2.isin(nslc1)) # get a list of used traces, combine and trim st = obspy.Stream([x for x, y in zip(st, ar) if y]) return st.slice(starttime=to_utc(_t1), endtime=to_utc(_t2))
def _func(time, ind, df): """ return waveforms from df of bulk parameters """ match_chars = {"*", "?", "[", "]"} t1, t2 = time[0], time[1] # filter index based on start/end times in_time = ~((ind["starttime"] > t2) | (ind["endtime"] < t1)) ind = ind[in_time] # create indices used to load data ar = np.ones(len(ind)) # indices of ind to use to load data df = df[(df.t1 == time[0]) & (df.t2 == time[1])] # determine which columns use any matching or other select features uses_matches = [_column_contains(df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = df[match_ar] df_no_match = df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(ind, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_seed_id_series(df_no_match)) nslc2 = get_seed_id_series(ind) ar = np.logical_and(ar, nslc2.isin(nslc1)) return self._index2stream(ind[ar], t1, t2)
def test_dataframe_missing_columns(self, pick_df): """Dataframe without required columns should raise ValueError.""" new = pick_df.drop(columns=["network", "location"]) with pytest.raises(ValueError): upd.get_seed_id_series(new) # But it should work if only the required subset is there out = upd.get_seed_id_series(new, subset=["station", "channel"]) assert len(out) == len(pick_df) split = out.str.split(".", expand=True) assert (split[0] == pick_df["station"]).all() assert (split[1] == pick_df["channel"]).all()
def set_stations(self, stations: fetcher_station_type): """ Set the station state in fetcher. Parameters ---------- stations Data representing stations, from which a client or dataframe can be inferred. """ try: self.station_client = get_station_client(stations) except TypeError: self.station_client = getattr(self, "station_client", None) try: # since its common for inventories to have far out enddates this # can raise a warning. These are safe to ignore. with suppress_warnings(category=TimeOverflowWarning): self.station_df = stations_to_df(stations) except TypeError: # if unable to get station info from stations use waveform client try: self.station_df = stations_to_df(self.waveform_client) except TypeError: # if no waveforms try events try: self.station_df = stations_to_df(self.event_client) except TypeError: self.station_df = None # make sure seed_id is set if self.station_df is not None: self.station_df["seed_id"] = get_seed_id_series(self.station_df)
def _index2stream(self, index, starttime=None, endtime=None, merge=True) -> Stream: """ return the waveforms in the index """ # get abs path to each datafame files: pd.Series = (str(self.bank_path) + index.path).unique() # make sure start and endtimes are in UTCDateTime starttime = to_utc(starttime) if starttime else None endtime = to_utc(endtime) if endtime else None # iterate the files to read and try to load into waveforms kwargs = dict(format=self.format, starttime=starttime, endtime=endtime) func = partial(_try_read_stream, **kwargs) stt = obspy.Stream() chunksize = (len(files) // self._max_workers) or 1 for st in self._map(func, files, chunksize=chunksize): if st is not None: stt += st # sort out nullish nslc codes stt = replace_null_nlsc_codes(stt) # filter out any traces not in index (this can happen when files hold # multiple traces). nslc = set(get_seed_id_series(index)) stt.traces = [x for x in stt if x.id in nslc] # trim, merge, attach response stt = self._prep_output_stream(stt, starttime, endtime, merge=merge) return stt
def archive_to_sds( bank: Union[Path, str, "obsplus.WaveBank"], sds_path: Union[Path, str], starttime: Optional[UTCDateTime] = None, endtime: Optional[UTCDateTime] = None, overlap: float = 30, type_code: str = "D", stream_processor: Optional[callable] = None, ): """ Create a seiscomp data structure archive from a waveform source. Parameters ---------- bank A wavebank or path to such. sds_path The path for the new sds archive to be created. starttime If not None, the starttime to convert data from bank. endtime If not None, the endtime to convert data from bank. overlap The overlap to use for each file. type_code The str indicating the datatype. stream_processor A callable that will take a single stream as input and return a a single stream. May return and empty stream to skip a stream. Notes ----- see: https://www.seiscomp3.org/doc/applications/slarchive/SDS.html """ sds_path = Path(sds_path) # create a fetcher object for yielding continuous waveforms bank = obsplus.WaveBank(bank) bank.update_index() # get starttime/endtimes index = bank.read_index() ts1 = index.starttime.min() if not starttime else starttime t1 = _nearest_day(ts1) t2 = to_utc(index.endtime.max() if not endtime else endtime) nslcs = get_seed_id_series(index).unique() # iterate over nslc and get data for selected channel for nslc in nslcs: nslc_dict = {n: v for n, v in zip(NSLC, nslc.split("."))} # yield waveforms in desired chunks ykwargs = dict(starttime=t1, endtime=t2, overlap=overlap, duration=86400) ykwargs.update(nslc_dict) for st in bank.yield_waveforms(**ykwargs): if stream_processor: # apply stream processor if needed. st = stream_processor(st) if st: path = _get_sds_filename(st, sds_path, type_code, **nslc_dict) st.write(str(path), "mseed")
def make_origins( events: catalog_or_event, inventory: obspy.Inventory, depth: float = 1.0, phase_hints: Optional[Iterable] = ("P", "p"), ) -> catalog_or_event: """ Iterate a catalog or single events and ensure each has an origin. If no origins are found for an event, create one with the time set to the earliest pick and the location set to the location of the first hit station. Events are modified in place. This may be useful for location codes that need a starting location. Parameters ---------- events The events to scan and add origins were necessary. inventory An inventory object which contains all the stations referenced in quakeml elements of events. depth The default depth for created origins. Should be in meters. See the obspy docs for Origin or the quakeml standard for more details. phase_hints List of acceptable phase hints to use for identifying the earliest pick. By default will only search for "P" or "p" phase hints. Returns ------- Either a Catalog or Event object (same as input). """ # ensure input is an iterable of events cat = [events] if isinstance(events, Event) else events # load inv dataframe and make sure it has a seed_id column df = obsplus.stations_to_df(inventory) nslc_series = get_seed_id_series(df) for event in cat: if not event.origins: # make new origin picks = event.picks_to_df() picks = picks.loc[(~(picks["evaluation_status"] == "rejected")) & (picks["phase_hint"].isin(phase_hints))] if not len(picks): msg = f"{event} has no acceptable picks to create origin" raise ValidationError(msg) # get first pick, determine time/station used first_pick = picks.loc[picks["time"].idxmin()] seed_id = first_pick["seed_id"] # find channel corresponding to pick df_chan = df[nslc_series == seed_id] if not len(df_chan): raise ValidationError(f"{seed_id} not found in inventory") ser = df_chan.iloc[0] # create origin ori = _create_first_pick_origin(first_pick, ser, depth=depth) event.origins.append(ori) return events
def _filter_index_to_bulk(time, index_df, bulk_df) -> pd.DataFrame: """ Using an index_df, apply conditions in request_df and return array indicating if values in index meet requested conditions. Parameters ---------- time A tuple of mintime, maxtime index_df A dataframe indexing a waveform resource. Can be an index of traces in a stream or an index from a wavebank. bulk_df The dataframe containing bulk requests. """ match_chars = {"*", "?", "[", "]"} # filter out any index times not in current time pair too_late = index_df["starttime"] > time[1] too_early = index_df["endtime"] < time[0] index_df = index_df[~(too_early | too_late)] ar = np.ones(len(index_df)) # indices of ind to use to load data # filter out any request times which are not for the current time pair is_starttime = bulk_df["starttime"] == time[0] is_endtime = bulk_df["endtime"] == time[1] bulk_df = bulk_df[is_starttime & is_endtime] # determine which columns use matching. These must be handled separately. uses_matches = [_column_contains(bulk_df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = bulk_df[match_ar] df_no_match = bulk_df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(index_df, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_seed_id_series(df_no_match)) nslc2 = get_seed_id_series(index_df) ar = np.logical_and(ar, nslc2.isin(nslc1)) # get a list of used traces, combine and trim return index_df[ar]
def _get_waveform_df(stream: wave_type) -> pd.DataFrame: """ Convert a stream of sequence of traces into a datframe. Parameters ---------- stream The streams to index Notes ----- This is private because it is probably not quite polished enough to include in the public API. More thought is needed how to do this properly. """ stats_columns = list(NSLC) + ["starttime", "endtime", "sampling_rate"] trace_contents = [{i: tr.stats[i] for i in stats_columns} for tr in stream] df = pd.DataFrame(trace_contents, columns=stats_columns) # ensure time(y) columns have proper df["starttime"] = to_datetime64(df["starttime"]) df["endtime"] = to_datetime64(df["endtime"]) df["sampling_period"] = to_timedelta64(1 / df["sampling_rate"]) df["seed_id"] = get_seed_id_series(df) df["trace"] = [ObjectWrapper(tr) for tr in stream] return df
def test_one_subset_raises(self, pick_df): """At least two columns are required in subset.""" with pytest.raises(ValueError): upd.get_seed_id_series(pick_df, subset=["network"])
def test_bad_subset(self, pick_df): """ Bad subset should raise valuerror.""" with pytest.raises(ValueError): upd.get_seed_id_series(pick_df, subset=["network", "monkey"])
def test_seed_id_basic(self, pick_df): """Standard usage.""" seed = upd.get_seed_id_series(pick_df) assert (seed == pick_df["seed_id"]).all()