def __call__( self, time_arg: event_time_type, time_before: Optional[float] = None, time_after: Optional[float] = None, *args, **kwargs, ) -> obspy.Stream: """ Using a reference time, return a waveforms that encompasses that time. Parameters ---------- time_arg The argument that will indicate a start time. Can be a one length events, and event, a float, or a UTCDatetime object time_before The time before time_arg to include in waveforms time_after The time after time_arg to include in waveforms Returns ------- obspy.Stream """ tbefore = to_timedelta64(time_before, default=self.time_before) tafter = to_timedelta64(time_after, default=self.time_after) assert (tbefore is not None) and (tafter is not None) # get the reference time from the object time = to_datetime64(get_reference_time(time_arg)) t1 = time - tbefore t2 = time + tafter return self.get_waveforms(starttime=to_utc(t1), endtime=to_utc(t2), **kwargs)
def _index2stream(self, index, starttime=None, endtime=None, merge=True) -> Stream: """ return the waveforms in the index """ # get abs path to each datafame files: pd.Series = (str(self.bank_path) + index.path).unique() # make sure start and endtimes are in UTCDateTime starttime = to_utc(starttime) if starttime else None endtime = to_utc(endtime) if endtime else None # iterate the files to read and try to load into waveforms kwargs = dict(format=self.format, starttime=starttime, endtime=endtime) func = partial(_try_read_stream, **kwargs) stt = obspy.Stream() chunksize = (len(files) // self._max_workers) or 1 for st in self._map(func, files, chunksize=chunksize): if st is not None: stt += st # sort out nullish nslc codes stt = replace_null_nlsc_codes(stt) # filter out any traces not in index (this can happen when files hold # multiple traces). nslc = set(get_seed_id_series(index)) stt.traces = [x for x in stt if x.id in nslc] # trim, merge, attach response stt = self._prep_output_stream(stt, starttime, endtime, merge=merge) return stt
def get_waveforms_bulk(stream: Stream, bulk: bulk_waveform_arg_type, **kwargs) -> Stream: """ Get a large number of waveforms with a bulk request. Parameters ---------- stream A stream object. bulk A list of any number of tuples containing the following: (network, station, location, channel, starttime, endtime). """ # get a dataframe of stream contents index = _get_waveform_df(stream) # get a dataframe of the bulk arguments, convert time to datetime64 request_df = get_waveform_bulk_df(bulk) if not len(request_df): # return empty string if no bulk reqs provided return obspy.Stream() # get unique times and check conditions for string columns unique_times = np.unique(request_df[["starttime", "endtime"]].values, axis=0) traces = [] for (t1, t2) in unique_times: sub = _filter_index_to_bulk((t1, t2), index_df=index, bulk_df=request_df) new = obspy.Stream(traces=[x.data for x in sub["trace"]]).slice( starttime=to_utc(t1), endtime=to_utc(t2)) traces.extend(new.traces) return merge_traces(obspy.Stream(traces=traces))
def create_stream( self, starttime: utc_able_type, endtime: utc_able_type, seed_ids: Optional[List[str]] = None, sampling_rate: Optional[Union[float, int]] = None, ) -> obspy.Stream: """ create a waveforms from random data """ t1 = to_utc(starttime) t2 = to_utc(endtime) sr = sampling_rate or self.sampling_rate ar_len = int((t2.timestamp - t1.timestamp) * sr) st = obspy.Stream() for seed in seed_ids or self.seed_ids: n, s, l, c = seed.split(".") meta = { "sampling_rate": sr, "starttime": t1, "network": n, "station": s, "location": l, "channel": c, } data = np.random.randn(ar_len) tr = obspy.Trace(data=data, header=meta) st.append(tr) return st
def _trim_stream(df, stream, required_len, trim_tolerance): """ Get the starttimes and endtimes for trimming, raise ValueError if the stream is disjointed. """ # check trim tolerance if trim_tolerance is not None: con1 = (df.start.max() - df.start.min()) > trim_tolerance con2 = (df.end.max() - df.start.min()) > trim_tolerance if con1 or con2: msg = ("the following waveforms did not meed the required trim " f"tolerance{str(stream)}") raise ValueError(msg) # check length requirements, pop out any traces that dont meet it if required_len is not None: req_len = np.round(required_len * df.duration.max(), 2) too_short = df.duration <= req_len if too_short.any(): trace_str = "\n".join([str(x) for x in df[too_short].trace]) msg = f"These traces are not at least {req_len} seconds long:\n" warnings.warn(msg + trace_str + "\n removing them", UserWarning) stream.traces = list(df[~too_short].trace) df = df[~too_short] if not len(df): return Stream() # get trim time, trim, emit warnings t1, t2 = to_utc(df.start.max()), to_utc(df.end.min()) if t2 < t1: msg = f"The following waveforms has traces with no overlaps {stream}" raise ValueError(msg) return stream.trim(starttime=t1, endtime=t2)
def _func(time, ind, df, st): """ return waveforms from df of bulk parameters """ match_chars = {"*", "?", "[", "]"} ar = np.ones(len(ind)) # indices of ind to use to load data _t1, _t2 = time[0], time[1] df = df[(df.t1 == time[0]) & (df.t2 == time[1])] # determine which columns use any matching or other select features uses_matches = [_column_contains(df[x], match_chars) for x in NSLC] match_ar = np.array(uses_matches).any(axis=0) df_match = df[match_ar] df_no_match = df[~match_ar] # handle columns that need matches (more expensive) if not df_match.empty: match_bulk = df_match.to_records(index=False) mar = np.array( [filter_index(ind, *tuple(b)[:4]) for b in match_bulk]) ar = np.logical_and(ar, mar.any(axis=0)) # handle columns that do not need matches if not df_no_match.empty: nslc1 = set(get_seed_id_series(df_no_match)) nslc2 = get_seed_id_series(ind) ar = np.logical_and(ar, nslc2.isin(nslc1)) # get a list of used traces, combine and trim st = obspy.Stream([x for x, y in zip(st, ar) if y]) return st.slice(starttime=to_utc(_t1), endtime=to_utc(_t2))
def stream2contiguous(stream: Stream) -> Stream: """ Yields trimmed streams for times which all traces have data. Parameters ---------- stream The input stream Examples -------- >>> import obspy >>> st = obspy.read() >>> t1, t2 = st[0].stats.starttime, st[0].stats.endtime >>> _ = st[0].trim(endtime=t2 - 2) # remove data at end of one trace >>> out = stream2contiguous(st) >>> # stream2contiguous should now have trimmed all traces to match >>> assert all(len(tr.data) for tr in st) """ # pre-process waveforms by combining overlaps then breaking up masks stream.merge(method=1) stream = stream.split() # get seed_ids, start time, end time, and gaps seed_ids = {tr.id for tr in stream} starts, ends = _get_start_end(stream) # iterate start/end times, skip gaps and yield chunks of the waveforms for t1, t2 in zip(starts, ends): if t1 > t2 and len(starts) == len(ends) == 1: return # if disjointed shutdown generator assert t1 < t2 stream_out = stream.slice(starttime=to_utc(t1), endtime=to_utc(t2)) stream_out.merge(method=1) if len({tr.id for tr in stream_out}) == len(seed_ids): yield stream_out
def ta_time_range(ta_wavebank): """return a tuple of time from ta_test bank.""" df = ta_wavebank.read_index() t1 = to_utc(df["starttime"].min()) + 3600 # move to nearest hour start = to_utc(t1.timestamp - t1.timestamp % 3600) end = start + 3600 * 6 return to_utc(start), to_utc(end)
def _get_bulk_args(self, starttime=None, endtime=None, **kwargs) -> bulk_waveform_arg_type: """ Get the bulk waveform arguments based on given start/end times. This method also takes into account data availability as contained in the stations data. Parameters ---------- starttime Start times for query. endtime End times for query. Returns ------- List of tuples of the form: [(network, station, location, channel, starttime, endtime)] """ station_df = self.station_df.copy() inv = station_df[filter_index(station_df, **kwargs)] # replace None/Nan with larger number inv.loc[inv["end_date"].isnull(), "end_date"] = LARGEDT64 inv["end_date"] = inv["end_date"].astype("datetime64[ns]") # get start/end of the inventory inv_start = inv["start_date"].min() inv_end = inv["end_date"].max() # remove station/channels that dont have data for requested time min_time = to_datetime64(starttime, default=inv_start).min() max_time = to_datetime64(endtime, default=inv_end).max() con1, con2 = (inv["start_date"] > max_time), (inv["end_date"] < min_time) df = inv[~(con1 | con2)].set_index("seed_id")[list(NSLC)] if df.empty: # return empty list if no data found return [] if isinstance(starttime, pd.Series): # Have to get clever here to make sure only active stations get used # and indices are not duplicated. new_start = starttime.loc[set(starttime.index).intersection( df.index)] new_end = endtime.loc[set(endtime.index).intersection(df.index)] df["starttime"] = new_start.loc[~new_start.index.duplicated()] df["endtime"] = new_end.loc[~new_end.index.duplicated()] else: df["starttime"] = starttime df["endtime"] = endtime # remove any rows that don't have defined start/end times out = df[~(df["starttime"].isnull() | df["endtime"].isnull())] # ensure we have UTCDateTime objects out["starttime"] = [to_utc(x) for x in out["starttime"]] out["endtime"] = [to_utc(x) for x in out["endtime"]] # convert to list of tuples and return return [tuple(x) for x in out.to_records(index=False)]
def _get_stream_start_end(stream, gap_df): """ Return a list of the latest start time of initial chunk and earliest endtime of last time chunk. """ st1 = stream.slice(endtime=to_utc(gap_df.t1.min())) st2 = stream.slice(starttime=to_utc(gap_df.t2.max())) t1 = max([tr.stats.starttime.timestamp for tr in st1]) t2 = min([tr.stats.endtime.timestamp for tr in st2]) assert t1 < t2 return t1, t2
def stream_bulk_split(st: Stream, bulk: List[waveform_request_type], fill_value: Any = None) -> List[Stream]: """ Split a stream into a list of streams that meet requirements in bulk. This is similar to the get_waveforms_bulk methods of waveform_client, but rather than merging any overlapping data it is returned in a list of traces. Parameters ---------- st A stream object bulk A bulk request. Wildcards not currently supported on str params. fill_value If not None fill any missing data in time range with this value. Returns ------- List of traces, each meeting the corresponding request in bulk. """ # return nothing if empty bulk or stream args bulk = _get_bulk(bulk) if not bulk or len(st) == 0: return [] # # get dataframe of stream contents sdf = _stream_data_to_df(st) # iterate stream, return output out = [] for barg in bulk: assert len( barg) == 6, f"{barg} is not a valid bulk arg, must have len 6" need = filter_index(sdf, *barg) traces = [tr for tr, bo in zip(st, need) if bo] new_st = obspy.Stream(traces) t1, t2 = to_utc(barg[-2]), to_utc(barg[-1]) new = new_st.slice(starttime=t1, endtime=t2) # apply fill if needed if fill_value is not None: new = new.trim(starttime=t1, endtime=t2, fill_value=fill_value, pad=True) if new is None or not len(new): out.append(obspy.Stream()) continue new = merge_traces(new) out.append(new) assert len(out) == len(bulk), "output is not the same len as stream list" return out
def test_duplicate_stations(self, inv_df_duplicate_channels): """ Ensure duplicate stations create Station objects with correct time range. """ df = inv_df_duplicate_channels fur_df = df[df["station"] == "FUR"] inv = df_to_inventory(fur_df).select(station="FUR") stations = inv.networks assert len(stations) == 1 fur = stations[0] assert fur.start_date == to_utc(fur_df["start_date"].min()) assert fur.end_date == to_utc(fur_df["end_date"].max())
def incomplete_trace(self, node_stats_group, node_st) -> Stream: """ Return a stream with part of the data missing for one of its traces """ # Select a trace from a pick that is referenced in the stats group st = node_st.copy() pick = node_stats_group.data.iloc[0] seed_id = pick.name[-1] tr = st.select(id=seed_id)[0] # Trim the trace so it ends in the middle of the desired time window pick_start = pick["starttime"] pick_end = pick["endtime"] new_end = to_utc(pick_end) - (to_utc(pick_end) - to_utc(pick_start)) / 2 tr.trim(tr.stats.starttime, new_end) # Acts in place return st
def _prep_output_stream(self, st, starttime=None, endtime=None) -> obspy.Stream: """ Prepare waveforms object for output by trimming to desired times, merging channels, and attaching responses. """ if not len(st): return st starttime = starttime or min([x.stats.starttime for x in st]) endtime = endtime or max([x.stats.endtime for x in st]) # trim st.trim(starttime=to_utc(starttime), endtime=to_utc(endtime)) return merge_traces(st, inplace=True).sort()
def archive_to_sds( bank: Union[Path, str, "obsplus.WaveBank"], sds_path: Union[Path, str], starttime: Optional[UTCDateTime] = None, endtime: Optional[UTCDateTime] = None, overlap: float = 30, type_code: str = "D", stream_processor: Optional[callable] = None, ): """ Create a seiscomp data structure archive from a waveform source. Parameters ---------- bank A wavebank or path to such. sds_path The path for the new sds archive to be created. starttime If not None, the starttime to convert data from bank. endtime If not None, the endtime to convert data from bank. overlap The overlap to use for each file. type_code The str indicating the datatype. stream_processor A callable that will take a single stream as input and return a a single stream. May return and empty stream to skip a stream. Notes ----- see: https://www.seiscomp3.org/doc/applications/slarchive/SDS.html """ sds_path = Path(sds_path) # create a fetcher object for yielding continuous waveforms bank = obsplus.WaveBank(bank) bank.update_index() # get starttime/endtimes index = bank.read_index() ts1 = index.starttime.min() if not starttime else starttime t1 = _nearest_day(ts1) t2 = to_utc(index.endtime.max() if not endtime else endtime) nslcs = get_seed_id_series(index).unique() # iterate over nslc and get data for selected channel for nslc in nslcs: nslc_dict = {n: v for n, v in zip(NSLC, nslc.split("."))} # yield waveforms in desired chunks ykwargs = dict(starttime=t1, endtime=t2, overlap=overlap, duration=86400) ykwargs.update(nslc_dict) for st in bank.yield_waveforms(**ykwargs): if stream_processor: # apply stream processor if needed. st = stream_processor(st) if st: path = _get_sds_filename(st, sds_path, type_code, **nslc_dict) st.write(str(path), "mseed")
def yield_waveforms( self, network: Optional[str] = None, station: Optional[str] = None, location: Optional[str] = None, channel: Optional[str] = None, starttime: Optional[obspy.UTCDateTime] = None, endtime: Optional[obspy.UTCDateTime] = None, duration: float = 3600.0, overlap: Optional[float] = None, ) -> Stream: """ Yield time-series segments from the waveform client. Parameters ---------- {get_waveforms_params} duration : float The duration of the streams to yield. All channels selected channels will be included in the waveforms. overlap : float If duration is used, the amount of overlap in yielded streams, added to the end of the waveforms. Notes ----- All string parameters can use posix style matching with * and ? chars. Total duration of yielded streams = duration + overlap. If no starttime or endtime is provided the min/max indicated by the stations will be used. """ # Note: although WaveBank has a yield waveforms method, we want # fetcher to work with any client so we don't use its implementation. starttime = to_utc(starttime or self.station_df["start_date"].min()) endtime = to_utc(endtime or self.station_df["end_date"].max()) time_chunks = make_time_chunks(starttime, endtime, duration, overlap) for t1, t2 in time_chunks: kwargs = dict(network=network, station=station, location=location, channel=channel) yield self.get_waveforms(starttime=t1, endtime=t2, **kwargs)
def get_waveforms( stream: Stream, network: str = "*", station: str = "*", location: str = "*", channel: str = "*", starttime: Optional[UTC] = None, endtime: Optional[UTC] = None, ) -> obspy.Stream: """ A subset of the Client.get_waveforms method. Simply makes successive calls to Stream.select and Stream.trim under the hood. Matching is available on all str parameters. Parameters ---------- stream A stream object. network The network code station The station code location Location code channel Channel code starttime Starttime for query endtime Endtime for query """ t1, t2 = to_utc(starttime or SMALLDT64), to_utc(endtime or LARGEDT64) kwargs = { c: v for c, v in zip(NSLC, [network, station, location, channel]) } st = stream.select(**kwargs).slice(starttime=t1, endtime=t2).copy() return st
def floatify_dict(some_dict): """ Iterate a dict and convert all TimeStamps/datetime64 to floats. Then round all floats to nearest 4 decimals. """ out = {} for i, v in some_dict.items(): if isinstance(v, (pd.Timestamp, np.datetime64)): v = to_utc(v).timestamp if isinstance(v, float): v = np.round(v, 4) out[i] = v return out
def _create_first_pick_origin(first_pick, channel_ser, depth): """ Create an origin based on first pick and a channel series. """ msg = ("origin fixed to location and time of earliest pick by " f"obsplus version {obsplus.__last_version__}") comment = ev.Comment(text=msg) odict = dict( time=to_utc(first_pick["time"]), latitude=channel_ser["latitude"], longitude=channel_ser["longitude"], depth=depth, time_fixed=True, comments=[comment], ) return ev.Origin(**odict)
def test_only_p_phases(self, event_dict_p, subbing_fetcher_with_processor): """make sure only stations that have p picks are returned""" stream = subbing_fetcher_with_processor.waveform_client.get_waveforms() df = subbing_fetcher_with_processor.picks_df for eve_id, st in event_dict_p.items(): con1 = df["event_id"] == eve_id con2 = df["phase_hint"].str.upper() == "P" pick_df = df[con1 & con2] # iterate each pick, determine if it has data in the bank for ind, row in pick_df.iterrows(): time = to_utc(row["time"]) kwargs = dict( starttime=time - self.time_before, endtime=time + self.time_after, station=row["station"], ) st1 = stream.get_waveforms(**kwargs) st2 = st.get_waveforms(**kwargs) assert_streams_almost_equal(st1, st2, allow_off_by_one=True)
def df_with_get_stations_kwargs(self): """ Add response information to the dataframe using get_stations_kwargs. Add an additional station which will need to get all data from other columns. """ _inv = obsplus.load_dataset( "bingham_test").station_client.get_stations() inv = _inv.select(station="NOQ") with suppress_warnings(): df = obsplus.stations_to_df(inv).reset_index() # set get_station_kwargs for last two channels, leave first empty kwargs_list = [""] for ind, row in df.iloc[1:].iterrows(): kwargs = {x: row[x] for x in NSLC} kwargs["endafter"] = str(to_utc(row["start_date"])) kwargs_list.append(kwargs) # set last kwargs to str to simulate reading from csv kwargs_list[-1] = str(kwargs_list[-1]) df["get_station_kwargs"] = kwargs_list # set the first kwargs to a string to make sure it can be parsed # this is important for eg reading data from a csv. df.loc[0, "get_station_kwargs"] = str(df.loc[0, "get_station_kwargs"]) # now add a row with an empty get_station_kwargs column old = dict(df.iloc[0]) new = { "station": "CWU", "network": "UU", "channel": "EHZ", "location": "01", "seed_id": "UU.CWU.01.EHZ", "get_station_kwargs": "{}", } old.update(new) ser = pd.Series(old) return df.append(ser, ignore_index=True)
def gauss_trace_group(gauss_stat_group) -> TraceGroup: """ Create a TraceGroup with a Gaussian pulse as the data """ # Generate the data data = gauss(_t, _a, _b, _c) gauss_stat_group.data["sampling_rate"] = 1 / _dt # Build a stream from the data tr = Trace( data=data, header={ "starttime": to_utc(gauss_stat_group.data.iloc[0].starttime), "delta": _dt, "network": "UK", "station": "STA1", "channel": "HHZ", }, ) st = Stream() st.append(tr) # Add a second trace with a substantial discontinuity caused by zero-padding st.append(tr.copy()) # Same data, but the time window in the StatsGroup halves it st[1].stats.station = "STA2" # Make a TraceGroup return mopy.TraceGroup(gauss_stat_group, st, "displacement").fillna()
def test_single_value(self, value): """Test either a sequence or UTCDateTime""" out = to_utc(value) assert isinstance(out, (Sequence, UTCDateTime, np.ndarray))
def new_time(self, bing_first_time): """Get a new time based on the first event in bingham_test event + 1""" return to_utc(bing_first_time + 1)
def test_time(self, new_event_stream, new_time): """ensure the new time was returned""" assert len(new_event_stream) t1 = to_utc(new_event_stream[0].stats.starttime.timestamp) t2 = to_utc(new_event_stream[0].stats.endtime.timestamp) assert t1 < new_time < t2
def _nearest_day(time): """ Round a time down to the nearest day. """ ts = to_utc(time).timestamp ts_day = 3600 * 24 return to_utc(ts - (ts % ts_day))
def bing_first_time(bingham_dataset): """Get a new time based on the first event in bingham_test event + 1""" df = obsplus.events_to_df(bingham_dataset.event_client.get_events()) return to_utc(df.iloc[0]["time"])
def _times_to_utc(df): """Convert time columns to UTCDateTime.""" df["starttime"] = to_utc(df["starttime"]) df["endtime"] = to_utc(df["endtime"]) return df