def __call__(self, starttime, endtime, buffer, **kwargs): """ get start and end times, perform in kernel lookup """ # get defaults if starttime or endtime is none starttime = to_datetime64(starttime or SMALLDT64) endtime = to_datetime64(endtime or LARGEDT64) # find out if the query falls within one cached times con1 = self.cache.t1 <= starttime con2 = self.cache.t2 >= endtime con3 = self.cache.kwargs == self._kwargs_to_str(kwargs) cached_index = self.cache[con1 & con2 & con3] if not len(cached_index): # query is not cached get it from hdf5 file where = get_kernel_query(int(starttime), int(endtime), int(buffer)) raw_index = self._get_index(where, **kwargs) # replace "None" with None ic = self.bank.index_str raw_index.loc[:, ic] = raw_index.loc[:, ic].replace(["None"], [None]) # convert data types used by bank back to those seen by user index = raw_index.astype(dict(self.bank._dtypes_output)) self._set_cache(index, starttime, endtime, kwargs) else: index = cached_index.iloc[0]["cindex"] # trim down index con1 = index["starttime"] >= (endtime + buffer) con2 = index["endtime"] <= (starttime - buffer) return index[~(con1 | con2)]
def test_correct_endtime_in_index(self, default_wbank): """ ensure the index has times consistent with traces in waveforms """ index = default_wbank.read_index() st = obspy.read() starttimes = [to_datetime64(tr.stats.starttime) for tr in st] endtimes = [to_datetime64(tr.stats.endtime) for tr in st] assert min(starttimes) == index.starttime.min().to_datetime64() assert max(endtimes) == index.endtime.max().to_datetime64()
def yield_waveforms( self, network: Optional[str] = None, station: Optional[str] = None, location: Optional[str] = None, channel: Optional[str] = None, starttime: Optional[obspy.UTCDateTime] = None, endtime: Optional[obspy.UTCDateTime] = None, attach_response: bool = False, duration: float = 3600.0, overlap: Optional[float] = None, ) -> Stream: """ Yield waveforms from the bank. Parameters ---------- {get_waveforms_params} attach_response : bool If True attach the response to the waveforms using the stations duration : float The duration of the streams to yield. All channels selected channels will be included in the waveforms. overlap : float If duration is used, the amount of overlap in yielded streams, added to the end of the waveforms. Notes ----- All string parameters can use posix style matching with * and ? chars. """ # get times in float format starttime = to_datetime64(starttime, 0.0) endtime = to_datetime64(endtime, "2999-01-01") # read in the whole index df index = self.read_index( network=network, station=station, location=location, channel=channel, starttime=starttime, endtime=endtime, ) # adjust start/end times starttime = max(starttime, index.starttime.min()) endtime = min(endtime, index.endtime.max()) # chunk time and iterate over chunks time_chunks = make_time_chunks(starttime, endtime, duration, overlap) for t1, t2 in time_chunks: t1, t2 = to_datetime64(t1), to_datetime64(t2) con1 = (index.starttime - self.buffer) > t2 con2 = (index.endtime + self.buffer) < t1 ind = index[~(con1 | con2)] if not len(ind): continue yield self._index2stream(ind, t1, t2, attach_response)
def test_with_nulls(self): """ Test for handling nulls. """ test_input = (np.NaN, None, "", 15) out = np.array(to_datetime64(test_input)) # first make sure empty values worked assert pd.isnull(out[:3]).all() assert out[-1].astype(int) == obspy.UTCDateTime(15)._ns
def __call__( self, time_arg: event_time_type, time_before: Optional[float] = None, time_after: Optional[float] = None, *args, **kwargs, ) -> obspy.Stream: """ Using a reference time, return a waveforms that encompasses that time. Parameters ---------- time_arg The argument that will indicate a start time. Can be a one length events, and event, a float, or a UTCDatetime object time_before The time before time_arg to include in waveforms time_after The time after time_arg to include in waveforms Returns ------- obspy.Stream """ tbefore = to_timedelta64(time_before, default=self.time_before) tafter = to_timedelta64(time_after, default=self.time_after) assert (tbefore is not None) and (tafter is not None) # get the reference time from the object time = to_datetime64(get_reference_time(time_arg)) t1 = time - tbefore t2 = time + tafter return self.get_waveforms(starttime=t1, endtime=t2, **kwargs)
def test_npdatetime64_as_input(self): """ This should also work on np.datetime64. """ test_input = np.array((np.datetime64(1000, "s"), np.datetime64(100, "ns"))) out = to_datetime64(test_input) assert isinstance(out, np.ndarray) assert (test_input == out).all()
def test_pandas_timestamp(self): """ Timestamps should also work. """ kwargs = dict(year=2019, month=10, day=11, hour=12) ts = pd.Timestamp(**kwargs) out = to_datetime64((ts, )) expected_out = (ts.to_datetime64(), ) assert out == expected_out
def test_simple(self): """ Test converting simple UTCDateTimable things """ test_input = ("2019-01-10 11-12", obspy.UTCDateTime("2019-01-10T12-12"), 100) expected = np.array([obspy.UTCDateTime(x)._ns for x in test_input]) out = np.array(to_datetime64(test_input)).astype(int) assert np.equal(expected, out).all()
def test_starttime_origin_time_seperation(self, dar_attached_picks): """ ensure the start of the trace and start of the events arent too far off """ dar = dar_attached_picks cat = dar.attrs["events"] for ev in cat: rid = ev.resource_id time = to_datetime64(ev.origins[-1].time) dd = dar[dar.stream_id == rid] assert dd.origin_time.values - time == EMPTYTD64 assert ((dd.starttime.values - to_utc(time).timestamp) < 100).all()
def test_filter_index(self, crandall_dataset): """ Tests for filtering index with filter index function. """ # this is mainly here to test the time filtering, because the bank # operations pass this off to the HDF5 kernel. index = crandall_dataset.waveform_client.read_index(network="UU") mean_ns = index.starttime.astype(int).mean() t1 = to_datetime64(obspy.UTCDateTime(ns=int(mean_ns))) t2 = index.endtime.max() kwargs = dict(network="UU", station="*", location="*", channel="*") bool_ind = filter_index(index, starttime=t1, endtime=t2, **kwargs) assert (~np.logical_not(bool_ind)).any()
def _get_bulk_arg(self, starttime=None, endtime=None, **kwargs) -> list: """ get the argument passed to get_waveforms_bulk, see obspy.fdsn.client for more info """ station_df = self.station_df.copy() inv = station_df[filter_index(station_df, **kwargs)] # replace None/Nan with larger number inv.loc[inv["end_date"].isnull(), "end_date"] = LARGEDT64 inv["end_date"] = inv["end_date"].astype("datetime64[ns]") # remove station/channels that dont have data for requested time starttime = to_datetime64(starttime, default=inv["start_date"].min()) endtime = to_datetime64(endtime, default=inv["end_date"].max()) con1, con2 = (inv["start_date"] > endtime), (inv["end_date"] < starttime) inv = inv[~(con1 | con2)] df = inv[list(NSLC)] if df.empty: # return empty list if no data found return [] df.loc[:, "starttime"] = starttime df.loc[:, "endtime"] = endtime # remove any rows that don't have defined start/end times out = df[(~df["starttime"].isnull()) & (~df["endtime"].isnull())] # convert to list of tuples and return return [tuple(x) for x in out.to_records(index=False)]
def get_phase_window_df( # noqa: C901 event: ev.Event, max_duration: Optional[Union[float, int, Mapping]] = None, min_duration: Optional[Union[float, int, Mapping]] = None, channel_codes: Optional[Union[Collection, pd.Series]] = None, buffer_ratio: Optional[float] = None, restrict_to_arrivals: bool = True, ) -> pd.DataFrame: """ Return a dataframe of phase picks for the event. Does the following: 1) Removes the rejected picks. 2) Defines pick time windows using either a) a corresponding amplitude whose type matches the phase hint of the pick and has a time window b) the start of the phase to the arrival time plus min_duration. Parameters ---------- event The seismic event max_duration The maximum duration (in seconds) of a phase. Can either be a scalar, or a mapping whose keys are seed_ids and values are applied to that specific channel. min_duration The minimum duration (in seconds) of a phase. Can either be a scalar, or a mapping whose keys are seed_ids and values are applied to that specific channel. channel_codes If provided, supplies the needed information to expand the dataframe to include an entry for each channel on a station for a given pick. For example, this is used a P pick that occurs on a HHZ channel will also have an entry on HHE and HHN (assuming they are in the list). buffer_ratio If not None, the ratio of the total duration of the phase windows which should be added to BOTH the start and end of the phase window. restrict_to_arrivals If True, only use picks for which there is an arrival on the preferred origin. """ reftime = to_datetime64(get_reference_time(event)) def _get_earliest_s_time(df): return df[df.phase_hint == "S"].time.min() def _get_extrema_like_df(df, extrema_arg): """ get min or max argument with the same length as df. This is done so each rows duration can be compared to some row specific value. """ if isinstance(extrema_arg, pd.Series): return df["seed_id"].map(extrema_arg.droplevel("seed_id_less")) elif isinstance(extrema_arg, Mapping): return df["seed_id"].map(extrema_arg) else: return np.ones(len(df)) * extrema_arg def _get_picks_df(restrict_to_arrivals): """Get the picks dataframe, remove picks flagged as rejected.""" pdf = obsplus.picks_to_df(event) pdf["seed_id_less"] = pdf["seed_id"].str[:-1] if restrict_to_arrivals: adf = obsplus.arrivals_to_df(event) pdf = pdf.loc[pdf["resource_id"].isin(adf["pick_id"])] # remove rejected picks pdf = pdf[pdf.evaluation_status != "rejected"] # Toss any picks from stations that have S-picks that are earlier than P-picks if {"P", "S"}.issubset(pdf["phase_hint"]): phs = pdf.groupby("phase_hint") p_picks = phs.get_group("P") s_picks = phs.get_group("S") both = set(p_picks["seed_id_less"]).intersection( s_picks["seed_id_less"]) p_picks = (p_picks.loc[p_picks["seed_id_less"].isin( both)].set_index("seed_id_less").sort_index()) s_picks = (s_picks.loc[s_picks["seed_id_less"].isin( both)].set_index("seed_id_less").sort_index()) mask = p_picks["time"] > s_picks["time"] bad_p = p_picks.loc[mask] bad_s = s_picks.loc[mask] if mask.any(): warnings.warn( "S-pick is earlier than P-pick for one or more picks." "Skipping phases.") pdf = pdf.loc[~pdf["resource_id"].isin(bad_s["resource_id"]) & ~pdf["resource_id"].isin(bad_p["resource_id"])] if not len(pdf): raise NoPhaseInformationError( f"No valid phases for event:\n{event}") # # add seed_id column # pdf["seed_id"] = obsplus.utils.get_nslc_series(pdf) # add the seed id column that drops the component from the channel # rename the resource_id column for later merging pdf.rename(columns={"resource_id": "pick_id"}, inplace=True) return pdf def _add_amplitudes(df): """Add amplitudes to picks""" # expected_cols = ["pick_id", "twindow_start", "twindow_end", "twindow_ref"] dtypes = { "pick_id": str, "twindow_start": "timedelta64[ns]", "twindow_end": "timedelta64[ns]", "twindow_ref": "datetime64[ns]", } amp_df = event.amplitudes_to_df() # Drop rejected amplitudes amp_df = amp_df.loc[amp_df["evaluation_status"] != "rejected"] if amp_df.empty: # no data, init empty df with expected cols amp_df = pd.DataFrame(columns=list(dtypes)).astype(dtypes) else: # merge picks/amps together and calculate windows amp_df.rename( columns={ "time_begin": "twindow_start", "time_end": "twindow_end", "reference": "twindow_ref", }, inplace=True, ) # merge and return amp_df = amp_df[list(dtypes)].astype( dtypes) # Make sure time-related things are set correctly # Note: the amplitude list can be longer than the pick list because of # the logic for dropping picks earlier merged = df.merge(amp_df, left_on="pick_id", right_on="pick_id", how="outer").dropna(subset=["time"]) assert len(merged) == len(df) return _add_starttime_end(merged) def _add_starttime_end(df): """Add the time window start and end""" # fill references with start times of phases if empty df.loc[df["twindow_ref"].isnull(), "twindow_ref"] = df["time"] # Fill NaTs w/ 0 second timedelta twindow_start = df["twindow_start"].fillna(np.timedelta64(0, "ns")) twindow_end = df["twindow_end"].fillna(np.timedelta64(0, "ns")) # Determine start/end times of phase windows df["starttime"] = df["twindow_ref"] - twindow_start df["endtime"] = df["twindow_ref"] + twindow_end # add travel time df["travel_time"] = df["time"] - reftime # get earliest s phase by station _s_start = df.groupby(list(NSLC[:2])).apply(_get_earliest_s_time) s_start = _s_start.rename("s_start").to_frame().reset_index() # merge back into pick_df, use either defined window or S phase, whichever # is smaller. dd2 = df.merge(s_start, on=["network", "station"], how="left") # get dataframe indices for P p_inds = df[df.phase_hint == "P"].index # make sure P end times don't exceed s start times endtime_or_s_start = dd2[["s_start", "endtime"]].min(axis=1, skipna=True) df.loc[p_inds, "endtime"] = endtime_or_s_start[p_inds] duration = abs(df["endtime"] - df["starttime"]) # Make sure all value are under phase duration time, else truncate them if max_duration is not None: max_dur = to_timedelta64(_get_extrema_like_df(df, max_duration)) larger_than_max = duration > max_dur df.loc[larger_than_max, "endtime"] = df["starttime"] + to_timedelta64(max_duration) # Make sure all values are at least min_phase_duration, else expand them if min_duration is not None: min_dur = to_timedelta64(_get_extrema_like_df(df, min_duration)) small_than_min = duration < min_dur df.loc[small_than_min, "endtime"] = df["starttime"] + min_dur # sanity checks assert (df["endtime"] >= df["starttime"]).all() assert not (df["starttime"].isnull()).any() return df def _duplicate_on_same_stations(df): """ Duplicate all the entries to get the 3 components for each station """ # make a dict of channel[:-1] and matching channels assert channel_codes is not None code_lest_1 = defaultdict(list) for code in channel_codes: code_lest_1[code[0]].append(code[1]) # create expanded df new_inds = [ x for y in df["seed_id"].unique() for x in code_lest_1[y[:-1]] ] # get seed_id columns and merge back together df_new = pd.DataFrame(new_inds, columns=["seed_id"]) df_new["seed_id_less"] = df_new["seed_id"].str[:-1] seed_id = expand_seed_id(df_new["seed_id"]) df_new = df_new.join(seed_id) # now merge in old dataframe for full expand # df_new["temp"] = df_new["seed_id"].str[:-1] right_cols = list(PHASE_WINDOW_INTERMEDIATE_DTYPES) out = pd.merge(df_new, df[right_cols], on="seed_id_less", how="left") return out.drop_duplicates() def _apply_buffer(df): # add buffers on either end of waveform for tapering buff = (df["endtime"] - df["starttime"]) * buffer_ratio df["starttime"] = df["starttime"] - buff df["endtime"] = df["endtime"] + buff return df # read picks in and filter out rejected picks dd = _add_amplitudes(_get_picks_df(restrict_to_arrivals)) # return columns out = dd[list(PHASE_WINDOW_DF_DTYPES)] # add buffer to window start/end if buffer_ratio is not None: out = _apply_buffer(out) # if channel codes are provided, make a duplicate of each phase window row # for each channel on the same station if channel_codes: out = _duplicate_on_same_stations(out)[list(PHASE_WINDOW_DF_DTYPES)] return out
class TestGetEventData: t1 = to_datetime64("2009-04-01") t2 = to_datetime64("2009-04-04") path = "eventwaveforms/{year}/{julday}" # fixtures @pytest.fixture(scope="class") def temp_dir_path(self): """ return a path to a temporary directory """ with tempfile.TemporaryDirectory() as tempdir: out = os.path.join(tempdir, "temp") yield out @pytest.fixture(scope="class") def fetcher(self, kem_fetcher): """ make a copy of the kem_fetcher and restrict scope of events to when data are available.""" fet = kem_fetcher.copy() df = fet.event_df fet.event_df = df[(df.time >= self.t1) & (df.time <= self.t2)] return fet @pytest.fixture(scope="class") def download_data(self, temp_dir_path, fetcher: Fetcher): """ download data from the kem fetcher into the tempdir, return path to tempdir """ path = os.path.join(temp_dir_path, self.path) params = dict(time_before_origin=0, time_after_origin=10, path=path) fetcher.download_event_waveforms(**params) return temp_dir_path @pytest.fixture(scope="class") def event_sbank(self, download_data): """ return an sbank pointed at the temp_dir_path """ sb = WaveBank(download_data) sb.update_index() return sb @pytest.fixture(scope="class") def event_fetcher(self, event_sbank, fetcher): """ init a fetcher using the old fetcher """ fet = fetcher.copy() fet._download_client = event_sbank return fet @pytest.fixture(scope="class") def mseeds(self, download_data): """ return a list of all the files with the ext mseed """ return glob.glob(os.path.join(download_data, "**", "*mseed"), recursive=True) @pytest.fixture(scope="class") def stream_dict(self, event_fetcher): """ return a dict of the events contained in the waveforms """ return dict(event_fetcher.yield_event_waveforms(0, 10)) # tests def test_directory_exists(self, download_data): """ ensure the directory was created """ assert os.path.exists(download_data) def test_mseeds(self, mseeds): """ ensure some files with mseed ext were created """ assert len(mseeds) def test_data_were_downloaded(self, stream_dict): """ ensure data from the events exists """ for eveid, stream in stream_dict.items(): assert isinstance(stream, obspy.Stream) assert len(stream)
def test_series_to_datetimes(self): """ Series should be convertible to datetimes. """ ser = pd.Series([10, "2010-01-01"]) out = to_datetime64(ser) assert isinstance(out, pd.Series)
def test_utc_to_large(self): too_big = obspy.UTCDateTime("2600-01-01") with pytest.warns(UserWarning): out = to_datetime64(too_big) assert pd.Timestamp(out).year == 2262
def test_creation_time(self, amplitude, amp_series): assert amp_series["creation_time"] == to_datetime64( amplitude.creation_info.creation_time) assert amp_series["author"] == amplitude.creation_info.author assert amp_series["agency_id"] == amplitude.creation_info.agency_id
def yield_event_waveforms( self, time_before: Optional[float] = None, time_after: Optional[float] = None, reference: Union[str, Callable] = "origin", raise_on_fail: bool = True, ) -> Tuple[str, Stream]: """ Yield event_id and streams for each event in the events. Parameters ---------- time_before The time before (in seconds) the reference that will be included in the waveforms if possible. time_after The Time after (in seconds) the reference that will be included in the waveforms if possible. reference A str that indicates how the starttime of the trace should be determined. The following are supported: origin - use the origin time of the event for each channel p - use the first p times as the start of the station traces s - use the first s times as the start of the station traces If a station doesn't have p or s picks and "p" or "s" is used, it's streams will not be returned. raise_on_fail If True, re raise and exception if one is caught during waveform fetching, else continue to next event. Yields ------ obspy.Stream """ assert reference.lower() in self.reference_funcs tb = to_timedelta64(time_before, default=self.time_before) ta = to_timedelta64(time_after, default=self.time_after) assert (tb is not None) and (ta is not None) # get reference times event_ids = self.event_df.event_id.values reftimes = { x: self.reference_funcs[reference](self, x) for x in event_ids } # if using a wavebank preload index over entire time-span for speedup if isinstance(self.waveform_client, WaveBank): mt = min([ x.min() if hasattr(x, "min") else x for x in reftimes.values() ]) mx = max([ x.max() if hasattr(x, "max") else x for x in reftimes.values() ]) index = self.waveform_client.read_index(starttime=mt, endtime=mx) get_bulk_wf = partial(self._get_bulk_wf, index=index) else: get_bulk_wf = self._get_bulk_wf # iterate each event in the events and yield the waveform for event_id in event_ids: # make sure ser is either a single datetime or a series of datetimes ti_ = to_datetime64(reftimes[event_id]) bulk_args = self._get_bulk_arg(starttime=ti_ - tb, endtime=ti_ + ta) try: yield EventStream(event_id, get_bulk_wf(bulk_args)) except Exception: if raise_on_fail: raise