def __call__(self, starttime, endtime, buffer, **kwargs): """ get start and end times, perform in kernel lookup """ # get defaults if starttime or endtime is none starttime = None if pd.isnull(starttime) else starttime endtime = None if pd.isnull(endtime) else endtime starttime = to_datetime64(starttime or SMALLDT64) endtime = to_datetime64(endtime or LARGEDT64) # find out if the query falls within one cached times con1 = self.cache.t1 <= starttime con2 = self.cache.t2 >= endtime con3 = self.cache.kwargs == self._kwargs_to_str(kwargs) cached_index = self.cache[con1 & con2 & con3] if not len(cached_index): # query is not cached get it from hdf5 file where = _get_kernel_query(starttime.astype(np.int64), endtime.astype(np.int64), int(buffer)) raw_index = self._get_index(where, **kwargs) # replace "None" with None ic = self.bank.index_str raw_index.loc[:, ic] = raw_index.loc[:, ic].replace(["None"], [None]) # convert data types used by bank back to those seen by user index = raw_index.astype(dict(self.bank._dtypes_output)) self._set_cache(index, starttime, endtime, kwargs) else: index = cached_index.iloc[0]["cindex"] # trim down index con1 = index["starttime"] >= (endtime + buffer) con2 = index["endtime"] <= (starttime - buffer) return index[~(con1 | con2)]
def waveform_df(): """ Create a dataframe with the basic required columns. """ st = obspy.read() cols = list(NSLC) + ["starttime", "endtime"] df = pd.DataFrame([tr.stats for tr in st])[cols] df["starttime"] = to_datetime64(df["starttime"]) df["endtime"] = to_datetime64(df["endtime"]) return df
def _get_bulk_args(self, starttime=None, endtime=None, **kwargs) -> bulk_waveform_arg_type: """ Get the bulk waveform arguments based on given start/end times. This method also takes into account data availability as contained in the stations data. Parameters ---------- starttime Start times for query. endtime End times for query. Returns ------- List of tuples of the form: [(network, station, location, channel, starttime, endtime)] """ station_df = self.station_df.copy() inv = station_df[filter_index(station_df, **kwargs)] # replace None/Nan with larger number inv.loc[inv["end_date"].isnull(), "end_date"] = LARGEDT64 inv["end_date"] = inv["end_date"].astype("datetime64[ns]") # get start/end of the inventory inv_start = inv["start_date"].min() inv_end = inv["end_date"].max() # remove station/channels that dont have data for requested time min_time = to_datetime64(starttime, default=inv_start).min() max_time = to_datetime64(endtime, default=inv_end).max() con1, con2 = (inv["start_date"] > max_time), (inv["end_date"] < min_time) df = inv[~(con1 | con2)].set_index("seed_id")[list(NSLC)] if df.empty: # return empty list if no data found return [] if isinstance(starttime, pd.Series): # Have to get clever here to make sure only active stations get used # and indices are not duplicated. new_start = starttime.loc[set(starttime.index).intersection( df.index)] new_end = endtime.loc[set(endtime.index).intersection(df.index)] df["starttime"] = new_start.loc[~new_start.index.duplicated()] df["endtime"] = new_end.loc[~new_end.index.duplicated()] else: df["starttime"] = starttime df["endtime"] = endtime # remove any rows that don't have defined start/end times out = df[~(df["starttime"].isnull() | df["endtime"].isnull())] # ensure we have UTCDateTime objects out["starttime"] = [to_utc(x) for x in out["starttime"]] out["endtime"] = [to_utc(x) for x in out["endtime"]] # convert to list of tuples and return return [tuple(x) for x in out.to_records(index=False)]
def _filter_starttime_endtime(df, starttime=None, endtime=None): """Filter dataframe on starttime and endtime.""" bool_index = np.ones(len(df), dtype=bool) t1 = to_datetime64(starttime) if starttime is not None else SMALLDT64 t2 = to_datetime64(endtime) if endtime is not None else LARGEDT64 # get time columns start_col = getattr(df, "starttime", getattr(df, "start_date", None)) end_col = getattr(df, "endtime", getattr(df, "end_date", None)) in_time = ~((end_col < t1) | (start_col > t2)) return np.logical_and(bool_index, in_time.values)
def yield_waveforms( self, network: Optional[str] = None, station: Optional[str] = None, location: Optional[str] = None, channel: Optional[str] = None, starttime: Optional[utc_able_type] = None, endtime: Optional[utc_able_type] = None, duration: float = 3600.0, overlap: Optional[float] = None, ) -> Stream: """ Yield time-series segments. Parameters ---------- {get_waveforms_params} duration : float The duration of the streams to yield. All channels selected channels will be included in the waveforms. overlap : float If duration is used, the amount of overlap in yielded streams, added to the end of the waveforms. Notes ----- All string parameters can use posix style matching with * and ? chars. Total duration of yielded streams = duration + overlap. """ # get times in float format starttime = to_datetime64(starttime, 0.0) endtime = to_datetime64(endtime, "2999-01-01") # read in the whole index df index = self.read_index( network=network, station=station, location=location, channel=channel, starttime=starttime, endtime=endtime, ) # adjust start/end times starttime = max(starttime, index.starttime.min()) endtime = min(endtime, index.endtime.max()) # chunk time and iterate over chunks time_chunks = make_time_chunks(starttime, endtime, duration, overlap) for t1, t2 in time_chunks: t1, t2 = to_datetime64(t1), to_datetime64(t2) con1 = (index.starttime - self.buffer) > t2 con2 = (index.endtime + self.buffer) < t1 ind = index[~(con1 | con2)] if not len(ind): continue yield self._index2stream(ind, t1, t2)
def _get_times(starttime, endtime): """Return starttimes and endtimes.""" # get defaults if starttime or endtime is none starttime = None if pd.isnull(starttime) else starttime endtime = None if pd.isnull(endtime) else endtime starttime = to_datetime64(starttime or SMALLDT64) endtime = to_datetime64(endtime or LARGEDT64) if starttime is not None and endtime is not None: if starttime > endtime: msg = "starttime cannot be greater than endtime." raise ValueError(msg) return starttime, endtime
def __call__( self, time_arg: event_time_type, time_before: Optional[float] = None, time_after: Optional[float] = None, *args, **kwargs, ) -> obspy.Stream: """ Using a reference time, return a waveforms that encompasses that time. Parameters ---------- time_arg The argument that will indicate a start time. Can be a one length events, and event, a float, or a UTCDatetime object time_before The time before time_arg to include in waveforms time_after The time after time_arg to include in waveforms Returns ------- obspy.Stream """ tbefore = to_timedelta64(time_before, default=self.time_before) tafter = to_timedelta64(time_after, default=self.time_after) assert (tbefore is not None) and (tafter is not None) # get the reference time from the object time = to_datetime64(get_reference_time(time_arg)) t1 = time - tbefore t2 = time + tafter return self.get_waveforms(starttime=to_utc(t1), endtime=to_utc(t2), **kwargs)
def _index_from_iterable(self, iterable, update_time): """Iterate over an event iterable and dump to database.""" events, update_times, paths = [], [], [] max_mem = self._max_events_in_memory # this avoids the MRO each loop events_remain = False for cat, mtime, path in iterable: if cat is None: continue for event in cat: events.append(event) update_times.append(mtime) paths.append(path) if len(events) >= max_mem: # max limit exceeded, dump to db events_remain = True break # add new events to database df = obsplus.events.pd._default_cat_to_df(events) df["updated"] = to_datetime64(update_times) df["path"] = _remove_base_path(pd.Series(paths, dtype=object)) if len(df): df = _time_cols_to_ints(df) df_to_write = self._prepare_dataframe(df, EVENT_TYPES_INPUT) self._write_update(df_to_write, update_time) return events_remain
def test_with_nulls(self): """Test for handling nulls.""" test_input = (np.NaN, None, "", 15) out = np.array(to_datetime64(test_input)) # first make sure empty values worked assert pd.isnull(out[:3]).all() assert out[-1].astype(np.int64) == obspy.UTCDateTime(15)._ns
def _get_absolute_time(time: Union[time_types, np.ndarray], ref_time: np.ndarray) -> np.ndarray: """ Get an absolute time from a possible reference time. Parameters ---------- time Can either be a an absolute time, or a timedelta with respect to ref_time. ref_time The object time is referenced to. """ def _is_time_delta(obj): """ return True if an object is a timedelta like thing. """ if isinstance(obj, (int, float)): return True dtype = getattr(obj, "dtype", None) if np.issubdtype(dtype, np.timedelta64): return True is_int = np.issubdtype(dtype, np.integer) is_float = np.issubdtype(dtype, np.floating) if is_int or is_float: return True return False # First try converting to datetime64, if that fails convert to timedelta. if _is_time_delta(time): dt = ref_time + to_timedelta64(time) else: dt = to_datetime64(time) return np.broadcast_to(dt, np.shape(ref_time))
def test_npdatetime64_as_input(self): """This should also work on np.datetime64.""" test_input = np.array((np.datetime64(1000, "s"), np.datetime64(100, "ns"))) out = to_datetime64(test_input) assert isinstance(out, np.ndarray) assert (test_input == out).all()
def test_creation_time(self, amplitude, amp_series): """Ensure creation time was included.""" assert amp_series["creation_time"] == to_datetime64( amplitude.creation_info.creation_time ) assert amp_series["author"] == amplitude.creation_info.author assert amp_series["agency_id"] == amplitude.creation_info.agency_id
def test_trim_out_of_existence(self, stream_wf): """Tests for trimming out all data.""" far_out = to_datetime64("2200-01-01") wf = stream_wf.trim(starttime=far_out) assert len(wf) == 0 data, stats = wf.data, wf.stats assert len(data) == len(stats) == 0
def test_pandas_timestamp(self): """Timestamps should also work.""" kwargs = dict(year=2019, month=10, day=11, hour=12) ts = pd.Timestamp(**kwargs) out = to_datetime64((ts, )) expected_out = (ts.to_datetime64(), ) assert out == expected_out
def test_simple(self): """Test converting simple UTCDateTimable things""" test_input = ("2019-01-10 11-12", obspy.UTCDateTime("2019-01-10T12-12"), 100) expected_ns = np.array([obspy.UTCDateTime(x)._ns for x in test_input]) dt64s = to_datetime64(test_input) new_ns = np.array(dt64s).astype(np.int64) assert np.equal(expected_ns, new_ns).all()
def test_set_picks_dataframe(self, add_picks_from_df, pick_df): """verify that it is possible to attach picks from a DataFrame/csv file""" assert len(add_picks_from_df) == len(pick_df) # Make sure the resulting data is what you expect newest = add_picks_from_df.data.reset_index().set_index( PDF_IND).iloc[-1] assert isinstance(newest["time"], pd.Timestamp) pick_time = pick_df.set_index(PDF_IND).loc[newest.name]["time"] assert newest.time == to_datetime64(pick_time) assert_not_nan(newest.pick_id)
def _get_waveform_df(stream: wave_type) -> pd.DataFrame: """ Convert a stream of sequence of traces into a datframe. Parameters ---------- stream The streams to index Notes ----- This is private because it is probably not quite polished enough to include in the public API. More thought is needed how to do this properly. """ stats_columns = list(NSLC) + ["starttime", "endtime", "sampling_rate"] trace_contents = [{i: tr.stats[i] for i in stats_columns} for tr in stream] df = pd.DataFrame(trace_contents, columns=stats_columns) # ensure time(y) columns have proper df["starttime"] = to_datetime64(df["starttime"]) df["endtime"] = to_datetime64(df["endtime"]) df["sampling_period"] = to_timedelta64(1 / df["sampling_rate"]) df["seed_id"] = get_seed_id_series(df) df["trace"] = [ObjectWrapper(tr) for tr in stream] return df
def test_multi_indexed_dataframe(self, add_picks_from_multi_df, pick_df_multi): """ Verify that it is possible to use a multi-indexed dataframe with the pick information """ # Things may be starting to get more complicated than I # actually want to deal with here assert len(add_picks_from_multi_df) == len(pick_df_multi) # Make sure the input data is what you expect newest = add_picks_from_multi_df.data.reset_index().set_index( PDF_IND).iloc[-1] pick_time = (pick_df_multi.reset_index().set_index(PDF_IND).loc[ newest.name]["time"]) assert newest.time == to_datetime64(pick_time) assert_not_nan(newest.pick_id)
def test_npdatetime64_too_large(self): """Test np.datetime64s larger than can fit into int64""" too_big = np.array([ np.datetime64("2300-01-01"), np.datetime64("2020-01-01"), np.datetime64("2500-01-01"), ]) with pytest.warns(UserWarning): out = to_datetime64(too_big) years = out.astype("M8[Y]") assert np.array_equal( years, [ np.datetime64("2262", "Y"), np.datetime64("2020", "Y"), np.datetime64("2262", "Y"), ], )
def test_df_overwrite(self, add_picks_from_df, pick_df): """ Verify that my hack for preserving pick_ids when overwriting picks with a df works as expected """ pick_df = deepcopy(pick_df) resource_ids = add_picks_from_df.data.pick_id pick_df["time"] = to_datetime64(pick_df["time"]) + pd.to_timedelta( 1, unit="s") with pytest.warns(UserWarning, match="existing"): out = add_picks_from_df.set_picks(pick_df) # Make sure the existing picks were overwritten, not appended assert len(out) == len(pick_df) # Make sure the times got updated pick_df["seed_id_less"] = pick_df["seed_id"].str[:-1] pick_df.set_index(list(_INDEX_NAMES), inplace=True) assert out.data["time"].sort_values().equals( pick_df["time"].sort_values()) # Make sure the resource_ids haven't changed assert (add_picks_from_df.data.pick_id == resource_ids).all()
def _get_phase_reference_time(fetcher: Fetcher, phase): """ Get reference times to specified phases, apply over all channels in a station. """ pha = phase.upper() # ensure the pick_df and inventory df exist pick_df = fetcher.picks_df inv_df = fetcher.station_df assert pick_df is not None and inv_df is not None # filter dataframes for phase of interest assert (pick_df["phase_hint"].str.upper() == pha ).any(), f"no {phase} picks found" pick_df = pick_df[pick_df["phase_hint"] == pha] # merge inventory and pick df together, ensure time is datetime64 columns = ["time", "station", "event_id"] merge = pd.merge(inv_df, pick_df[columns], on="station", how="left") merge["time"] = to_datetime64(merge["time"]) assert merge["seed_id"].astype(bool).all() return merge.set_index("seed_id")[["time", "event_id"]]
def test_time_dtype(self, time_df): """ Test time dtype. """ out1 = upd.cast_dtypes(time_df, {"time": "ops_datetime"})["time"] out2 = to_datetime64(time_df["time"]) assert (out1 == out2).all()
def test_utc_too_large(self): """Test a time larger than can fit into int64.""" too_big = obspy.UTCDateTime("2600-01-01") with pytest.warns(UserWarning): out = to_datetime64(too_big) assert pd.Timestamp(out).year == 2262
def last_updated(self) -> Optional[np.datetime64]: """ Get the last time (UTC) that the bank was updated. """ return to_datetime64(self.last_updated_timestamp)
def test_zero(self): """Tests for input values as 0 or 0.0""" dt1 = to_datetime64(0) dt2 = to_datetime64(0.0) assert dt1.astype(np.int64) == dt2.astype(np.int64) == 0
def yield_event_waveforms( self, time_before: Optional[float] = None, time_after: Optional[float] = None, reference: Union[str, Callable] = "origin", raise_on_fail: bool = True, ) -> Tuple[str, Stream]: """ Yield event_id and streams for each event. Parameters ---------- time_before The time before (in seconds) the reference that will be included in the waveforms if possible. time_after The Time after (in seconds) the reference that will be included in the waveforms if possible. reference A str that indicates how the starttime of the trace should be determined. The following are supported: origin - use the origin time of the event p - use the first p time as the start for each station s - use the first s times as the start for each station If "p" or "s" is used only streams corresponding to stations with the appropriate phase pick will be returned. raise_on_fail If True, re raise an exception if one is caught during waveform fetching, else continue to next event. Notes ----- Streams will not be yielded for any event for which a reference time cannot be obtained. For example, if reference='S' only events with some S picks will be yielded. """ def _check_yield_event_waveform_(reference, ta, tb): if not reference.lower() in self.reference_funcs: msg = (f"reference of {reference} is not supported. Supported " f"reference arguments are {list(self.reference_funcs)}") raise ValueError(msg) if not (np.abs(tb) + np.abs(ta)) > np.timedelta64(0, "s"): msg = ( "time_before and/or time_after must be specified in either " "Fetcher's init or the yield_event_Waveforms call") raise ValueError(msg) tb = to_timedelta64(time_before, default=self.time_before) ta = to_timedelta64(time_after, default=self.time_after) _check_yield_event_waveform_(reference, ta, tb) # get reference times ref_func = self.reference_funcs[reference.lower()] reftime_df = ref_func(self) # if using a wavebank preload index over entire time-span for speedup if isinstance(self.waveform_client, WaveBank) and len(reftime_df): mt = reftime_df["time"].min() - tb mx = reftime_df["time"].max() + ta index = self.waveform_client.read_index(starttime=mt, endtime=mx) get_bulk_wf = partial(self._get_bulk_wf, index=index) else: get_bulk_wf = self._get_bulk_wf # iterate each event in the events and yield the waveform for event_id, df in reftime_df.groupby("event_id"): # make sure ser is either a single datetime or a series of datetimes time = to_datetime64(df["time"]) t1, t2 = time - tb, time + ta bulk_args = self._get_bulk_args(starttime=t1, endtime=t2) try: yield EventStream(event_id, get_bulk_wf(bulk_args)) except Exception: if raise_on_fail: raise else: msg = f"Fetcher failed to get waveforms for {event_id}." warnings.warn(msg)
def test_nullish_values_returns_default(self): """Nullish values should return default values.""" out1 = to_datetime64(None) assert pd.isnull(out1) assert out1 is not None
def test_tuple_and_list(self): """tests for tuples and lists.""" input1 = ["2020-01-03", obspy.UTCDateTime("2020-01-01").timestamp] out1 = to_datetime64(input1) out2 = to_datetime64(tuple(input1)) assert np.all(out1 == out2)
def test_series_to_datetimes(self): """Series should be convertible to datetimes, return series""" ser = pd.Series([10, "2010-01-01"]) out = to_datetime64(ser) assert isinstance(out, pd.Series)
def merge_traces(st: trace_sequence, inplace=False) -> obspy.Stream: """ An efficient function to merge overlapping data for a stream. This function is equivalent to calling merge(1) and split() then returning the resulting trace. This means only traces that have overlaps or adjacent times will be merged, otherwise they will remain separate traces. Parameters ---------- st The input stream to merge. inplace If True st is modified in place. Returns ------- A stream with merged traces. """ def _make_trace_df(traces: trace_sequence) -> pd.DataFrame: """Create a dataframe form a sequence of traces.""" # create dataframe of traces and stats (use ns for all time values) sortby = ["seed_id", "sampling_rate", "starttime", "endtime"] index = _get_waveform_df(traces).sort_values(sortby).reset_index( drop=True) # create column if trace should be merged into previous trace shifted_index = index.shift(1) # shift all value forward one column seed_ids_match = index["seed_id"] == shifted_index["seed_id"] index_samp = index["sampling_period"] samps_match = index_samp == shifted_index["sampling_period"] # not sure why ~ is used rather than flipping comparison, maybe something # with NaNs? overlap = ~(shifted_index["endtime"] < (index["starttime"] - index_samp)) index["merge_group"] = ( ~(seed_ids_match & samps_match & overlap)).cumsum() return index # checks for early bail out, no merging if one trace or all unique ids if len(st) < 2 or len({tr.id for tr in st}) == len(st): return st if not inplace: st = copy.deepcopy(st) df = _make_trace_df(st) # get a series of properties by group group = df.groupby("merge_group") t1, t2 = group["starttime"].min(), group["endtime"].max() sampling_periods = group["sampling_period"].max() gsize = group.size() # number of traces in each group gnum_one = gsize[gsize == 1].index # groups with 1 trace gnum_gt_one = gsize[gsize > 1].index # group num with > 1 trace # use this to avoid pandas groupbys merged_traces = [ x.data for x in df[df["merge_group"].isin(gnum_one)]["trace"] ] for gnum in gnum_gt_one: # any groups w/ more than one trace ind = df.merge_group == gnum gtraces = [x.data for x in df.trace[ind]] # unpack traces dtype = _get_dtype(gtraces) # create list of time, y values, and marker for when values are filled sampling_period = sampling_periods[gnum].to_timedelta64() start, stop = t1[gnum].to_datetime64(), t2[gnum].to_datetime64() t = np.arange(start=start, stop=stop + sampling_period, step=sampling_period) y = np.empty(np.shape(t), dtype=dtype) has_filled = np.zeros(len(t), dtype=np.int32) for tr in gtraces: start_ind = np.searchsorted(t, to_datetime64(tr.stats.starttime)) y[start_ind:start_ind + len(tr.data)] = tr.data has_filled[start_ind:start_ind + len(tr.data)] = 1 gtraces[0].data = y merged_traces.append(gtraces[0]) assert np.all(has_filled), "some values not filled in!" return obspy.Stream(traces=merged_traces)