Ejemplo n.º 1
0
 def __call__(self, starttime, endtime, buffer, **kwargs):
     """ get start and end times, perform in kernel lookup """
     # get defaults if starttime or endtime is none
     starttime = to_datetime64(starttime or SMALLDT64)
     endtime = to_datetime64(endtime or LARGEDT64)
     # find out if the query falls within one cached times
     con1 = self.cache.t1 <= starttime
     con2 = self.cache.t2 >= endtime
     con3 = self.cache.kwargs == self._kwargs_to_str(kwargs)
     cached_index = self.cache[con1 & con2 & con3]
     if not len(cached_index):  # query is not cached get it from hdf5 file
         where = get_kernel_query(int(starttime), int(endtime), int(buffer))
         raw_index = self._get_index(where, **kwargs)
         # replace "None" with None
         ic = self.bank.index_str
         raw_index.loc[:, ic] = raw_index.loc[:, ic].replace(["None"],
                                                             [None])
         # convert data types used by bank back to those seen by user
         index = raw_index.astype(dict(self.bank._dtypes_output))
         self._set_cache(index, starttime, endtime, kwargs)
     else:
         index = cached_index.iloc[0]["cindex"]
     # trim down index
     con1 = index["starttime"] >= (endtime + buffer)
     con2 = index["endtime"] <= (starttime - buffer)
     return index[~(con1 | con2)]
Ejemplo n.º 2
0
 def test_correct_endtime_in_index(self, default_wbank):
     """ ensure the index has times consistent with traces in waveforms """
     index = default_wbank.read_index()
     st = obspy.read()
     starttimes = [to_datetime64(tr.stats.starttime) for tr in st]
     endtimes = [to_datetime64(tr.stats.endtime) for tr in st]
     assert min(starttimes) == index.starttime.min().to_datetime64()
     assert max(endtimes) == index.endtime.max().to_datetime64()
Ejemplo n.º 3
0
    def yield_waveforms(
        self,
        network: Optional[str] = None,
        station: Optional[str] = None,
        location: Optional[str] = None,
        channel: Optional[str] = None,
        starttime: Optional[obspy.UTCDateTime] = None,
        endtime: Optional[obspy.UTCDateTime] = None,
        attach_response: bool = False,
        duration: float = 3600.0,
        overlap: Optional[float] = None,
    ) -> Stream:
        """
        Yield waveforms from the bank.

        Parameters
        ----------
        {get_waveforms_params}
        attach_response : bool
            If True attach the response to the waveforms using the stations
        duration : float
            The duration of the streams to yield. All channels selected
            channels will be included in the waveforms.
        overlap : float
            If duration is used, the amount of overlap in yielded streams,
            added to the end of the waveforms.


        Notes
        -----
        All string parameters can use posix style matching with * and ? chars.
        """
        # get times in float format
        starttime = to_datetime64(starttime, 0.0)
        endtime = to_datetime64(endtime, "2999-01-01")
        # read in the whole index df
        index = self.read_index(
            network=network,
            station=station,
            location=location,
            channel=channel,
            starttime=starttime,
            endtime=endtime,
        )
        # adjust start/end times
        starttime = max(starttime, index.starttime.min())
        endtime = min(endtime, index.endtime.max())
        # chunk time and iterate over chunks
        time_chunks = make_time_chunks(starttime, endtime, duration, overlap)
        for t1, t2 in time_chunks:
            t1, t2 = to_datetime64(t1), to_datetime64(t2)
            con1 = (index.starttime - self.buffer) > t2
            con2 = (index.endtime + self.buffer) < t1
            ind = index[~(con1 | con2)]
            if not len(ind):
                continue
            yield self._index2stream(ind, t1, t2, attach_response)
Ejemplo n.º 4
0
 def test_with_nulls(self):
     """ Test for handling nulls. """
     test_input = (np.NaN, None, "", 15)
     out = np.array(to_datetime64(test_input))
     # first make sure empty values worked
     assert pd.isnull(out[:3]).all()
     assert out[-1].astype(int) == obspy.UTCDateTime(15)._ns
Ejemplo n.º 5
0
    def __call__(
        self,
        time_arg: event_time_type,
        time_before: Optional[float] = None,
        time_after: Optional[float] = None,
        *args,
        **kwargs,
    ) -> obspy.Stream:
        """
        Using a reference time, return a waveforms that encompasses that time.

        Parameters
        ----------
        time_arg
            The argument that will indicate a start time. Can be a one
            length events, and event, a float, or a UTCDatetime object
        time_before
            The time before time_arg to include in waveforms
        time_after
            The time after time_arg to include in waveforms

        Returns
        -------
        obspy.Stream
        """
        tbefore = to_timedelta64(time_before, default=self.time_before)
        tafter = to_timedelta64(time_after, default=self.time_after)
        assert (tbefore is not None) and (tafter is not None)
        # get the reference time from the object
        time = to_datetime64(get_reference_time(time_arg))
        t1 = time - tbefore
        t2 = time + tafter
        return self.get_waveforms(starttime=t1, endtime=t2, **kwargs)
Ejemplo n.º 6
0
 def test_npdatetime64_as_input(self):
     """ This should also work on np.datetime64. """
     test_input = np.array((np.datetime64(1000,
                                          "s"), np.datetime64(100, "ns")))
     out = to_datetime64(test_input)
     assert isinstance(out, np.ndarray)
     assert (test_input == out).all()
Ejemplo n.º 7
0
 def test_pandas_timestamp(self):
     """ Timestamps should also work. """
     kwargs = dict(year=2019, month=10, day=11, hour=12)
     ts = pd.Timestamp(**kwargs)
     out = to_datetime64((ts, ))
     expected_out = (ts.to_datetime64(), )
     assert out == expected_out
Ejemplo n.º 8
0
 def test_simple(self):
     """ Test converting simple UTCDateTimable things """
     test_input = ("2019-01-10 11-12",
                   obspy.UTCDateTime("2019-01-10T12-12"), 100)
     expected = np.array([obspy.UTCDateTime(x)._ns for x in test_input])
     out = np.array(to_datetime64(test_input)).astype(int)
     assert np.equal(expected, out).all()
Ejemplo n.º 9
0
 def test_starttime_origin_time_seperation(self, dar_attached_picks):
     """ ensure the start of the trace and start of the events arent too
     far off """
     dar = dar_attached_picks
     cat = dar.attrs["events"]
     for ev in cat:
         rid = ev.resource_id
         time = to_datetime64(ev.origins[-1].time)
         dd = dar[dar.stream_id == rid]
         assert dd.origin_time.values - time == EMPTYTD64
         assert ((dd.starttime.values - to_utc(time).timestamp) < 100).all()
Ejemplo n.º 10
0
 def test_filter_index(self, crandall_dataset):
     """ Tests for filtering index with filter index function. """
     # this is mainly here to test the time filtering, because the bank
     # operations pass this off to the HDF5 kernel.
     index = crandall_dataset.waveform_client.read_index(network="UU")
     mean_ns = index.starttime.astype(int).mean()
     t1 = to_datetime64(obspy.UTCDateTime(ns=int(mean_ns)))
     t2 = index.endtime.max()
     kwargs = dict(network="UU", station="*", location="*", channel="*")
     bool_ind = filter_index(index, starttime=t1, endtime=t2, **kwargs)
     assert (~np.logical_not(bool_ind)).any()
Ejemplo n.º 11
0
    def _get_bulk_arg(self, starttime=None, endtime=None, **kwargs) -> list:
        """ get the argument passed to get_waveforms_bulk, see
        obspy.fdsn.client for more info """
        station_df = self.station_df.copy()
        inv = station_df[filter_index(station_df, **kwargs)]
        # replace None/Nan with larger number
        inv.loc[inv["end_date"].isnull(), "end_date"] = LARGEDT64
        inv["end_date"] = inv["end_date"].astype("datetime64[ns]")
        # remove station/channels that dont have data for requested time
        starttime = to_datetime64(starttime, default=inv["start_date"].min())
        endtime = to_datetime64(endtime, default=inv["end_date"].max())
        con1, con2 = (inv["start_date"] > endtime), (inv["end_date"] <
                                                     starttime)

        inv = inv[~(con1 | con2)]
        df = inv[list(NSLC)]
        if df.empty:  # return empty list if no data found
            return []
        df.loc[:, "starttime"] = starttime
        df.loc[:, "endtime"] = endtime
        # remove any rows that don't have defined start/end times
        out = df[(~df["starttime"].isnull()) & (~df["endtime"].isnull())]
        # convert to list of tuples and return
        return [tuple(x) for x in out.to_records(index=False)]
Ejemplo n.º 12
0
def get_phase_window_df(  # noqa: C901
    event: ev.Event,
    max_duration: Optional[Union[float, int, Mapping]] = None,
    min_duration: Optional[Union[float, int, Mapping]] = None,
    channel_codes: Optional[Union[Collection, pd.Series]] = None,
    buffer_ratio: Optional[float] = None,
    restrict_to_arrivals: bool = True,
) -> pd.DataFrame:
    """
    Return a dataframe of phase picks for the event. Does the following:

    1) Removes the rejected picks.
    2) Defines pick time windows using either
        a) a corresponding amplitude whose type matches the phase hint of the pick
           and has a time window
        b) the start of the phase to the arrival time plus min_duration.

    Parameters
    ----------
    event
        The seismic event
    max_duration
        The maximum duration (in seconds) of a phase. Can either be a scalar,
        or a mapping whose keys are seed_ids and values are applied to that
        specific channel.
    min_duration
        The minimum duration (in seconds) of a phase. Can either be a scalar,
        or a mapping whose keys are seed_ids and values are applied to that
        specific channel.
    channel_codes
        If provided, supplies the needed information to expand the dataframe
        to include an entry for each channel on a station for a given pick.
        For example, this is used a P pick that occurs on a HHZ channel will
        also have an entry on HHE and HHN (assuming they are in the list).
    buffer_ratio
        If not None, the ratio of the total duration of the phase windows
        which should be added to BOTH the start and end of the phase window.
    restrict_to_arrivals
        If True, only use picks for which there is an arrival on the preferred
        origin.
    """
    reftime = to_datetime64(get_reference_time(event))

    def _get_earliest_s_time(df):
        return df[df.phase_hint == "S"].time.min()

    def _get_extrema_like_df(df, extrema_arg):
        """
        get min or max argument with the same length as df.
        This is done so each rows duration can be compared to some
        row specific value.
        """

        if isinstance(extrema_arg, pd.Series):
            return df["seed_id"].map(extrema_arg.droplevel("seed_id_less"))
        elif isinstance(extrema_arg, Mapping):
            return df["seed_id"].map(extrema_arg)
        else:
            return np.ones(len(df)) * extrema_arg

    def _get_picks_df(restrict_to_arrivals):
        """Get the picks dataframe, remove picks flagged as rejected."""
        pdf = obsplus.picks_to_df(event)
        pdf["seed_id_less"] = pdf["seed_id"].str[:-1]
        if restrict_to_arrivals:
            adf = obsplus.arrivals_to_df(event)
            pdf = pdf.loc[pdf["resource_id"].isin(adf["pick_id"])]
        # remove rejected picks
        pdf = pdf[pdf.evaluation_status != "rejected"]
        # Toss any picks from stations that have S-picks that are earlier than P-picks
        if {"P", "S"}.issubset(pdf["phase_hint"]):
            phs = pdf.groupby("phase_hint")
            p_picks = phs.get_group("P")
            s_picks = phs.get_group("S")
            both = set(p_picks["seed_id_less"]).intersection(
                s_picks["seed_id_less"])
            p_picks = (p_picks.loc[p_picks["seed_id_less"].isin(
                both)].set_index("seed_id_less").sort_index())
            s_picks = (s_picks.loc[s_picks["seed_id_less"].isin(
                both)].set_index("seed_id_less").sort_index())
            mask = p_picks["time"] > s_picks["time"]
            bad_p = p_picks.loc[mask]
            bad_s = s_picks.loc[mask]
            if mask.any():
                warnings.warn(
                    "S-pick is earlier than P-pick for one or more picks."
                    "Skipping phases.")
            pdf = pdf.loc[~pdf["resource_id"].isin(bad_s["resource_id"])
                          & ~pdf["resource_id"].isin(bad_p["resource_id"])]
        if not len(pdf):
            raise NoPhaseInformationError(
                f"No valid phases for event:\n{event}")
        # # add seed_id column
        # pdf["seed_id"] = obsplus.utils.get_nslc_series(pdf)
        # add the seed id column that drops the component from the channel
        # rename the resource_id column for later merging
        pdf.rename(columns={"resource_id": "pick_id"}, inplace=True)
        return pdf

    def _add_amplitudes(df):
        """Add amplitudes to picks"""
        # expected_cols = ["pick_id", "twindow_start", "twindow_end", "twindow_ref"]
        dtypes = {
            "pick_id": str,
            "twindow_start": "timedelta64[ns]",
            "twindow_end": "timedelta64[ns]",
            "twindow_ref": "datetime64[ns]",
        }
        amp_df = event.amplitudes_to_df()
        # Drop rejected amplitudes
        amp_df = amp_df.loc[amp_df["evaluation_status"] != "rejected"]
        if amp_df.empty:  # no data, init empty df with expected cols
            amp_df = pd.DataFrame(columns=list(dtypes)).astype(dtypes)
        else:
            # merge picks/amps together and calculate windows
            amp_df.rename(
                columns={
                    "time_begin": "twindow_start",
                    "time_end": "twindow_end",
                    "reference": "twindow_ref",
                },
                inplace=True,
            )
        # merge and return
        amp_df = amp_df[list(dtypes)].astype(
            dtypes)  # Make sure time-related things are set correctly
        # Note: the amplitude list can be longer than the pick list because of
        # the logic for dropping picks earlier
        merged = df.merge(amp_df,
                          left_on="pick_id",
                          right_on="pick_id",
                          how="outer").dropna(subset=["time"])
        assert len(merged) == len(df)
        return _add_starttime_end(merged)

    def _add_starttime_end(df):
        """Add the time window start and end"""
        # fill references with start times of phases if empty
        df.loc[df["twindow_ref"].isnull(), "twindow_ref"] = df["time"]
        # Fill NaTs w/ 0 second timedelta
        twindow_start = df["twindow_start"].fillna(np.timedelta64(0, "ns"))
        twindow_end = df["twindow_end"].fillna(np.timedelta64(0, "ns"))
        # Determine start/end times of phase windows
        df["starttime"] = df["twindow_ref"] - twindow_start
        df["endtime"] = df["twindow_ref"] + twindow_end
        # add travel time
        df["travel_time"] = df["time"] - reftime
        # get earliest s phase by station
        _s_start = df.groupby(list(NSLC[:2])).apply(_get_earliest_s_time)
        s_start = _s_start.rename("s_start").to_frame().reset_index()
        # merge back into pick_df, use either defined window or S phase, whichever
        # is smaller.
        dd2 = df.merge(s_start, on=["network", "station"], how="left")
        # get dataframe indices for P
        p_inds = df[df.phase_hint == "P"].index
        # make sure P end times don't exceed s start times
        endtime_or_s_start = dd2[["s_start", "endtime"]].min(axis=1,
                                                             skipna=True)
        df.loc[p_inds, "endtime"] = endtime_or_s_start[p_inds]
        duration = abs(df["endtime"] - df["starttime"])
        # Make sure all value are under phase duration time, else truncate them
        if max_duration is not None:
            max_dur = to_timedelta64(_get_extrema_like_df(df, max_duration))
            larger_than_max = duration > max_dur
            df.loc[larger_than_max,
                   "endtime"] = df["starttime"] + to_timedelta64(max_duration)
        # Make sure all values are at least min_phase_duration, else expand them
        if min_duration is not None:
            min_dur = to_timedelta64(_get_extrema_like_df(df, min_duration))
            small_than_min = duration < min_dur
            df.loc[small_than_min, "endtime"] = df["starttime"] + min_dur
        # sanity checks
        assert (df["endtime"] >= df["starttime"]).all()
        assert not (df["starttime"].isnull()).any()
        return df

    def _duplicate_on_same_stations(df):
        """
        Duplicate all the entries to get the 3 components for each station
        """
        # make a dict of channel[:-1] and matching channels
        assert channel_codes is not None
        code_lest_1 = defaultdict(list)
        for code in channel_codes:
            code_lest_1[code[0]].append(code[1])
        # create expanded df
        new_inds = [
            x for y in df["seed_id"].unique() for x in code_lest_1[y[:-1]]
        ]
        # get seed_id columns and merge back together
        df_new = pd.DataFrame(new_inds, columns=["seed_id"])
        df_new["seed_id_less"] = df_new["seed_id"].str[:-1]
        seed_id = expand_seed_id(df_new["seed_id"])
        df_new = df_new.join(seed_id)
        # now merge in old dataframe for full expand
        # df_new["temp"] = df_new["seed_id"].str[:-1]
        right_cols = list(PHASE_WINDOW_INTERMEDIATE_DTYPES)
        out = pd.merge(df_new, df[right_cols], on="seed_id_less", how="left")
        return out.drop_duplicates()

    def _apply_buffer(df):
        # add buffers on either end of waveform for tapering
        buff = (df["endtime"] - df["starttime"]) * buffer_ratio
        df["starttime"] = df["starttime"] - buff
        df["endtime"] = df["endtime"] + buff
        return df

    # read picks in and filter out rejected picks
    dd = _add_amplitudes(_get_picks_df(restrict_to_arrivals))
    # return columns
    out = dd[list(PHASE_WINDOW_DF_DTYPES)]
    # add buffer to window start/end
    if buffer_ratio is not None:
        out = _apply_buffer(out)
    # if channel codes are provided, make a duplicate of each phase window row
    # for each channel on the same station
    if channel_codes:
        out = _duplicate_on_same_stations(out)[list(PHASE_WINDOW_DF_DTYPES)]
    return out
Ejemplo n.º 13
0
class TestGetEventData:
    t1 = to_datetime64("2009-04-01")
    t2 = to_datetime64("2009-04-04")

    path = "eventwaveforms/{year}/{julday}"

    # fixtures
    @pytest.fixture(scope="class")
    def temp_dir_path(self):
        """ return a path to a temporary directory """
        with tempfile.TemporaryDirectory() as tempdir:
            out = os.path.join(tempdir, "temp")
            yield out

    @pytest.fixture(scope="class")
    def fetcher(self, kem_fetcher):
        """ make a copy of the kem_fetcher and restrict scope of events to
        when data are available."""
        fet = kem_fetcher.copy()
        df = fet.event_df
        fet.event_df = df[(df.time >= self.t1) & (df.time <= self.t2)]
        return fet

    @pytest.fixture(scope="class")
    def download_data(self, temp_dir_path, fetcher: Fetcher):
        """ download data from the kem fetcher into the tempdir, return
        path to tempdir """
        path = os.path.join(temp_dir_path, self.path)
        params = dict(time_before_origin=0, time_after_origin=10, path=path)
        fetcher.download_event_waveforms(**params)
        return temp_dir_path

    @pytest.fixture(scope="class")
    def event_sbank(self, download_data):
        """ return an sbank pointed at the temp_dir_path """
        sb = WaveBank(download_data)
        sb.update_index()
        return sb

    @pytest.fixture(scope="class")
    def event_fetcher(self, event_sbank, fetcher):
        """ init a fetcher using the old fetcher """
        fet = fetcher.copy()
        fet._download_client = event_sbank
        return fet

    @pytest.fixture(scope="class")
    def mseeds(self, download_data):
        """ return a list of all the files with the ext mseed """
        return glob.glob(os.path.join(download_data, "**", "*mseed"), recursive=True)

    @pytest.fixture(scope="class")
    def stream_dict(self, event_fetcher):
        """ return a dict of the events contained in the waveforms """
        return dict(event_fetcher.yield_event_waveforms(0, 10))

    # tests
    def test_directory_exists(self, download_data):
        """ ensure the directory was created """
        assert os.path.exists(download_data)

    def test_mseeds(self, mseeds):
        """ ensure some files with mseed ext were created """
        assert len(mseeds)

    def test_data_were_downloaded(self, stream_dict):
        """ ensure data from the events exists """
        for eveid, stream in stream_dict.items():
            assert isinstance(stream, obspy.Stream)
            assert len(stream)
Ejemplo n.º 14
0
 def test_series_to_datetimes(self):
     """ Series should be convertible to datetimes. """
     ser = pd.Series([10, "2010-01-01"])
     out = to_datetime64(ser)
     assert isinstance(out, pd.Series)
Ejemplo n.º 15
0
 def test_utc_to_large(self):
     too_big = obspy.UTCDateTime("2600-01-01")
     with pytest.warns(UserWarning):
         out = to_datetime64(too_big)
     assert pd.Timestamp(out).year == 2262
Ejemplo n.º 16
0
 def test_creation_time(self, amplitude, amp_series):
     assert amp_series["creation_time"] == to_datetime64(
         amplitude.creation_info.creation_time)
     assert amp_series["author"] == amplitude.creation_info.author
     assert amp_series["agency_id"] == amplitude.creation_info.agency_id
Ejemplo n.º 17
0
    def yield_event_waveforms(
        self,
        time_before: Optional[float] = None,
        time_after: Optional[float] = None,
        reference: Union[str, Callable] = "origin",
        raise_on_fail: bool = True,
    ) -> Tuple[str, Stream]:
        """
        Yield event_id and streams for each event in the events.

        Parameters
        ----------
        time_before
            The time before (in seconds) the reference that will be included
            in the waveforms if possible.
        time_after
            The Time after (in seconds) the reference that will be included
            in the waveforms if possible.
        reference
            A str that indicates how the starttime of the trace should be
            determined. The following are supported:
                origin - use the origin time of the event for each channel
                p - use the first p times as the start of the station traces
                s - use the first s times as the start of the station traces
            If a station doesn't have p or s picks and "p" or "s" is used,
            it's streams will not be returned.
        raise_on_fail
            If True, re raise and exception if one is caught during waveform
            fetching, else continue to next event.

        Yields
        ------
        obspy.Stream
        """
        assert reference.lower() in self.reference_funcs
        tb = to_timedelta64(time_before, default=self.time_before)
        ta = to_timedelta64(time_after, default=self.time_after)
        assert (tb is not None) and (ta is not None)
        # get reference times
        event_ids = self.event_df.event_id.values
        reftimes = {
            x: self.reference_funcs[reference](self, x)
            for x in event_ids
        }
        # if using a wavebank preload index over entire time-span for speedup
        if isinstance(self.waveform_client, WaveBank):
            mt = min([
                x.min() if hasattr(x, "min") else x for x in reftimes.values()
            ])
            mx = max([
                x.max() if hasattr(x, "max") else x for x in reftimes.values()
            ])
            index = self.waveform_client.read_index(starttime=mt, endtime=mx)
            get_bulk_wf = partial(self._get_bulk_wf, index=index)
        else:
            get_bulk_wf = self._get_bulk_wf
        # iterate each event in the events and yield the waveform
        for event_id in event_ids:
            # make sure ser is either a single datetime or a series of datetimes
            ti_ = to_datetime64(reftimes[event_id])
            bulk_args = self._get_bulk_arg(starttime=ti_ - tb,
                                           endtime=ti_ + ta)
            try:
                yield EventStream(event_id, get_bulk_wf(bulk_args))
            except Exception:
                if raise_on_fail:
                    raise