Exemple #1
0
def WindowData_with_duration(
    d,
    duration,
    t0shift=None,
    object_history=False,
    alg_name="scale",
    alg_id=None,
    dryrun=False,
):
    if duration < 0:
        detailline = "Window duration: {dur}  Data range:  {dst},{det}".format(
            dur=duration, dst=d.t0, det=d.endtime())
        d.elog.log_error(
            "WindowData",
            "Duration is a negative number.\n" + detailline,
            ErrorSeverity.Invalid,
        )
        d.kill()
        return d
    win_start = d.t0 + 1
    win_end = win_start + duration
    if d.dead():
        return d
    twcut = TimeWindow(win_start, win_end)
    if t0shift:
        twcut.shift(t0shift)
    try:
        # This handler duplicates an error test in the WindowData C code but
        # it will be more efficient to handle it here.
        if win_start < d.t0 or win_end > d.endtime():
            detailline = "Window range: {wst},{wet}  Data range:  {dst},{det}".format(
                wst=win_start, wet=win_end, dst=d.t0, det=d.endtime())
            d.elog.log_error(
                "WindowData",
                "Data range is smaller than window range\n" + detailline,
                ErrorSeverity.Invalid,
            )
            d.kill()
            return d
        if isinstance(d, TimeSeries):
            dcut = _WindowData(d, twcut)
            return dcut
        elif isinstance(d, Seismogram):
            dcut = _WindowData3C(d, twcut)
            return dcut
        else:
            raise RuntimeError(
                "WindowData:  Invalid input data type received=" +
                str(type(d)))
    except MsPASSError as err:
        d.log_error("WindowData", str(err), ErrorSeverity.Invalid)
        d.kill()
        return d
Exemple #2
0
 def nwin(self):
     if self.__uses_noise:
         tws = self.md.get_double("noise_window_start")
         twe = self.md.get_double("noise_window_end")
         return TimeWindow(tws, twe)
     else:
         return TimeWindow  # always initialize even if not used
Exemple #3
0
 def dwin(self):
     tws = self.md.get_double("deconvolution_data_window_start")
     twe = self.md.get_double("deconvolution_data_window_end")
     return TimeWindow(tws, twe)
Exemple #4
0
def scale(
    d,
    compute_from_window=False,
    window=None,
    method="peak",
    level=1.0,
    scale_by_section=False,
    use_mean=False,
    object_history=False,
    alg_name="scale",
    alg_id=None,
    dryrun=False,
    function_return_key=None,
):
    """
    Top level function interface to data scaling methods.

    This function can be used to scale seismic data contained in any of
    the four seismic data objects defined in mspass:  TimeSeries, Seismogram,
    TimeSeriesEnsemble, and SeismogramEnsemble.   An extensive set of
    amplitude estimation metrics are available by selecting one of the
    allowed values for the method parameter.   Ensembles can be scaled
    at the individual seismogram level or as a whole (scale_by_section=True).

    Note all methods preserve the original amplitude by updating the
    Metadata parameter calib to include the scaling.  i.e. as always the
    amplitude of the data in physical units can be restored by multiplying
    the data samples by calib.

    :param d:  is input data object.  If not one of the four mspass seismic
      data types noted above the function will throw a RuntimeError exception.
    :param compute_from_window: boolean used to compute amplitude and scale
      based on a windowed section of the input waveform.   By default (this
      boolan False) the amplitude for scaling is computed from the full
      waveform.  When True the window argument must contain a valid TimeWindow
      that spans a time range smaller than the data range.  In that situation
      if the window is inconsistent with the data range the return will be
      marked dead and messages will be found posted in elog.  For ensembles
      all or only  portion of the group will be killed if this happens.
      Note this parameter is also ignored when scale_by_section is true.
    :param window:  mspass TimeWindow object defining the time range
      over which the amplitude for scaling is to be computed.  (see the
      compute_from_window parameter description)
    :param method: string defining the gain method to use.  Currently supported
      method values are:  peak, RMS (rms accepted), perc, and MAD
      (also accepts mad or Mad).  Peak uses the largest amplitude for
      scaling.  For 3C data that means vector amplitude while for scalar data
      it is the largest absolute value. rms is the standard rms measure,
      although for 3C data is is rms vector amplitudes so scaling is by the
      number of vectors not the total number of samples (3 times number of
      vectors).  perc uses a sorted percentage clip level metric as used in
      seismic unix.  mad is a variant where the value returned is the
      median absolute deviation (mad) that is actual the same as perc=1/2.
      Default is peak.  WARNING:  if an invalid value for method is passed the
      data will be returned unaltered with a complaint message issue for
      very datum (indivually or in ensembles) run that way.
    :param level:   For all but perc this defines the scale to which the data
      are scaled.  For perc it is used to set the percent clip level.
      If the value passed is illegal (0 or negative for most methods while
      perc must also be positive but less or equal 1) a complain message will
      be posted to elog and the level adjusted to 1.0.
    :param scale_by_section:  is a boolean that controls the scaling
      behavior on ensembles only (It is silently ignored for atomic
      TimeSeries and Seismogram data).  When true a single gain factor is
      applied to all members of an ensemble.  When false each member is
      gained individually as if this function were applied in a loop to
      each member.
    :param use_mean:  boolean used only for ensembles and when scale_by_section is
      True.   The algorithm used in that case has an option to use the mean
      log amplitude for scaling the section instead of the default median
      amplitude.

    :return: Data scaled to specified level.  Note the scaling always preserves
      absolute amplitude by adjusting the value of the calib attribute of the
      return so calib*data is the same value before and after the scaling.
    :rtype: same as input

    """
    if isinstance(d, TimeSeries) or isinstance(d, Seismogram):
        if d.dead():
            return d
    # First validate arguments
    # The logic here would be much cleaner if ensembles had an elog attribute
    # may happen as group discussions have proposed that change.  this should
    # change to be cleaner of elog is added to ensmeble objects
    if (method != "peak" and method != "RMS" and method != "rms"
            and method != "perc" and method != "MAD" and method != "mad"):
        message = (
            "method parameter passed = " + method +
            " is not valid.  Should be peak, rms, perc, or mad\nContinuing with no change to data"
        )
        if isinstance(d, TimeSeriesEnsemble) or isinstance(
                d, SeismogramEnsemble):
            ensemble_error_post(d, alg_name, message, ErrorSeverity.Complaint)
        else:
            # This could cause an abort if the input is not one of the four stock data types
            # but that is ok as that is an obvious usage error and should be a fatal error
            d.elog.log_error(
                alg_name,
                "method parameter passed = " + method + " is not valid.  " +
                "Should be peak, rms, perc, or mad",
                ErrorSeverity.Complaint,
            )
        return d
    if method == "perc":
        if level <= 0 or level > 1.0:
            message = "perc scaling method given illegal value={plevel}\nDefaulted to 1.0".format(
                plevel=level)
            if isinstance(d, TimeSeriesEnsemble) or isinstance(
                    d, SeismogramEnsemble):
                ensemble_error_post(d, alg_name, message,
                                    ErrorSeverity.Complaint)
            else:
                d.elog.log_error(alg_name, message, ErrorSeverity.Complaint)
                level = 1.0
    else:
        if level <= 0.0:
            message = "{meth} scaling method given illegal value={slevel}\nDefaulted to 1.0".format(
                meth=method, slevel=level)
            if isinstance(d, TimeSeriesEnsemble) or isinstance(
                    d, SeismogramEnsemble):
                ensemble_error_post(d, alg_name, message,
                                    ErrorSeverity.Complaint)
            else:
                d.elog.log_error(alg_name, message, ErrorSeverity.Complaint)
            level = 1.0
    if compute_from_window:
        if isinstance(window, TimeWindow):
            ampwin = window
        else:
            message = "optional window parameter set but value is not a TimeWindow object\nReverting to unwindowed estimate"
            if isinstance(d, TimeSeriesEnsemble) or isinstance(
                    d, SeismogramEnsemble):
                ensemble_error_post(d, alg_name, message,
                                    ErrorSeverity.Complaint)
            else:
                d.elog.log_error(alg_name, message, ErrorSeverity.Complaint)
            # this is an invalid window because start>end is used as a signal
            # in WindowData to use the entire waveform.  It switches automatically
            # without logging an error (currently).  Definitely a little weird
            # but this hides that detail for python users
            ampwin = TimeWindow(0.0, -1.0)
    else:
        ampwin = TimeWindow(0.0, -1.0)
    # The pybind11 and C++ way of defining an enum class creates an
    # obnoxiously ugly syntax. We insulate the user from this oddity
    # by using a string arg to define this enum passed to _scale
    method_to_use = ScalingMethod.Peak
    if method == "rms" or method == "RMS":
        method_to_use = ScalingMethod.RMS
    elif method == "perc":
        method_to_use = ScalingMethod.perc
    elif method == "MAD" or method == "mad":
        method_to_use = ScalingMethod.MAD
    try:
        # Note this logic depends on an oddity of the C++ api in the
        # functions called with the _scale binding name.   When the TimeWindow
        # is invalid the entire data range is silently used - not viewed as
        # an error.  Hence, the following works only when the logic above to
        # handle the definition of the window parameter is set.
        if isinstance(d, TimeSeries) or isinstance(d, Seismogram):
            amp = _scale(d, method_to_use, level, ampwin)
            _post_amplitude(d, method_to_use, amp)
        elif isinstance(d, TimeSeriesEnsemble) or isinstance(
                d, SeismogramEnsemble):
            if len(d.member
                   ) <= 0:  # Silently return nothing if the ensemble is empy
                return d
            if scale_by_section:
                amp = _scale_ensemble(d, method_to_use, level, use_mean)
                # We post the amplitude the ensembe's metadata in this case
                _post_amplitude(d, method_to_use, amp)
            else:
                ampvec = _scale_ensemble_members(d, method_to_use, level,
                                                 ampwin)
                i = 0
                for x in d.member:
                    if x.live:
                        _post_amplitude(x, method_to_use, ampvec[i])
                    i += 1
        else:
            raise MsPASSError(
                "scale: input data is not a supported mspass seismic data type",
                "Fatal")
        return d
    except MsPASSError as err:
        if isinstance(d, Seismogram) or isinstance(d, TimeSeries):
            d.elog.log_error(alg_name, str(err), ErrorSeverity.Invalid)
            d.kill()
        # avoid further isinstance at the expense of a maintenance issue.
        # if we add any other supported data objects we could have a
        # problem here.  This assumes what lands here is an ensemble
        else:
            ensemble_error_post(d, alg_name, err)
            for x in d.member:
                x.kill()
        return d
    # this is needed to handle an oddity recommended on this
    # web site:  http://effbot.org/zone/stupid-exceptions-keyboardinterrupt.htm
    except KeyboardInterrupt:
        raise
    except:
        message = "Something threw an unexpected exception\nThat is a bug that needs to be fixed - contact authors"
        if isinstance(d, Seismogram) or isinstance(d, TimeSeries):
            d.elog.log_error(alg_name, message, ErrorSeverity.Invalid)
        else:
            ensemble_error_post(d, alg_name, message, ErrorSeverity.Invalid)
Exemple #5
0
def WindowData(
    d,
    win_start,
    win_end,
    t0shift=None,
    object_history=False,
    alg_name="scale",
    alg_id=None,
    dryrun=False,
):
    """
    Cut data defined by a TimeWindow object.

    Cutting a smaller waveform segment from a larger waveform segment
    is a very common seismic data processing task.   The function is
    a python wrapper to adapt C++ code that accomplishes that task
    to the MsPASS framework.

    Note this function uses one model for being bombproof with a map
    operation.  Any exception that makes the result invalid will cause
    an error message to be posted to the elog attribute of the input
    datum AND the data will be marked dead (killed).

    :param d: is the input data.  d must be either a :class:`mspasspy.ccore.seismic.TimeSeries` or :class:`mspasspy.ccore.seismic.Seismogram`
      object or the function will log an error to d and return a None.
    :param twin_start: defines the start of timeWindow to be cut
    :type twin_start: :class:`float`
    :param twin_end: defines the end of timeWindow to be cut
    :type twin_end: :class:`float`
    :param t0shift: is an optional time shift to apply to the time window.
      This parameter is convenient to avoid conversions to relative time.
      A typical example would be to set t0shift to an arrival time and let
      the window define time relative to that arrival time.  Default is None
      which cause the function to assume twin is to be used directly.
    :param object_history: boolean to enable or disable saving object
      level history.  Default is False.  Note this functionality is
      implemented via the mspass_func_wrapper decorator.
    :param alg_name:   When history is enabled this is the algorithm name
      assigned to the stamp for applying this algorithm.
      Default ("WindowData") should normally be just used.
      Note this functionality is implemented via the mspass_func_wrapper decorator.
    :param ald_id:  algorithm id to assign to history record (used only if
      object_history is set True.)
      Note this functionality is implemented via the mspass_func_wrapper decorator.
    :param dryrun:
      Note this functionality is implemented via the mspass_func_wrapper decorator.
    :param dryrun:
      Note this functionality is implemented via the mspass_func_wrapper decorator.

    :return: copy of d with sample range reduced to twin range.  Returns
      an empty version of the parent data type (default constructor) if
      the input is marked dead
    """
    if d.dead():
        return d
    twcut = TimeWindow(win_start, win_end)
    if t0shift:
        twcut.shift(t0shift)
    try:
        # This handler duplicates an error test in the WindowData C code but
        # it will be more efficient to handle it here.
        if twcut.start < d.t0 or twcut.end > d.endtime():
            detailline = "Window range: {wst},{wet}  Data range:  {dst},{det}".format(
                wst=twcut.start, wet=twcut.end, dst=d.t0, det=d.endtime())
            d.elog.log_error(
                "WindowData",
                "Data range is smaller than window range\n" + detailline,
                ErrorSeverity.Invalid,
            )
            d.kill()
            return d
        if isinstance(d, TimeSeries):
            dcut = _WindowData(d, twcut)
            return dcut
        elif isinstance(d, Seismogram):
            dcut = _WindowData3C(d, twcut)
            return dcut
        else:
            raise RuntimeError(
                "WindowData:  Invalid input data type received=" +
                str(type(d)))
    except MsPASSError as err:
        d.log_error("WindowData", str(err), ErrorSeverity.Invalid)
        d.kill()
        return d
Exemple #6
0
def arrival_snr_QC(
    data_object,
    noise_window=TimeWindow(-130.0, -5.0),
    noise_spectrum_engine=None,
    signal_window=TimeWindow(-5.0, 120.0),
    signal_spectrum_engine=None,
    band_cutoff_snr=2.0,
    # check these are reasonable - don't remember the formula when writing this
    tbp=5.0,
    ntapers=10,
    high_frequency_search_start=5.0,
    poles=3,
    perc=95.0,
    phase_name="P",
    metadata_key="Parrival",
    optional_metrics=[
        "snr_stats",
        "filtered_envelope",
        "filtered_L2",
        "filtered_Linf",
        "filtered_MAD",
        "filtered_perc",
    ],
    save_spectra=False,
    db=None,
    collection="arrival",
    use_measured_arrival_time=False,
    measured_arrival_time_key="Ptime",
    taup_model=None,
    update_mode=False,
    component=2,
    source_collection="source",
    receiver_collection=None,
):
    """
    Compute a series of metrics that can be used for quality control
    filtering of seismic phase data.

    This is the highest level function in this module for computing
    signal-to-noise ratio metrics for processing signals that can be
    defined by a computable or measurable "phase".  Features this
    function adds over lower level functions in this module are:
        1.  An option to save computed metrics to a MongoDB collection
            (defaults as "arrival").  If the update_mode argument is
            set True (default is False) the function expects the data_object
            to contain the attribute "arrival_id" that references the
            ObjectID of an existing entry in the the collection where the
            data this function computes is to be saved (default is"arrival").
        2.  Adds an option to use a computed or measured arrival as the
            time reference for all windowing.   The lower level snr
            functions in this module require the user do what this
            function does prior to calling the function.  Note one or the other is required
            (i.e. either computed or measured time will be define t0 of the
             processing)

    The input of arg 0 (data_object) can be either a TimeSeries or
    a Seismogram object.  If a Seismogram object is passed the "component"
    argument is used to extract the specified single channel from the Seismogram
    object and than component is used for processing.  That is necessary
    because all the algorithms used are single channel algorithms.  To
    use this function on all components use a loop over components BUT
    make sure you use a unique value for the argument "metadata_key" for
    each component.  Note this will also produce multiple documents per
    input datum.

    The type of the data_object also has a more subtle implication the
    user must be aware of.  That is, in the MsPASS schema we store receiver coordinates
    in one of two different collections:  "channel" for TimeSeries data and
    "site" for Seismogram data.  When such data are loaded the generic keys
    like lat are always converted to names like channel_lat or site_lat
    for TimeSeries and Seismogram data respectively.   This function uses
    the data type to set that naming.  i.e. if the input is TimeSeries
    it tries to fetch the latitude data as channel_lat while if it the input
    is a Seismogram it tries to fetch site_lat.   That is true of all coordinate
    data loaded by normalization from a source and receiver collection.

    The following args are passed directly to the function arrival_snr:
    noise_window, signal_window, band_cutoff_snr, tbp, ntapers, poles,
    perc, phase_name, metadata_key, and optional_metrics.  See the docstring
    for arrival_snr and FD_snr_estimator for descriptions of how these
    arguments should be used.  This top level function adds arguments
    decribed below.

    :param db:  mspass Database object that is used as a handle for to MongoDB.
    Default is None, which the function takes to mean you don't want to
    save the computed values to MongoDB.   In this mode the computed
    metrics will all be posted to a python dict that can be found under the
    key defined by the "metadata_key" argument.   When db is defined the
    contents of that same python dict will save to MongoDB is the
    collection defined by the "collection" argument.  If db is run as
    the default None the user is responsible for saving and managing the
    computed snr data.   Be aware a simple later call to db.save_data
    will not produce the same normalized data with the (default) arrival
    collection.

    :param collection:  MongoDB collection name where the results of this
    function will be saved.  If the "update_mode" argument is also set
    True the update section will reference this collection. Default is "arrival".

    :param use_measured_arrival_time:  boolean defining the method used to
    define the time reference for windowing used for snr calculations.
    When True the function will attempt to fetch a phase arrival time with
    the key defined by the "measured_arrival_time_key" argument.  In that
    mode if the fetch fails the data_object will be killed and an error
    posted to elog.   That somewhat brutal choice was intentional as the
    expectation is if you want to use measured arrival times you don't
    want data where there are no picks.   The default is True to make
    the defaults consistent.  The reason is that the tau-p calculator
    handle is passed to the function when using model-based travel times.
    There is no way to default that so it defaults to None.

    :param measured_arrival_time_key: is the key used to fetch a
    measured arrival time.   This parameter is ignored if use_measured_arrival_time
    is False.

    :param taup_model: when use_measured_arrival_time is False this argument
    is required.  It defaults as None because there is now way the author
    knows to initialize it to anything valid.  If set it MUST be an instance
    of the obspy class TauPyModel (https://docs.obspy.org/packages/autogen/obspy.taup.tau.TauPyModel.html#obspy.taup.tau.TauPyModel)
    Mistakes in use of this argument can cause a MsPASSError exception to
    be thrown (not logged thrown as a fatal error) in one of two ways:
    (1)  If use_measured_arrival_time is False this argument must be defined,
    and (2) if it is defined it MUST be an instance of TauPyModel.

    :param update_mode:   When True the function will attempt to extract
    a MongoDB ObjectID from data_object's Metadata using the (currently fixed)
    key "arrival_id".   If found it will add the computed data to an existing
    document in the collection defined by the collection argument.  Otherwise
    it will simply add a new entry and post the ObjectID of the new document
    with the (same fixed) key arrival_id.  When False no attempt to fetch
    the arrival id is made and we simply add a record.  This parameter is
    completely ignored unless the db argument defines a valid Database class.

    :param component: integer (0, 1, or 2) defining which component of a
    Seismogram object to use to compute the requested snr metrics.   This
    parameter is ignored if the input is a TimeSeries.

    :param source_collection:  normalization collection for source data.
    The default is the MsPASS name "source" which means the function will
    try to load the source hypocenter coordinates (when required) as
    source_lat, source_lon, source_depth, and source_time.

    :param receiver_collection:  when set this name will override the
    automatic setting of the expected normalization collection naming
    for receiver functions (see above).  The default is None which causes
    the automatic switching to be involked.  If it is any other string
    the automatic naming will be overridden.

    :return:  the data_object modified by insertion of the snr QC data
    in the object's Metadata
    """
    if data_object.dead():
        return data_object
    if isinstance(data_object, TimeSeries):
        # We need to make a copy of a TimeSeries object to assure the only
        # thing we change is the Metadata we add to the return
        data_to_process = TimeSeries(data_object)
        if receiver_collection:
            rcol = receiver_collection
        else:
            rcol = "channel"
    elif isinstance(data_object, Seismogram):
        if component < 0 or component > 2:
            raise MsPASSError(
                "arrival_snr_QC:  usage error.  " +
                "component parameter passed with illegal value={n}\n".format(
                    n=component) + "Must be 0, 1, or 2",
                ErrorSeverity.Fatal,
            )
        data_to_process = ExtractComponent(data_object, component)
        if receiver_collection:
            rcol = receiver_collection
        else:
            rcol = "site"
    else:
        raise MsPASSError(
            "arrival_snr_QC:   received invalid input data\n" +
            "Input must be either TimeSeries or a Seismogram object",
            ErrorSeverity.Fatal,
        )
    if use_measured_arrival_time:
        arrival_time = data_object[measured_arrival_time_key]
    else:
        # This test is essential or python will throw a more obscure,
        # generic exception
        if taup_model is None:
            raise MsPASSError(
                "arrival_snr_QC:  usage error.  " +
                "taup_model parameter is set None but use_measured_arrival_time is False\n"
                +
                "This gives no way to define processing windows.  See docstring",
                ErrorSeverity.Fatal,
            )
        source_lat = data_object[source_collection + "_lat"]
        source_lon = data_object[source_collection + "_lon"]
        source_depth = data_object[source_collection + "_depth"]
        source_time = data_object[source_collection + "_time"]
        receiver_lat = data_object[rcol + "_lat"]
        receiver_lon = data_object[rcol + "_lon"]
        delta = locations2degrees(source_lat, source_lon, receiver_lat,
                                  receiver_lon)
        arrival = taup_model.get_travel_times(
            source_depth_in_km=source_depth,
            distance_in_degree=delta,
            phase_list=[phase_name],
        )
        arrival_time = source_time + arrival[0].time
        taup_arrival_phase = arrival[0].phase.name
        # not sure if this will happen but worth trapping it as a warning if
        # it does
        if phase_name != taup_arrival_phase:
            data_object.elog.log_error(
                "arrival_snr_QC",
                "Requested phase name=" + phase_name +
                " does not match phase name tag returned by obpsy taup calculator="
                + taup_arrival_phase,
                "Complaint",
            )
    if data_to_process.time_is_UTC():
        data_to_process.ator(arrival_time)
    [snrdata, elog] = FD_snr_estimator(
        data_to_process,
        noise_window,
        noise_spectrum_engine,
        signal_window,
        signal_spectrum_engine,
        band_cutoff_snr,
        tbp,
        ntapers,
        high_frequency_search_start,
        poles,
        perc,
        optional_metrics,
        save_spectra=save_spectra,
    )
    if elog.size() > 0:
        data_object.elog += elog
    snrdata["phase"] = phase_name
    snrdata["snr_arrival_time"] = arrival_time
    snrdata["snr_signal_window_start"] = arrival_time + signal_window.start
    snrdata["snr_signal_window_end"] = arrival_time + signal_window.end
    snrdata["snr_noise_window_start"] = arrival_time + noise_window.start
    snrdata["snr_noise_window_end"] = arrival_time + noise_window.end

    # These cross-referencing keys may not always be defined when a phase
    # time is based on a pick so we add these cautiously
    scol_id_key = source_collection + "_id"
    rcol_id_key = rcol + "_id"
    if data_object.is_defined(scol_id_key):
        snrdata[scol_id_key] = data_object[scol_id_key]
    if data_object.is_defined(rcol_id_key):
        snrdata[rcol_id_key] = data_object[rcol_id_key]
    # Note we add this result to data_object NOT data_to_process because that
    # is not always the same thing - for a TimeSeries input it is a copy of
    # the original but it may have been altered while for a Seismogram it is
    # an extracted component
    data_object[metadata_key] = snrdata
    if db:
        arrival_id_key = collection + "_id"
        dbcol = db[collection]
        if update_mode:
            if data_object.is_defined(arrival_id_key):
                arrival_id = data_object[arrival_id_key]
                filt = {"_id": arrival_id}
                update_clause = {"$set": snrdata}
                dbcol.update_one(filt, update_clause)
            else:
                data_object.elog.log_error(
                    "arrival_snr_QC",
                    "Running in update mode but arrival id key=" +
                    arrival_id_key + " is not defined\n" +
                    "Inserting computed snr data as a new document in collection="
                    + collection,
                    "Complaint",
                )
                arrival_id = dbcol.insert_one(snrdata).inserted_id
                data_object[arrival_id_key] = arrival_id
        else:
            arrival_id = dbcol.insert_one(snrdata).inserted_id
            data_object[arrival_id_key] = arrival_id
    return data_object
Exemple #7
0
def arrival_snr(
    data_object,
    noise_window=TimeWindow(-130.0, -5.0),
    noise_spectrum_engine=None,
    signal_window=TimeWindow(-5.0, 120.0),
    signal_spectrum_engine=None,
    band_cutoff_snr=2.0,
    # check these are reasonable - don't remember the formula when writing this
    tbp=5.0,
    ntapers=10,
    high_frequency_search_start=5.0,
    poles=3,
    perc=95.0,
    save_spectra=False,
    phase_name="P",
    metadata_key="Parrival",
    optional_metrics=[
        "snr_stats",
        "filtered_envelope",
        "filtered_L2",
        "filtered_Linf",
        "filtered_MAD",
        "filtered_perc",
    ],
):
    """
    Specialization of FD_snr_estimator.   A common situation where snr
    data is a critical thing to estimate is data windowed around a given
    seismic phase.   FD_snr_estimator is a bit more generic.  This function
    removes some of the options from the more generic function and
    has a frozen structure appropriate for measuring snr of a particular phase.
    In particular it always stores the results as a subdocument (python dict)
    keyed by the name defined in the metadata_key argument.   The idea of
    that sturcture is the contents of the subdocument are readily extracted
    and saved to a MongoDB "arrival" collection with a normalization key.
    Arrival driven workflows can then use queries to reduce the number of
    data actually retrieved for final processing.  i.e. the arrival
    collection data should be viewed as a useful initial quality control
    feature.

    Most parameters for this function are described in detail in the
    docstring for FD_snr_estimator.  The user is referred there to
    see the usage.   The following are added for this specialization:

    :param phase_name:  Name tag for the seismic phase being analyzed.
    This string is saved to the output subdocument with the key "phase".
    The default is "P"

    :param metadata_key:  is a string used as a key under which the
    subdocument (python dict) created internally is stored.  Default is
    "Parrival".   The idea is if multiple phases are being analyzed
    each phase should have a different key set by this argument
    (e.g. if PP were also being analyzed in the same workflow you
     might use a key like "PParrival").

    :return:  a copy of data_object with the the results stored under
    the key defined by the metadata_key argument.
    """
    if data_object.dead():
        return data_object
    [snrdata, elog] = FD_snr_estimator(
        data_object,
        noise_window,
        noise_spectrum_engine,
        signal_window,
        signal_spectrum_engine,
        band_cutoff_snr,
        tbp,
        ntapers,
        high_frequency_search_start,
        poles,
        perc,
        optional_metrics,
        save_spectra=save_spectra,
    )
    if elog.size() > 0:
        data_object.elog += elog
    snrdata["phase"] = phase_name
    data_object[metadata_key] = snrdata
    return data_object
Exemple #8
0
def snr(
    data_object,
    noise_window=TimeWindow(-130.0, -5.0),
    signal_window=TimeWindow(-5.0, 120.0),
    noise_metric="mad",
    signal_metric="mad",
    perc=95.0,
):
    """
    Compute time-domain based signal-to-noise ratio with a specified metric.

    Signal-to-noise ratio is a fundamental measurement in all forms of
    seismic data processing.   There is, however, not a single unified metric
    that ideal for all types of signals one may want to analyze.  One class
    of metrics used time-domain metrics to use some measure of amplitude in
    a signal and noise window cut from a single waveform segment.  A type
    example is snr of some particular "seismic phase" (P, S, PP, ScS, etc)
    relative to some measure of background noise.  e.g. for P phases it is
    nearly universal to try to estimate snr from some window defined by the
    arrival time of P and a noise window before the time P arrives (pre-event noise).

    This function provides a generic api to measure a large range of metrics
    using one of four choices for measuring the norm of the data in the
    signal and noise windows:
        1.  rms - L2 norm
        2.  mad - median absolute difference, which is essentially the median amplitude in this context
        3.  perc - percentage norm ala seismic unix.  perc is defined at as the
            amplitude level were perc percentage of the data have an amplitude
            smaller than this value.  It is computed by ranking (sorting) the
            data, computing the count of that perctage relative to the number of
            amplitude samples, and returning the amplitude of the nearest value
            to that position in the ranked data.
        4.  peak - is the peak value which in linear algebra is the L infinity norm

    Note the user can specify a different norm for the signal and noise windows.
    The perc metric requires specifying what percentage level to use.

    This function will throw a MsPASSError exception if the window parameters
    do not define a time period inside the range of the data_object. You will
    need a custom function if the model of windows insider a larger waveform
    segment does not match your data.

    There is one final detail about an snr calculation that we handle carefully.
    With simulation data it is very common to have error free simulations where
    the "noise" window one would use with real data is all zeros.  An snr calculated
    with this function in that situation would either return inf or NaN depending
    on some picky details.  Neither is good as either can cause downstream
    problems.  For that reason we trap any condition where the noise amplitude
    measure is computed as zero.  If the signal amplitude is also zero we return
    a -1.0.  Otherwise we return a large, constant, positive number.  Neither
    condition will cause an exception to be thrown as that condition is considered
    somewhat to be anticipated.

    :param data_object:  MsPASS atomic data object (TimeSeries or Seismogram)
      to use for computing the snr.  Note that for Seismogram objects the
      metrix always use L2 measures of amplitude of each sample (i.e. vector amplitudes)
      If snr for components of a Seismogram are desired use ExtractComponent and
      apply this function to each component separately.
    :param noise_window: TimeWindow objects defining the time range to extract
      from data_object to define the part of the signal considered noise.
      Times can be absolute or relative.  Default the range -5 to 120 which
      is makes sense only as time relative to some phase arrival time.
    :param signal_window:  TimeWindow object defining the time range to
      extract from data_object to define the part of the signal defines as
      signal to use for the required amplitude measure.  Default of -130 to
      -5 is consistent with the default noise window (in terms of length) and
      is assumes a time relative to a phase arrival time.  For absolute times
      each call to this function may need its own time window.
    :param noise_metric:  string defining one of the four metrics defined above
      ('mad','peak','perc' or 'rms') to use for noise window measurement.
    :param signal_metric:  string defining one of the four metrics defined above
      ('mad','peak','perc' or 'rms') to use for signal window measurement.
    :return: estimated signal-to-noise ratio as a single float.  Note the
      special returns noted above for any situation where the noise window
      amplitude is 0
    """
    if _window_invalid(data_object, noise_window):
        raise MsPASSError(
            "snr:  noise_window []{wstart} - {wend}] is outside input data range"
            .format(wstart=noise_window.start, wend=noise_window.end),
            ErrorSeverity.Invalid,
        )
    if _window_invalid(data_object, signal_window):
        raise MsPASSError(
            "snr:  noise_window []{wstart} - {wend}] is outside input data range"
            .format(wstart=noise_window.start, wend=noise_window.end),
            ErrorSeverity.Invalid,
        )
    n = WindowData(data_object, noise_window.start, noise_window.end)
    s = WindowData(data_object, signal_window.start, signal_window.end)
    if noise_metric == "rms":
        namp = RMSAmplitude(n)
    elif noise_metric == "mad":
        namp = MADAmplitude(n)
    elif noise_metric == "peak":
        namp = PeakAmplitude(n)
    elif noise_metric == "perc":
        namp = PercAmplitude(n, perc)
    else:
        raise MsPASSError(
            "snr:  Illegal noise_metric argument = " + noise_metric,
            ErrorSeverity.Invalid,
        )

    if signal_metric == "rms":
        samp = RMSAmplitude(s)
    elif signal_metric == "mad":
        samp = MADAmplitude(s)
    elif signal_metric == "peak":
        samp = PeakAmplitude(s)
    elif signal_metric == "perc":
        samp = PercAmplitude(s, perc)
    else:
        raise MsPASSError(
            "snr:  Illegal signal_metric argument = " + signal_metric,
            ErrorSeverity.Invalid,
        )

    return _safe_snr_calculation(samp, namp)
Exemple #9
0
def FD_snr_estimator(
    data_object,
    noise_window=TimeWindow(-130.0, -5.0),
    noise_spectrum_engine=None,
    signal_window=TimeWindow(-5.0, 120.0),
    signal_spectrum_engine=None,
    band_cutoff_snr=2.0,
    # check these are reasonable - don't remember the formula when writing this
    tbp=2.5,
    ntapers=4,
    high_frequency_search_start=5.0,
    poles=3,
    perc=95.0,
    optional_metrics=None,
    save_spectra=False,
):
    # optional_metrics=['snr_stats','filtered_envelope','filtered_L2','filtered_Linf','filtered_MAD','filtered_perc']):
    """
    Estimates one or more metrics of signal-to-noise from a TimeSeries object.
    An implicit assumption is that the analysis is centered on a timeable "phase"
    like P, PP, etc.

    This is a python function that can be used to compute one or several
    signal-to-noise ratio estimates based on an estimated bandwidth using
    the C++ function EstimateBandwidth.  The function has a fair number of
    options, but the core metrics computed are the bandwidth estimates
    computed by that function.  It uses a fairly simple search algorithm
    that functions well for most earthquake sources.  For the low end the
    algorithm searches from the first frequency indistinguishable from DC to
    find the lowest frequency for which the snr exceeds a threshold specified
    by the input parameter 'band_cutoff_snr'.   It does a similar search
    from the high end from a point 80% of Nyquist - a good choice for all
    modern digital data that use FIR antialias filters.   Both searches are
    not just defined with just the first frequency to satisfy the snr
    threshold criteria.  Only when a group of frequencies more than 2 times
    the time-bandwidth product exceed the threshold is the band edge
    defined.   The actual band edge is then defined as the first frequency
    exceeding the threshold.  That more elaborate algorithm was used to
    prevent pure lines in either the signal or noise spectrum from
    corrupting the estimates.

    A set of optional metrics can be computed.  All optional metrics use
    the bandwidth estimates in one way or another.   Optional metrics are
    defined by the following keywords passed through a list (actually
    any iterable container will work) of strings defining one or more
    of the keywords. The metrics and a brief description of each follow:

    *snr_stats* computes what are commonly plotted in box plots for the
    snr estimates within the estimated bandwidth:  minimum, maximum,
    0.25 (1/4) point, 0.75 (3/4) point, and the median.   These are set
    with following dict keys:   'snr_band_maximum','snr_band_minimum',
    'snr_band_1/4', 'srn_band_3/4', and 'snr_band_median' respectively.

    *filtered_envelope*, *filtered_L2*, *filtered_Linf*, *filtered_perc*, and *filtered_MAD*:
    All of these optional metrics first copy the data_object and then
    filter the copy with a Butterworth bandpass filter with the number of
    poles specified by the npoles argument and corners at the estimated
    band edge by the EstimateBandwidth function.   The metrics computed
    are time domain snr estimates computed with he filtered data.  They are
    actually computed from functions in this same module that can be
    used independently and have their own docstring description. The
    functions called have the following names in order of the keyword
    list above:  *snr_envelope*, *snr_L2*, *snr_Linv*, and *snr_MAD*.
    When the computed they are set in the output dictionary with the
    following (again in order) keys:  'snr_envelope','snr_L2', 'srn_Linf',
    and 'snr_MAD'.

    :param data_object:  TimeSeries object to be processed. For Seismogram
    objects the assumption is algorithm would be used for a single
    component (e.g longitudinal or vertical for a P phase)

    :param noise_window: defines the time window to use for computing the
    spectrum considered noise. The time span can be either relative or
    UTC (absolute) time but we do not check for consistency.  This low
    level function assumes they are consistent.  If not, the calculations
    are nearly guaranteed to fail.  Type must be mspasspy.ccore.TimeWindow.

    :param signal_window: defines the time window to use that defines what
    you consider "the signal".  The time span can be either relative or
    UTC (absolute) time but we do not check for consistency.  This low
    level function assumes they are consistent.  If not, the calculations
    are nearly guaranteed to fail.  Type must be mspasspy.ccore.TimeWindow.

    :param noise_spectrum_engine: is expected to either by a None type
    or an instance of a ccore object called an MTPowerSpectralEngine.
    When None an instance of MTPowerSpectralEngine is computed for
    each call to this function.   That is a convenience for small
    jobs or when called with data from mixed sample rates and/or variable
    length time windows.   It is very inefficient to use the default
    approach for processing large data sets and really for any use in a
    map operation with dask or spark.  Normal use should be for the user to
    predefine an MtPowerSpectralEngine from the expected window size
    for a given data sample rate and include it in the function call.

    :param signal_spectrum_engine:  is the comparable MTPowerSpectralEngine
    to use to compute the signal power spectrum.   Default is None with the
    same caveat as above for the noise_spectrum_engine.

    :param tbp:  time-bandwidth product to use for computing the set of
    Slepian functions used for the multitaper estimator.  This parameter is
    used only if the noise_spectrum_engine or signal_spectrum_engine
    arguments are set as None.  The default is 2.5

    :param ntapers:  is the number of Slepian functions (tapers) to compute
    for the multitaper estimators. Like tbp it is referenced only if
    noise_spectrum_engine or signal_spectrum_engine are set to None.
    Note the function will throw an exception if the ntaper parameter is
    not consistent with the time-bandwidth product.  That is, the
    maximum number of tapers is round(2*tbp-1).   Default is 4 which is
    consistent with default tbp=2.5

    :param high_frequency_search_start: Used to specify the upper frequency
      used to start the search for the upper end of the bandwidth by
      the function EstimateBandwidth.  Default is 4.0 which reasonable for
      teleseismic P wave data.  Should be change for usage other than
      analysis of teleseimic P phases or you the bandwidth may be
      grossly underestimated.

    :param npoles:   defines number of poles to us for the Butterworth
    bandpass applied for the "filtered" metrics (see above).  Default is 3.

    :param perc:   used only if 'filtered_perc' is in the optional metrics list.
    Specifies the perc parameter as used in seismic unix.  Uses the percentage
    point specified of the sorted abs of all amplitudes.  (Not perc=50.0 is
    identical to MAD)  Default is 95.0 which is 2 sigma for Gaussian noise.

    :param optional_metrics: is an iterable container containing one or more
    of the optional snr metrics discussed above.

    :param store_as_subdocument:  This parameter is included for
    flexibility but should not normally be changed by the user.  As noted
    earlier the outputs of this function are best abstracted as Metadata.
    When this parameter is False the Metadata members are all posted with
    directly to data_object's Metadata container.  If set True the
    internally generated python dict is copied and stored with a key
    defined through the subdocument_key argument.  See use below in
    function arrival_snr.

    :param subdocument_key:  key for storing results as a subdocument.
    This parameter is ignored unless store_as_subdocument is True.
    Default is "snr_data"

    :param save_spectra:   If set True (default is False) the function
    will pickle the computed noise and signal spectra and save the
    strings created along with a set of related metadata defining the
    time range to the output python dict (these will be saved in MongoDB
    when db is defined - see below).   This option should ONLY be used
    for spot checking, discovery of why an snr metric has unexpected
    results using graphics, or a research topic where the spectra would
    be of interest.  It is a very bad idea to turn this option on if
    you are processing a large quantity of data and saving the results
    to MongoDB as it will bloat the arrival collection.  Consider a
    different strategy if that essential for your work.

    :return:  python tuple with two components.  0 is a python dict with
    the computed metrics associated with keys defined above.  1 is a
    mspass.ccore.ErrorLogger object. Any errors in computng any of the
    metrics will be posted to this logger.  Users should then test this
    object using it's size() method and if it the log is not empty (size >0)
    the caller should handle that condition.   For normal use that means
    pushing any messages the log contains to the original data object's
    error log.
    """
    algname = "FN_snr_estimator"
    my_logger = ErrorLogger()
    # For this algorithm we dogmatically demand the input be a TimeSeries
    if not isinstance(data_object, TimeSeries):
        raise MsPASSError(
            "FD_snr_estimator:  Received invalid data object - arg0 data must be a TimeSeries",
            ErrorSeverity.Invalid,
        )
    # MTPowerSpectrum at the moment has an issue with how it handles
    # a user error in specifying time-band product and number of tapers.
    # We put in an explicit trap here and abort if the user makes a mistake
    # to avoid a huge spray of error message
    if ntapers > round(2 * tbp):
        message = (
            algname +
            "(Fatal Error):  ntapers={ntapers} inconsistent with tbp={tbp}\n".
            format(ntapers=ntapers, tbp=tbp))
        message += "ntapers must be >= round(2*tbp)"
        raise MsPASSError(message, ErrorSeverity.Fatal)
    if data_object.dead():
        my_logger.log_error(
            algname,
            "Datum received was set dead - cannot compute anything",
            ErrorSeverity.Invalid,
        )
        return [dict(), my_logger]
    # We enclose all the main code here in a try block and cat any MsPASSErrors
    # they will be posted as log message. Others will not be handled
    # intentionally letting python's error mechanism handle them as
    # unexpected exceptions - MsPASSError can be anticipated for data problems
    snrdata = dict()
    try:
        # First extract the required windows and compute the power spectra
        n = WindowData(data_object, noise_window.start, noise_window.end)
        s = WindowData(data_object, signal_window.start, signal_window.end)
        if noise_spectrum_engine:
            nengine = noise_spectrum_engine
        else:
            nengine = MTPowerSpectrumEngine(n.npts, tbp, ntapers)
        if signal_spectrum_engine:
            sengine = signal_spectrum_engine
        else:
            sengine = MTPowerSpectrumEngine(n.npts, tbp, ntapers)
        N = nengine.apply(n)
        S = sengine.apply(s)
        bwd = EstimateBandwidth(S.df, S, N, band_cutoff_snr, tbp,
                                high_frequency_search_start)
        # These estimates are always computed and posted
        snrdata["low_f_band_edge"] = bwd.low_edge_f
        snrdata["high_f_band_edge"] = bwd.high_edge_f
        snrdata["low_f_band_edge_snr"] = bwd.low_edge_snr
        snrdata["high_f_band_edge_snr"] = bwd.high_edge_snr
        snrdata["spectrum_frequency_range"] = bwd.f_range
        snrdata["bandwidth_fraction"] = bwd.bandwidth_fraction()
        snrdata["bandwidth"] = bwd.bandwidth()
        if save_spectra:
            snrdata["signal_spectrum"] = pickle.dumps(S)
            snrdata["noise_spectrum"] = pickle.dumps(N)
            snrdata["signal_window_start_time"] = signal_window.start
            snrdata["signal_window_end_time"] = signal_window.end
            snrdata["noise_window_start_time"] = noise_window.start
            snrdata["noise_window_end_time"] = noise_window.end

    except MsPASSError as err:
        newmessage = _reformat_mspass_error(
            err,
            "Spectrum calculation and EstimateBandwidth function section failed with the following message\n",
            "No SNR metrics can be computed for this datum",
        )
        my_logger.log_error(algname, newmessage, ErrorSeverity.Invalid)
        return [snrdata, my_logger]

    # For current implementation all the optional metrics require
    # computed a filtered version of the data.  If a new option is
    # desired that does not require filtering the data the logic
    # here will need to be changed to create a more exclusive test

    if len(optional_metrics) > 0:
        # use the mspass butterworth filter for speed - obspy
        # version requires a conversion to Trace objects
        BWfilt = Butterworth(
            False,
            True,
            True,
            poles,
            bwd.low_edge_f,
            poles,
            bwd.high_edge_f,
            data_object.dt,
        )
        filtered_data = TimeSeries(data_object)
        BWfilt.apply(filtered_data)
        nfilt = WindowData(filtered_data, noise_window.start, noise_window.end)
        sfilt = WindowData(filtered_data, signal_window.start,
                           signal_window.end)
        # In this implementation we don't need this any longer so we
        # delete it here.  If options are added beware
        del filtered_data
        # Some minor efficiency would be possible if we avoided
        # duplication of computations when multiple optional metrics
        # are requested, but the fragility that adds to maintenance
        # is not justified
        for metric in optional_metrics:
            if metric == "snr_stats":
                try:
                    stats = BandwidthStatistics(S, N, bwd)
                    # stats is a Metadata container - copy to snrdata
                    for k in stats.keys():
                        snrdata[k] = stats[k]
                except MsPASSError as err:
                    newmessage = _reformat_mspass_error(
                        err,
                        "BandwithStatistics throw the following error\n",
                        "Five snr_stats attributes were not computed",
                    )
                    my_logger.log_error(algname, newmessage, err.severity)
            if metric == "filtered_envelope":
                try:
                    analytic_nfilt = hilbert(nfilt.data)
                    analytic_sfilt = hilbert(sfilt.data)
                    nampvector = np.abs(analytic_nfilt)
                    sampvector = np.abs(analytic_sfilt)
                    namp = np.median(nampvector)
                    samp = np.max(sampvector)
                    snrdata[
                        "snr_envelope_Linf_over_L1"] = _safe_snr_calculation(
                            samp, namp)
                except:
                    my_logger.log_erro(
                        algname,
                        "Error computing filtered_envelope metrics:  snr_envelope_Linf_over_L1 not computed",
                        ErrorSeverity.Complaint,
                    )
            if metric == "filtered_L2":
                try:
                    namp = RMSAmplitude(nfilt)
                    samp = RMSAmplitude(sfilt)
                    snrvalue = _safe_snr_calculation(samp, namp)
                    snrdata["snr_L2"] = snrvalue
                except MsPASSError as err:
                    newmessage = _reformat_mspass_error(
                        err,
                        "Error computing filtered_L2 metric",
                        "snr_L2 attribute was not compouted",
                    )
                    my_logger.log_error(algname, newmessage, err.severity)

            if metric == "filtered_MAD":
                try:
                    namp = MADAmplitude(nfilt)
                    samp = MADAmplitude(sfilt)
                    snrvalue = _safe_snr_calculation(samp, namp)
                    snrdata["snr_MAD"] = snrvalue
                except MsPASSError as err:
                    newmessage = _reformat_mspass_error(
                        err,
                        "Error computing filtered_MAD metric",
                        "snr_MAD attribute was not computed",
                    )
                    my_logger.log_error(algname, newmessage, err.severity)

            if metric == "filtered_Linf":
                try:
                    # the C function expects a fraction - for users a percentage
                    # is clearer
                    namp = PercAmplitude(nfilt, perc / 100.0)
                    samp = PeakAmplitude(sfilt)
                    snrvalue = _safe_snr_calculation(samp, namp)
                    snrdata["snr_Linf"] = snrvalue
                    snrdata["snr_perc"] = perc
                except MsPASSError as err:
                    newmessage = _reformat_mspass_error(
                        err,
                        "Error computing filtered_Linf metric",
                        "snr_Linf attribute was not computed",
                    )
                    my_logger.log_error(algname, newmessage, err.severity)

            if metric == "filtered_perc":
                try:
                    namp = MADAmplitude(nfilt)
                    samp = PercAmplitude(sfilt, perc / 100.0)
                    snrvalue = _safe_snr_calculation(samp, namp)
                    snrdata["snr_perc"] = snrvalue
                    snrdata[
                        "snr_perc"] = perc  # redundant if filter_Linf is also run but tiny cost
                except MsPASSError as err:
                    newmessage = _reformat_mspass_error(
                        err,
                        "Error computing filtered_perc metric",
                        "snr_perf metric was not computed",
                    )
                    my_logger.log_error(algname, newmessage, err.severity)
            else:
                message = "Illegal optional_metrics keyword=" + metric + "\n"
                message += (
                    "If that is a typo expect some metrics will be missing from output"
                )
                my_logger.log_error(algname, message, ErrorSeverity.Complaint)
    return [snrdata, my_logger]