def WindowData_with_duration( d, duration, t0shift=None, object_history=False, alg_name="scale", alg_id=None, dryrun=False, ): if duration < 0: detailline = "Window duration: {dur} Data range: {dst},{det}".format( dur=duration, dst=d.t0, det=d.endtime()) d.elog.log_error( "WindowData", "Duration is a negative number.\n" + detailline, ErrorSeverity.Invalid, ) d.kill() return d win_start = d.t0 + 1 win_end = win_start + duration if d.dead(): return d twcut = TimeWindow(win_start, win_end) if t0shift: twcut.shift(t0shift) try: # This handler duplicates an error test in the WindowData C code but # it will be more efficient to handle it here. if win_start < d.t0 or win_end > d.endtime(): detailline = "Window range: {wst},{wet} Data range: {dst},{det}".format( wst=win_start, wet=win_end, dst=d.t0, det=d.endtime()) d.elog.log_error( "WindowData", "Data range is smaller than window range\n" + detailline, ErrorSeverity.Invalid, ) d.kill() return d if isinstance(d, TimeSeries): dcut = _WindowData(d, twcut) return dcut elif isinstance(d, Seismogram): dcut = _WindowData3C(d, twcut) return dcut else: raise RuntimeError( "WindowData: Invalid input data type received=" + str(type(d))) except MsPASSError as err: d.log_error("WindowData", str(err), ErrorSeverity.Invalid) d.kill() return d
def nwin(self): if self.__uses_noise: tws = self.md.get_double("noise_window_start") twe = self.md.get_double("noise_window_end") return TimeWindow(tws, twe) else: return TimeWindow # always initialize even if not used
def dwin(self): tws = self.md.get_double("deconvolution_data_window_start") twe = self.md.get_double("deconvolution_data_window_end") return TimeWindow(tws, twe)
def scale( d, compute_from_window=False, window=None, method="peak", level=1.0, scale_by_section=False, use_mean=False, object_history=False, alg_name="scale", alg_id=None, dryrun=False, function_return_key=None, ): """ Top level function interface to data scaling methods. This function can be used to scale seismic data contained in any of the four seismic data objects defined in mspass: TimeSeries, Seismogram, TimeSeriesEnsemble, and SeismogramEnsemble. An extensive set of amplitude estimation metrics are available by selecting one of the allowed values for the method parameter. Ensembles can be scaled at the individual seismogram level or as a whole (scale_by_section=True). Note all methods preserve the original amplitude by updating the Metadata parameter calib to include the scaling. i.e. as always the amplitude of the data in physical units can be restored by multiplying the data samples by calib. :param d: is input data object. If not one of the four mspass seismic data types noted above the function will throw a RuntimeError exception. :param compute_from_window: boolean used to compute amplitude and scale based on a windowed section of the input waveform. By default (this boolan False) the amplitude for scaling is computed from the full waveform. When True the window argument must contain a valid TimeWindow that spans a time range smaller than the data range. In that situation if the window is inconsistent with the data range the return will be marked dead and messages will be found posted in elog. For ensembles all or only portion of the group will be killed if this happens. Note this parameter is also ignored when scale_by_section is true. :param window: mspass TimeWindow object defining the time range over which the amplitude for scaling is to be computed. (see the compute_from_window parameter description) :param method: string defining the gain method to use. Currently supported method values are: peak, RMS (rms accepted), perc, and MAD (also accepts mad or Mad). Peak uses the largest amplitude for scaling. For 3C data that means vector amplitude while for scalar data it is the largest absolute value. rms is the standard rms measure, although for 3C data is is rms vector amplitudes so scaling is by the number of vectors not the total number of samples (3 times number of vectors). perc uses a sorted percentage clip level metric as used in seismic unix. mad is a variant where the value returned is the median absolute deviation (mad) that is actual the same as perc=1/2. Default is peak. WARNING: if an invalid value for method is passed the data will be returned unaltered with a complaint message issue for very datum (indivually or in ensembles) run that way. :param level: For all but perc this defines the scale to which the data are scaled. For perc it is used to set the percent clip level. If the value passed is illegal (0 or negative for most methods while perc must also be positive but less or equal 1) a complain message will be posted to elog and the level adjusted to 1.0. :param scale_by_section: is a boolean that controls the scaling behavior on ensembles only (It is silently ignored for atomic TimeSeries and Seismogram data). When true a single gain factor is applied to all members of an ensemble. When false each member is gained individually as if this function were applied in a loop to each member. :param use_mean: boolean used only for ensembles and when scale_by_section is True. The algorithm used in that case has an option to use the mean log amplitude for scaling the section instead of the default median amplitude. :return: Data scaled to specified level. Note the scaling always preserves absolute amplitude by adjusting the value of the calib attribute of the return so calib*data is the same value before and after the scaling. :rtype: same as input """ if isinstance(d, TimeSeries) or isinstance(d, Seismogram): if d.dead(): return d # First validate arguments # The logic here would be much cleaner if ensembles had an elog attribute # may happen as group discussions have proposed that change. this should # change to be cleaner of elog is added to ensmeble objects if (method != "peak" and method != "RMS" and method != "rms" and method != "perc" and method != "MAD" and method != "mad"): message = ( "method parameter passed = " + method + " is not valid. Should be peak, rms, perc, or mad\nContinuing with no change to data" ) if isinstance(d, TimeSeriesEnsemble) or isinstance( d, SeismogramEnsemble): ensemble_error_post(d, alg_name, message, ErrorSeverity.Complaint) else: # This could cause an abort if the input is not one of the four stock data types # but that is ok as that is an obvious usage error and should be a fatal error d.elog.log_error( alg_name, "method parameter passed = " + method + " is not valid. " + "Should be peak, rms, perc, or mad", ErrorSeverity.Complaint, ) return d if method == "perc": if level <= 0 or level > 1.0: message = "perc scaling method given illegal value={plevel}\nDefaulted to 1.0".format( plevel=level) if isinstance(d, TimeSeriesEnsemble) or isinstance( d, SeismogramEnsemble): ensemble_error_post(d, alg_name, message, ErrorSeverity.Complaint) else: d.elog.log_error(alg_name, message, ErrorSeverity.Complaint) level = 1.0 else: if level <= 0.0: message = "{meth} scaling method given illegal value={slevel}\nDefaulted to 1.0".format( meth=method, slevel=level) if isinstance(d, TimeSeriesEnsemble) or isinstance( d, SeismogramEnsemble): ensemble_error_post(d, alg_name, message, ErrorSeverity.Complaint) else: d.elog.log_error(alg_name, message, ErrorSeverity.Complaint) level = 1.0 if compute_from_window: if isinstance(window, TimeWindow): ampwin = window else: message = "optional window parameter set but value is not a TimeWindow object\nReverting to unwindowed estimate" if isinstance(d, TimeSeriesEnsemble) or isinstance( d, SeismogramEnsemble): ensemble_error_post(d, alg_name, message, ErrorSeverity.Complaint) else: d.elog.log_error(alg_name, message, ErrorSeverity.Complaint) # this is an invalid window because start>end is used as a signal # in WindowData to use the entire waveform. It switches automatically # without logging an error (currently). Definitely a little weird # but this hides that detail for python users ampwin = TimeWindow(0.0, -1.0) else: ampwin = TimeWindow(0.0, -1.0) # The pybind11 and C++ way of defining an enum class creates an # obnoxiously ugly syntax. We insulate the user from this oddity # by using a string arg to define this enum passed to _scale method_to_use = ScalingMethod.Peak if method == "rms" or method == "RMS": method_to_use = ScalingMethod.RMS elif method == "perc": method_to_use = ScalingMethod.perc elif method == "MAD" or method == "mad": method_to_use = ScalingMethod.MAD try: # Note this logic depends on an oddity of the C++ api in the # functions called with the _scale binding name. When the TimeWindow # is invalid the entire data range is silently used - not viewed as # an error. Hence, the following works only when the logic above to # handle the definition of the window parameter is set. if isinstance(d, TimeSeries) or isinstance(d, Seismogram): amp = _scale(d, method_to_use, level, ampwin) _post_amplitude(d, method_to_use, amp) elif isinstance(d, TimeSeriesEnsemble) or isinstance( d, SeismogramEnsemble): if len(d.member ) <= 0: # Silently return nothing if the ensemble is empy return d if scale_by_section: amp = _scale_ensemble(d, method_to_use, level, use_mean) # We post the amplitude the ensembe's metadata in this case _post_amplitude(d, method_to_use, amp) else: ampvec = _scale_ensemble_members(d, method_to_use, level, ampwin) i = 0 for x in d.member: if x.live: _post_amplitude(x, method_to_use, ampvec[i]) i += 1 else: raise MsPASSError( "scale: input data is not a supported mspass seismic data type", "Fatal") return d except MsPASSError as err: if isinstance(d, Seismogram) or isinstance(d, TimeSeries): d.elog.log_error(alg_name, str(err), ErrorSeverity.Invalid) d.kill() # avoid further isinstance at the expense of a maintenance issue. # if we add any other supported data objects we could have a # problem here. This assumes what lands here is an ensemble else: ensemble_error_post(d, alg_name, err) for x in d.member: x.kill() return d # this is needed to handle an oddity recommended on this # web site: http://effbot.org/zone/stupid-exceptions-keyboardinterrupt.htm except KeyboardInterrupt: raise except: message = "Something threw an unexpected exception\nThat is a bug that needs to be fixed - contact authors" if isinstance(d, Seismogram) or isinstance(d, TimeSeries): d.elog.log_error(alg_name, message, ErrorSeverity.Invalid) else: ensemble_error_post(d, alg_name, message, ErrorSeverity.Invalid)
def WindowData( d, win_start, win_end, t0shift=None, object_history=False, alg_name="scale", alg_id=None, dryrun=False, ): """ Cut data defined by a TimeWindow object. Cutting a smaller waveform segment from a larger waveform segment is a very common seismic data processing task. The function is a python wrapper to adapt C++ code that accomplishes that task to the MsPASS framework. Note this function uses one model for being bombproof with a map operation. Any exception that makes the result invalid will cause an error message to be posted to the elog attribute of the input datum AND the data will be marked dead (killed). :param d: is the input data. d must be either a :class:`mspasspy.ccore.seismic.TimeSeries` or :class:`mspasspy.ccore.seismic.Seismogram` object or the function will log an error to d and return a None. :param twin_start: defines the start of timeWindow to be cut :type twin_start: :class:`float` :param twin_end: defines the end of timeWindow to be cut :type twin_end: :class:`float` :param t0shift: is an optional time shift to apply to the time window. This parameter is convenient to avoid conversions to relative time. A typical example would be to set t0shift to an arrival time and let the window define time relative to that arrival time. Default is None which cause the function to assume twin is to be used directly. :param object_history: boolean to enable or disable saving object level history. Default is False. Note this functionality is implemented via the mspass_func_wrapper decorator. :param alg_name: When history is enabled this is the algorithm name assigned to the stamp for applying this algorithm. Default ("WindowData") should normally be just used. Note this functionality is implemented via the mspass_func_wrapper decorator. :param ald_id: algorithm id to assign to history record (used only if object_history is set True.) Note this functionality is implemented via the mspass_func_wrapper decorator. :param dryrun: Note this functionality is implemented via the mspass_func_wrapper decorator. :param dryrun: Note this functionality is implemented via the mspass_func_wrapper decorator. :return: copy of d with sample range reduced to twin range. Returns an empty version of the parent data type (default constructor) if the input is marked dead """ if d.dead(): return d twcut = TimeWindow(win_start, win_end) if t0shift: twcut.shift(t0shift) try: # This handler duplicates an error test in the WindowData C code but # it will be more efficient to handle it here. if twcut.start < d.t0 or twcut.end > d.endtime(): detailline = "Window range: {wst},{wet} Data range: {dst},{det}".format( wst=twcut.start, wet=twcut.end, dst=d.t0, det=d.endtime()) d.elog.log_error( "WindowData", "Data range is smaller than window range\n" + detailline, ErrorSeverity.Invalid, ) d.kill() return d if isinstance(d, TimeSeries): dcut = _WindowData(d, twcut) return dcut elif isinstance(d, Seismogram): dcut = _WindowData3C(d, twcut) return dcut else: raise RuntimeError( "WindowData: Invalid input data type received=" + str(type(d))) except MsPASSError as err: d.log_error("WindowData", str(err), ErrorSeverity.Invalid) d.kill() return d
def arrival_snr_QC( data_object, noise_window=TimeWindow(-130.0, -5.0), noise_spectrum_engine=None, signal_window=TimeWindow(-5.0, 120.0), signal_spectrum_engine=None, band_cutoff_snr=2.0, # check these are reasonable - don't remember the formula when writing this tbp=5.0, ntapers=10, high_frequency_search_start=5.0, poles=3, perc=95.0, phase_name="P", metadata_key="Parrival", optional_metrics=[ "snr_stats", "filtered_envelope", "filtered_L2", "filtered_Linf", "filtered_MAD", "filtered_perc", ], save_spectra=False, db=None, collection="arrival", use_measured_arrival_time=False, measured_arrival_time_key="Ptime", taup_model=None, update_mode=False, component=2, source_collection="source", receiver_collection=None, ): """ Compute a series of metrics that can be used for quality control filtering of seismic phase data. This is the highest level function in this module for computing signal-to-noise ratio metrics for processing signals that can be defined by a computable or measurable "phase". Features this function adds over lower level functions in this module are: 1. An option to save computed metrics to a MongoDB collection (defaults as "arrival"). If the update_mode argument is set True (default is False) the function expects the data_object to contain the attribute "arrival_id" that references the ObjectID of an existing entry in the the collection where the data this function computes is to be saved (default is"arrival"). 2. Adds an option to use a computed or measured arrival as the time reference for all windowing. The lower level snr functions in this module require the user do what this function does prior to calling the function. Note one or the other is required (i.e. either computed or measured time will be define t0 of the processing) The input of arg 0 (data_object) can be either a TimeSeries or a Seismogram object. If a Seismogram object is passed the "component" argument is used to extract the specified single channel from the Seismogram object and than component is used for processing. That is necessary because all the algorithms used are single channel algorithms. To use this function on all components use a loop over components BUT make sure you use a unique value for the argument "metadata_key" for each component. Note this will also produce multiple documents per input datum. The type of the data_object also has a more subtle implication the user must be aware of. That is, in the MsPASS schema we store receiver coordinates in one of two different collections: "channel" for TimeSeries data and "site" for Seismogram data. When such data are loaded the generic keys like lat are always converted to names like channel_lat or site_lat for TimeSeries and Seismogram data respectively. This function uses the data type to set that naming. i.e. if the input is TimeSeries it tries to fetch the latitude data as channel_lat while if it the input is a Seismogram it tries to fetch site_lat. That is true of all coordinate data loaded by normalization from a source and receiver collection. The following args are passed directly to the function arrival_snr: noise_window, signal_window, band_cutoff_snr, tbp, ntapers, poles, perc, phase_name, metadata_key, and optional_metrics. See the docstring for arrival_snr and FD_snr_estimator for descriptions of how these arguments should be used. This top level function adds arguments decribed below. :param db: mspass Database object that is used as a handle for to MongoDB. Default is None, which the function takes to mean you don't want to save the computed values to MongoDB. In this mode the computed metrics will all be posted to a python dict that can be found under the key defined by the "metadata_key" argument. When db is defined the contents of that same python dict will save to MongoDB is the collection defined by the "collection" argument. If db is run as the default None the user is responsible for saving and managing the computed snr data. Be aware a simple later call to db.save_data will not produce the same normalized data with the (default) arrival collection. :param collection: MongoDB collection name where the results of this function will be saved. If the "update_mode" argument is also set True the update section will reference this collection. Default is "arrival". :param use_measured_arrival_time: boolean defining the method used to define the time reference for windowing used for snr calculations. When True the function will attempt to fetch a phase arrival time with the key defined by the "measured_arrival_time_key" argument. In that mode if the fetch fails the data_object will be killed and an error posted to elog. That somewhat brutal choice was intentional as the expectation is if you want to use measured arrival times you don't want data where there are no picks. The default is True to make the defaults consistent. The reason is that the tau-p calculator handle is passed to the function when using model-based travel times. There is no way to default that so it defaults to None. :param measured_arrival_time_key: is the key used to fetch a measured arrival time. This parameter is ignored if use_measured_arrival_time is False. :param taup_model: when use_measured_arrival_time is False this argument is required. It defaults as None because there is now way the author knows to initialize it to anything valid. If set it MUST be an instance of the obspy class TauPyModel (https://docs.obspy.org/packages/autogen/obspy.taup.tau.TauPyModel.html#obspy.taup.tau.TauPyModel) Mistakes in use of this argument can cause a MsPASSError exception to be thrown (not logged thrown as a fatal error) in one of two ways: (1) If use_measured_arrival_time is False this argument must be defined, and (2) if it is defined it MUST be an instance of TauPyModel. :param update_mode: When True the function will attempt to extract a MongoDB ObjectID from data_object's Metadata using the (currently fixed) key "arrival_id". If found it will add the computed data to an existing document in the collection defined by the collection argument. Otherwise it will simply add a new entry and post the ObjectID of the new document with the (same fixed) key arrival_id. When False no attempt to fetch the arrival id is made and we simply add a record. This parameter is completely ignored unless the db argument defines a valid Database class. :param component: integer (0, 1, or 2) defining which component of a Seismogram object to use to compute the requested snr metrics. This parameter is ignored if the input is a TimeSeries. :param source_collection: normalization collection for source data. The default is the MsPASS name "source" which means the function will try to load the source hypocenter coordinates (when required) as source_lat, source_lon, source_depth, and source_time. :param receiver_collection: when set this name will override the automatic setting of the expected normalization collection naming for receiver functions (see above). The default is None which causes the automatic switching to be involked. If it is any other string the automatic naming will be overridden. :return: the data_object modified by insertion of the snr QC data in the object's Metadata """ if data_object.dead(): return data_object if isinstance(data_object, TimeSeries): # We need to make a copy of a TimeSeries object to assure the only # thing we change is the Metadata we add to the return data_to_process = TimeSeries(data_object) if receiver_collection: rcol = receiver_collection else: rcol = "channel" elif isinstance(data_object, Seismogram): if component < 0 or component > 2: raise MsPASSError( "arrival_snr_QC: usage error. " + "component parameter passed with illegal value={n}\n".format( n=component) + "Must be 0, 1, or 2", ErrorSeverity.Fatal, ) data_to_process = ExtractComponent(data_object, component) if receiver_collection: rcol = receiver_collection else: rcol = "site" else: raise MsPASSError( "arrival_snr_QC: received invalid input data\n" + "Input must be either TimeSeries or a Seismogram object", ErrorSeverity.Fatal, ) if use_measured_arrival_time: arrival_time = data_object[measured_arrival_time_key] else: # This test is essential or python will throw a more obscure, # generic exception if taup_model is None: raise MsPASSError( "arrival_snr_QC: usage error. " + "taup_model parameter is set None but use_measured_arrival_time is False\n" + "This gives no way to define processing windows. See docstring", ErrorSeverity.Fatal, ) source_lat = data_object[source_collection + "_lat"] source_lon = data_object[source_collection + "_lon"] source_depth = data_object[source_collection + "_depth"] source_time = data_object[source_collection + "_time"] receiver_lat = data_object[rcol + "_lat"] receiver_lon = data_object[rcol + "_lon"] delta = locations2degrees(source_lat, source_lon, receiver_lat, receiver_lon) arrival = taup_model.get_travel_times( source_depth_in_km=source_depth, distance_in_degree=delta, phase_list=[phase_name], ) arrival_time = source_time + arrival[0].time taup_arrival_phase = arrival[0].phase.name # not sure if this will happen but worth trapping it as a warning if # it does if phase_name != taup_arrival_phase: data_object.elog.log_error( "arrival_snr_QC", "Requested phase name=" + phase_name + " does not match phase name tag returned by obpsy taup calculator=" + taup_arrival_phase, "Complaint", ) if data_to_process.time_is_UTC(): data_to_process.ator(arrival_time) [snrdata, elog] = FD_snr_estimator( data_to_process, noise_window, noise_spectrum_engine, signal_window, signal_spectrum_engine, band_cutoff_snr, tbp, ntapers, high_frequency_search_start, poles, perc, optional_metrics, save_spectra=save_spectra, ) if elog.size() > 0: data_object.elog += elog snrdata["phase"] = phase_name snrdata["snr_arrival_time"] = arrival_time snrdata["snr_signal_window_start"] = arrival_time + signal_window.start snrdata["snr_signal_window_end"] = arrival_time + signal_window.end snrdata["snr_noise_window_start"] = arrival_time + noise_window.start snrdata["snr_noise_window_end"] = arrival_time + noise_window.end # These cross-referencing keys may not always be defined when a phase # time is based on a pick so we add these cautiously scol_id_key = source_collection + "_id" rcol_id_key = rcol + "_id" if data_object.is_defined(scol_id_key): snrdata[scol_id_key] = data_object[scol_id_key] if data_object.is_defined(rcol_id_key): snrdata[rcol_id_key] = data_object[rcol_id_key] # Note we add this result to data_object NOT data_to_process because that # is not always the same thing - for a TimeSeries input it is a copy of # the original but it may have been altered while for a Seismogram it is # an extracted component data_object[metadata_key] = snrdata if db: arrival_id_key = collection + "_id" dbcol = db[collection] if update_mode: if data_object.is_defined(arrival_id_key): arrival_id = data_object[arrival_id_key] filt = {"_id": arrival_id} update_clause = {"$set": snrdata} dbcol.update_one(filt, update_clause) else: data_object.elog.log_error( "arrival_snr_QC", "Running in update mode but arrival id key=" + arrival_id_key + " is not defined\n" + "Inserting computed snr data as a new document in collection=" + collection, "Complaint", ) arrival_id = dbcol.insert_one(snrdata).inserted_id data_object[arrival_id_key] = arrival_id else: arrival_id = dbcol.insert_one(snrdata).inserted_id data_object[arrival_id_key] = arrival_id return data_object
def arrival_snr( data_object, noise_window=TimeWindow(-130.0, -5.0), noise_spectrum_engine=None, signal_window=TimeWindow(-5.0, 120.0), signal_spectrum_engine=None, band_cutoff_snr=2.0, # check these are reasonable - don't remember the formula when writing this tbp=5.0, ntapers=10, high_frequency_search_start=5.0, poles=3, perc=95.0, save_spectra=False, phase_name="P", metadata_key="Parrival", optional_metrics=[ "snr_stats", "filtered_envelope", "filtered_L2", "filtered_Linf", "filtered_MAD", "filtered_perc", ], ): """ Specialization of FD_snr_estimator. A common situation where snr data is a critical thing to estimate is data windowed around a given seismic phase. FD_snr_estimator is a bit more generic. This function removes some of the options from the more generic function and has a frozen structure appropriate for measuring snr of a particular phase. In particular it always stores the results as a subdocument (python dict) keyed by the name defined in the metadata_key argument. The idea of that sturcture is the contents of the subdocument are readily extracted and saved to a MongoDB "arrival" collection with a normalization key. Arrival driven workflows can then use queries to reduce the number of data actually retrieved for final processing. i.e. the arrival collection data should be viewed as a useful initial quality control feature. Most parameters for this function are described in detail in the docstring for FD_snr_estimator. The user is referred there to see the usage. The following are added for this specialization: :param phase_name: Name tag for the seismic phase being analyzed. This string is saved to the output subdocument with the key "phase". The default is "P" :param metadata_key: is a string used as a key under which the subdocument (python dict) created internally is stored. Default is "Parrival". The idea is if multiple phases are being analyzed each phase should have a different key set by this argument (e.g. if PP were also being analyzed in the same workflow you might use a key like "PParrival"). :return: a copy of data_object with the the results stored under the key defined by the metadata_key argument. """ if data_object.dead(): return data_object [snrdata, elog] = FD_snr_estimator( data_object, noise_window, noise_spectrum_engine, signal_window, signal_spectrum_engine, band_cutoff_snr, tbp, ntapers, high_frequency_search_start, poles, perc, optional_metrics, save_spectra=save_spectra, ) if elog.size() > 0: data_object.elog += elog snrdata["phase"] = phase_name data_object[metadata_key] = snrdata return data_object
def snr( data_object, noise_window=TimeWindow(-130.0, -5.0), signal_window=TimeWindow(-5.0, 120.0), noise_metric="mad", signal_metric="mad", perc=95.0, ): """ Compute time-domain based signal-to-noise ratio with a specified metric. Signal-to-noise ratio is a fundamental measurement in all forms of seismic data processing. There is, however, not a single unified metric that ideal for all types of signals one may want to analyze. One class of metrics used time-domain metrics to use some measure of amplitude in a signal and noise window cut from a single waveform segment. A type example is snr of some particular "seismic phase" (P, S, PP, ScS, etc) relative to some measure of background noise. e.g. for P phases it is nearly universal to try to estimate snr from some window defined by the arrival time of P and a noise window before the time P arrives (pre-event noise). This function provides a generic api to measure a large range of metrics using one of four choices for measuring the norm of the data in the signal and noise windows: 1. rms - L2 norm 2. mad - median absolute difference, which is essentially the median amplitude in this context 3. perc - percentage norm ala seismic unix. perc is defined at as the amplitude level were perc percentage of the data have an amplitude smaller than this value. It is computed by ranking (sorting) the data, computing the count of that perctage relative to the number of amplitude samples, and returning the amplitude of the nearest value to that position in the ranked data. 4. peak - is the peak value which in linear algebra is the L infinity norm Note the user can specify a different norm for the signal and noise windows. The perc metric requires specifying what percentage level to use. This function will throw a MsPASSError exception if the window parameters do not define a time period inside the range of the data_object. You will need a custom function if the model of windows insider a larger waveform segment does not match your data. There is one final detail about an snr calculation that we handle carefully. With simulation data it is very common to have error free simulations where the "noise" window one would use with real data is all zeros. An snr calculated with this function in that situation would either return inf or NaN depending on some picky details. Neither is good as either can cause downstream problems. For that reason we trap any condition where the noise amplitude measure is computed as zero. If the signal amplitude is also zero we return a -1.0. Otherwise we return a large, constant, positive number. Neither condition will cause an exception to be thrown as that condition is considered somewhat to be anticipated. :param data_object: MsPASS atomic data object (TimeSeries or Seismogram) to use for computing the snr. Note that for Seismogram objects the metrix always use L2 measures of amplitude of each sample (i.e. vector amplitudes) If snr for components of a Seismogram are desired use ExtractComponent and apply this function to each component separately. :param noise_window: TimeWindow objects defining the time range to extract from data_object to define the part of the signal considered noise. Times can be absolute or relative. Default the range -5 to 120 which is makes sense only as time relative to some phase arrival time. :param signal_window: TimeWindow object defining the time range to extract from data_object to define the part of the signal defines as signal to use for the required amplitude measure. Default of -130 to -5 is consistent with the default noise window (in terms of length) and is assumes a time relative to a phase arrival time. For absolute times each call to this function may need its own time window. :param noise_metric: string defining one of the four metrics defined above ('mad','peak','perc' or 'rms') to use for noise window measurement. :param signal_metric: string defining one of the four metrics defined above ('mad','peak','perc' or 'rms') to use for signal window measurement. :return: estimated signal-to-noise ratio as a single float. Note the special returns noted above for any situation where the noise window amplitude is 0 """ if _window_invalid(data_object, noise_window): raise MsPASSError( "snr: noise_window []{wstart} - {wend}] is outside input data range" .format(wstart=noise_window.start, wend=noise_window.end), ErrorSeverity.Invalid, ) if _window_invalid(data_object, signal_window): raise MsPASSError( "snr: noise_window []{wstart} - {wend}] is outside input data range" .format(wstart=noise_window.start, wend=noise_window.end), ErrorSeverity.Invalid, ) n = WindowData(data_object, noise_window.start, noise_window.end) s = WindowData(data_object, signal_window.start, signal_window.end) if noise_metric == "rms": namp = RMSAmplitude(n) elif noise_metric == "mad": namp = MADAmplitude(n) elif noise_metric == "peak": namp = PeakAmplitude(n) elif noise_metric == "perc": namp = PercAmplitude(n, perc) else: raise MsPASSError( "snr: Illegal noise_metric argument = " + noise_metric, ErrorSeverity.Invalid, ) if signal_metric == "rms": samp = RMSAmplitude(s) elif signal_metric == "mad": samp = MADAmplitude(s) elif signal_metric == "peak": samp = PeakAmplitude(s) elif signal_metric == "perc": samp = PercAmplitude(s, perc) else: raise MsPASSError( "snr: Illegal signal_metric argument = " + signal_metric, ErrorSeverity.Invalid, ) return _safe_snr_calculation(samp, namp)
def FD_snr_estimator( data_object, noise_window=TimeWindow(-130.0, -5.0), noise_spectrum_engine=None, signal_window=TimeWindow(-5.0, 120.0), signal_spectrum_engine=None, band_cutoff_snr=2.0, # check these are reasonable - don't remember the formula when writing this tbp=2.5, ntapers=4, high_frequency_search_start=5.0, poles=3, perc=95.0, optional_metrics=None, save_spectra=False, ): # optional_metrics=['snr_stats','filtered_envelope','filtered_L2','filtered_Linf','filtered_MAD','filtered_perc']): """ Estimates one or more metrics of signal-to-noise from a TimeSeries object. An implicit assumption is that the analysis is centered on a timeable "phase" like P, PP, etc. This is a python function that can be used to compute one or several signal-to-noise ratio estimates based on an estimated bandwidth using the C++ function EstimateBandwidth. The function has a fair number of options, but the core metrics computed are the bandwidth estimates computed by that function. It uses a fairly simple search algorithm that functions well for most earthquake sources. For the low end the algorithm searches from the first frequency indistinguishable from DC to find the lowest frequency for which the snr exceeds a threshold specified by the input parameter 'band_cutoff_snr'. It does a similar search from the high end from a point 80% of Nyquist - a good choice for all modern digital data that use FIR antialias filters. Both searches are not just defined with just the first frequency to satisfy the snr threshold criteria. Only when a group of frequencies more than 2 times the time-bandwidth product exceed the threshold is the band edge defined. The actual band edge is then defined as the first frequency exceeding the threshold. That more elaborate algorithm was used to prevent pure lines in either the signal or noise spectrum from corrupting the estimates. A set of optional metrics can be computed. All optional metrics use the bandwidth estimates in one way or another. Optional metrics are defined by the following keywords passed through a list (actually any iterable container will work) of strings defining one or more of the keywords. The metrics and a brief description of each follow: *snr_stats* computes what are commonly plotted in box plots for the snr estimates within the estimated bandwidth: minimum, maximum, 0.25 (1/4) point, 0.75 (3/4) point, and the median. These are set with following dict keys: 'snr_band_maximum','snr_band_minimum', 'snr_band_1/4', 'srn_band_3/4', and 'snr_band_median' respectively. *filtered_envelope*, *filtered_L2*, *filtered_Linf*, *filtered_perc*, and *filtered_MAD*: All of these optional metrics first copy the data_object and then filter the copy with a Butterworth bandpass filter with the number of poles specified by the npoles argument and corners at the estimated band edge by the EstimateBandwidth function. The metrics computed are time domain snr estimates computed with he filtered data. They are actually computed from functions in this same module that can be used independently and have their own docstring description. The functions called have the following names in order of the keyword list above: *snr_envelope*, *snr_L2*, *snr_Linv*, and *snr_MAD*. When the computed they are set in the output dictionary with the following (again in order) keys: 'snr_envelope','snr_L2', 'srn_Linf', and 'snr_MAD'. :param data_object: TimeSeries object to be processed. For Seismogram objects the assumption is algorithm would be used for a single component (e.g longitudinal or vertical for a P phase) :param noise_window: defines the time window to use for computing the spectrum considered noise. The time span can be either relative or UTC (absolute) time but we do not check for consistency. This low level function assumes they are consistent. If not, the calculations are nearly guaranteed to fail. Type must be mspasspy.ccore.TimeWindow. :param signal_window: defines the time window to use that defines what you consider "the signal". The time span can be either relative or UTC (absolute) time but we do not check for consistency. This low level function assumes they are consistent. If not, the calculations are nearly guaranteed to fail. Type must be mspasspy.ccore.TimeWindow. :param noise_spectrum_engine: is expected to either by a None type or an instance of a ccore object called an MTPowerSpectralEngine. When None an instance of MTPowerSpectralEngine is computed for each call to this function. That is a convenience for small jobs or when called with data from mixed sample rates and/or variable length time windows. It is very inefficient to use the default approach for processing large data sets and really for any use in a map operation with dask or spark. Normal use should be for the user to predefine an MtPowerSpectralEngine from the expected window size for a given data sample rate and include it in the function call. :param signal_spectrum_engine: is the comparable MTPowerSpectralEngine to use to compute the signal power spectrum. Default is None with the same caveat as above for the noise_spectrum_engine. :param tbp: time-bandwidth product to use for computing the set of Slepian functions used for the multitaper estimator. This parameter is used only if the noise_spectrum_engine or signal_spectrum_engine arguments are set as None. The default is 2.5 :param ntapers: is the number of Slepian functions (tapers) to compute for the multitaper estimators. Like tbp it is referenced only if noise_spectrum_engine or signal_spectrum_engine are set to None. Note the function will throw an exception if the ntaper parameter is not consistent with the time-bandwidth product. That is, the maximum number of tapers is round(2*tbp-1). Default is 4 which is consistent with default tbp=2.5 :param high_frequency_search_start: Used to specify the upper frequency used to start the search for the upper end of the bandwidth by the function EstimateBandwidth. Default is 4.0 which reasonable for teleseismic P wave data. Should be change for usage other than analysis of teleseimic P phases or you the bandwidth may be grossly underestimated. :param npoles: defines number of poles to us for the Butterworth bandpass applied for the "filtered" metrics (see above). Default is 3. :param perc: used only if 'filtered_perc' is in the optional metrics list. Specifies the perc parameter as used in seismic unix. Uses the percentage point specified of the sorted abs of all amplitudes. (Not perc=50.0 is identical to MAD) Default is 95.0 which is 2 sigma for Gaussian noise. :param optional_metrics: is an iterable container containing one or more of the optional snr metrics discussed above. :param store_as_subdocument: This parameter is included for flexibility but should not normally be changed by the user. As noted earlier the outputs of this function are best abstracted as Metadata. When this parameter is False the Metadata members are all posted with directly to data_object's Metadata container. If set True the internally generated python dict is copied and stored with a key defined through the subdocument_key argument. See use below in function arrival_snr. :param subdocument_key: key for storing results as a subdocument. This parameter is ignored unless store_as_subdocument is True. Default is "snr_data" :param save_spectra: If set True (default is False) the function will pickle the computed noise and signal spectra and save the strings created along with a set of related metadata defining the time range to the output python dict (these will be saved in MongoDB when db is defined - see below). This option should ONLY be used for spot checking, discovery of why an snr metric has unexpected results using graphics, or a research topic where the spectra would be of interest. It is a very bad idea to turn this option on if you are processing a large quantity of data and saving the results to MongoDB as it will bloat the arrival collection. Consider a different strategy if that essential for your work. :return: python tuple with two components. 0 is a python dict with the computed metrics associated with keys defined above. 1 is a mspass.ccore.ErrorLogger object. Any errors in computng any of the metrics will be posted to this logger. Users should then test this object using it's size() method and if it the log is not empty (size >0) the caller should handle that condition. For normal use that means pushing any messages the log contains to the original data object's error log. """ algname = "FN_snr_estimator" my_logger = ErrorLogger() # For this algorithm we dogmatically demand the input be a TimeSeries if not isinstance(data_object, TimeSeries): raise MsPASSError( "FD_snr_estimator: Received invalid data object - arg0 data must be a TimeSeries", ErrorSeverity.Invalid, ) # MTPowerSpectrum at the moment has an issue with how it handles # a user error in specifying time-band product and number of tapers. # We put in an explicit trap here and abort if the user makes a mistake # to avoid a huge spray of error message if ntapers > round(2 * tbp): message = ( algname + "(Fatal Error): ntapers={ntapers} inconsistent with tbp={tbp}\n". format(ntapers=ntapers, tbp=tbp)) message += "ntapers must be >= round(2*tbp)" raise MsPASSError(message, ErrorSeverity.Fatal) if data_object.dead(): my_logger.log_error( algname, "Datum received was set dead - cannot compute anything", ErrorSeverity.Invalid, ) return [dict(), my_logger] # We enclose all the main code here in a try block and cat any MsPASSErrors # they will be posted as log message. Others will not be handled # intentionally letting python's error mechanism handle them as # unexpected exceptions - MsPASSError can be anticipated for data problems snrdata = dict() try: # First extract the required windows and compute the power spectra n = WindowData(data_object, noise_window.start, noise_window.end) s = WindowData(data_object, signal_window.start, signal_window.end) if noise_spectrum_engine: nengine = noise_spectrum_engine else: nengine = MTPowerSpectrumEngine(n.npts, tbp, ntapers) if signal_spectrum_engine: sengine = signal_spectrum_engine else: sengine = MTPowerSpectrumEngine(n.npts, tbp, ntapers) N = nengine.apply(n) S = sengine.apply(s) bwd = EstimateBandwidth(S.df, S, N, band_cutoff_snr, tbp, high_frequency_search_start) # These estimates are always computed and posted snrdata["low_f_band_edge"] = bwd.low_edge_f snrdata["high_f_band_edge"] = bwd.high_edge_f snrdata["low_f_band_edge_snr"] = bwd.low_edge_snr snrdata["high_f_band_edge_snr"] = bwd.high_edge_snr snrdata["spectrum_frequency_range"] = bwd.f_range snrdata["bandwidth_fraction"] = bwd.bandwidth_fraction() snrdata["bandwidth"] = bwd.bandwidth() if save_spectra: snrdata["signal_spectrum"] = pickle.dumps(S) snrdata["noise_spectrum"] = pickle.dumps(N) snrdata["signal_window_start_time"] = signal_window.start snrdata["signal_window_end_time"] = signal_window.end snrdata["noise_window_start_time"] = noise_window.start snrdata["noise_window_end_time"] = noise_window.end except MsPASSError as err: newmessage = _reformat_mspass_error( err, "Spectrum calculation and EstimateBandwidth function section failed with the following message\n", "No SNR metrics can be computed for this datum", ) my_logger.log_error(algname, newmessage, ErrorSeverity.Invalid) return [snrdata, my_logger] # For current implementation all the optional metrics require # computed a filtered version of the data. If a new option is # desired that does not require filtering the data the logic # here will need to be changed to create a more exclusive test if len(optional_metrics) > 0: # use the mspass butterworth filter for speed - obspy # version requires a conversion to Trace objects BWfilt = Butterworth( False, True, True, poles, bwd.low_edge_f, poles, bwd.high_edge_f, data_object.dt, ) filtered_data = TimeSeries(data_object) BWfilt.apply(filtered_data) nfilt = WindowData(filtered_data, noise_window.start, noise_window.end) sfilt = WindowData(filtered_data, signal_window.start, signal_window.end) # In this implementation we don't need this any longer so we # delete it here. If options are added beware del filtered_data # Some minor efficiency would be possible if we avoided # duplication of computations when multiple optional metrics # are requested, but the fragility that adds to maintenance # is not justified for metric in optional_metrics: if metric == "snr_stats": try: stats = BandwidthStatistics(S, N, bwd) # stats is a Metadata container - copy to snrdata for k in stats.keys(): snrdata[k] = stats[k] except MsPASSError as err: newmessage = _reformat_mspass_error( err, "BandwithStatistics throw the following error\n", "Five snr_stats attributes were not computed", ) my_logger.log_error(algname, newmessage, err.severity) if metric == "filtered_envelope": try: analytic_nfilt = hilbert(nfilt.data) analytic_sfilt = hilbert(sfilt.data) nampvector = np.abs(analytic_nfilt) sampvector = np.abs(analytic_sfilt) namp = np.median(nampvector) samp = np.max(sampvector) snrdata[ "snr_envelope_Linf_over_L1"] = _safe_snr_calculation( samp, namp) except: my_logger.log_erro( algname, "Error computing filtered_envelope metrics: snr_envelope_Linf_over_L1 not computed", ErrorSeverity.Complaint, ) if metric == "filtered_L2": try: namp = RMSAmplitude(nfilt) samp = RMSAmplitude(sfilt) snrvalue = _safe_snr_calculation(samp, namp) snrdata["snr_L2"] = snrvalue except MsPASSError as err: newmessage = _reformat_mspass_error( err, "Error computing filtered_L2 metric", "snr_L2 attribute was not compouted", ) my_logger.log_error(algname, newmessage, err.severity) if metric == "filtered_MAD": try: namp = MADAmplitude(nfilt) samp = MADAmplitude(sfilt) snrvalue = _safe_snr_calculation(samp, namp) snrdata["snr_MAD"] = snrvalue except MsPASSError as err: newmessage = _reformat_mspass_error( err, "Error computing filtered_MAD metric", "snr_MAD attribute was not computed", ) my_logger.log_error(algname, newmessage, err.severity) if metric == "filtered_Linf": try: # the C function expects a fraction - for users a percentage # is clearer namp = PercAmplitude(nfilt, perc / 100.0) samp = PeakAmplitude(sfilt) snrvalue = _safe_snr_calculation(samp, namp) snrdata["snr_Linf"] = snrvalue snrdata["snr_perc"] = perc except MsPASSError as err: newmessage = _reformat_mspass_error( err, "Error computing filtered_Linf metric", "snr_Linf attribute was not computed", ) my_logger.log_error(algname, newmessage, err.severity) if metric == "filtered_perc": try: namp = MADAmplitude(nfilt) samp = PercAmplitude(sfilt, perc / 100.0) snrvalue = _safe_snr_calculation(samp, namp) snrdata["snr_perc"] = snrvalue snrdata[ "snr_perc"] = perc # redundant if filter_Linf is also run but tiny cost except MsPASSError as err: newmessage = _reformat_mspass_error( err, "Error computing filtered_perc metric", "snr_perf metric was not computed", ) my_logger.log_error(algname, newmessage, err.severity) else: message = "Illegal optional_metrics keyword=" + metric + "\n" message += ( "If that is a typo expect some metrics will be missing from output" ) my_logger.log_error(algname, message, ErrorSeverity.Complaint) return [snrdata, my_logger]