def test_process_streams(): # Loma Prieta test station (nc216859) data_files, origin = read_data_dir('geonet', 'us1000778i', '*.V1A') streams = [] for f in data_files: streams += read_data(f) sc = StreamCollection(streams) sc.describe() config = update_config(os.path.join(datadir, 'config_min_freq_0p2.yml')) test = process_streams(sc, origin, config=config) logging.info('Testing trace: %s' % test[0][1]) assert len(test) == 3 assert len(test[0]) == 3 assert len(test[1]) == 3 assert len(test[2]) == 3 # Apparently the traces end up in a different order on the Travis linux # container than on my local mac. So testing individual traces need to # not care about trace order. trace_maxes = np.sort( [np.max(np.abs(t.data)) for t in test.select(station='HSES')[0]]) np.testing.assert_allclose(trace_maxes, np.array( [157.81975508, 240.33718094, 263.67804256]), rtol=1e-5)
def test_metrics2(): eventid = 'usb000syza' datafiles, event = read_data_dir('knet', eventid, '*') datadir = os.path.split(datafiles[0])[0] raw_streams = StreamCollection.from_directory(datadir) config = update_config(os.path.join(datadir, 'config_min_freq_0p2.yml')) config['metrics']['output_imts'].append('Arias') config['metrics']['output_imcs'].append('arithmetic_mean') # turn off sta/lta check and snr checks newconfig = drop_processing(config, ['check_sta_lta', 'compute_snr']) processed_streams = process_streams(raw_streams, event, config=newconfig) tdir = tempfile.mkdtemp() try: tfile = os.path.join(tdir, 'test.hdf') workspace = StreamWorkspace(tfile) workspace.addEvent(event) workspace.addStreams(event, processed_streams, label='processed') workspace.calcMetrics(event.id, labels=['processed']) etable, imc_tables1, readmes1 = workspace.getTables('processed') assert 'ARITHMETIC_MEAN' not in imc_tables1 assert 'ARITHMETIC_MEAN' not in readmes1 del workspace.dataset.auxiliary_data.WaveFormMetrics del workspace.dataset.auxiliary_data.StationMetrics workspace.calcMetrics(event.id, labels=['processed'], config=config) etable2, imc_tables2, readmes2 = workspace.getTables('processed') assert 'ARITHMETIC_MEAN' in imc_tables2 assert 'ARITHMETIC_MEAN' in readmes2 assert 'ARIAS' in imc_tables2['ARITHMETIC_MEAN'] testarray = readmes2['ARITHMETIC_MEAN']['Column header'].to_numpy() assert 'ARIAS' in testarray except Exception as e: raise (e) finally: shutil.rmtree(tdir)
def generate_workspace(): """Generate simple HDF5 with ASDF layout for testing. """ PCOMMANDS = [ 'assemble', 'process', ] EVENTID = 'us1000778i' LABEL = 'ptest' datafiles, event = read_data_dir('geonet', EVENTID, '*.V1A') tdir = tempfile.mkdtemp() tfilename = os.path.join(tdir, 'workspace.h5') raw_data = [] for dfile in datafiles: raw_data += read_data(dfile) write_asdf(tfilename, raw_data, event, label="unprocessed") del raw_data config = update_config(os.path.join(datadir, 'config_min_freq_0p2.yml')) workspace = StreamWorkspace.open(tfilename) raw_streams = workspace.getStreams(EVENTID, labels=['unprocessed']) pstreams = process_streams(raw_streams, event, config=config) workspace.addStreams(event, pstreams, label=LABEL) workspace.calcMetrics(event.id, labels=[LABEL], config=config) return tfilename
def test_metrics(): eventid = 'usb000syza' datafiles, event = read_data_dir('knet', eventid, '*') datadir = os.path.split(datafiles[0])[0] raw_streams = StreamCollection.from_directory(datadir) config = update_config(os.path.join(datadir, 'config_min_freq_0p2.yml')) # turn off sta/lta check and snr checks # newconfig = drop_processing(config, ['check_sta_lta', 'compute_snr']) # processed_streams = process_streams(raw_streams, event, config=newconfig) newconfig = config.copy() newconfig['processing'].append( {'NNet_QA': { 'acceptance_threshold': 0.5, 'model_name': 'CantWell' }}) processed_streams = process_streams(raw_streams.copy(), event, config=newconfig) tdir = tempfile.mkdtemp() try: tfile = os.path.join(tdir, 'test.hdf') workspace = StreamWorkspace(tfile) workspace.addEvent(event) workspace.addStreams(event, raw_streams, label='raw') workspace.addStreams(event, processed_streams, label='processed') stream1 = raw_streams[0] # Get metrics from station summary for raw streams summary1 = StationSummary.from_config(stream1) s1_df_in = summary1.pgms.sort_values(['IMT', 'IMC']) array1 = s1_df_in['Result'].to_numpy() # Compare to metrics from getStreamMetrics for raw streams workspace.calcMetrics(eventid, labels=['raw']) summary1_a = workspace.getStreamMetrics(event.id, stream1[0].stats.network, stream1[0].stats.station, 'raw') s1_df_out = summary1_a.pgms.sort_values(['IMT', 'IMC']) array2 = s1_df_out['Result'].to_numpy() np.testing.assert_allclose(array1, array2, atol=1e-6, rtol=1e-6) workspace.close() except Exception as e: raise (e) finally: shutil.rmtree(tdir)
def test_check_instrument(): data_files, origin = read_data_dir('fdsn', 'nc51194936', '*.mseed') streams = [] for f in data_files: streams += read_data(f) sc = StreamCollection(streams) sc.describe() config = update_config(os.path.join(datadir, 'config_test_check_instr.yml')) test = process_streams(sc, origin, config=config) for sta, expected in [('CVS', True), ('GASB', True), ('SBT', False)]: st = test.select(station=sta)[0] logging.info('Testing stream: %s' % st) assert st.passed == expected
def _test_workspace(): eventid = 'us1000778i' datafiles, event = read_data_dir('geonet', eventid, '*.V1A') tdir = tempfile.mkdtemp() try: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=H5pyDeprecationWarning) warnings.filterwarnings("ignore", category=YAMLLoadWarning) warnings.filterwarnings("ignore", category=FutureWarning) config = update_config( os.path.join(datadir, 'config_min_freq_0p2.yml')) tfile = os.path.join(tdir, 'test.hdf') raw_streams = [] for dfile in datafiles: raw_streams += read_data(dfile) workspace = StreamWorkspace(tfile) t1 = time.time() workspace.addStreams(event, raw_streams, label='raw') t2 = time.time() print('Adding %i streams took %.2f seconds' % (len(raw_streams), (t2 - t1))) str_repr = workspace.__repr__() assert str_repr == 'Events: 1 Stations: 3 Streams: 3' eventobj = workspace.getEvent(eventid) assert eventobj.origins[0].latitude == event.origins[0].latitude assert eventobj.magnitudes[0].mag == event.magnitudes[0].mag stations = workspace.getStations() assert sorted(stations) == ['HSES', 'THZ', 'WTMC'] stations = workspace.getStations(eventid=eventid) assert sorted(stations) == ['HSES', 'THZ', 'WTMC'] # test retrieving event that doesn't exist with pytest.raises(KeyError): workspace.getEvent('foo') instream = None for stream in raw_streams: if stream[0].stats.station.lower() == 'hses': instream = stream break if instream is None: raise ValueError('Instream should not be none.') outstream = workspace.getStreams(eventid, stations=['HSES'], labels=['raw'])[0] compare_streams(instream, outstream) label_summary = workspace.summarizeLabels() assert label_summary.iloc[0]['Label'] == 'raw' assert label_summary.iloc[0]['Software'] == 'gmprocess' sc = StreamCollection(raw_streams) processed_streams = process_streams(sc, event, config=config) workspace.addStreams(event, processed_streams, 'processed') idlist = workspace.getEventIds() assert idlist[0] == eventid outstream = workspace.getStreams(eventid, stations=['HSES'], labels=['processed'])[0] provenance = workspace.getProvenance(eventid, labels=['processed']) first_row = pd.Series({ 'Record': 'NZ.HSES.--.HN1_us1000778i_processed', 'Processing Step': 'Remove Response', 'Step Attribute': 'input_units', 'Attribute Value': 'counts' }) last_row = pd.Series({ 'Record': 'NZ.WTMC.--.HNZ_us1000778i_processed', 'Processing Step': 'Lowpass Filter', 'Step Attribute': 'number_of_passes', 'Attribute Value': 2 }) assert provenance.iloc[0].equals(first_row) assert provenance.iloc[-1].equals(last_row) # compare the parameters from the input processed stream # to it's output equivalent instream = None for stream in processed_streams: if stream[0].stats.station.lower() == 'hses': instream = stream break if instream is None: raise ValueError('Instream should not be none.') compare_streams(instream, outstream) workspace.close() # read in data from a second event and stash it in the workspace eventid = 'nz2018p115908' datafiles, event = read_data_dir('geonet', eventid, '*.V2A') raw_streams = [] for dfile in datafiles: raw_streams += read_data(dfile) workspace = StreamWorkspace.open(tfile) workspace.addStreams(event, raw_streams, label='foo') stations = workspace.getStations(eventid) eventids = workspace.getEventIds() assert eventids == ['us1000778i', 'nz2018p115908'] instation = raw_streams[0][0].stats.station this_stream = workspace.getStreams(eventid, stations=[instation], labels=['foo'])[0] assert instation == this_stream[0].stats.station usid = 'us1000778i' inventory = workspace.getInventory(usid) workspace.close() codes = [ station.code for station in inventory.networks[0].stations ] assert sorted(set(codes)) == ['HSES', 'THZ', 'WPWS', 'WTMC'] except Exception as e: raise (e) finally: shutil.rmtree(tdir)
def _test_vs30_dist_metrics(): KNOWN_DISTANCES = { 'epicentral': 5.1, 'hypocentral': 10.2, 'rupture': 2.21, 'rupture_var': np.nan, 'joyner_boore': 2.21, 'joyner_boore_var': np.nan, 'gc2_rx': 2.66, 'gc2_ry': 3.49, 'gc2_ry0': 0.00, 'gc2_U': 34.34, 'gc2_T': 2.66 } KNOWN_BAZ = 239.46 KNOWN_VS30 = 331.47 eventid = 'ci38457511' datafiles, event = read_data_dir('fdsn', eventid, '*') datadir = os.path.split(datafiles[0])[0] raw_streams = StreamCollection.from_directory(datadir) config = update_config(os.path.join(datadir, 'config_min_freq_0p2.yml')) processed_streams = process_streams(raw_streams, event, config=config) rupture_file = get_rupture_file(datadir) grid_file = os.path.join(datadir, 'test_grid.grd') config['metrics']['vs30'] = { 'vs30': { 'file': grid_file, 'column_header': 'GlobalVs30', 'readme_entry': 'GlobalVs30', 'units': 'm/s' } } tdir = tempfile.mkdtemp() try: tfile = os.path.join(tdir, 'test.hdf') ws = StreamWorkspace(tfile) ws.addEvent(event) ws.addStreams(event, raw_streams, label='raw') ws.addStreams(event, processed_streams, label='processed') ws.calcMetrics(event.id, rupture_file=rupture_file, labels=['processed'], config=config) sta_sum = ws.getStreamMetrics(event.id, 'CI', 'CLC', 'processed') for dist in sta_sum.distances: np.testing.assert_allclose(sta_sum.distances[dist], KNOWN_DISTANCES[dist], rtol=0.01) np.testing.assert_allclose(sta_sum._back_azimuth, KNOWN_BAZ, rtol=0.01) np.testing.assert_allclose(sta_sum._vs30['vs30']['value'], KNOWN_VS30, rtol=0.01) event_df, imc_tables, readme_tables = ws.getTables('processed') ws.close() check_cols = set([ 'EpicentralDistance', 'HypocentralDistance', 'RuptureDistance', 'RuptureDistanceVar', 'JoynerBooreDistance', 'JoynerBooreDistanceVar', 'GC2_rx', 'GC2_ry', 'GC2_ry0', 'GC2_U', 'GC2_T', 'GlobalVs30', 'BackAzimuth' ]) assert check_cols.issubset(set(readme_tables['Z']['Column header'])) assert check_cols.issubset(set(imc_tables['Z'].columns)) except Exception as e: raise (e) finally: shutil.rmtree(tdir)
def main(): logging.warning("gmprocess2 (formerly gmprocess) is deprecated " "and will be removed soon.") logging.warning("Please use gmrecords instead.") description = ''' Download, process, and extract metrics from raw ground motion data. This program will allow the user to: - download raw data from a number of sources, including: - Any FDSN provider which serves waveform data - Japan's KNET/KikNet repository (requires login info) - ... ''' parser = argparse.ArgumentParser(description=description, formatter_class=MyFormatter) # ***** Required arguments parser.add_argument('-o', '--output-directory', help='Output directory', metavar="DIRECTORY", action='store', type=str, required=True, dest='outdir') # ***** Command arguments help_assemble = format_helptext( 'Download data from all available online sources, or load raw data ' 'from files if --directory is selected.') parser.add_argument('--assemble', help=help_assemble, action='store_true', dest='assemble') help_process = format_helptext( 'Process data using steps defined in configuration file.') parser.add_argument('--process', help=help_process, action='store_true', dest='process') help_report = format_helptext( 'Create a summary report for each event specified.') parser.add_argument('--report', help=help_report, action='store_true', dest='report') help_provenance = format_helptext( 'Generate provenance table in --format format.') parser.add_argument('--provenance', help=help_provenance, action='store_true', dest='provenance') help_export = format_helptext( 'Generate metrics tables (NGA-style "flat" files) for all events ' 'and IMCs.') parser.add_argument('--export', help=help_export, action='store_true', dest='export') help_export_dir = format_helptext('Specify an alternate directory for the ' 'export files, which defaults to the ' 'directory above event directory.') parser.add_argument('--export-dir', help=help_export_dir) help_shakemap = format_helptext( 'Generate ShakeMap-friendly peak ground motions table.') parser.add_argument('--shakemap', help=help_shakemap, action='store_true', dest='shakemap') # # ***** Optional arguments group = parser.add_mutually_exclusive_group(required=False) help_eventids = format_helptext('ComCat Event IDs') group.add_argument('--eventids', help=help_eventids, nargs='+') help_textfile = format_helptext( 'Text file containing lines of ComCat Event IDs or event ' 'information (ID TIME LAT LON DEPTH MAG)') group.add_argument('--textfile', help=help_textfile, action='store', dest='textfile') help_event = format_helptext( 'Single event information as ID TIME(YYYY-MM-DDTHH:MM:SS) ' 'LAT LON DEP MAG') group.add_argument('--eventinfo', help=help_event, type=str, nargs=7, metavar=('ID', 'TIME', 'LAT', 'LON', 'DEPTH', 'MAG', 'MAG_TYPE')) help_dir = format_helptext( 'Sidestep online data retrieval and get from local directory instead. ' 'This is the path where data already exists. Must organized in a ' '\'raw\' directory, within directories with names as the event IDs. ' 'For example, if `--directory` is \'proj_dir\' and you have data for ' 'event id \'abc123\' then the raw data to be read in should be ' 'located in `proj_dir/abc123/raw/`.') parser.add_argument('--directory', help=help_dir, action='store', dest='directory') help_format = format_helptext('Output format for tabular information') parser.add_argument('--format', help=help_format, choices=['excel', 'csv'], default='csv', dest='format') help_tag = format_helptext( 'Processing label (single word, no spaces) to attach to processed ' 'files. Defaults to the current time in YYYYMMDDHHMMSS format.') parser.add_argument('--process-tag', help=help_tag, action='store', type=str, dest='process_tag') help_config = format_helptext('Supply custom configuration file') parser.add_argument('--config', help=help_config, action='store', type=str, dest='config') help_recompute = format_helptext( 'Recompute metrics (i.e. from new config)') parser.add_argument('--recompute-metrics', help=help_recompute, action='store_true', dest='recompute_metrics') help_logfile = format_helptext( 'Supply file name to store processing log info.') parser.add_argument('--log-file', help=help_logfile, action='store', dest='log_file') nhelpstr = 'Number of parallel processes to run over events.' parser.add_argument('-n', '--num-processes', default=0, type=int, help=nhelpstr) help_status = format_helptext( 'Output failure information, either in short form ("short"), ' 'long form ("long"), or network form ("net"). ' 'short: Two column table, where the columns are "failure reason" and ' '"number of records". net: Three column table where the columns are ' '"network", "number passed", and "number failed". long: Two column ' 'table, where columns are "station ID" and "failure reason".') parser.add_argument('--status', choices=['short', 'long', 'net'], dest='status', help=help_status) # ***** Shared arguments parser = add_shared_args(parser) args = parser.parse_args() tstart = datetime.now() # get the process tag from the user or define by current datetime process_tag = args.process_tag or datetime.utcnow().strftime(TAG_FMT) # config handling configfile = args.config if configfile is not None: config = update_config(configfile) if config is None: print('\nCustom config file %s is invalid. Exiting.' % configfile) sys.exit(1) else: config = get_config() outdir = args.outdir eventids = args.eventids textfile = args.textfile eventinfo = args.eventinfo input_directory = args.directory # get a list of ScalarEvent objects from one of the inputs events = get_events(eventids, textfile, eventinfo, input_directory, outdir) if not events: print('No event information was found. Exiting.') sys.exit(1) if not os.path.isdir(outdir): os.makedirs(outdir) workspace_files = [] files_created = {} logbase = 'gmprocess_batch_log_' logfmt = logbase + '%i.txt' # compare list of all commands with list of actual commands process_commands = set( ['assemble', 'process', 'report', 'shakemap', 'provenance', 'export']) pcommands = [] if args.assemble: pcommands.append('assemble') if args.process: pcommands.append('process') if args.provenance: pcommands.append('provenance') if args.report: pcommands.append('report') if args.shakemap: pcommands.append('shakemap') if args.export: pcommands.append('export') if len(process_commands.intersection(set(pcommands))) > 0: if args.num_processes: # parallelize processing on events using forked processes try: client = Client(n_workers=args.num_processes) except OSError: sys.stderr.write("Could not create a dask client.\n") sys.exit(1) # Need a dict holding all args that do not change across calls _argdict_ = { 'outdir': outdir, 'pcommands': pcommands, 'config': config, 'input_directory': input_directory, 'process_tag': process_tag, 'files_created': files_created, 'output_format': args.format, 'status': args.status, 'recompute_metrics': args.recompute_metrics, 'export_dir': args.export_dir } def dask_process_event(event): """ Wrapper function for multiprocessing of process_event method. """ workname = process_event(event, **_argdict_) return event, workname futures = client.map(dask_process_event, events) for _, result in as_completed(futures, with_results=True): print('Completed event: %s, %s' % (result[0].id, str(result[1]))) else: logfile = os.path.join(outdir, logfmt % os.getpid()) for event in events: workname = process_event(event, outdir, pcommands, config, input_directory, process_tag, files_created, args.format, args.status, args.recompute_metrics, export_dir=args.export_dir) workspace_files.append(workname) print('Completed event: %s, %s' % (event.id, str(workname))) # logging logger = None setup_logger(args) if args.log_file: logger = logging.getLogger() stream_handler = logger.handlers[0] fhandler = logging.FileHandler(args.log_file) logger.removeHandler(stream_handler) logger.addHandler(fhandler) # transfer the logfile contents into our global logger # first get the handler if logger is None: logger = logging.getLogger() handler = logger.handlers[0] # then get the current formatter old_format = handler.formatter handler.setFormatter(logging.Formatter('%(message)s')) logfiles = glob.glob(os.path.join(outdir, logbase + '*')) for logfile in logfiles: with open(logfile, 'rt', encoding='utf-8') as logobj: for line in logobj.readlines(): logging.info(line.strip()) os.remove(logfile) # reset handler back to original formatter handler.setFormatter(old_format) logging.info('%i workspace files created' % len(workspace_files)) if 'export' in pcommands: imc_table_names = [ file.replace('_README', '') for file in os.listdir(outdir) if 'README' in file ] imc_tables = {} for file in imc_table_names: imc_tables[file.replace('.%s' % args.format, '')] = pd.read_csv( os.path.join(outdir, file)) if 'fit_spectra_parameters' in imc_tables: del imc_tables['fit_spectra_parameters'] # TODO - where is this being written? Is it a requirement? event_file = os.path.join(outdir, 'events.csv') if os.path.isfile(event_file): event_table = pd.read_csv(event_file) else: data = [{'id': event.id, 'magnitude': event.magnitude}] event_table = pd.DataFrame(data=data) # make a regression plot of the most common imc/imt combination we # can find if not len(imc_tables): msg = '''No IMC tables found. It is likely that no streams passed checks. If you created reports for the events you have been processing, check those to see if this is the case, then adjust your configuration as necessary to process the data. ''' logging.warning(msg) else: pref_imcs = [ 'rotd50.0', 'greater_of_two_horizontals', 'h1', 'h2', ] pref_imts = ['PGA', 'PGV', 'SA(1.0)'] found_imc = None found_imt = None for imc in pref_imcs: if imc in imc_tables: for imt in pref_imts: if imt in imc_tables[imc].columns: found_imt = imt found_imc = imc break if found_imc: break # now look for whatever IMC/IMTcombination we can find if imc_tables and not found_imc: found_imc = list(imc_tables.keys())[0] table_cols = set(imc_tables[found_imc].columns) imtlist = list(table_cols - NON_IMT_COLS) found_imt = imtlist[0] if found_imc and found_imt: pngfile = '%s_%s.png' % (found_imc, found_imt) regression_file = os.path.join(outdir, pngfile) plot_regression(event_table, found_imc, imc_tables[found_imc], found_imt, regression_file, distance_metric='EpicentralDistance', colormap='viridis_r') append_file(files_created, 'Multi-event regression plot', regression_file) if args.status: if args.status == 'short': index_col = 'Failure reason' elif args.status == 'long': index_col = 'Station ID' elif args.status == 'net': index_col = 'Network' statuses = [] for event in events: status_path = os.path.join(outdir, event.id, 'status.csv') if os.path.exists(status_path): status = pd.read_csv(status_path, index_col=index_col) if args.status == 'long': status['Event ID'] = event.id statuses.append(status) if statuses: comp_status_path = os.path.join(outdir, 'complete_status.csv') if args.status == 'long': for idx, status in enumerate(statuses): if idx == 0: status.to_csv(comp_status_path, mode='w') else: status.to_csv(comp_status_path, mode='a', header=False) else: df_status = pd.concat(statuses) df_status = df_status.groupby(df_status.index).sum() df_status.to_csv(comp_status_path) append_file(files_created, 'Complete status', comp_status_path) print('\nThe following files have been created:') for file_type, file_list in files_created.items(): print('File type: %s' % file_type) for fname in file_list: print('\t%s' % fname) tend = datetime.now() dt = (tend - tstart).total_seconds() minutes = dt // 60 seconds = dt % 60 fmt = '\nElapsed processing time: %i minutes, %i seconds.' print(fmt % (minutes, seconds)) print('\nProcessing is complete.\n')