def main(self, gmrecords): """ Download data and organize it in the project data directory. Args: gmrecords: GMrecordsApp instance. """ logging.info('Running subcommand \'%s\'' % self.command_name) self.gmrecords = gmrecords self._check_arguments() self._get_events() logging.info('Number of events to download: %s' % len(self.events)) for event in self.events: logging.info('Starting event: %s' % event.id) event_dir = os.path.join(gmrecords.data_path, event.id) if not os.path.exists(event_dir): os.makedirs(event_dir) _ = download(event=event, event_dir=event_dir, config=gmrecords.conf, directory=None, create_workspace=False, stream_collection=False)
def _assemble_event(self, event): logging.info('Starting event: %s' % event.id) event_dir = os.path.join(self.gmrecords.data_path, event.id) if not os.path.exists(event_dir): os.makedirs(event_dir) workname = os.path.join(event_dir, WORKSPACE_NAME) workspace_exists = os.path.isfile(workname) if workspace_exists: logging.info("ASDF exists: %s" % workname) if not self.gmrecords.args.overwrite: logging.info("The --overwrite argument not selected.") logging.info("No action taken for %s." % event.id) return event.id else: logging.info( "Removing existing ASDF file: %s" % workname ) os.remove(workname) # Todo: probably want to break up `download` into finer steps to # call here. Also, there are files created besides workspace # that are not getting tracked (e.g., raw data plots, event.json) workspace, _, _, _ = download( event=event, event_dir=event_dir, config=self.gmrecords.conf, directory=self.gmrecords.data_path ) workspace.close() self.append_file('Workspace', workname) return event.id
def process_event(event, outdir, pcommands, config, input_directory, process_tag, files_created, output_format, status, recompute_metrics, export_dir=None): # setup logging to write to the input logfile argthing = namedtuple('args', ['debug', 'quiet']) args = argthing(debug=True, quiet=False) setup_logger(args) logger = logging.getLogger() stream_handler = logger.handlers[0] logfile = os.path.join(outdir, '%s.log' % event.id) fhandler = logging.FileHandler(logfile) logger.removeHandler(stream_handler) logger.addHandler(fhandler) event_dir = os.path.join(outdir, event.id) if not os.path.exists(event_dir): os.makedirs(event_dir) workname = os.path.join(event_dir, WORKSPACE_NAME) workspace_exists = os.path.isfile(workname) workspace_has_processed = False workspace = None processing_done = False if workspace_exists: workspace = StreamWorkspace.open(workname) labels = workspace.getLabels() if len(labels): labels.remove('unprocessed') elif 'assemble' not in pcommands: print('No data in workspace. Please run assemble.') sys.exit(1) if len(labels) == 1: process_tag = labels[0] workspace_has_processed = True else: if 'process' not in pcommands: fmt = '\nThere are %i sets of processed data in %s.' tpl = (len(labels), workname) print(fmt % tpl) print(('This software can only handle one set of ' 'processed data. Exiting.\n')) sys.exit(1) download_done = False # Need to initialize rstreams/pstreams rstreams = [] pstreams = [] rupture_file = None if 'assemble' in pcommands: logging.info('Downloading/loading raw streams...') workspace, workspace_file, rstreams, rupture_file = download( event, event_dir, config, input_directory) download_done = True append_file(files_created, 'Workspace', workname) else: if not workspace_exists: print('\nYou opted not to download or process from input.') print('No previous HDF workspace file could be found.') print('Try re-running with the assemble command with or ') print('without the --directory option.\n') sys.exit(1) if 'process' in pcommands: logging.info('Getting raw streams from workspace...') with warnings.catch_warnings(): warnings.simplefilter("ignore", category=H5pyDeprecationWarning) rstreams = workspace.getStreams(event.id, labels=['unprocessed']) download_done = True else: need_processed = set(['report', 'shakemap']) need_pstreams = len(need_processed.intersection(pcommands)) if workspace_has_processed: if need_pstreams: logging.info('Getting processed streams from workspace...') with warnings.catch_warnings(): warnings.simplefilter("ignore", category=H5pyDeprecationWarning) pstreams = workspace.getStreams(event.id, labels=[process_tag]) download_done = True processing_done = True if ('process' in pcommands and download_done and not processing_done and len(rstreams)): logging.info('Processing raw streams for event %s...' % event.id) pstreams = process_streams(rstreams, event, config=config) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=H5pyDeprecationWarning) workspace.addStreams(event, pstreams, label=process_tag) workspace.calcMetrics(event.id, labels=[process_tag], config=config, streams=pstreams, stream_label=process_tag, rupture_file=rupture_file) processing_done = True if 'export' in pcommands: if export_dir is not None: if not os.path.isdir(export_dir): os.makedirs(export_dir) outdir = export_dir labels = workspace.getLabels() if 'unprocessed' not in labels: fmt = ('Workspace file "%s" appears to have no unprocessed ' 'data. Skipping.') logging.info(fmt % workspace_file) else: labels.remove('unprocessed') if not labels: fmt = ('Workspace file "%s" appears to have no processed ' 'data. Skipping.') logging.info(fmt % workspace_file) else: logging.info('Creating tables for event %s...', event.id) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=H5pyDeprecationWarning) if recompute_metrics: del workspace.dataset.auxiliary_data.WaveFormMetrics del workspace.dataset.auxiliary_data.StationMetrics workspace.calcMetrics(event.id, labels=labels, config=config, rupture_file=rupture_file) event_table, imc_tables, readmes = workspace.getTables( labels[0], streams=pstreams, stream_label=process_tag) ev_fit_spec, fit_readme = workspace.getFitSpectraTable( event.id, labels[0], pstreams) # Set the precisions for the imc tables, event table, and # fit_spectra table before writing imc_tables_formatted = {} for imc, imc_table in imc_tables.items(): imc_tables_formatted[imc] = set_precisions(imc_table) event_table_formatted = set_precisions(event_table) df_fit_spectra_formatted = set_precisions(ev_fit_spec) if not os.path.isdir(outdir): os.makedirs(outdir) filenames = ['events'] + \ [imc.lower() for imc in imc_tables_formatted.keys()] + \ [imc.lower() + '_README' for imc in readmes.keys()] + \ ['fit_spectra_parameters', 'fit_spectra_parameters_README'] files = [event_table_formatted] + list( imc_tables_formatted.values()) + list(readmes.values()) + [ df_fit_spectra_formatted, fit_readme ] if output_format != 'csv': output_format = 'xlsx' for filename, df in dict(zip(filenames, files)).items(): filepath = os.path.join(outdir, filename + '.%s' % output_format) if os.path.exists(filepath): if 'README' in filename: continue else: mode = 'a' header = False else: mode = 'w' header = True append_file(files_created, 'Tables', filepath) if output_format == 'csv': df.to_csv(filepath, index=False, float_format=DEFAULT_FLOAT_FORMAT, na_rep=DEFAULT_NA_REP, mode=mode, header=header) else: df.to_excel(filepath, index=False, float_format=DEFAULT_FLOAT_FORMAT, na_rep=DEFAULT_NA_REP, mode=mode, header=header) if ('report' in pcommands and processing_done and len(pstreams)): logging.info('Creating diagnostic plots for event %s...' % event.id) plot_dir = os.path.join(event_dir, 'plots') if not os.path.isdir(plot_dir): os.makedirs(plot_dir) for stream in pstreams: summary_plots(stream, plot_dir, event) mapfile = draw_stations_map(pstreams, event, event_dir) plot_moveout(pstreams, event.latitude, event.longitude, file=os.path.join(event_dir, 'moveout_plot.png')) append_file(files_created, 'Station map', mapfile) append_file(files_created, 'Moveout plot', 'moveout_plot.png') logging.info('Creating diagnostic report for event %s...' % event.id) # Build the summary report? build_conf = config['build_report'] report_format = build_conf['format'] if report_format == 'latex': report_file, success = build_report_latex(pstreams, event_dir, event, config=config) else: report_file = '' success = False if os.path.isfile(report_file) and success: append_file(files_created, 'Summary report', report_file) if 'provenance' in pcommands and processing_done and len(pstreams): logging.info('Creating provenance table for event %s...' % event.id) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=H5pyDeprecationWarning) provdata = workspace.getProvenance(event.id, labels=[process_tag]) if output_format == 'csv': csvfile = os.path.join(event_dir, 'provenance.csv') append_file(files_created, 'Provenance', csvfile) provdata.to_csv(csvfile) else: excelfile = os.path.join(event_dir, 'provenance.xlsx') append_file(files_created, 'Provenance', excelfile) provdata.to_excel(excelfile, index=False) if 'shakemap' in pcommands and processing_done and len(pstreams): logging.info('Creating shakemap table for event %s...' % event.id) shakemap_file, jsonfile = save_shakemap_amps(pstreams, event, event_dir) append_file(files_created, 'shakemap', shakemap_file) append_file(files_created, 'shakemap', jsonfile) if status and processing_done and len(pstreams): if status == 'short': index = 'Failure reason' col = ['Number of records'] elif status == 'long': index = 'Station ID' col = ['Failure reason'] elif status == 'net': index = 'Network' col = ['Number of passed records', 'Number of failed records'] status_info = pstreams.get_status(status) status_info.to_csv(os.path.join(event_dir, 'status.csv'), header=col, index_label=index) # since we don't know how many events users will be processing, # let's guard against memory issues by clearing out the big data # structures workspace.close() logging.info('Finishing event %s' % event.id) return workname