def send_products_to_server(job_id): args = load_sys_cfg() jobfile = osp.abspath(osp.join(args.workspace_path, job_id, 'job.json')) logging.info('sent_products_to_server: loading job description from %s' % jobfile) try: js = Dict(json.load(open(jobfile, 'r'))) except Exception as e: logging.error('Cannot load the job description file %s' % jobfile) logging.error('%s' % e) sys.exit(1) desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id pp_dir = js.get( 'pp_dir', osp.abspath(osp.join(args.workspace_path, job_id, "products"))) manifest_filename = js.get('manifest_filename', 'wfc-' + js.grid_code + '.json') send_product_to_server(args, pp_dir, job_id, job_id, manifest_filename, desc)
def execute(args): """ Executes a weather/fire simulation. The args dictionary contains :param args: a dictionary with the following keys :param grid_code: the (unique) code of the grid that is used :param sys_install_path: system installation directory :param start_utc: start time of simulation in UTC :param end_utc: end time of simulation in UTC :param workspace_path: workspace directory :param wps_install_path: installation directory of WPS that will be used :param wrf_install_path: installation directory of WRF that will be used :param grib_source: a string identifying a valid GRIB2 source :param wps_namelist_path: the path to the namelist.wps file that will be used as template :param wrf_namelist_path: the path to the namelist.input file that will be used as template :param fire_namelist_path: the path to the namelist.fire file that will be used as template :param wps_geog_path: the path to the geogrid data directory providing terrain/fuel data :param email_notification: dictionary containing keys address and events indicating when a mail should be fired off """ logging.basicConfig(level=logging.INFO) # initialize the job state from the arguments js = JobState(args) logging.info("job %s starting [%d hours to forecast]." % (js.job_id, js.fc_hrs)) send_email(js, 'start', 'Job %s started.' % js.job_id) # read in all namelists js.wps_nml = f90nml.read(args['wps_namelist_path']) js.wrf_nml = f90nml.read(args['wrf_namelist_path']) js.fire_nml = f90nml.read(args['fire_namelist_path']) js.ems_nml = None if 'emissions_namelist_path' in args: js.ems_nml = f90nml.read(args['emissions_namelist_path']) # Parse and setup the domain configuration js.domain_conf = WPSDomainConf(js.domains) num_doms = len(js.domain_conf) js.wps_nml['share']['start_date'] = [utc_to_esmf(js.start_utc)] * num_doms js.wps_nml['share']['end_date'] = [utc_to_esmf(js.end_utc)] * num_doms js.wps_nml['share']['interval_seconds'] = 3600 logging.info("number of domains defined is %d." % num_doms) # build directories in workspace js.wps_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wps')) js.wrf_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wrf')) logging.info("cloning WPS into %s" % js.wps_dir) # step 1: clone WPS and WRF directories cln = WRFCloner(args) cln.clone_wps(js.wps_dir, js.grib_source.vtables(), []) # step 2: process domain information and patch namelist for geogrid js.wps_nml['geogrid']['geog_data_path'] = args['wps_geog_path'] js.domain_conf.prepare_for_geogrid(js.wps_nml, js.wrf_nml, js.wrfxpy_dir, js.wps_dir) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) # do steps 2 & 3 & 4 in parallel (two execution streams) # -> GEOGRID -> # -> GRIB2 download -> UNGRIB -> proc_q = Queue() geogrid_proc = Process(target=run_geogrid, args=(js, proc_q)) grib_proc = Process(target=retrieve_gribs_and_run_ungrib, args=(js, proc_q)) geogrid_proc.start() grib_proc.start() # wait until both tasks are done geogrid_proc.join() grib_proc.join() if proc_q.get() != 'SUCCESS': return if proc_q.get() != 'SUCCESS': return proc_q.close() # step 5: execute metgrid after ensuring all grids will be processed js.domain_conf.prepare_for_metgrid(js.wps_nml) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) logging.info("running METGRID") Metgrid(js.wps_dir).execute().check_output() send_email(js, 'metgrid', 'Job %s - metgrid complete.' % js.job_id) logging.info("cloning WRF into %s" % js.wrf_dir) # step 6: clone wrf directory, symlink all met_em* files cln.clone_wrf(js.wrf_dir, []) symlink_matching_files(js.wrf_dir, js.wps_dir, "met_em*") logging.info("running REAL") # step 7: patch input namelist, fire namelist, emissions namelist (if required) # and execute real.exe time_ctrl = update_time_control(js.start_utc, js.end_utc, num_doms) js.wrf_nml['time_control'].update(time_ctrl) update_namelist(js.wrf_nml, js.grib_source.namelist_keys()) if 'ignitions' in args: update_namelist(js.wrf_nml, render_ignitions(js, num_doms)) # if we have an emissions namelist, automatically turn on the tracers if js.ems_nml is not None: f90nml.write(js.ems_nml, osp.join(js.wrf_dir, 'namelist.fire_emissions'), force=True) js.wrf_nml['dynamics']['tracer_opt'] = [2] * num_doms f90nml.write(js.wrf_nml, osp.join(js.wrf_dir, 'namelist.input'), force=True) f90nml.write(js.fire_nml, osp.join(js.wrf_dir, 'namelist.fire'), force=True) # try to run Real twice as it sometimes fails the first time # it's not clear why this error happens try: Real(js.wrf_dir).execute().check_output() except Exception as e: logging.error('Real step failed with exception %s, retrying ...' % str(e)) Real(js.wrf_dir).execute().check_output() # step 8: if requested, do fuel moisture DA if js.fmda is not None: logging.info('running fuel moisture data assimilation') for dom in js.fmda.domains: assimilate_fm10_observations(osp.join(wrf_dir, 'wrfinput_d%02d' % dom), None, js.fmda.token) logging.info('submitting WRF job') send_email(js, 'wrf_submit', 'Job %s - wrf job submitted.' % js.job_id) # step 8: execute wrf.exe on parallel backend js.task_id = "sim-" + js.grid_code + "-" + utc_to_esmf(js.start_utc)[:10] WRF(js.wrf_dir, js.qsys).submit(js.task_id, js.num_nodes, js.ppn, js.wall_time_hrs) send_email(js, 'wrf_exec', 'Job %s - wrf job starting now with id %s.' % (js.job_id, js.task_id)) logging.info("WRF job submitted with id %s, waiting for rsl.error.0000" % js.task_id) # step 9: wait for appearance of rsl.error.0000 and open it wrf_out = None while wrf_out is None: try: wrf_out = open(osp.join(js.wrf_dir, 'rsl.error.0000')) break except IOError: logging.info('forecast: waiting 10 seconds for rsl.error.0000 file') time.sleep(5) logging.info('Detected rsl.error.0000') # step 10: track log output and check for history writes fro WRF pp = None already_sent_files, max_pp_dom = [], -1 if js.postproc is not None: js.pp_dir = osp.join(js.workspace_path, js.job_id, "products") make_dir(js.pp_dir) pp = Postprocessor(js.pp_dir, 'wfc-' + js.grid_code) max_pp_dom = max([int(x) for x in filter(lambda x: len(x) == 1, js.postproc)]) while True: line = wrf_out.readline().strip() if not line: time.sleep(0.2) continue if "SUCCESS COMPLETE WRF" in line: send_email(js, 'complete', 'Job %s - wrf job complete SUCCESS.' % js.job_id) logging.info("WRF completion detected.") break if "Timing for Writing wrfout" in line: esmf_time,domain_str = re.match(r'.*wrfout_d.._([0-9_\-:]{19}) for domain\ +(\d+):' ,line).groups() dom_id = int(domain_str) logging.info("Detected history write for domain %d for time %s." % (dom_id, esmf_time)) if js.postproc is not None and str(dom_id) in js.postproc: var_list = [str(x) for x in js.postproc[str(dom_id)]] logging.info("Executing postproc instructions for vars %s for domain %d." % (str(var_list), dom_id)) wrfout_path = find_fresh_wrfout(js.wrf_dir, dom_id) try: pp.process_vars(wrfout_path, dom_id, esmf_time, var_list) except Exception as e: logging.warning('Failed to postprocess for time %s with error %s.' % (esmf_time, str(e))) # if this is the last processed domain for this timestamp in incremental mode, upload to server if dom_id == max_pp_dom and js.postproc.get('shuttle', None) == 'incremental': desc = js.postproc['description'] if 'description' in js.postproc else js.job_id sent_files_1 = send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc, already_sent_files) logging.info('sent %d files to visualization server.' % len(sent_files_1)) already_sent_files = filter(lambda x: not x.endswith('json'), already_sent_files + sent_files_1) # if we are to send out the postprocessed files after completion, this is the time if js.postproc.get('shuttle', None) == 'on_completion': desc = js.postproc['description'] if 'description' in js.postproc else js.job_id send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc)
def process_output(job_id): args = load_sys_cfg() jobfile = osp.abspath(osp.join(args.workspace_path, job_id, 'job.json')) logging.info('process_output: loading job description from %s' % jobfile) try: js = Dict(json.load(open(jobfile, 'r'))) except Exception as e: logging.error('Cannot load the job description file %s' % jobfile) logging.error('%s' % e) sys.exit(1) js.old_pid = js.pid js.pid = os.getpid() js.state = 'Processing' json.dump(js, open(jobfile, 'w'), indent=4, separators=(',', ': ')) js.wrf_dir = osp.abspath(osp.join(args.workspace_path, js.job_id, 'wrf')) pp = None already_sent_files, max_pp_dom = [], -1 if js.postproc is None: logging.info('No postprocessing specified, exiting.') return # set up postprocessing delete_visualization(js.job_id) js.pp_dir = osp.join(args.workspace_path, js.job_id, "products") make_clean_dir(js.pp_dir) pp = Postprocessor(js.pp_dir, 'wfc-' + js.grid_code) js.manifest_filename = 'wfc-' + js.grid_code + '.json' logging.debug('Postprocessor created manifest %s', js.manifest_filename) max_pp_dom = max( [int(x) for x in filter(lambda x: len(x) == 1, js.postproc)]) if js.postproc.get('from', None) == 'wrfout': logging.info('Postprocessing all wrfout files.') # postprocess all wrfouts for wrfout_path in sorted( glob.glob( osp.join(js.wrf_dir, 'wrfout_d??_????-??-??_??:??:??'))): logging.info("Found %s" % wrfout_path) domain_str, wrfout_esmf_time = re.match( r'.*wrfout_d(0[0-9])_([0-9_\-:]{19})', wrfout_path).groups() dom_id = int(domain_str) d = nc4.Dataset(wrfout_path) # extract ESMF string times times = [''.join(x) for x in d.variables['Times'][:]] d.close() for esmf_time in sorted(times): logging.info("Processing domain %d for time %s." % (dom_id, esmf_time)) if js.postproc is not None and str(dom_id) in js.postproc: var_list = [str(x) for x in js.postproc[str(dom_id)]] logging.info( "Executing postproc instructions for vars %s for domain %d." % (str(var_list), dom_id)) try: pp.process_vars(osp.join(js.wrf_dir, wrfout_path), dom_id, esmf_time, var_list) # in incremental mode, upload to server if js.postproc.get('shuttle', None) == 'incremental': desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id sent_files_1 = send_product_to_server( args, js.pp_dir, js.job_id, js.job_id, js.manifest_filename, desc, already_sent_files) already_sent_files = filter( lambda x: not x.endswith('json'), already_sent_files + sent_files_1) except Exception as e: logging.warning( 'Failed to postprocess for time %s with error %s.' % (esmf_time, str(e))) # if we are to send out the postprocessed files after completion, this is the time if js.postproc.get('shuttle', None) == 'on_completion': desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, js.manifest_filename, desc) json.dump(js, open(jobfile, 'w'), indent=4, separators=(',', ': ')) return # step 9: wait for appearance of rsl.error.0000 and open it wrf_out = None rsl_path = osp.join(js.wrf_dir, 'rsl.error.0000') while wrf_out is None: try: wrf_out = open(rsl_path) break except IOError: logging.info( 'process_output: waiting 5 seconds for rsl.error.0000 file') time.sleep(5) logging.info('process_output: Detected rsl.error.0000') js.run_utc = time.ctime(os.path.getmtime(rsl_path)) js.processed_utc = time.asctime(time.gmtime()) # step 10: track log output and check for history writes fro WRF wait_lines = 0 wait_wrfout = 0 while True: line = wrf_out.readline().strip() if not line: if not parallel_job_running(js): logging.warning('WRF did not run to completion.') break if not wait_lines: logging.info('Waiting for more output lines') wait_lines = wait_lines + 1 time.sleep(5) continue wait_lines = 0 if "SUCCESS COMPLETE WRF" in line: # send_email(js, 'complete', 'Job %s - wrf job complete SUCCESS.' % js.job_id) logging.info("WRF completion detected.") js.old_job_num = js.job_num js.job_num = None json.dump(js, open(jobfile, 'w'), indent=4, separators=(',', ': ')) break if "Timing for Writing wrfout" in line: wait_wrfout = 0 esmf_time, domain_str = re.match( r'.*wrfout_d.._([0-9_\-:]{19}) for domain\ +(\d+):', line).groups() wrfout_path, domain_str = re.match( r'.*(wrfout_d.._[0-9_\-:]{19}) for domain\ +(\d+):', line).groups() dom_id = int(domain_str) logging.info("Detected history write for domain %d for time %s." % (dom_id, esmf_time)) if js.postproc is not None and str(dom_id) in js.postproc: var_list = [str(x) for x in js.postproc[str(dom_id)]] logging.info( "Executing postproc instructions for vars %s for domain %d." % (str(var_list), dom_id)) wrfout_path = find_wrfout(js.wrf_dir, dom_id, esmf_time) try: pp.process_vars(osp.join(js.wrf_dir, wrfout_path), dom_id, esmf_time, var_list) except Exception as e: logging.warning( 'Failed to postprocess for time %s with error %s.' % (esmf_time, str(e))) else: # in incremental mode, upload to server if js.postproc.get('shuttle', None) == 'incremental': desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id sent_files_1 = send_product_to_server( args, js.pp_dir, js.job_id, js.job_id, js.manifest_filename, desc, already_sent_files) already_sent_files = filter( lambda x: not x.endswith('json'), already_sent_files + sent_files_1) else: if not wait_wrfout: logging.info('Waiting for wrfout') wait_wrfout = wait_wrfout + 1 # if we are to send out the postprocessed files after completion, this is the time if js.postproc.get('shuttle', None) == 'on_completion': desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, js.manifest_filename, desc) if js.postproc.get('shuttle', None) is not None: make_kmz(js.job_id) # arguments can be added to the job id string js.old_pid = js.pid js.pid = None js.state = 'Completed' json.dump(js, open(jobfile, 'w'), indent=4, separators=(',', ': '))
dont_have_vars, have_vars = rtma.retrieve_rtma(cycle) if dont_have_vars: logging.info('RTMA variables %s not yet available for cycle %s.' % (str(dont_have_vars), str(cycle))) cycle -= timedelta(hours=1) lookback_length -= 1 else: break if dont_have_vars: logging.error('CYCLER could not find useable cycle, exiting.') sys.exit(1) logging.info('Have RTMA data for cycle %s.' % str(cycle)) # check for each region, if we are up to date w.r.t. RTMA data available for region_id,region_cfg in cfg.regions.iteritems(): wrapped_cfg = Dict(region_cfg) #if 1: # to run every time for debugging if not is_cycle_computed(cycle, wrapped_cfg, cfg.workspace_path): logging.info('CYCLER processing region %s for cycle %s' % (region_id, str(cycle))) fmda_advance_region(cycle, wrapped_cfg, rtma, cfg.workspace_path, lookback_length, meso_token) pp_path = postprocess_cycle(cycle, wrapped_cfg, cfg.workspace_path) if 'shuttle_remote_host' in sys_cfg: sim_code = 'fmda-' + wrapped_cfg.code send_product_to_server(sys_cfg, pp_path, sim_code, sim_code, region_id + ' FM') else: logging.info('CYCLER the cycle %s for region %s is already complete, skipping ...' % (str(cycle), str(region_id))) # done logging.info('CYCLER cycle %s complete.' % str(cycle))
def fmda_advance_region(cycle, cfg, rtma, wksp_path, lookback_length, meso_token): """ Advance the fuel moisture estimates in the region specified by the configuration. The function assumes that the fuel moisture model has not been advanced to this cycle yet and will overwrite any previous computations. Control flow: 1) read in RTMA variables 2) check if there is a stored FM model for previous cycle 2a) yes -> load it, advance one time-step, perform DA 2b) no -> compute equilibrium, use background covariance to do DA 3) store model :param cycle: the datetime indicating the processed cycle in UTC :param cfg: the configuration dictionary specifying the region :param rtma: the RTMA object that can be used to retrieve variables for this cycle :param wksp_path: the workspace path for the cycler :param lookback_length: number of cycles to search before we find a computed cycle :param meso_token: the mesowest API access token or a list of them :return: the model advanced and assimilated at the current cycle """ logging.info("rtma_cycler.fmda_advance_region: %s" % str(cycle)) model = None prev_cycle = cycle - timedelta(hours=1) prev_model_path = compute_model_path(prev_cycle, cfg.code, wksp_path) if not osp.exists(prev_model_path): logging.info('CYCLER cannot find model from previous cycle %s' % str(prev_cycle)) if lookback_length > 0: model = fmda_advance_region(cycle - timedelta(hours=1), cfg, rtma, wksp_path, lookback_length - 1, meso_token) else: logging.info('CYCLER found previous model for cycle %s.' % str(prev_cycle)) model = FuelMoistureModel.from_netcdf(prev_model_path) # retrieve the variables and make sure they are available (we should not be here if they are not) try: dont_have_vars, have_vars = rtma.retrieve_rtma(cycle) except ValueError as e: logging.error(e) sys.exit(1) assert not dont_have_vars logging.info('CYCLER loading RTMA data for cycle %s.' % str(cycle)) TD, T2, RH, precipa, hgt, lats, lons = load_rtma_data(have_vars, cfg.bbox) Ed, Ew = compute_equilibria(T2, RH) rain = precipa[:, :] + 0 # remove rain that is too small to make any difference rain[rain < 0.01] = 0 # remove bogus rain that is too large rain[rain > 1e10] = 0 dom_shape = T2.shape # store the lons/lats for this domain geo_path = osp.join(wksp_path, '%s-geo.nc' % cfg.code) if not osp.isfile(geo_path): logging.info('CYCLER initializing new file %s.' % (geo_path)) d = netCDF4.Dataset(geo_path, 'w', format='NETCDF4') d.createDimension('south_north', dom_shape[0]) d.createDimension('west_east', dom_shape[1]) xlat = d.createVariable('XLAT', 'f4', ('south_north', 'west_east')) xlat[:, :] = lats xlong = d.createVariable('XLONG', 'f4', ('south_north', 'west_east')) xlong[:, :] = lons d.close() else: logging.info('CYCLER file already exists: %s.' % (geo_path)) # the process noise matrix Q = np.diag([1e-4, 5e-5, 1e-5, 1e-6, 1e-6]) # background covariance P0 = np.diag([0.01, 0.01, 0.01, 0.001, 0.001]) # check if we must start from equilibrium if model is None: logging.info('CYCLER initializing from equilibrium for cycle %s.' % (str(cycle))) # setup model parameters Nk = 3 Tk = np.array([1.0, 10.0, 100.0]) m0 = np.expand_dims(0.5 * (Ed + Ew), axis=2) model = FuelMoistureModel(m0[:, :, [0, 0, 0]], Tk, P0) else: logging.info('CYCLER advancing model one hour to cycle %s.' % (str(cycle))) dt = 3600 # always 1 hr step in RTMA model.advance_model(Ed, Ew, rain, dt, Q) logging.info('CYCLER retrieving fm-10 observations for cycle %s.' % (str(cycle))) # perform assimilation with mesowest observations tm_start = cycle - timedelta(minutes=30) tm_end = cycle + timedelta(minutes=30) fm10 = retrieve_mesowest_observations(meso_token, tm_start, tm_end, lats, lons, hgt) fm10v = [] for fm10_obs in fm10.values(): for obs in fm10_obs: fm10v.append(obs.get_value()) logging.info( 'CYCLER retrieved %d valid observations, min/mean/max [%g/%g/%g].' % (len(fm10), np.amin(fm10v), np.mean(fm10v), np.amax(fm10v))) # run the data assimilation step covs = [np.ones(dom_shape), hgt / 2000.0] covs_names = ['const', 'hgt/2000'] if np.any(rain > 0.01): covs.append(rain) covs_names.append('rain') execute_da_step(model, cycle, covs, covs_names, fm10) # make geogrid files for WPS; datasets and lines to add to GEOGRID.TBL geo_path = compute_model_path(cycle, cfg.code, wksp_path, ext="geo") index = rtma.geogrid_index() print('index', index) model.to_geogrid(geo_path, index, lats, lons) # make wps format files for WPS fmda_path = osp.join(wksp_path, cfg.code, '{:04d}{:02d}'.format(cycle.year, cycle.month)) time_tag = '{:04d}-{:02d}-{:02d}_{:02d}'.format(cycle.year, cycle.month, cycle.day, cycle.hour) model.to_wps_format(fmda_path, index, lats, lons, time_tag) # store the new model model_path = compute_model_path(cycle, cfg.code, wksp_path) logging.info('CYCLER writing model variables to: %s.' % model_path) model.to_netcdf( ensure_dir(model_path), { 'EQUILd FM': Ed, 'EQUILw FM': Ew, 'TD': TD, 'T2': T2, 'RH': RH, 'PRECIPA': precipa, 'PRECIP': rain, 'HGT': hgt }) # create visualization and send results bounds = (lons.min(), lons.max(), lats.min(), lats.max()) pp_path = postprocess_cycle(cycle, cfg, wksp_path, bounds) if pp_path != None: if 'shuttle_remote_host' in sys_cfg: sim_code = 'fmda-' + cfg.code send_product_to_server(sys_cfg, pp_path, sim_code, sim_code, sim_code + '.json', cfg.region_id + ' FM') return model
if dont_have_vars: logging.warning('CYCLER could not find useable cycle.') logging.warning('CYCLER copying previous post-processing.') for region_id, region_cfg in six.iteritems(cfg.regions): wrapped_cfg = Dict(region_cfg) wrapped_cfg.update({'region_id': region_id}) try: bounds = compute_rtma_bounds(wrapped_cfg.bbox) pp_path = postprocess_cycle(cycle, wrapped_cfg, cfg.workspace_path, bounds) if pp_path != None: if 'shuttle_remote_host' in sys_cfg: sim_code = 'fmda-' + wrapped_cfg.code send_product_to_server(sys_cfg, pp_path, sim_code, sim_code, sim_code + '.json', region_id + ' FM') except Exception as e: logging.warning('CYCLER exception {}'.format(e)) logging.error('CYCLER skipping region {} for cycle {}'.format( region_id, str(cycle))) sys.exit(1) logging.info('Have RTMA data for cycle %s.' % str(cycle)) # check for each region, if we are up to date w.r.t. RTMA data available for region_id, region_cfg in six.iteritems(cfg.regions): wrapped_cfg = Dict(region_cfg) wrapped_cfg.update({'region_id': region_id}) #if 1: # to run every time for debugging if not is_cycle_computed(cycle, wrapped_cfg, cfg.workspace_path):
def process_output(job_id): args = load_sys_cfg() jobfile = osp.abspath(osp.join(args.workspace_path, job_id,'job.json')) logging.info('process_output: loading job description from %s' % jobfile) try: js = Dict(json.load(open(jobfile,'r'))) except Exception as e: logging.error('Cannot load the job description file %s' % jobfile) logging.error('%s' % e) sys.exit(1) js.old_pid = js.pid js.pid = os.getpid() js.state = 'Processing' json.dump(js, open(jobfile,'w'), indent=4, separators=(',', ': ')) js.wrf_dir = osp.abspath(osp.join(args.workspace_path, js.job_id, 'wrf')) # step 9: wait for appearance of rsl.error.0000 and open it wrf_out = None while wrf_out is None: try: wrf_out = open(osp.join(js.wrf_dir, 'rsl.error.0000')) break except IOError: logging.info('process_output: waiting 5 seconds for rsl.error.0000 file') time.sleep(5) logging.info('process_output: Detected rsl.error.0000') # step 10: track log output and check for history writes fro WRF pp = None already_sent_files, max_pp_dom = [], -1 if js.postproc is not None: js.pp_dir = osp.join(args.workspace_path, js.job_id, "products") make_clean_dir(js.pp_dir) pp = Postprocessor(js.pp_dir, 'wfc-' + js.grid_code) max_pp_dom = max([int(x) for x in filter(lambda x: len(x) == 1, js.postproc)]) wait_lines = 0 wait_wrfout = 0 while True: line = wrf_out.readline().strip() if not line: if not parallel_job_running(js): logging.warning('WRF did not run to completion.') break if not wait_lines: logging.info('Waiting for more output lines') wait_lines = wait_lines + 1 time.sleep(0.5) continue wait_lines = 0 if "SUCCESS COMPLETE WRF" in line: # send_email(js, 'complete', 'Job %s - wrf job complete SUCCESS.' % js.job_id) logging.info("WRF completion detected.") js.old_job_num = js.job_num js.job_num = None json.dump(js, open(jobfile,'w'), indent=4, separators=(',', ': ')) break if "Timing for Writing wrfout" in line: esmf_time,domain_str = re.match(r'.*wrfout_d.._([0-9_\-:]{19}) for domain\ +(\d+):' ,line).groups() wrfout_path,domain_str = re.match(r'.*(wrfout_d.._[0-9_\-:]{19}) for domain\ +(\d+):' ,line).groups() dom_id = int(domain_str) logging.info("Detected history write for domain %d for time %s." % (dom_id, esmf_time)) if js.postproc is not None and str(dom_id) in js.postproc: var_list = [str(x) for x in js.postproc[str(dom_id)]] logging.info("Executing postproc instructions for vars %s for domain %d." % (str(var_list), dom_id)) wrfout_path = find_wrfout(js.wrf_dir, dom_id, esmf_time) try: pp.process_vars(osp.join(js.wrf_dir,wrfout_path), dom_id, esmf_time, var_list) except Exception as e: logging.warning('Failed to postprocess for time %s with error %s.' % (esmf_time, str(e))) # in incremental mode, upload to server if js.postproc.get('shuttle', None) == 'incremental': desc = js.postproc['description'] if 'description' in js.postproc else js.job_id sent_files_1 = send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc, already_sent_files) already_sent_files = filter(lambda x: not x.endswith('json'), already_sent_files + sent_files_1) wait_wrfout = 0 else: if not wait_wrfout: logging.info('Waiting for wrfout') wait_wrfout = wait_wrfout + 1 # if we are to send out the postprocessed files after completion, this is the time if js.postproc.get('shuttle', None) == 'on_completion': desc = js.postproc['description'] if 'description' in js.postproc else js.job_id send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc) js.old_pid = js.pid js.pid = None js.state = 'Completed' json.dump(js, open(jobfile,'w'), indent=4, separators=(',', ': '))
def execute(args): """ Executes a weather/fire simulation. The args dictionary contains :param args: a dictionary with the following keys :param grid_code: the (unique) code of the grid that is used :param sys_install_path: system installation directory :param start_utc: start time of simulation in UTC :param end_utc: end time of simulation in UTC :param workspace_path: workspace directory :param wps_install_path: installation directory of WPS that will be used :param wrf_install_path: installation directory of WRF that will be used :param grib_source: a string identifying a valid GRIB2 source :param wps_namelist_path: the path to the namelist.wps file that will be used as template :param wrf_namelist_path: the path to the namelist.input file that will be used as template :param fire_namelist_path: the path to the namelist.fire file that will be used as template :param wps_geog_path: the path to the geogrid data directory providing terrain/fuel data :param email_notification: dictionary containing keys address and events indicating when a mail should be fired off """ logging.basicConfig(level=logging.INFO) # initialize the job state from the arguments js = JobState(args) logging.info("job %s starting [%d hours to forecast]." % (js.job_id, js.fc_hrs)) send_email(js, 'start', 'Job %s started.' % js.job_id) # read in all namelists js.wps_nml = f90nml.read(args['wps_namelist_path']) js.wrf_nml = f90nml.read(args['wrf_namelist_path']) js.fire_nml = f90nml.read(args['fire_namelist_path']) js.ems_nml = None if 'emissions_namelist_path' in args: js.ems_nml = f90nml.read(args['emissions_namelist_path']) # Parse and setup the domain configuration js.domain_conf = WPSDomainConf(js.domains) num_doms = len(js.domain_conf) js.wps_nml['share']['start_date'] = [utc_to_esmf(js.start_utc)] * num_doms js.wps_nml['share']['end_date'] = [utc_to_esmf(js.end_utc)] * num_doms js.wps_nml['share']['interval_seconds'] = 3600 logging.info("number of domains defined is %d." % num_doms) # build directories in workspace js.wps_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wps')) js.wrf_dir = osp.abspath(osp.join(js.workspace_path, js.job_id, 'wrf')) logging.info("cloning WPS into %s" % js.wps_dir) # step 1: clone WPS and WRF directories cln = WRFCloner(args) cln.clone_wps(js.wps_dir, js.grib_source.vtables(), []) # step 2: process domain information and patch namelist for geogrid js.wps_nml['geogrid']['geog_data_path'] = args['wps_geog_path'] js.domain_conf.prepare_for_geogrid(js.wps_nml, js.wrf_nml, js.wrfxpy_dir, js.wps_dir) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) # do steps 2 & 3 & 4 in parallel (two execution streams) # -> GEOGRID -> # -> GRIB2 download -> UNGRIB -> proc_q = Queue() geogrid_proc = Process(target=run_geogrid, args=(js, proc_q)) grib_proc = Process(target=retrieve_gribs_and_run_ungrib, args=(js, proc_q)) geogrid_proc.start() grib_proc.start() # wait until both tasks are done geogrid_proc.join() grib_proc.join() if proc_q.get() != 'SUCCESS': return if proc_q.get() != 'SUCCESS': return proc_q.close() # step 5: execute metgrid after ensuring all grids will be processed js.domain_conf.prepare_for_metgrid(js.wps_nml) f90nml.write(js.wps_nml, osp.join(js.wps_dir, 'namelist.wps'), force=True) logging.info("running METGRID") Metgrid(js.wps_dir).execute().check_output() send_email(js, 'metgrid', 'Job %s - metgrid complete.' % js.job_id) logging.info("cloning WRF into %s" % js.wrf_dir) # step 6: clone wrf directory, symlink all met_em* files cln.clone_wrf(js.wrf_dir, []) symlink_matching_files(js.wrf_dir, js.wps_dir, "met_em*") logging.info("running REAL") # step 7: patch input namelist, fire namelist, emissions namelist (if required) # and execute real.exe time_ctrl = update_time_control(js.start_utc, js.end_utc, num_doms) js.wrf_nml['time_control'].update(time_ctrl) update_namelist(js.wrf_nml, js.grib_source.namelist_keys()) if 'ignitions' in args: update_namelist(js.wrf_nml, render_ignitions(js, num_doms)) # if we have an emissions namelist, automatically turn on the tracers if js.ems_nml is not None: f90nml.write(js.ems_nml, osp.join(js.wrf_dir, 'namelist.fire_emissions'), force=True) js.wrf_nml['dynamics']['tracer_opt'] = [2] * num_doms f90nml.write(js.wrf_nml, osp.join(js.wrf_dir, 'namelist.input'), force=True) f90nml.write(js.fire_nml, osp.join(js.wrf_dir, 'namelist.fire'), force=True) # try to run Real twice as it sometimes fails the first time # it's not clear why this error happens try: Real(js.wrf_dir).execute().check_output() except Exception as e: logging.error('Real step failed with exception %s, retrying ...' % str(e)) Real(js.wrf_dir).execute().check_output() # step 8: if requested, do fuel moisture DA if js.fmda is not None: logging.info('running fuel moisture data assimilation') for dom in js.fmda.domains: assimilate_fm10_observations( osp.join(wrf_dir, 'wrfinput_d%02d' % dom), None, js.fmda.token) logging.info('submitting WRF job') send_email(js, 'wrf_submit', 'Job %s - wrf job submitted.' % js.job_id) # step 8: execute wrf.exe on parallel backend js.task_id = "sim-" + js.grid_code + "-" + utc_to_esmf(js.start_utc)[:10] WRF(js.wrf_dir, js.qsys).submit(js.task_id, js.num_nodes, js.ppn, js.wall_time_hrs) send_email( js, 'wrf_exec', 'Job %s - wrf job starting now with id %s.' % (js.job_id, js.task_id)) logging.info("WRF job submitted with id %s, waiting for rsl.error.0000" % js.task_id) # step 9: wait for appearance of rsl.error.0000 and open it wrf_out = None while wrf_out is None: try: wrf_out = open(osp.join(js.wrf_dir, 'rsl.error.0000')) break except IOError: logging.info( 'forecast: waiting 10 seconds for rsl.error.0000 file') time.sleep(5) logging.info('Detected rsl.error.0000') # step 10: track log output and check for history writes fro WRF pp = None already_sent_files, max_pp_dom = [], -1 if js.postproc is not None: js.pp_dir = osp.join(js.workspace_path, js.job_id, "products") make_dir(js.pp_dir) pp = Postprocessor(js.pp_dir, 'wfc-' + js.grid_code) max_pp_dom = max( [int(x) for x in filter(lambda x: len(x) == 1, js.postproc)]) while True: line = wrf_out.readline().strip() if not line: time.sleep(0.2) continue if "SUCCESS COMPLETE WRF" in line: send_email(js, 'complete', 'Job %s - wrf job complete SUCCESS.' % js.job_id) logging.info("WRF completion detected.") break if "Timing for Writing wrfout" in line: esmf_time, domain_str = re.match( r'.*wrfout_d.._([0-9_\-:]{19}) for domain\ +(\d+):', line).groups() dom_id = int(domain_str) logging.info("Detected history write for domain %d for time %s." % (dom_id, esmf_time)) if js.postproc is not None and str(dom_id) in js.postproc: var_list = [str(x) for x in js.postproc[str(dom_id)]] logging.info( "Executing postproc instructions for vars %s for domain %d." % (str(var_list), dom_id)) wrfout_path = find_fresh_wrfout(js.wrf_dir, dom_id) try: pp.process_vars(wrfout_path, dom_id, esmf_time, var_list) except Exception as e: logging.warning( 'Failed to postprocess for time %s with error %s.' % (esmf_time, str(e))) # if this is the last processed domain for this timestamp in incremental mode, upload to server if dom_id == max_pp_dom and js.postproc.get('shuttle', None) == 'incremental': desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id sent_files_1 = send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc, already_sent_files) logging.info('sent %d files to visualization server.' % len(sent_files_1)) already_sent_files = filter(lambda x: not x.endswith('json'), already_sent_files + sent_files_1) # if we are to send out the postprocessed files after completion, this is the time if js.postproc.get('shuttle', None) == 'on_completion': desc = js.postproc[ 'description'] if 'description' in js.postproc else js.job_id send_product_to_server(args, js.pp_dir, js.job_id, js.job_id, desc)