def lambda_handler(event, context): ''' This lambda handler calls submit_job with the job type info and product id from the sns message ''' print("Got event of type: %s" % type(event)) print("Got event: %s" % json.dumps(event)) print("Got context: %s" % context) # parse sns message message = json.loads(event["Records"][0]["Sns"]["Message"]) # parse s3 event s3_info = message['Records'][0]['s3'] # parse signal and dataset files and urls bucket = s3_info['bucket']['name'] trigger_file = s3_info['object']['key'] print("Trigger file: {}".format(trigger_file)) if signal_file_suffix: ds_file = trigger_file.replace(signal_file_suffix, '') else: ds_file = trigger_file ds_url = "s3://%s/%s/%s" % (os.environ['DATASET_S3_ENDPOINT'], bucket, ds_file) # read in metadata md = {} if signal_file_suffix: s3 = boto3.resource('s3') obj = s3.Object(bucket, trigger_file) md = json.loads(obj.get()['Body'].read()) print("Got signal metadata: %s" % json.dumps(md, indent=2)) # data file id = data_file = os.path.basename(ds_url) # submit mozart jobs to update ES default_job_type = os.environ['JOB_TYPE'] # e.g. "INGEST_L0A_LR_RAW" default_job_release = os.environ['JOB_RELEASE'] # e.g. "gman-dev" default_queue = os.environ['JOB_QUEUE'] job_types = {} if 'JOB_TYPES' in os.environ: job_types = json.loads(os.environ['JOB_TYPES']) job_type, job_release, queue = __get_job_type_info(data_file, job_types, default_job_type, default_job_release, default_queue) job_spec = "job-%s:%s" % (job_type, job_release) job_params = { "id": id, "data_url": ds_url, "data_file": data_file, "prod_met": md, } tags = ["data-staged"] # submit mozart job submit_job(job_spec, job_params, queue, tags)
def create_and_submit_job(self, batch_client: batch.BatchExtensionsClient): """ Creates the Job that will be submitted to the batch service :param batch_client: The batch client to use. :type batch_client: `azure.batch.BatchExtensionsClient` """ logger.info('Creating Job [{}]... job will run on [{}]'.format( self.job_id, self.pool_id)) # load the template and parameters file template = ctm.load_file(self.template_file) parameters = ctm.load_file(self.parameters_file) # updates any placeholder parameter values with the values from # keyVault, if required utils.update_params_with_values_from_keyvault( parameters, self.keyvault_client_with_url) # overrides some of the parameters needed in the file, container SAS # tokens need to be generated for the container ctm.set_parameter_name(parameters, self.job_id) ctm.set_parameter_storage_info(parameters, self.storage_info) ctm.set_template_pool_id(parameters, self.pool_id) ctm.set_job_resource_file_urls_to_branch( template, self.repository_branch_name) # Submits the job utils.submit_job(batch_client, template, parameters, self.raw_job_id)
def run_wps_geogrid(work_root, wps_root, config, args): wps_work_dir = os.path.abspath(work_root) + '/wps' if not os.path.isdir(wps_work_dir): os.mkdir(wps_work_dir) os.chdir(wps_work_dir) cli.notice(f'Run geogrid.exe at {wps_work_dir} ...') if not os.path.isfile('GEOGRID.TBL'): run(f'ln -sf {wps_root}/geogrid/GEOGRID.TBL.ARW {wps_work_dir}/GEOGRID.TBL' ) expected_files = [ 'geo_em.d{:02d}.nc'.format(i + 1) for i in range(config['domains']['max_dom']) ] if not check_files(expected_files): run('rm -f geo_em.d*.nc') submit_job(f'{wps_root}/geogrid/src/geogrid.exe', args.np, config, args, logfile='geogrid.log.0000', wait=True) if not check_files(expected_files): cli.error( f'Failed! Check output {os.path.abspath(wps_work_dir)}/geogrid.out.0000' ) cli.notice('Succeeded.') else: cli.notice('File geo_em.*.nc already exist.') run(f'ls -l {wps_work_dir}/geo_em.*.nc')
def run_wrfplus_ad(work_root, wrfplus_root, config, args): start_time = config['custom']['start_time'] end_time = config['custom']['end_time'] datetime_fmt = 'YYYY-MM-DD_HH:mm:ss' start_time_str = start_time.format(datetime_fmt) max_dom = config['domains']['max_dom'] wrf_work_dir = os.path.abspath(work_root) + '/wrf' if not os.path.isdir(wrf_work_dir): cli.error(f'WRF work directory {wrf_work_dir} does not exist!') wrfplus_work_dir = os.path.abspath(work_root) + '/wrfplus' if not os.path.isdir(wrfplus_work_dir): cli.error(f'WRFPLUS has not been configured! Run config_wrfplus.py first.') os.chdir(wrfplus_work_dir) if os.path.isfile(f'{wrf_work_dir}/wrfinput_d01_{start_time_str}'): run(f'ln -sf {wrf_work_dir}/wrfinput_d01 .') elif os.path.isfile(f'{wrf_work_dir}/wrfout_d01_{start_time_str}'): run(f'ln -sf {wrf_work_dir}/wrfout_d01_{start_time_str} wrfinput_d01') run(f'ln -sf {wrf_work_dir}/wrfbdy_d01 .') if not os.path.isfile('final_sens_d01'): cli.error('There is no final_sens_d01 file!') version = wrf_version(wrfplus_root) cli.stage(f'Run WRFPLUS at {wrfplus_work_dir} ...') expected_files = ['wrfout_d{:02d}_{}'.format(i + 1, start_time_str) for i in range(max_dom)] expected_files.append(f'init_sens_d01_{start_time_str}') if not check_files(expected_files) or args.force: run('rm -f wrfout_*') run(f'ln -sf {wrfplus_root}/run/LANDUSE.TBL .') run(f'ln -sf {wrfplus_root}/run/VEGPARM.TBL .') run(f'ln -sf {wrfplus_root}/run/SOILPARM.TBL .') run(f'ln -sf {wrfplus_root}/run/GENPARM.TBL .') run(f'ln -sf {wrfplus_root}/run/RRTM_DATA_DBL RRTM_DATA') run(f'ln -sf {wrfplus_root}/run/ETAMPNEW_DATA_DBL ETAMPNEW_DATA') if version >= Version('4.0'): cmd = f'{wrfplus_root}/run/wrfplus.exe' else: cmd = f'{wrfplus_root}/run/wrf.exe' retries = 0 while True: submit_job(cmd, args.np, config, args, wait=True) if os.path.isfile(f'gradient_wrfplus_d01_{start_time_str}'): run(f'mv gradient_wrfplus_d01_{start_time_str} init_sens_d01_{start_time_str}') if not check_files(expected_files): if retries == 10: cli.error(f'Failed! Check output {os.path.abspath(wrfplus_work_dir)}/rsl.error.0000.') retries = retries + 1 cli.warning('Failed to run wrfplus, retry it!') else: break cli.notice('Succeeded.') else: cli.notice('File wrfout_* already exist.') run(f'ls -l {wrfplus_work_dir}/wrfout_*')
def test_cni_labels(): driver_task_id = utils.submit_job( app_url=SPARK_EXAMPLES, app_args= "3000", # Long enough to examine the Driver's & Executor's task infos app_name="/spark", args=[ "--conf", "spark.mesos.network.name=dcos", "--conf", "spark.mesos.network.labels=key1:val1,key2:val2", "--conf", "spark.cores.max={}".format(CNI_TEST_NUM_EXECUTORS), "--class", "org.apache.spark.examples.SparkPi" ]) # Wait until executors are running utils.wait_for_executors_running(SPARK_PI_FW_NAME, CNI_TEST_NUM_EXECUTORS) # Check for network name / labels in Driver task info driver_task = shakedown.get_task(driver_task_id, completed=False) _check_task_network_info(driver_task) # Check for network name / labels in Executor task info executor_task = shakedown.get_service_tasks(SPARK_PI_FW_NAME)[0] _check_task_network_info(executor_task) # Check job output utils.check_job_output(driver_task_id, "Pi is roughly 3")
def submit_example_aws_batch_job(): boto3.setup_default_session(region_name='us-east-1') set_batch_client(boto3.client('batch')) set_cloudwatch_client(boto3.client('logs')) job_queue = "my_queue_here" job_name = f"example_job-{datetime.now().isoformat()}" print(f"JobName: {job_name}") command_args = {'-arg1': "value1", '-arg2': "value2"} command = [] for k, v in command_args.items(): command.append(k) command.append(v) # actual command is a list: ['-arg1','value1','-arg2','value2'] job_def_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), "example_batch_job_definition.json") job_def = update_job_definition_from_file(job_def_file) job_id = submit_job(command, job_definition_arn=job_def['jobDefinitionArn'], job_name=job_name, job_queue=job_queue) track_job(job_id)
def test_disconnect_from_master(): python_script_path = os.path.join(THIS_DIR, 'jobs', 'python', 'long_running.py') python_script_url = utils.upload_file(python_script_path) task_id = utils.submit_job( python_script_url, "{} {}".format(LONG_RUNNING_FW_NUM_TASKS, LONG_RUNNING_RUN_TIME_SEC), [ "--conf", "spark.mesos.driver.failoverTimeout=1800", "--conf", "spark.cores.max=1" ]) # Wait until executor is running utils.wait_for_executors_running(LONG_RUNNING_FW_NAME, LONG_RUNNING_FW_NUM_TASKS) # Block the driver's connection to Mesos master framework_info = shakedown.get_service(LONG_RUNNING_FW_NAME) (driver_host, port) = _parse_fw_pid_host_port(framework_info["pid"]) _block_master_connection(driver_host, port) # The connection will timeout after 15 minutes of inactivity. # Add 5 minutes to make sure the master has detected the disconnection. # The framework will be considered disconnected => failover_timeout kicks in. LOGGER.info( "Waiting {} seconds for connection with master to timeout...".format( MASTER_CONNECTION_TIMEOUT_SEC)) time.sleep(MASTER_CONNECTION_TIMEOUT_SEC + 5 * 60) # Restore the connection. The driver should reconnect. _unblock_master_connection(driver_host) # The executor and driver should finish. utils.check_job_output(task_id, "Job completed successfully")
def lambda_handler(event, context): ''' This lambda handler calls submit_job with the job type info and product id from the sns message ''' print("Got event of type: %s" % type(event)) print("Got event: %s" % json.dumps(event)) print("Got context: %s" % context) # parse sns message message = json.loads(event["Records"][0]["Sns"]["Message"]) # parse s3 event s3_info = message['Records'][0]['s3'] # parse met and dataset files and urls bucket = s3_info['bucket']['name'] met_file = s3_info['object']['key'] ds_file = met_file.replace('.met.json', '') ds_url = "s3://%s/%s/%s" % (os.environ['DATASET_S3_ENDPOINT'], bucket, ds_file) # read in metadata s3 = boto3.resource('s3') obj = s3.Object(bucket, met_file) md = json.loads(obj.get()['Body'].read()) print("Got metadata: %s" % json.dumps(md, indent=2)) # dataset id id = md['id'] #submit mozart jobs to update ES job_type = os.environ['JOB_TYPE'] # e.g. "INGEST_L0A_LR_RAW" job_release = os.environ['JOB_RELEASE'] # e.g. "gman-dev" job_spec = "job-%s:%s" % (job_type, job_release) job_params = { "id": id, "raw_url": ds_url, "raw_file": os.path.basename(ds_url), "prod_met": md, } queue = os.environ['JOB_QUEUE'] # eg.g "factotum-job_worker-large" tags = ["data-staged"] # submit mozart job submit_job(job_spec, job_params, queue, tags)
def main(): id = utils.submit_job(min=args.min, max=args.max, N=args.N, O=args.O, res_up=args.res_up, res_dw=args.res_dw, path=args.path) print("Job submitted! GUID:", str(id)) '''
def start_job(self, run_type, runs, run_name, meas_spec, dep_jobs, asic_set): global BATCH_JOB_DIR if self.use_xfel: if type(runs) == list: runs = "-".join(list(map(str, runs))) job_name = ("{}_{}_{}_ch{}".format(run_type, self.measurement, runs, self.panel)) else: short_temperature = self.temperature[len("temperature_"):] job_name = ("{}_{}_{}_{}".format(run_type, self.measurement, self.panel, run_name)) if asic_set is not None: print("Starting job for asics {}\n".format(asic_set)) #asic_str = "asics" + "-".join(list(map(str,asic_set))) job_name = job_name + "_{}".format(asic_set) log_name = "{}_%j".format(job_name) self.sbatch_params += [ "--job-name", job_name, "--output", log_name + ".out", "--error", log_name + ".err" ] self.script_params["asic_list"] = asic_set script_params = json.dumps(self.script_params) shell_script = os.path.join(BATCH_JOB_DIR, "start_analyse.sh") cmd = [shell_script, BATCH_JOB_DIR, script_params] jobnum = None if not self.no_slurm: if dep_jobs != "": self.sbatch_params += [ "--depend=afterok:{}".format(dep_jobs), "--kill-on-invalid-dep=yes" ] cmd = ["sbatch"] + self.sbatch_params + cmd # print("submitting job with command:", cmd) jobnum = utils.submit_job(cmd, "{} job".format(run_type)) else: print("cmd {}".format(cmd)) try: subprocess.call(cmd) except: raise return jobnum
def generate_mail_body(self): header = ["Panel", "Run type", "Runs", "JobID", "State"] sep = " " overview = self.get() time.sleep(3) # get status of jobs status = {} for panel in overview: for run_type in overview[panel]: for runs in overview[panel][run_type]: d_o = overview[panel][run_type][runs] if d_o["jobnum"] is not None: cmd = ["sacct", "--brief", "-p", "-j", d_o["jobnum"]] # os.system("squeue --user $USER") result = utils.submit_job(cmd, jobname="sacct") result = result.split() # the line ends with a separator # -> remove last character # status_header = result[0][:-1].split("|") # print(status_header) for res in result[1:]: # the line ends with a separator status_result = res[:-1].split("|") # print("status_result", status_result) # sacct give the two results for every job # <jobid> and <jobid>.batch if status_result[0] == d_o["jobnum"]: # status results as the entries # 'JobID', 'State', 'ExitCode' status[d_o["jobnum"]] = status_result[1] max_key_len, header_str, sorted_keys = self.overview.prepare(header, sep) print("\nMail Body") print(header_str) # print overview for panel in sorted_keys: for run_type in overview[panel]: for runs in overview[panel][run_type]: d_o = overview[panel][run_type][runs] row = [] row.append(panel.ljust(max_key_len[header[0]])) row.append(run_type.ljust(max_key_len[header[1]])) row.append(runs.ljust(max_key_len[header[2]])) row.append(str(d_o["jobnum"]).ljust(max_key_len[header[3]])) row.append(status[d_o["jobnum"]]) print(sep.join(row))
def test_supervise(): def streaming_job_registered(): return shakedown.get_service("HdfsWordCount") is not None def streaming_job_is_not_running(): return not streaming_job_registered() def has_running_executors(): f = shakedown.get_service("HdfsWordCount") if f is None: return False else: return len([ x for x in f.dict()["tasks"] if x["state"] == "TASK_RUNNING" ]) > 0 driver_id = utils.submit_job( app_url=SPARK_EXAMPLES, app_args="file:///mnt/mesos/sandbox/", app_name="/spark", args=[ "--supervise", "--class", "org.apache.spark.examples.streaming.HdfsWordCount", "--conf", "spark.cores.max=8", "--conf", "spark.executors.cores=4" ]) LOGGER.info("Started supervised driver {}".format(driver_id)) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has running executors") host = shakedown.get_service("HdfsWordCount").dict()["hostname"] id = shakedown.get_service("HdfsWordCount").dict()["id"] driver_regex = "spark.mesos.driver.frameworkId={}".format(id) shakedown.kill_process_on_host(hostname=host, pattern=driver_regex) shakedown.wait_for(lambda: streaming_job_registered(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has re-registered") shakedown.wait_for(lambda: has_running_executors(), ignore_exceptions=False, timeout_seconds=600) LOGGER.info("Job has re-started") out = utils.kill_driver(driver_id, "/spark") LOGGER.info("{}".format(out)) out = json.loads(out) assert out["success"], "Failed to kill spark streaming job" shakedown.wait_for(lambda: streaming_job_is_not_running(), ignore_exceptions=False, timeout_seconds=600)
def run_openpose(vid_path, op_output_dir, face=True, hand=True, overwrite=None, **kwargs): """run_openpose: submit sbatch job to run Openpose on given video. :param vid_path: path to video file. :param op_output_dir: directory containing Openpose output folders. :param face: outputs face keypoints if True. :param hand: outputs hand keypoints if True. :param **kwargs: additional command-line arguments to pass to Openpose (see Openpose demo documentation). """ os.makedirs(op_output_dir, exist_ok=True) vid_name = ntpath.basename(vid_path)[:-4] vid_output_dir = os.path.join(op_output_dir, f'{vid_name}') if os.path.exists(vid_output_dir): if (overwrite is None and input( f'overwrite existing directory {vid_output_dir}? (yes/no)') != 'yes') or not overwrite: print(f'aborting on video {vid_path}.') return os.makedirs(vid_output_dir, exist_ok=True) #this could also be openpose_latest.sif, instead of openpose-latest.img. cmd = 'singularity exec --nv $SINGULARITY_CACHEDIR/openpose-latest.img bash -c \'' cmd += 'cd /openpose-master && ./build/examples/openpose/openpose.bin ' cmd += f'--video {vid_path} ' for opt, optval in kwargs.items(): cmd += f'--{opt} {optval} ' if face: cmd += '--face ' if hand: cmd += '--hand ' cmd += f'--write_keypoint_json {vid_output_dir}\'' msg = submit_job(cmd, job_name=f'{vid_name}', p='gpu', t=5.0, mem='8G', gres='gpu:1') print(msg)
def packSubmit(self, job_obj): def get_mpi_ranks(runtime_data, inh5, pat_per_job, max_jobs): pat_num = 0 try: for ind, f in enumerate(runtime_data): fp = h5py.File(f, 'r') if len(fp[inh5].shape) != 3: return None pat_num += fp[inh5].shape[0] fp.close() except: return None return min(pat_num / pat_per_job + 1, max_jobs) proj_conf = utils.read_config( os.path.join(self.rootdir, self.namespace['ini'])) pat_per_job = int( proj_conf.get(self.namespace['project_ini'][0], self.namespace['project_ini'][1].split(',')[4])) max_jobs = int( proj_conf.get(self.namespace['project_ini'][0], self.namespace['project_ini'][1].split(',')[5])) del proj_conf # runtime json runtime = {} # config file config_file = None # jss sub_jss = self.jss # queue sub_submit_queue = self.submit_queue # write run shell to workdir sub_prepare_sub_script = os.path.join( os.path.split(os.path.realpath(__file__))[0], "scripts/submission.sh") # python script sub_python_exec = None # number of process sub_nproc = 0 # workdir sub_workdir = None ############ These are irrelevent to assignments' type runtime['run_name'] = job_obj.run_name runtime['dataset'] = job_obj.datafile runtime['savepath'] = job_obj.savepath config_file = job_obj.config sub_workdir = job_obj.savepath # get inh5 try: inh5 = utils.read_config( config_file, [self.namespace['config_head'], 'data-path in cxi/h5']) except: inh5 = utils.read_config(config_file, ['darkcal', 'inh5']) inh5 = utils.compile_h5loc(inh5, job_obj.run_name) ############ ''' choose correct code blocks to run according to the given job type ''' if job_obj.assignments == self.namespace['process_HF']: # hit-finding if str(self.ui.lineEdit_2.text()).lower() == "none": runtime['darkcal'] = None else: runtime['darkcal'] = str(self.ui.lineEdit_2.text()) sub_python_exec = os.path.join( os.path.split(os.path.realpath(__file__))[0], "scripts", self.python_scripts["hf"]) # decide mpi rank size # actually I want to be free from inh5, but can not find any good ways if self.data_format.lower() in ["cxi", "h5"]: sub_nproc = get_mpi_ranks(runtime['dataset'], inh5, pat_per_job, max_jobs) if sub_nproc is None: utils.show_message( "The data file '%s' seem seem to have some problems. Check whether the 'Data-path in cxi/h5' is correct." % f) return None else: # TODO pass elif job_obj.assignments == self.namespace['process_AP']: # adu2photon sub_python_exec = os.path.join( os.path.split(os.path.realpath(__file__))[0], "scripts", self.python_scripts["ap"]) # get data source path this_datadir = utils.read_config( config_file, [self.namespace['config_head'], 'Data Dir']) # do not use raw dataset if not job_obj.datafile[0].startswith(this_datadir): assignments = os.path.split(this_datadir)[-1] try: tag_remarks = self.main_gui.tag_buffer[assignments][ runtime['run_name']] tag_remarks = self.main_gui.split_tag_remarks(tag_remarks) if len(tag_remarks) != 2: raise ValueError("boomb") except: utils.show_message( "%s:\nI cannot find the data source, please check the parameters agian." % runtime['run_name']) return None ''' tag_remarks = self.tag_remarks_buffer[runtime['run_name']] if len(tag_remarks) != 2: utils.show_message("%s:\nI cannot find the data source, please check the parameters agian." % runtime['run_name']) return None ''' datafile = os.path.join( this_datadir, utils.fmt_job_dir(runtime['run_name'], tag_remarks[0], tag_remarks[1]), '*.h5') datafile = glob.glob(datafile) if len(datafile) == 0: utils.show_message( "%s:\nI cannot find the data source, please check the parameters agian." % runtime['run_name']) return None runtime['dataset'] = datafile # decide mpi rank size sub_nproc = get_mpi_ranks(runtime['dataset'], inh5, pat_per_job, max_jobs) if sub_nproc is None: utils.show_message("The data files in '%s' seem to have some problems.\n" % this_datadir + \ "Check : data location, 'Data-path in cxi/h5' are correct;\n" + \ "Check : data files are multi-pattern HDf5 format.") return None else: return None ''' prepare and submit jobs : job_dir/status stores all status_xxx.txt of every process and there will be a status.txt if the job finished ''' try: # make workdir if not exists if not os.path.isdir(runtime['savepath']): os.mkdir(runtime['savepath']) os.mkdir(os.path.join(runtime['savepath'], 'status')) else: if not self.force_overwrite: re = utils.show_warning( "project %s already exists, overwrite it?" % runtime['savepath']) if re == 0: utils.show_message( "Don't overwrite project %s, EXIT." % runtime['savepath']) return None shutil.rmtree(runtime['savepath']) os.mkdir(runtime['savepath']) os.mkdir(os.path.join(runtime['savepath'], 'status')) # write runtimejson and config to workdir with open(os.path.join(runtime['savepath'], 'runtime.json'), 'w') as rjson: json.dump(runtime, rjson) shutil.copyfile(config_file, os.path.join(runtime['savepath'], 'config.ini')) # write submit script to workdir SUB_cmd = subprocess.check_output( "bash %s -s %s -t %d -p %s -q %s -y %s" % (sub_prepare_sub_script, sub_python_exec, sub_nproc, sub_workdir, sub_submit_queue, sub_jss), shell=True) SUB_cmd = SUB_cmd.strip("\n") if len(SUB_cmd) == 0: return None # submit pobj = utils.submit_job(sub_workdir, SUB_cmd, self.jss) # return if pobj is None: return None else: return pobj except: return None
def run_wrfda_3dvar(work_root, wrfda_root, config, args, wrf_work_dir=None, force=False, tag=None, fg=None): start_time = config['custom']['start_time'] datetime_fmt = 'YYYY-MM-DD_HH:mm:ss' start_time_str = start_time.format(datetime_fmt) max_dom = config['domains']['max_dom'] if not wrf_work_dir: if tag != None: wrf_work_dir = f'{work_root}/wrf_{tag}' else: wrf_work_dir = f'{work_root}/wrf' if tag != None: obsproc_work_dir = f'{work_root}/wrfda_{tag}/obsproc' else: obsproc_work_dir = f'{work_root}/wrfda/obsproc' if max_dom > 1: dom_str = 'd' + str(config['custom']['wrfda']['dom'] + 1).zfill(2) if tag != None: wrfda_work_dir = f'{work_root}/wrfda_{tag}/{dom_str}' else: wrfda_work_dir = f'{work_root}/wrfda/{dom_str}' else: dom_str = 'd01' if tag != None: wrfda_work_dir = f'{work_root}/wrfda_{tag}' else: wrfda_work_dir = f'{work_root}/wrfda' if not os.path.isdir(wrfda_work_dir): os.mkdir(wrfda_work_dir) os.chdir(wrfda_work_dir) cli.stage(f'Run da_wrfvar.exe at {wrfda_work_dir} ...') if os.path.isfile(f'wrfvar_output_{start_time_str}' ) and not args.force and not force: run(f'ls -l wrfvar_output_{start_time_str}') cli.notice(f'wrfvar_output_{start_time_str} already exist.') return run(f'ln -sf {wrfda_root}/run/LANDUSE.TBL {wrfda_work_dir}') if not os.path.isfile('namelist.input'): cli.error( 'namelist.input has not been generated! Run config_wrfda.py.') # BE matrix if 'cv_options' in config['wrfvar7']: be_work_dir = os.path.dirname( os.path.abspath(work_root)) + '/be/' + dom_str if not os.path.isdir(be_work_dir): be_work_dir = os.path.dirname( os.path.abspath(work_root)) + '/../be/' + dom_str if config['wrfvar7']['cv_options'] == 5: if not os.path.isfile(f'{be_work_dir}/be.dat.cv5'): cli.error( f'BE matrix {be_work_dir}/be.dat.cv5 does not exist!') run(f'ln -sf {be_work_dir}/be.dat.cv5 be.dat') elif config['wrfvar7']['cv_options'] == 6: if not os.path.isfile(f'{be_work_dir}/be.dat.cv6'): cli.error( f'BE matrix {be_work_dir}/be.dat.cv6 does not exist!') run(f'ln -sf {be_work_dir}/be.dat.cv6 be.dat') elif config['wrfvar7']['cv_options'] == 7: if not os.path.isfile(f'{be_work_dir}/be.dat.cv7'): cli.error( f'BE matrix {be_work_dir}/be.dat.cv7 does not exist!') run(f'ln -sf {be_work_dir}/be.dat.cv7 be.dat') if not os.path.exists('./be.dat'): run(f'ln -sf {wrfda_root}/var/run/be.dat.cv3 be.dat') # First guess # TODO: Assume there is only one domain to be assimilated. if fg != None: run(f'ln -sf {fg} {wrfda_work_dir}/fg') else: expected_files = [ '{}/wrfout_d{:02d}_{}'.format(wrf_work_dir, i + 1, start_time_str) for i in range(max_dom) ] if check_files(expected_files): run(f'ln -sf {wrf_work_dir}/wrfout_{dom_str}_{start_time_str} {wrfda_work_dir}/fg' ) else: expected_files = [ '{}/wrfinput_d{:02d}_{}'.format(wrf_work_dir, i + 1, start_time_str) for i in range(max_dom) ] if not check_files(expected_files): cli.error( 'real.exe or da_update_bc.exe wasn\'t executed successfully!' ) run(f'ln -sf {wrf_work_dir}/wrfinput_{dom_str}_{start_time_str} {wrfda_work_dir}/fg' ) # Observation data if config['custom']['wrfda']['type'] == '3dvar': if 'use_radarobs' in config['wrfvar4'] and config['wrfvar4'][ 'use_radarobs']: # Radar data run(f'rm -f ob.*') for obs_radar_file in glob( f'{args.littler_root}/{start_time.format("YYYYMMDD")}/obs.radar.*' ): radar_time = pendulum.from_format( os.path.basename(obs_radar_file).split('.')[2], 'YYYYMMDDHHmm') if radar_time == start_time: run(f'ln -sf {obs_radar_file} ob.radar') if os.path.isfile(f'wrfvar_output_{start_time_str}'): cli.notice('Use previous analysis data as the background.') run(f'mv wrfvar_output_{start_time_str} wrfvar_output_conv_{start_time_str}' ) run(f'ln -sf wrfvar_output_conv_{start_time_str} fg') elif 'conv_obs' in config['custom']: if 'dir_pattern' in config['custom']['conv_obs']: obs_dir = Template( config['custom']['conv_obs']['dir_pattern']).render( obs_time=start_time) if 'file_pattern' in config['custom']['conv_obs']: obs_file = Template( config['custom']['conv_obs']['file_pattern']).render( obs_time=start_time) if config['wrfvar3']['ob_format'] == 1: run(f'ln -sf {args.prepbufr_root}/{obs_dir}/{obs_file} ob.bufr' ) elif config['wrfvar3']['ob_format'] == 2: run(f'ln -sf {args.prepbufr_root}/{obs_dir}/{obs_file} ob.ascii' ) elif config['wrfvar3']['ob_format'] == 2 and os.path.isfile( f'{obsproc_work_dir}/obs_gts_{start_time.format(datetime_fmt)}.3DVAR' ): # LITTLE_R conventional data run(f'ln -sf {obsproc_work_dir}/obs_gts_{start_time.format(datetime_fmt)}.3DVAR ob.ascii' ) elif config['wrfvar3']['ob_format'] == 1 and config['custom']['wrfda'][ 'prepbufr_source'] == 'gdas': # PREPBUFR conventional data gdas_file_path = f'{args.prepbufr_root}/gdas.{start_time.format("YYYYMMDD")}/gdas.t{start_time.hour:02}z.prepbufr.nr' if not os.path.isfile(gdas_file_path): cli.error(f'{gdas_file_path} does not exist!') run(f'ln -sf {gdas_file_path} ob.bufr') if os.path.isfile(f'{wrfda_work_dir}/wrfvar_output_{start_time_str}' ) and not args.force: cli.notice( f'{wrfda_work_dir}/wrfvar_output_{start_time_str} already exists.') return submit_job(f'{wrfda_root}/var/build/da_wrfvar.exe', min(20, args.np), config, args, wait=True) expected_files = [f'wrfvar_output', 'statistics'] if not check_files(expected_files): # Check if the failure is caused by parallel computing? Such as cv_options is zero in some process. if search_files('rsl.error.*', 'Invalid CV option chosen: cv_options = 0'): cli.warning( 'Failed to run da_wrfvar.exe in parallel. Try to run in serial.' ) submit_job(f'{wrfda_root}/var/build/da_wrfvar.exe', 1, config, args, wait=True) if not check_files(expected_files): cli.error( f'Still failed! See {wrfda_work_dir}/rsl.error.0000.') else: cli.error(f'Failed! See {wrfda_work_dir}/rsl.error.0000.') else: print(open('statistics').read()) run(f'ncl -Q {scripts_root}/../plots/plot_cost_grad_fn.ncl') run(f'cp wrfvar_output wrfvar_output_{start_time_str}') cli.notice('Succeeded.')
def run_wrf(work_root, wrf_root, config, args, wrfda_work_dir=None, tag=None): start_time = config['custom']['start_time'] end_time = config['custom']['end_time'] datetime_fmt = 'YYYY-MM-DD_HH:mm:ss' start_time_str = start_time.format(datetime_fmt) end_time_str = end_time.format(datetime_fmt) max_dom = config['domains']['max_dom'] if not wrfda_work_dir: if tag != None: wrfda_work_dir = f'{work_root}/wrfda_{tag}' else: wrfda_work_dir = f'{work_root}/wrfda' elif not os.path.isdir(wrfda_work_dir): cli.error(f'run_wrf: {wrfda_work_dir} does not exist!') if tag != None: wrf_work_dir = f'{work_root}/wrf_{tag}' else: wrf_work_dir = f'{work_root}/wrf' if not os.path.isdir(wrf_work_dir): cli.error(f'run_wrf: {wrf_work_dir} does not exist!') os.chdir(wrf_work_dir) all_wrfda_ok = True for dom_idx in range(max_dom): dom_str = 'd' + str(dom_idx + 1).zfill(2) if not copy_wrfda_output(dom_str, start_time_str, wrfda_work_dir): all_wrfda_ok = False break if not all_wrfda_ok: cli.warning('Do not use data assimilation.') expected_files = ['wrfinput_d{:02d}_{}'.format(i + 1, start_time_str) for i in range(max_dom)] expected_files.append(f'wrfbdy_d01_{start_time_str}') if not check_files(expected_files): cli.error('real.exe wasn\'t executed successfully!') for i in range(max_dom): run('ln -sf wrfinput_d{0:02d}_{1} wrfinput_d{0:02d}'.format(i + 1, start_time_str)) run(f'ln -sf wrfbdy_d01_{start_time_str} wrfbdy_d01') cli.stage(f'Run wrf.exe at {wrf_work_dir} ...') expected_files = ['wrfout_d{:02d}_{}'.format(i + 1, end_time_str) for i in range(max_dom)] if not check_files(expected_files) or args.force: run('rm -f wrfout_*') run(f'ln -sf {wrf_root}/run/LANDUSE.TBL .') run(f'ln -sf {wrf_root}/run/ozone_plev.formatted .') run(f'ln -sf {wrf_root}/run/ozone_lat.formatted .') run(f'ln -sf {wrf_root}/run/ozone.formatted .') run(f'ln -sf {wrf_root}/run/RRTM_DATA_DBL RRTM_DATA') run(f'ln -sf {wrf_root}/run/RRTMG_LW_DATA .') run(f'ln -sf {wrf_root}/run/RRTMG_SW_DATA .') run(f'ln -sf {wrf_root}/run/VEGPARM.TBL .') run(f'ln -sf {wrf_root}/run/SOILPARM.TBL .') run(f'ln -sf {wrf_root}/run/GENPARM.TBL .') retries = 0 while True: submit_job(f'{wrf_root}/run/wrf.exe', args.np, config, args, wait=True) if not check_files(expected_files): if retries == 0: cli.error(f'Failed! Check output {os.path.abspath(wrf_work_dir)}/rsl.error.0000.') retries = retries + 1 cli.warning(f'Failed to run wrf, retry it! {retries}') else: break cli.notice('Succeeded.') else: cli.notice('File wrfout_* already exist.') run(f'ls -l {wrf_work_dir}/wrfout_*')
def run_real(work_root, wps_work_dir, wrf_root, config, args, tag=None): start_time = config['custom']['start_time'] datetime_fmt = 'YYYY-MM-DD_HH:mm:ss' start_time_str = start_time.format(datetime_fmt) max_dom = config['domains']['max_dom'] if not os.path.isdir(wps_work_dir): cli.error(f'WPS work directory {wps_work_dir} does not exist!') if tag != None: wrf_work_dir = f'{work_root}/wrf_{tag}' else: wrf_work_dir = f'{work_root}/wrf' if not os.path.isdir(wrf_work_dir): os.mkdir(wrf_work_dir) os.chdir(wrf_work_dir) cli.stage(f'Run real.exe at {wrf_work_dir} ...') expected_files = [ 'wrfinput_d{:02d}_{}'.format(i + 1, start_time_str) for i in range(max_dom) ] expected_files.append('wrfbdy_d01') if not check_files(expected_files) or args.force: run('rm -f wrfinput_* met_em.*.nc') run(f'ln -sf {wps_work_dir}/met_em.*.nc .') try: dataset = Dataset(glob('met_em.*.nc')[0]) except: cli.error('Failed to open one of met_em.*.nc file!') # Check met_em file. if not 'num_st_layers' in dataset.dimensions or dataset.dimensions[ 'num_st_layers'].size == 0: cli.error( 'Failed to run ungrib and metgrid due to num_metgrid_soil_levels is zero!' ) namelist_input = f90nml.read('./namelist.input') namelist_input['domains']['num_metgrid_levels'] = dataset.dimensions[ 'num_metgrid_levels'].size namelist_input['physics']['num_land_cat'] = dataset.getncattr( 'NUM_LAND_CAT') if 'num_st_layers' in dataset.dimensions: namelist_input['domains'][ 'num_metgrid_soil_levels'] = dataset.dimensions[ 'num_st_layers'].size else: cli.warning( f'Dimension num_st_layers is not in {dataset.filepath()}! Set num_metgrid_soil_levels to 0.' ) namelist_input['domains']['num_metgrid_soil_levels'] = 0 dataset.close() namelist_input.write('./namelist.input', force=True) submit_job(f'{wrf_root}/run/real.exe', args.np, config, args, wait=True) for i in range(max_dom): if not os.path.isfile('wrfinput_d{0:02d}'.format(i + 1)): # Check if the failure is caused by parallel computing? cli.warning( 'Failed to run real.exe in parallel. Try to run in serial.' ) submit_job(f'{wrf_root}/run/real.exe', 1, config, args, wait=True) if not os.path.isfile('wrfinput_d{0:02d}'.format(i + 1)): cli.error( f'Still failed to generate wrfinput_d{0:02d}! See {wrf_work_dir}/rsl.error.0000.' .format(i + 1)) run('ln -sf wrfinput_d{0:02d} wrfinput_d{0:02d}_{1}'.format( i + 1, start_time_str)) if os.path.isfile('wrfbdy_d01'): run(f'ln -sf wrfbdy_d01 wrfbdy_d01_{start_time_str}') cli.notice('Succeeded.') else: run('ls -l wrfinput_* wrfbdy_*') cli.notice('File wrfinput_* already exist.')
def run_wrfda_update_bc(work_root, wrfda_root, update_lowbc, config, args, wrf_work_dir=None, wrfbdy=None, tag=None): start_time = config['custom']['start_time'] datetime_fmt = 'YYYY-MM-DD_HH:mm:ss' start_time_str = start_time.format(datetime_fmt) max_dom = config['domains']['max_dom'] if not wrf_work_dir: if tag != None: wrf_work_dir = f'{work_root}/wrf_{tag}' else: wrf_work_dir = f'{work_root}/wrf' if max_dom > 1: dom_str = 'd' + str(config['custom']['wrfda']['dom'] + 1).zfill(2) if tag != None: wrfda_work_dir = f'{work_root}/wrfda_{tag}/{dom_str}' else: wrfda_work_dir = f'{work_root}/wrfda/{dom_str}' else: dom_str = 'd01' if tag != None: wrfda_work_dir = f'{work_root}/wrfda_{tag}' else: wrfda_work_dir = f'{work_root}/wrfda' if not os.path.isdir(wrfda_work_dir): os.mkdir(wrfda_work_dir) os.chdir(wrfda_work_dir) if not wrfbdy: wrfbdy = f'{wrf_work_dir}/wrfbdy_{dom_str}' cli.stage(f'Run WRFDA update_bc at {wrfda_work_dir} ...') expected_files = [wrfbdy, f'wrfvar_output_{start_time_str}', 'fg'] if not check_files(expected_files): print(expected_files) cli.error( 'run_wrfda_update_bc: da_wrfvar.exe or real.exe wasn\'t executed successfully!' ) run(f'ln -sf {wrfbdy} wrfbdy_{dom_str}') run(f'ln -sf wrfvar_output_{start_time_str} wrfvar_output') parame_in = f90nml.read(f'{wrfda_root}/var/test/update_bc/parame.in') parame_in['control_param']['wrf_input'] = './fg' if update_lowbc: cli.notice('Update only low boundary condition.') parame_in['control_param']['low_bdy_only'] = True parame_in.write(f'{wrfda_work_dir}/parame.in', force=True) if update_lowbc: expected_file = f'wrfbdy_{dom_str}_{start_time_str}.low_updated' else: expected_file = f'wrfbdy_{dom_str}_{start_time_str}.lateral_updated' if not check_files(expected_file) or args.force: submit_job(f'{wrfda_root}/var/build/da_update_bc.exe', 1, config, args, wait=True) run(f'cp wrfbdy_{dom_str} {expected_file}') else: run(f'ls -l {expected_file}') cli.notice('Succeeded.')
def main(log, job_date, mail_receiver, base_dir, download_url, git_dir, job_dir, logging_conf, slurm_account, slurm_log_dir, slurm_mail, export_dir, export_offset): log.trace("Arguments(updated paths): \ndate: {}\nmailreceiver: {}\n" "basedir: {}\ngitdir: {}\njobdir: {}\nloggingconf: {}\n" "slurmaccount: {}\nslurmlogdir: {}\nslurmmail: {}".format( job_date, mail_receiver, base_dir, git_dir, job_dir, logging_conf, slurm_account, slurm_log_dir, slurm_mail)) # Define sample_id log.info("Date for this Job: {}".format(job_date)) start_date = datetime.date(2020, 1, 28) sample_id = ((job_date - datetime.timedelta(days=25)) - start_date).days log.info("Sample_ID for this Job: {}".format(sample_id)) # Setup directories log.info("Store temporary files for this job in {}".format(job_dir)) log.trace("Create {}".format(job_dir)) Path(job_dir).mkdir(parents=True, exist_ok=True) log.trace("Create {}".format(slurm_log_dir)) Path(slurm_log_dir).mkdir(parents=True, exist_ok=True) git_dir_data = os.path.join(git_dir, "data") git_dir_src = os.path.join(git_dir, "src") log.trace("Create {}".format(os.path.join(job_dir, 'data', 'raw'))) Path(os.path.join(job_dir, 'data', 'raw')).mkdir(parents=True, exist_ok=True) log.trace("Create {}".format(os.path.join(job_dir, 'data', 'diseases'))) Path(os.path.join(job_dir, 'data', 'diseases')).mkdir(parents=True, exist_ok=True) raw_csv_fpath = os.path.join(job_dir, 'data', 'raw', 'covid19.csv') data_csv_fpath = os.path.join(job_dir, 'data', 'diseases', 'covid19.csv') output_dpath = os.path.join(job_dir, 'csv') Path(output_dpath).mkdir(parents=True, exist_ok=True) # download csv data utils.download_csv(log, download_url, raw_csv_fpath) log.debug("Download completed") utils.preprocess_table(log, raw_csv_fpath, data_csv_fpath, git_dir_data, job_dir) log.debug("Preprocess completed") # Create files for Slurm Job slurm_dir = os.path.join(job_dir, 'slurm') log.trace("Create {}".format(slurm_dir)) Path(slurm_dir).mkdir(parents=True, exist_ok=True) slurm_sh_file = os.path.join(slurm_dir, "sample_window.slurm.sh") slurm_file = os.path.join(slurm_dir, "sample_window.slurm") utils.create_slurm_sh(log, slurm_sh_file, data_csv_fpath, git_dir_src, output_dpath, sample_id, export_dir, export_offset) log.debug("Slurm_sh file created") utils.create_slurm(log, slurm_file, slurm_sh_file, sample_id, slurm_account, slurm_log_dir, slurm_mail) log.debug("Slurm file created") slurm_jobid = utils.submit_job(log, slurm_file, slurm_dir, '-vv') log.debug("Slurm job submitted. JobId: {}".format(slurm_jobid)) slurm_status = utils.status_job(log, slurm_jobid) log.debug("Slurm job status: {}".format(slurm_status))
def run_wrfda_obsproc(work_root, wrfda_root, littler_root, config, args, wrf_work_dir=None, tag=None): start_time = config['custom']['start_time'] datetime_fmt = 'YYYY-MM-DD_HH:mm:ss' start_time_str = start_time.format(datetime_fmt) if not wrf_work_dir: if tag != None: wrf_work_dir = f'{work_root}/wrf_{tag}' else: wrf_work_dir = f'{work_root}/wrf' if tag != None: wrfda_work_dir = f'{work_root}/wrfda_{tag}/obsproc' else: wrfda_work_dir = f'{work_root}/wrfda/obsproc' if not os.path.isdir(wrfda_work_dir): os.mkdir(wrfda_work_dir) os.chdir(wrfda_work_dir) cli.notice('Use builtin obserr.') run(f'ln -sf {wrfda_root}/var/obsproc/obserr.txt {wrfda_work_dir}') # Use d01 domain extent. if check_files([f'{wrf_work_dir}/wrfinput_d01_{start_time_str}']): ncfile = Dataset(f'{wrf_work_dir}/wrfinput_d01_{start_time_str}', 'r') iproj = ncfile.getncattr('MAP_PROJ') phic = ncfile.getncattr('CEN_LAT') xlonc = ncfile.getncattr('CEN_LON') moad_cen_lat = ncfile.getncattr('MOAD_CEN_LAT') standard_lon = ncfile.getncattr('STAND_LON') ncfile.close() else: iproj = config['geogrid']['map_proj'] phic = config['geogrid']['ref_lat'] xlonc = config['geogrid']['ref_lon'] moad_cen_lat = config['geogrid']['ref_lat'] standard_lon = config['geogrid']['ref_lon'] output_format = get_value(config, ['custom', 'obsproc', 'output_format'], default=2) time_window = get_value(config, ['custom', 'wrfda', 'time_window'], default=360) if has_key(config, ('custom', 'da', 'type')): if config['custom']['da']['type'] == '3dvar': namelist_obsproc = f90nml.read( f'{wrfda_root}/var/obsproc/namelist.obsproc.3dvar.wrfvar-tut') else: cli.error('Currently, we only support 3DVar...') else: namelist_obsproc = f90nml.read( f'{wrfda_root}/var/obsproc/namelist.obsproc.3dvar.wrfvar-tut') namelist_obsproc['record1'][ 'obs_gts_filename'] = f'obs.gts.{start_time.format("YYYYMMDDHHmm")}' namelist_obsproc['record2']['time_window_min'] = start_time.subtract( minutes=time_window / 2).format('YYYY-MM-DD_HH:mm:ss') namelist_obsproc['record2']['time_analysis'] = start_time.format( 'YYYY-MM-DD_HH:mm:ss') namelist_obsproc['record2']['time_window_max'] = start_time.add( minutes=time_window / 2).format('YYYY-MM-DD_HH:mm:ss') namelist_obsproc['record3']['max_number_of_obs'] = 1200000 namelist_obsproc['record7']['PHIC'] = phic namelist_obsproc['record7']['XLONC'] = xlonc namelist_obsproc['record7']['MOAD_CEN_LAT'] = moad_cen_lat namelist_obsproc['record7']['STANDARD_LON'] = standard_lon namelist_obsproc['record8']['NESTIX'] = config['geogrid']['e_sn'] namelist_obsproc['record8']['NESTJX'] = config['geogrid']['e_we'] namelist_obsproc['record8']['DIS'] = config['geogrid']['dx'] namelist_obsproc['record9']['OUTPUT_OB_FORMAT'] = output_format namelist_obsproc.write('./namelist.obsproc', force=True) cli.stage(f'Run obsproc.exe at {wrfda_work_dir} ...') expected_files = [ f'obs_gts_{start_time.format("YYYY-MM-DD_HH:mm:ss")}.3DVAR' ] if not check_files(expected_files) or args.force: run('rm -f obs_gts_*') if has_key(config, ('custom', 'littler')): if 'dir_pattern' in config['custom'][ 'littler'] and 'file_pattern' in config['custom'][ 'littler']: dir_name = Template( config['custom']['littler']['dir_pattern']).render( time=start_time) file_name = Template( config['custom']['littler']['file_pattern']).render( time=start_time) littler_path = f'{littler_root}/{dir_name}/{file_name}' else: cli.error( 'No dir_pattern and file_pattern in custom->littler section!' ) else: littler_path = f'{littler_root}/{start_time.format("YYYYMMDD")}/obs.gts.{start_time.format("YYYYMMDDHHmm")}' if os.path.exists(littler_path): run(f'ln -sf {littler_path} {wrfda_work_dir}/obs.gts.{start_time.format("YYYYMMDDHHmm")}' ) else: cli.error(f'Failed! {littler_path} Not Found.') submit_job(f'{wrfda_root}/var/obsproc/obsproc.exe', 1, config, args, wait=True) if not check_files(expected_files): cli.error(f'Failed!') cli.notice('Succeeded.') else: cli.notice('File obs_gts_* already exist.') run('ls -l obs_gts_*')