def check_mgmt_db(self): """Create errors for all entries in management db that did not complete""" base_proc_types = [ const.ProcessType.EMOD3D, const.ProcessType.HF, const.ProcessType.BB, const.ProcessType.IM_calculation, ] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder( allowed_tasks=base_proc_types, allowed_states=[const.Status.unknown, const.Status.failed], blocked_ids=self.canceled_running, ) for entry in entries: self.errors.append( Error( "Slurm task", "Run {} did not complete task {} " "(Status {}, JobId {}".format( entry.run_name, const.ProcessType(entry.proc_type), const.Status(entry.status), entry.job_id, ), ))
def cancel_running(self, proc_types: List[const.ProcessType]): """Looks for any running task of the specified process types and attempts to cancel one of each. """ # Get all running jobs in the mgmt db db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder(allowed_tasks=proc_types, allowed_states=[const.Status.running]) # Cancel one for each process type for entry in entries: if entry.proc_type in proc_types: print( f"Checkpoint testing: Cancelling job-id {entry.job_id} " "for {entry.run_name} and process type {entry.proc_type}") out, err = Scheduler.get_scheduler().cancel_job(entry.job_id) print("Scancel out: ", out, err) if "error" not in out.lower() and "error" not in err.lower(): self.canceled_running.append(str(entry.job_id)) proc_types.remove(entry.proc_type) print("Cancelled job-id {}".format(entry.job_id)) return proc_types
def install(self): print("Installing database") create_mgmt_db.create_mgmt_db( [ "{}_REL{:0>2}".format(self.realisation_name, i) for i in range(1, 1 + self.realisations) ], sim_struct.get_mgmt_db(self.stage_dir), )
def check_mgmt_db_progress(self): """Checks auto submit progress in the management db""" with connect_db_ctx(sim_struct.get_mgmt_db(self.stage_dir)) as cur: comp_count = [ cur.execute( "SELECT COUNT(*) " "FROM state " "WHERE status = ? " "AND proc_type in (?{})".format(",?" * (len(self.tasks) - 1)), (i, *self.tasks), ).fetchone()[0] for i in range(1, 5) ] total_count = cur.execute( "SELECT COUNT(*) FROM state " "WHERE proc_type in (?{})".format(",?" * (len(self.tasks) - 1)), (*self.tasks, ), ).fetchone()[0] return comp_count, total_count
def check_completed(self): """Checks all simulations that have completed""" base_proc_types = [const.ProcessType.IM_calculation] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) entries = db.command_builder(allowed_tasks=base_proc_types, allowed_states=[const.Status.completed]) completed_sims = [sim_t.run_name for sim_t in entries] # Only check the ones that haven't been checked already completed_new = set(completed_sims) - (self._sim_passed | self._sim_failed) for sim in completed_new: result = self.check_sim_result( os.path.join(self.runs_dir, sim_struct.get_fault_from_realisation(sim), sim)) if not result: self._sim_failed.add(sim) if self._stop_on_error: print("Quitting as the following errors occured: ") self.print_errors() return False else: print("The following error occured for simulation {}:". format(sim)) print("ERROR: {}, {}\n".format(self.errors[-1].location, self.errors[-1].error)) else: self._sim_passed.add(sim) print("Passed/Failed/Total simulations: {}/{}/{}, ".format( len(self._sim_passed), len(self._sim_failed), len(self.sim_dirs))) return True
def check_mgmt_db_progress(self): """Checks auto submit progress in the management db""" base_proc_types = [ const.ProcessType.EMOD3D, const.ProcessType.HF, const.ProcessType.BB, const.ProcessType.IM_calculation, ] db = MgmtDB(sim_struct.get_mgmt_db(self.stage_dir)) total_count = len(db.command_builder(allowed_tasks=base_proc_types)) comp_count = len( db.command_builder(allowed_tasks=base_proc_types, allowed_states=[const.Status.completed])) failed_count = len( db.command_builder( allowed_tasks=base_proc_types, allowed_states=[const.Status.failed, const.Status.unknown], )) return total_count, comp_count, failed_count
def install_fault( fault_name, n_rel, root_folder, version, stat_file_path, seed=HF_DEFAULT_SEED, extended_period=False, vm_perturbations=False, ignore_vm_perturbations=False, vm_qpqs_files=False, ignore_vm_qpqs_files=False, keep_dup_station=True, components=None, logger: Logger = get_basic_logger(), ): config_dict = utils.load_yaml( os.path.join( platform_config[PLATFORM_CONFIG.TEMPLATES_DIR.name], "gmsim", version, ROOT_DEFAULTS_FILE_NAME, ) ) # Load variables from cybershake config v1d_full_path = os.path.join( platform_config[PLATFORM_CONFIG.VELOCITY_MODEL_DIR.name], "Mod-1D", config_dict.get("v_1d_mod"), ) site_v1d_dir = config_dict.get("site_v1d_dir") hf_stat_vs_ref = config_dict.get("hf_stat_vs_ref") vs30_file_path = stat_file_path.replace(".ll", ".vs30") vs30ref_file_path = stat_file_path.replace(".ll", ".vs30ref") # this variable has to be empty # TODO: fix this legacy issue, very low priority event_name = "" # get all srf from source srf_dir = simulation_structure.get_srf_dir(root_folder, fault_name) list_srf = glob.glob(os.path.join(srf_dir, "*_REL*.srf")) if len(list_srf) == 0: list_srf = glob.glob(os.path.join(srf_dir, "*.srf")) list_srf.sort() if n_rel is not None and len(list_srf) != n_rel: message = ( "Error: fault {} failed. Number of realisations do " "not match number of SRF files".format(fault_name) ) logger.log(NOPRINTCRITICAL, message) raise RuntimeError(message) # Get & validate velocity model directory vel_mod_dir = simulation_structure.get_fault_VM_dir(root_folder, fault_name) valid_vm, message = validate_vm.validate_vm(vel_mod_dir, srf=list_srf[0]) if not valid_vm: message = "Error: VM {} failed {}".format(fault_name, message) logger.log(NOPRINTCRITICAL, message) raise RuntimeError(message) # Load the variables from vm_params.yaml vm_params_path = os.path.join(vel_mod_dir, VM_PARAMS_FILE_NAME) vm_params_dict = utils.load_yaml(vm_params_path) yes_model_params = ( False # statgrid should normally be already generated with Velocity Model ) sim_root_dir = simulation_structure.get_runs_dir(root_folder) fault_yaml_path = simulation_structure.get_fault_yaml_path(sim_root_dir, fault_name) root_yaml_path = simulation_structure.get_root_yaml_path(sim_root_dir) for srf in list_srf: logger.info("Installing {}".format(srf)) # try to match find the stoch with same basename realisation_name = os.path.splitext(os.path.basename(srf))[0] stoch_file_path = simulation_structure.get_stoch_path( root_folder, realisation_name ) sim_params_file = simulation_structure.get_source_params_path( root_folder, realisation_name ) if not os.path.isfile(stoch_file_path): message = "Error: Corresponding Stoch file is not found: {}".format( stoch_file_path ) logger.log(NOPRINTCRITICAL, message) raise RuntimeError(message) # install pairs one by one to fit the new structure sim_dir = simulation_structure.get_sim_dir(root_folder, realisation_name) (root_params_dict, fault_params_dict, sim_params_dict) = install_simulation( version=version, sim_dir=sim_dir, rel_name=realisation_name, run_dir=sim_root_dir, vel_mod_dir=vel_mod_dir, srf_file=srf, stoch_file=stoch_file_path, stat_file_path=stat_file_path, vs30_file_path=vs30_file_path, vs30ref_file_path=vs30ref_file_path, yes_statcords=False, fault_yaml_path=fault_yaml_path, root_yaml_path=root_yaml_path, cybershake_root=root_folder, site_v1d_dir=site_v1d_dir, hf_stat_vs_ref=hf_stat_vs_ref, v1d_full_path=v1d_full_path, sim_params_file=sim_params_file, seed=seed, logger=logger, extended_period=extended_period, vm_perturbations=vm_perturbations, ignore_vm_perturbations=ignore_vm_perturbations, vm_qpqs_files=vm_qpqs_files, ignore_vm_qpqs_files=ignore_vm_qpqs_files, components=components, ) if ( root_params_dict is None or fault_params_dict is None or sim_params_dict is None ): # Something has gone wrong, returning without saving anything logger.critical(f"Critical Error some params dictionary are None") return if root_params_dict is not None and not isclose( vm_params_dict["flo"], root_params_dict["flo"] ): logger.critical( "The parameter 'flo' does not match in the VM params and root params files. " "Please ensure you are installing the correct gmsim version" ) return create_mgmt_db.create_mgmt_db( [], simulation_structure.get_mgmt_db(root_folder), srf_files=srf ) utils.setup_dir(os.path.join(root_folder, "mgmt_db_queue")) root_params_dict["mgmt_db_location"] = root_folder # Generate the fd files, create these at the fault level fd_statcords, fd_statlist = generate_fd_files( simulation_structure.get_fault_dir(root_folder, fault_name), vm_params_dict, stat_file=stat_file_path, logger=logger, keep_dup_station=keep_dup_station, ) fault_params_dict[FaultParams.stat_coords.value] = fd_statcords fault_params_dict[FaultParams.FD_STATLIST.value] = fd_statlist # root_params_dict['hf_stat_vs_ref'] = cybershake_cfg['hf_stat_vs_ref'] dump_all_yamls(sim_dir, root_params_dict, fault_params_dict, sim_params_dict) # test if the params are accepted by steps HF and BB sim_params = utils.load_sim_params(os.path.join(sim_dir, "sim_params.yaml")) # check hf # temporary change the script name to hf_sim, due to how error message are shown main_script_name = sys.argv[0] sys.argv[0] = "hf_sim.py" command_template, add_args = hf_gen_command_template( sim_params, list(HPC)[0].name, seed ) run_command = gen_args_cmd( ProcessType.HF.command_template, command_template, add_args ) hf_args_parser(cmd=run_command) # check bb sys.argv[0] = "bb_sim.py" command_template, add_args = bb_gen_command_template(sim_params) run_command = gen_args_cmd( ProcessType.BB.command_template, command_template, add_args ) bb_args_parser(cmd=run_command) # change back, to prevent unexpected error sys.argv[0] = main_script_name
def queue_monitor_loop( root_folder: str, sleep_time: int, max_retries: int, queue_logger: Logger = qclogging.get_basic_logger(), alert_url=None, ): mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder)) queue_folder = sim_struct.get_mgmt_db_queue(root_folder) queue_logger.info("Running queue-monitor, exit with Ctrl-C.") mgmt_db.add_retries(max_retries) sqlite_tmpdir = "/tmp/cer" while keepAlive: complete_data = True if not os.path.exists(sqlite_tmpdir): os.makedirs(sqlite_tmpdir) queue_logger.debug("Set up the sqlite_tmpdir") # For each hpc get a list of job id and status', and for each pair save them in a dictionary queued_tasks = {} for hpc in HPC: try: squeued_tasks = Scheduler.get_scheduler().check_queues( user=False, target_machine=hpc ) except EnvironmentError as e: queue_logger.critical(e) queue_logger.critical( f"An error was encountered when attempting to check {Scheduler.get_scheduler().QUEUE_NAME} for HPC {hpc}. " "Tasks will not be submitted to this HPC until the issue is resolved" ) complete_data = False else: for task in squeued_tasks: queued_tasks[task.split()[0]] = task.split()[1] if len(queued_tasks) > 0: if len(queued_tasks) > 200: queue_logger.log( VERYVERBOSE, f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}", ) queue_logger.info( f"Over 200 tasks were found in the queue. Check the log for an exact listing of them" ) else: queue_logger.info( f"{Scheduler.get_scheduler().QUEUE_NAME} tasks: {', '.join([' '.join(task) for task in queued_tasks.items()])}" ) else: queue_logger.debug(f"No {Scheduler.get_scheduler().QUEUE_NAME} tasks") db_in_progress_tasks = mgmt_db.get_submitted_tasks() if len(db_in_progress_tasks) > 0: queue_logger.info( "In progress tasks in mgmt db:" + ", ".join( [ "{}-{}-{}-{}".format( entry.run_name, const.ProcessType(entry.proc_type).str_value, entry.job_id, const.Status(entry.status).str_value, ) for entry in db_in_progress_tasks ] ) ) entry_files = os.listdir(queue_folder) entry_files.sort() entries = [] for file_name in entry_files[::-1]: queue_logger.debug( "Checking {} to see if it is a valid update file".format(file_name) ) entry = get_queue_entry(os.path.join(queue_folder, file_name), queue_logger) if entry is None: queue_logger.debug( "Removing {} from the list of update files".format(file_name) ) entry_files.remove(file_name) else: if str(entry.job_id) in queued_tasks.keys() and entry.status > 3: # This will prevent race conditions if the failure/completion state file is made and picked up before the job actually finishes # Most notabley happens on Kisti # The queued and running states are allowed queue_logger.debug( "Job {} is still running on the HPC, skipping this iteration".format( entry ) ) entry_files.remove(file_name) else: queue_logger.debug("Adding {} to the list of updates".format(entry)) entries.insert(0, entry) entries.extend( update_tasks( entry_files, queued_tasks, db_in_progress_tasks, complete_data, queue_logger, root_folder, ) ) if len(entries) > 0: queue_logger.info("Updating {} mgmt db tasks.".format(len(entries))) if mgmt_db.update_entries_live(entries, max_retries, queue_logger): for file_name in entry_files: os.remove(os.path.join(queue_folder, file_name)) # check for jobs that matches alert criteria if alert_url != None: for entry in entries: if entry.status == const.Status.failed.value: entry_retries = mgmt_db.get_retries( entry.proc_type, entry.run_name ) if entry_retries < max_retries: msg = f"fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error}" elif entry_retries >= max_retries: msg = f"@here fault:{entry.run_name} step:{entry.proc_type} has failed with error:{entry.error} and met the retry cap" send_alert(msg, alert_url) else: queue_logger.error( "Failed to update the current entries in the mgmt db queue. " "Please investigate and fix. If this is a repeating error, then this " "will block all other entries from updating." ) else: queue_logger.info("No entries in the mgmt db queue.") # Nap time queue_logger.debug("Sleeping for {}".format(sleep_time)) time.sleep(sleep_time)
def run_main_submit_loop( root_folder: str, n_runs: Dict[str, int], rels_to_run: str, given_tasks_to_run: List[const.ProcessType], sleep_time: int, models_tuple: Tuple[est.EstModel], main_logger: Logger = qclogging.get_basic_logger(), cycle_timeout=1, ): mgmt_queue_folder = sim_struct.get_mgmt_db_queue(root_folder) mgmt_db = MgmtDB(sim_struct.get_mgmt_db(root_folder)) root_params_file = os.path.join(sim_struct.get_runs_dir(root_folder), "root_params.yaml") config = utils.load_yaml(root_params_file) main_logger.info("Loaded root params file: {}".format(root_params_file)) # Default values hf_seed = config["hf"].get(const.RootParams.seed.value, const.HF_DEFAULT_SEED) main_logger.debug("hf_seed set to {}".format(hf_seed)) main_logger.debug( f"extended_period set to {config['ims']['extended_period']}") time_since_something_happened = cycle_timeout while time_since_something_happened > 0: main_logger.debug("time_since_something_happened is now {}".format( time_since_something_happened)) time_since_something_happened -= 1 # Get items in the mgmt queue, have to get a snapshot instead of # checking the directory real-time to prevent timing issues, # which can result in dual-submission mgmt_queue_entries = os.listdir(mgmt_queue_folder) # Get in progress tasks in the db and the HPC queue n_tasks_to_run = {} for hpc in HPC: try: squeued_tasks = Scheduler.get_scheduler().check_queues( user=True, target_machine=hpc) except EnvironmentError as e: main_logger.critical(e) n_tasks_to_run[hpc] = 0 else: n_tasks_to_run[hpc] = n_runs[hpc] - len(squeued_tasks) if len(squeued_tasks) > 0: main_logger.debug( "There was at least one job in squeue, resetting timeout" ) time_since_something_happened = cycle_timeout # Gets all runnable tasks based on mgmt db state runnable_tasks = mgmt_db.get_runnable_tasks( rels_to_run, sum(n_runs.values()), os.listdir(sim_struct.get_mgmt_db_queue(root_folder)), given_tasks_to_run, main_logger, ) if len(runnable_tasks) > 0: time_since_something_happened = cycle_timeout main_logger.info("Number of runnable tasks: {}".format( len(runnable_tasks))) main_logger.debug( "There was at least one runnable task, resetting timeout") else: main_logger.debug("No runnable_tasks") # Select the first ntask_to_run that are not waiting # for mgmt db updates (i.e. items in the queue) tasks_to_run, task_counter = [], {key: 0 for key in HPC} for cur_proc_type, cur_run_name, retries in runnable_tasks: cur_hpc = get_target_machine(cur_proc_type) # Add task if limit has not been reached and there are no # outstanding mgmt db updates if (not shared_automated_workflow.check_mgmt_queue( mgmt_queue_entries, cur_run_name, cur_proc_type) and task_counter.get(cur_hpc, 0) < n_tasks_to_run[cur_hpc]): tasks_to_run.append((cur_proc_type, cur_run_name, retries)) task_counter[cur_hpc] += 1 # Open to better suggestions # Break if enough tasks for each HPC have been added if np.all([ True if task_counter.get(hpc, 0) >= n_tasks_to_run[hpc] else False for hpc in n_tasks_to_run.keys() ]): break if len(tasks_to_run) > 0: main_logger.info("Tasks to run this iteration: " + ", ".join([ "{}-{}".format(entry[1], const.ProcessType(entry[0]).str_value) for entry in tasks_to_run ])) else: main_logger.debug("No tasks to run this iteration") # Submit the runnable tasks for proc_type, run_name, retries in tasks_to_run: # Special handling for merge-ts if proc_type == const.ProcessType.merge_ts.value: # Check if clean up has already run if mgmt_db.is_task_complete([ const.ProcessType.clean_up.value, run_name, const.Status.completed.str_value, ]): # If clean_up has already run, then we should set it to # be run again after merge_ts has run shared_automated_workflow.add_to_queue( mgmt_queue_folder, run_name, const.ProcessType.clean_up.value, const.Status.created.value, logger=main_logger, ) # submit the job submit_task( sim_struct.get_sim_dir(root_folder, run_name), proc_type, run_name, root_folder, main_logger, retries=retries, hf_seed=hf_seed, models=models_tuple, ) main_logger.debug("Sleeping for {} second(s)".format(sleep_time)) time.sleep(sleep_time) main_logger.info( "Nothing was running or ready to run last cycle, exiting now")