def resub(directory='in place'): # Takes a directory, resubmits errors, scf failures, and spin contaminated cases configure_dict = manager_io.read_configure(directory, None) max_resub = configure_dict['max_resub'] max_jobs = configure_dict['max_jobs'] hard_job_limit = configure_dict['hard_job_limit'] hit_queue_limit = False # Describes if this run has limitted the number of jobs submitted to work well with the queue # Get the state of all jobs being managed by this instance of the job manager completeness = moltools.check_completeness(directory, max_resub, configure_dict=configure_dict) print("completeness: ", completeness) errors = completeness[ 'Error'] # These are calculations which failed to complete scf_errors = completeness[ 'SCF_Error'] # These are calculations which failed to complete, appear to have an scf error, and hit wall time oscillating_scf_errors = completeness[ 'oscillating_scf_errors'] # These are calculations which failed to complete, appear to have an oscillaing scf error, need_resub = completeness[ 'Needs_resub'] # These are calculations with level shifts changed or hfx exchange changed spin_contaminated = completeness[ 'Spin_contaminated'] # These are finished jobs with spin contaminated solutions active = completeness[ 'Active'] # These are jobs which are currently running thermo_grad_error = completeness[ 'Thermo_grad_error'] # These are thermo jobs encountering the thermo grad error waiting = completeness[ 'Waiting'] # These are jobs which are or were waiting for another job to finish before continuing. bad_geos = completeness[ 'Bad_geos'] # These are jobs which finished, but converged to a bad geometry. finished = completeness['Finished'] molscontrol_kills = completeness['molscontrol_kills'] nactive = tools.get_number_active( ) # number of active jobs, counting bundled jobs as a single job # Kill SCF errors in progress, which are wasting computational resources all_scf_errors = completeness[ 'SCF_Errors_Including_Active'] # These are all jobs which appear to have scf error, including active ones scf_errors_to_kill = [ scf_err for scf_err in all_scf_errors if scf_err not in scf_errors ] names_to_kill = [ os.path.split(scf_err)[-1].rsplit('.', 1)[0] for scf_err in scf_errors_to_kill ] kill_jobs(names_to_kill, message1='Job: ', message2=' appears to have an scf error. Killing this job early') # Prep derivative jobs such as thermo single points, vertical IP, and ligand dissociation energies needs_derivative_jobs = list(filter(tools.check_original, finished)) print("needs_derivative_jobs: ", needs_derivative_jobs) prep_derivative_jobs(directory, needs_derivative_jobs) resubmitted = [ ] # Resubmitted list gets True if the job is submitted or False if not. Contains booleans, not job identifiers. for job in molscontrol_kills: print("killed by molscontrol: ", job) # Resub unidentified errors for error in errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue resub_tmp = recovery.simple_resub(error) if resub_tmp: print(('Unidentified error in job: ' + os.path.split(error)[-1] + ' -Resubmitting')) print('') resubmitted.append(resub_tmp) # Resub oscillating_scf convergence errors for error in oscillating_scf_errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'scf' in local_configure['job_recovery']: resub_tmp = recovery.resub_oscillating_scf(error) if resub_tmp: print(('Oscillating SCF error identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted precision and grid.')) print('') resubmitted.append(resub_tmp) # Resub scf convergence errors for error in scf_errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'scf' in local_configure['job_recovery']: resub_tmp = recovery.resub_scf(error) if resub_tmp: print(('SCF error identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted levelshifts')) print('') resubmitted.append(resub_tmp) # Resub jobs which converged to bad geometries with additional constraints for error in bad_geos: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'bad_geo' in local_configure['job_recovery']: resub_tmp = recovery.resub_bad_geo(error, directory) if resub_tmp: print(( 'Bad final geometry in job: ' + os.path.split(error)[-1] + ' -Resubmitting from initial structure with additional constraints' )) print('') resubmitted.append(resub_tmp) # Resub spin contaminated cases for error in spin_contaminated: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'spin_contaminated' in local_configure['job_recovery']: resub_tmp = recovery.resub_spin(error) if resub_tmp: print(('Spin contamination identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted HFX')) print('') resubmitted.append(resub_tmp) # Resub jobs with atypical parameters used to aid convergence for error in need_resub: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue resub_tmp = recovery.clean_resub(error) if resub_tmp: print(('Job ' + os.path.split(error)[-1] + ' needs to be rerun with typical paramters. -Resubmitting')) print('') resubmitted.append(resub_tmp) # Create a job with a tighter convergence threshold for failed thermo jobs for error in thermo_grad_error: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'thermo_grad_error' in local_configure['job_recovery']: resub_tmp = recovery.resub_tighter(error) if resub_tmp: print(( 'Job ' + os.path.split(error)[-1] + ' needs a better initial geo. Creating a geometry run with tighter convergence criteria' )) print('') resubmitted.append(resub_tmp) # Look at jobs in "waiting," resume them if the job they were waiting for is finished # Currently, this should only ever be thermo jobs waiting for an ultratight job for waiting_dict in waiting: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue if len(list(waiting_dict.keys())) > 1: raise Exception('Waiting job list improperly constructed') job = list(waiting_dict.keys())[0] waiting_for = waiting_dict[job] if waiting_for in finished: history = recovery.load_history(job) history.waiting = None history.save() results_for_this_job = manager_io.read_outfile(job) if results_for_this_job['thermo_grad_error']: resubmitted.append(recovery.resub_thermo(job)) else: raise Exception('A method for resuming job: ' + job + ' is not defined') else: resubmitted.append(False) # Submit jobs which haven't yet been submitted if not ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): to_submit = [] jobscripts = tools.find('*_jobscript') active_jobs = tools.list_active_jobs(home_directory=directory, parse_bundles=True) for job in jobscripts: if not os.path.isfile(job.rsplit('_', 1)[0] + '.out') and not os.path.split( job.rsplit('_', 1)[0])[-1] in active_jobs: to_submit.append(job) short_jobs_to_submit = [ i for i in to_submit if tools.check_short_single_point(i) ] long_jobs_to_submit = [ i for i in to_submit if i not in short_jobs_to_submit ] if len(short_jobs_to_submit) > 0: bundled_jobscripts = tools.bundle_jobscripts( os.getcwd(), short_jobs_to_submit) else: bundled_jobscripts = [] to_submit = bundled_jobscripts + long_jobs_to_submit submitted = [] for job in to_submit: if ((len(submitted) + nactive + np.sum(resubmitted)) >= max_jobs ) or ((tools.get_total_queue_usage() + len(submitted) + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue print(('Initial submission for job: ' + os.path.split(job)[-1])) tools.qsub(job) submitted.append(True) else: hit_queue_limit = True submitted = [] number_resubmitted = np.sum(np.array(resubmitted + submitted)) # ~ print str(number_resubmitted)+' Jobs submitted' return int(number_resubmitted), int(len( completeness['Active'])), hit_queue_limit
def resume_run(args): print('**********************************************') print(args) ## change to run directory with switch_to_rundir(args.resume): print((os.getcwd())) if args.reps: reps = args.reps else: reps = 1 its = 0 while its < reps: GA_run = GA_run_definition() GA_run.deserialize(os.getcwd() + '/.madconfig') if args.post_all: GA_run.config['post_all'] = True print('NB: ALL post on') GA_run.serialize() else: GA_run.config['post_all'] = False print('NB: ALL post OFF') GA_run.serialize() path_dictionary = setup_paths() print((GA_run.config)) if isKeyword('DFT'): print('DFT ON') ## update which jobs are live logger(path_dictionary['state_path'], '************************************************') logger(path_dictionary['state_path'], str(datetime.datetime.now()) + ' waking up...yawn') if isKeyword('job_manager'): live_job_count = tools.get_total_queue_usage() else: live_job_count = check_queue_for_live_jobs() logger( path_dictionary['state_path'], str(datetime.datetime.now()) + ' monitoring, number of live jobs ' + str(live_job_count)) ## wake the run logger(path_dictionary['state_path'], str(datetime.datetime.now()) + ' resuming MAD') wake_up_routine() if isKeyword('DFT'): ## send off oustanding jobs if isKeyword('job_manager'): logger( path_dictionary['state_path'], str(datetime.datetime.now()) + ' Enlisted job manager for help.') os.chdir(path_dictionary['job_manager']) logger( path_dictionary['state_path'], str(datetime.datetime.now()) + ' Changed to the path of all job manager jobs.') resub.main() logger( path_dictionary['state_path'], str(datetime.datetime.now()) + ' Job manager has terminated.') os.chdir(isKeyword('rundir')) logger( path_dictionary['state_path'], str(datetime.datetime.now()) + ' Switching back to the base rundir.') else: joblist = submit_outstanding_jobs() logger( path_dictionary['state_path'], str(datetime.datetime.now()) + ' tracking a total of ' + str(len(joblist)) + ' jobs') live_job_count = check_queue_for_live_jobs( ) # final check on live jobs logger( path_dictionary['state_path'], str(datetime.datetime.now()) + ' going back to sleep, number of live jobs ' + str(live_job_count)) logger( path_dictionary['state_path'], str(datetime.datetime.now()) + ' going back to sleep') logger(path_dictionary['state_path'], '************************************************') else: print('------- DONE NOW (ANN VERSION)-----') if not isKeyword('runtype') == "redox": try: print( '-----format_frequencies and format_distances being carried out' ) format_frequencies() format_distances() print('Done with distances now.') except: print('Passed in ga_resume_run.') pass its += 1 if args.sleep: print(('sleeping for ' + str(args.sleep))) time.sleep(args.sleep)