def simple_resub(outfile_path): """Resubmits a job without changing parameters. Particularly useful for CUDA errors. Parameters ---------- outfile_path : str The name of an output file. Returns ------- Resub_flag : bool True if resubmitted. """ # Resubmits a job without changing parameters. Particularly useful for CUDA errors. save_run(outfile_path, rewrite_inscr=False) history = resub_history() history.read(outfile_path) history.resub_number += 1 history.notes.append('Resubbed for unknown error') history.save() root = outfile_path.rsplit('.', 1)[0] tools.qsub(root + '_jobscript') return True
def resub_scf(outfile_path): """Resubmits a job that's having trouble converging the scf with different level shifts (1.0 and 0.1). Parameters ---------- outfile_path : str The name of an output file. Returns ------- Resub_flag : bool True if resubmitted. """ # Resubmits a job that's having trouble converging the scf with different level shifts (1.0 and 0.1) history = resub_history() history.read(outfile_path) resubbed_before = False if 'SCF convergence error, level shifts adjusted to aid convergence' in history.notes: resubbed_before = True history.status = os.path.split(outfile_path)[ -1] + ' has been submitted with levels shifted and is still encountering an scf error' history.save() if 'Needs clean resub' in history.notes: resubbed_before = True history.status = os.path.split(outfile_path)[ -1] + ' job recovery has failed - requesting resub_scf() after clean resubmission round' history.save() if not resubbed_before: save_run(outfile_path, rewrite_inscr=False) history = resub_history() history.read(outfile_path) history.resub_number += 1 history.status = 'Level shifts adjusted to assist convergence' history.needs_resub = True history.notes.append('SCF convergence error, level shifts adjusted to aid convergence') history.save() machine=tools.get_machine() root = outfile_path.rsplit('.', 1)[0] name = os.path.split(root)[-1] directory = os.path.split(outfile_path)[0] infile_dict = manager_io.read_infile(outfile_path) home = os.getcwd() if len(directory) > 0: # if the string is blank, then we're already in the correct directory os.chdir(directory) infile_dict['levelshifta'], infile_dict['levelshiftb'] = 1.0, 0.1 infile_dict['machine'] = machine manager_io.write_input(infile_dict) manager_io.write_jobscript(name, machine=machine) os.chdir(home) tools.qsub(root + '_jobscript') return True else: return False
def resub_thermo(outfile_path): # Similar to simple resub, but specific for addressing thermo gradient errors # Checks for the existance of an ultratight version of this run. If it exists, uses the most up to date version for the new thermo run save_run(outfile_path, rewrite_inscr=False) history = resub_history() history.read(outfile_path) history.resub_number += 1 history.status = 'Normal' history.notes.append( 'Resubmitting thermo, possibly with a better initial geo') history.needs_resub = False history.save() name = os.path.split(outfile_path)[-1] name = name.rsplit('.', 1)[0] directory = os.path.split(outfile_path)[0] parent_name = name.rsplit('_', 1)[0] parent_directory = os.path.split(os.path.split(outfile_path)[0])[0] ultratight_dir = os.path.join(parent_directory, parent_name + '_ultratight') infile_dict = manager_io.read_infile(outfile_path) if os.path.exists(ultratight_dir): if os.path.exists(os.path.join(ultratight_dir, 'scr', 'optim.xyz')): tools.extract_optimized_geo( os.path.join(ultratight_dir, 'scr', 'optim.xyz')) shutil.copy(os.path.join(ultratight_dir, 'scr', 'optimized.xyz'), outfile_path.rsplit('.', 1)[0] + '.xyz') else: raise Exception( 'Unable to identify the ultratight geometry for run: ' + outfile_path) if infile_dict['spinmult'] == 1 and os.path.exists( os.path.join(ultratight_dir, 'scr', 'c0')): shutil.copy(os.path.join(ultratight_dir, 'scr', 'c0'), os.path.join(directory, 'c0')) elif infile_dict['spinmult'] != 1 and os.path.exists( os.path.join(ultratight_dir, 'scr', 'ca0')) and os.path.exists( os.path.join(ultratight_dir, 'scr', 'cb0')): shutil.copy(os.path.join(ultratight_dir, 'scr', 'ca0'), os.path.join(directory, 'ca0')) shutil.copy(os.path.join(ultratight_dir, 'scr', 'cb0'), os.path.join(directory, 'cb0')) else: raise Exception( 'Unable to find wavefunction files for ultratight geometry for run: ' + outfile_path) else: raise Exception( 'An ultratight run does not exist for this thermo file. Consider calling simple_resub() or resub_tighter() instead of resub_thermo()' ) jobscript = outfile_path.rsplit('.', 1)[0] + '_jobscript' tools.qsub(jobscript) return True
def resub_spin(outfile_path): # resubmits a spin contaminated job with blyp to help convergence to a non-spin contaminated solution history = resub_history() history.read(outfile_path) resubbed_before = False if 'Spin contaminated, lowering HFX to aid convergence' in history.notes: resubbed_before = True history.status = os.path.split( outfile_path )[-1] + ' has been submitted with lower HFX and still converges to a spin contaminated solution' history.save() if 'Needs clean resub' in history.notes: resubbed_before = True history.status = os.path.split( outfile_path )[-1] + ' job recovery has failed - requesting resub_spin() after clean resubmission round' history.save() if 'HFXresampling' in outfile_path: resubbed_before = True history.status = os.path.split( outfile_path )[-1] + ' is spin contaminated, but submitting with lower HFX does not make sense for HFX resampling jobs' history.save() if not resubbed_before: save_run(outfile_path, rewrite_inscr=False) history = resub_history() history.read(outfile_path) history.resub_number += 1 history.status = 'HFX altered to assist convergence' history.needs_resub = True history.notes.append( 'Spin contaminated, lowering HFX to aid convergence') history.save() root = outfile_path.rsplit('.', 1)[0] name = os.path.split(root)[-1] directory = os.path.split(outfile_path)[0] infile_dict = manager_io.read_infile(outfile_path) home = os.getcwd() if len( directory ) > 0: # if the string is blank, then we're already in the correct directory os.chdir(directory) infile_dict['method'] = 'blyp' infile_dict['machine'] = machine manager_io.write_input(infile_dict) manager_io.write_jobscript(name, machine=machine) os.chdir(home) tools.qsub(root + '_jobscript') return True else: return False
def clean_resub(outfile_path): # Resubmits a job with default parameters, useful for undoing level shift or hfx alterations save_run(outfile_path) history = resub_history() history.read(outfile_path) history.resub_number += 1 history.status = 'Normal' history.notes.append('Needs clean resub') history.needs_resub = False history.save() root = outfile_path.rsplit('.', 1)[0] name = os.path.split(root)[-1] directory = os.path.split(outfile_path)[0] infile_dict = manager_io.read_infile(outfile_path) home = os.getcwd() if len( directory ) > 0: # if the string is blank, then we're already in the correct directory os.chdir(directory) if os.path.isfile('inscr/optimized.xyz'): coordinates = 'inscr/optimized.xyz' # Should trigger for optimization runs elif os.path.isfile(name + '.xyz'): coordinates = name + '.xyz' # Should trigger for single point runs else: raise ValueError( 'No coordinates idenfied for clean in resubmission in directory ' + os.getcwd()) configure_dict = manager_io.read_configure('in_place', outfile_path) infile_dict['coordinates'] = coordinates infile_dict['method'] = configure_dict['method'] infile_dict['levelshifta'], infile_dict['levelshiftb'] = configure_dict[ 'levela'], configure_dict['levelb'] infile_dict['dispersion'] = configure_dict['dispersion'] infile_dict['constraints'] = False infile_dict['machine'] = machine if infile_dict['spinmult'] == 1: infile_dict['guess'] = 'inscr/c0' manager_io.write_input(infile_dict) else: infile_dict['guess'] = 'inscr/ca0 inscr/cb0' manager_io.write_input(infile_dict) manager_io.write_jobscript(name, custom_line='# -fin inscr/', machine=machine) os.chdir(home) tools.qsub(root + '_jobscript') return True
def resub_tighter(outfile_path): """Resubmits a thermo job with the gradient error problem. Finds the parent job and resubmits it with a tighter scf convergence criteria. Parameters ---------- outfile_path : str The name of an output file. Returns ------- Resub_flag : bool True if resubmitted. """ # Takes the path to the outfile of a thermo job with the gradient error problem # Finds the parent job and resubmits it with a tighter scf convergence criteria machine=tools.get_machine() name = os.path.split(outfile_path)[-1].rsplit('.', 1)[0] parent_name = name.rsplit('_', 1)[0] parent_directory = os.path.split(os.path.split(outfile_path)[0])[0] parent_path = os.path.join(parent_directory, parent_name + '.out') ultratight_path = os.path.join(parent_directory, parent_name + '_ultratight', parent_name + '_ultratight.out') scr_needs_to_be_saved = False if os.path.exists(ultratight_path): # This ultratight resubmission has happend before, need to archive the results save_run(ultratight_path, rewrite_inscr=False, save_scr_flag=False) scr_needs_to_be_saved = True # Need to save the scr AFTER prepping the new ultratight run. This helps keep compatibility with other functions history = resub_history() history.read(ultratight_path) history.resub_number += 1 history.status = 'Running with tightened convergence thresholds' history.needs_resub = False history.notes.append('Further tightening convergence thresholds') history.save() jobscript = tools.prep_ultratight(parent_path) # Prep tighter convergence run if scr_needs_to_be_saved: save_scr(ultratight_path, rewrite_inscr=False) tools.qsub(jobscript) # Submit tighter convergence run # Set the original thermo run to wait for the ultratight run to finish history = resub_history() history.read(outfile_path) history.waiting = ultratight_path history.save() return True
def resub_oscillating_scf(outfile_path): # Resubmits a job that's having trouble converging the scf with different level shifts (1.0 and 0.1) history = resub_history() history.read(outfile_path) resubbed_before = False if 'SCF convergence error, precision and grid adjusted to aid convergence' in history.notes: resubbed_before = True history.status = os.path.split( outfile_path )[-1] + ' has been submitted with higher precision and grid and is still encountering an scf error' history.save() if 'Needs clean resub' in history.notes: resubbed_before = True history.status = os.path.split( outfile_path )[-1] + ' job recovery has failed - requesting resub_oscillating_scf() after clean resubmission round' history.save() if not resubbed_before: save_run(outfile_path, rewrite_inscr=False) history = resub_history() history.read(outfile_path) history.resub_number += 1 history.status = 'precision and grid adjusted to assist convergence' history.notes.append( 'SCF convergence error, precision and grid adjusted to aid convergence' ) history.save() root = outfile_path.rsplit('.', 1)[0] name = os.path.split(root)[-1] directory = os.path.split(outfile_path)[0] infile_dict = manager_io.read_infile(outfile_path) home = os.getcwd() if len( directory ) > 0: # if the string is blank, then we're already in the correct directory os.chdir(directory) infile_dict['precision'], infile_dict['dftgrid'], infile_dict[ 'dynamicgrid'] = "double", 5, "no" infile_dict['machine'] = machine manager_io.write_input(infile_dict) manager_io.write_jobscript(name, machine=machine) os.chdir(home) tools.qsub(root + '_jobscript') return True else: return False
def resub_bad_geo(outfile_path, home_directory): """Resubmits a job that's converged to a bad geometry with additional contraints. Parameters ---------- outfile_path : str The name of an output file. home_directory : str Path to the base directory of the run. Returns ------- Resub_flag : bool True if resubmitted. """ # Resubmits a job that's converged to a bad geometry with additional contraints history = resub_history() history.read(outfile_path) resubbed_before = False if 'Bad geometry detected, adding constraints and trying again' in history.notes: resubbed_before = True history.status = os.path.split(outfile_path)[ -1] + " has been submitted with additional constraints and still isn't a good geometry" history.save() if 'Needs clean resub' in history.notes: resubbed_before = True history.status = os.path.split(outfile_path)[ -1] + ' job recovery has failed - requesting resub_bad_geo after clean resubmission round' history.save() if not resubbed_before: save_run(outfile_path, rewrite_inscr=True) history = resub_history() history.read(outfile_path) history.resub_number += 1 history.status = 'Constraints added to help convergence' history.needs_resub = True history.notes.append('Bad geometry detected, adding constraints and trying again') history.save() machine=tools.get_machine() root = outfile_path.rsplit('.', 1)[0] name = os.path.split(root)[-1] directory = os.path.split(outfile_path)[0] infile_dict = manager_io.read_infile(outfile_path) if infile_dict['constraints']: raise Exception( 'resub.py does not currently support the use of external atom constraints. These will be overwritten by clean_resub() during job recovery') goal_geo = manager_io.read_configure(home_directory, outfile_path)['geo_check'] if not goal_geo: raise Exception( 'Goal geometry not specified, job ' + outfile_path + ' should not have been labelled bad geo!') else: metal_index, bonded_atom_indices = moltools.get_metal_and_bonded_atoms(outfile_path, goal_geo) # convert indexes from zero-indexed to one-indexed metal_index += 1 bonded_atom_indices = [index + 1 for index in bonded_atom_indices] # Convert to TeraChem input syntax constraints = ['bond ' + str(metal_index) + '_' + str(index) + '\n' for index in bonded_atom_indices] home = os.getcwd() if len(directory) > 0: # if the string is blank, then we're already in the correct directory os.chdir(directory) infile_dict['constraints'] = constraints infile_dict['machine'] = machine manager_io.write_input(infile_dict) manager_io.write_jobscript(name, machine=machine) os.chdir(home) tools.qsub(root + '_jobscript') return True else: return False
def resub(directory='in place'): # Takes a directory, resubmits errors, scf failures, and spin contaminated cases configure_dict = manager_io.read_configure(directory, None) max_resub = configure_dict['max_resub'] max_jobs = configure_dict['max_jobs'] hard_job_limit = configure_dict['hard_job_limit'] hit_queue_limit = False # Describes if this run has limitted the number of jobs submitted to work well with the queue # Get the state of all jobs being managed by this instance of the job manager completeness = moltools.check_completeness(directory, max_resub, configure_dict=configure_dict) print("completeness: ", completeness) errors = completeness[ 'Error'] # These are calculations which failed to complete scf_errors = completeness[ 'SCF_Error'] # These are calculations which failed to complete, appear to have an scf error, and hit wall time oscillating_scf_errors = completeness[ 'oscillating_scf_errors'] # These are calculations which failed to complete, appear to have an oscillaing scf error, need_resub = completeness[ 'Needs_resub'] # These are calculations with level shifts changed or hfx exchange changed spin_contaminated = completeness[ 'Spin_contaminated'] # These are finished jobs with spin contaminated solutions active = completeness[ 'Active'] # These are jobs which are currently running thermo_grad_error = completeness[ 'Thermo_grad_error'] # These are thermo jobs encountering the thermo grad error waiting = completeness[ 'Waiting'] # These are jobs which are or were waiting for another job to finish before continuing. bad_geos = completeness[ 'Bad_geos'] # These are jobs which finished, but converged to a bad geometry. finished = completeness['Finished'] molscontrol_kills = completeness['molscontrol_kills'] nactive = tools.get_number_active( ) # number of active jobs, counting bundled jobs as a single job # Kill SCF errors in progress, which are wasting computational resources all_scf_errors = completeness[ 'SCF_Errors_Including_Active'] # These are all jobs which appear to have scf error, including active ones scf_errors_to_kill = [ scf_err for scf_err in all_scf_errors if scf_err not in scf_errors ] names_to_kill = [ os.path.split(scf_err)[-1].rsplit('.', 1)[0] for scf_err in scf_errors_to_kill ] kill_jobs(names_to_kill, message1='Job: ', message2=' appears to have an scf error. Killing this job early') # Prep derivative jobs such as thermo single points, vertical IP, and ligand dissociation energies needs_derivative_jobs = list(filter(tools.check_original, finished)) print("needs_derivative_jobs: ", needs_derivative_jobs) prep_derivative_jobs(directory, needs_derivative_jobs) resubmitted = [ ] # Resubmitted list gets True if the job is submitted or False if not. Contains booleans, not job identifiers. for job in molscontrol_kills: print("killed by molscontrol: ", job) # Resub unidentified errors for error in errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue resub_tmp = recovery.simple_resub(error) if resub_tmp: print(('Unidentified error in job: ' + os.path.split(error)[-1] + ' -Resubmitting')) print('') resubmitted.append(resub_tmp) # Resub oscillating_scf convergence errors for error in oscillating_scf_errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'scf' in local_configure['job_recovery']: resub_tmp = recovery.resub_oscillating_scf(error) if resub_tmp: print(('Oscillating SCF error identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted precision and grid.')) print('') resubmitted.append(resub_tmp) # Resub scf convergence errors for error in scf_errors: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'scf' in local_configure['job_recovery']: resub_tmp = recovery.resub_scf(error) if resub_tmp: print(('SCF error identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted levelshifts')) print('') resubmitted.append(resub_tmp) # Resub jobs which converged to bad geometries with additional constraints for error in bad_geos: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'bad_geo' in local_configure['job_recovery']: resub_tmp = recovery.resub_bad_geo(error, directory) if resub_tmp: print(( 'Bad final geometry in job: ' + os.path.split(error)[-1] + ' -Resubmitting from initial structure with additional constraints' )) print('') resubmitted.append(resub_tmp) # Resub spin contaminated cases for error in spin_contaminated: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'spin_contaminated' in local_configure['job_recovery']: resub_tmp = recovery.resub_spin(error) if resub_tmp: print(('Spin contamination identified in job: ' + os.path.split(error)[-1] + ' -Resubmitting with adjusted HFX')) print('') resubmitted.append(resub_tmp) # Resub jobs with atypical parameters used to aid convergence for error in need_resub: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue resub_tmp = recovery.clean_resub(error) if resub_tmp: print(('Job ' + os.path.split(error)[-1] + ' needs to be rerun with typical paramters. -Resubmitting')) print('') resubmitted.append(resub_tmp) # Create a job with a tighter convergence threshold for failed thermo jobs for error in thermo_grad_error: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue local_configure = manager_io.read_configure(directory, None) if 'thermo_grad_error' in local_configure['job_recovery']: resub_tmp = recovery.resub_tighter(error) if resub_tmp: print(( 'Job ' + os.path.split(error)[-1] + ' needs a better initial geo. Creating a geometry run with tighter convergence criteria' )) print('') resubmitted.append(resub_tmp) # Look at jobs in "waiting," resume them if the job they were waiting for is finished # Currently, this should only ever be thermo jobs waiting for an ultratight job for waiting_dict in waiting: if ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue if len(list(waiting_dict.keys())) > 1: raise Exception('Waiting job list improperly constructed') job = list(waiting_dict.keys())[0] waiting_for = waiting_dict[job] if waiting_for in finished: history = recovery.load_history(job) history.waiting = None history.save() results_for_this_job = manager_io.read_outfile(job) if results_for_this_job['thermo_grad_error']: resubmitted.append(recovery.resub_thermo(job)) else: raise Exception('A method for resuming job: ' + job + ' is not defined') else: resubmitted.append(False) # Submit jobs which haven't yet been submitted if not ((nactive + np.sum(resubmitted)) >= max_jobs) or ( (tools.get_total_queue_usage() + np.sum(resubmitted)) >= hard_job_limit): to_submit = [] jobscripts = tools.find('*_jobscript') active_jobs = tools.list_active_jobs(home_directory=directory, parse_bundles=True) for job in jobscripts: if not os.path.isfile(job.rsplit('_', 1)[0] + '.out') and not os.path.split( job.rsplit('_', 1)[0])[-1] in active_jobs: to_submit.append(job) short_jobs_to_submit = [ i for i in to_submit if tools.check_short_single_point(i) ] long_jobs_to_submit = [ i for i in to_submit if i not in short_jobs_to_submit ] if len(short_jobs_to_submit) > 0: bundled_jobscripts = tools.bundle_jobscripts( os.getcwd(), short_jobs_to_submit) else: bundled_jobscripts = [] to_submit = bundled_jobscripts + long_jobs_to_submit submitted = [] for job in to_submit: if ((len(submitted) + nactive + np.sum(resubmitted)) >= max_jobs ) or ((tools.get_total_queue_usage() + len(submitted) + np.sum(resubmitted)) >= hard_job_limit): hit_queue_limit = True continue print(('Initial submission for job: ' + os.path.split(job)[-1])) tools.qsub(job) submitted.append(True) else: hit_queue_limit = True submitted = [] number_resubmitted = np.sum(np.array(resubmitted + submitted)) # ~ print str(number_resubmitted)+' Jobs submitted' return int(number_resubmitted), int(len( completeness['Active'])), hit_queue_limit
name = os.path.split(root)[-1] directory = os.path.split(outfile_path)[0] infile_dict = manager_io.read_infile(outfile_path) home = os.getcwd() if len(directory) > 0: # if the string is blank, then we're already in the correct directory os.chdir(directory) infile_dict['method'] = 'blyp' infile_dict['machine'] = machine manager_io.write_input(infile_dict) manager_io.write_jobscript(name, machine=machine) \ os.chdir(home) tools.qsub(root + '_jobscript') return True else: return False def resub_scf(outfile_path): # Resubmits a job that's having trouble converging the scf with different level shifts (1.0 and 0.1) history = resub_history() history.read(outfile_path) resubbed_before = False if 'SCF convergence error, level shifts adjusted to aid convergence' in history.notes: resubbed_before = True history.status = os.path.split(outfile_path)[ -1] + ' has been submitted with levels shifted and is still encountering an scf error'