def resubmit_unconverged_geometry(self, calc): """Resubmit a calculation it is not converged, but can be recovered.""" self.report("Checking the geometry convergence.") content_string = calc.outputs.retrieved.get_object_content(calc.get_attribute('output_filename')) time_not_exceeded = "PROGRAM ENDED AT" time_exceeded = "exceeded requested execution time" one_step_done = "Max. gradient =" self.ctx.inputs.parent_calc_folder = calc.outputs.remote_folder params = self.ctx.inputs.parameters # If the problem is recoverable then do restart if (time_not_exceeded not in content_string or time_exceeded in content_string) and one_step_done in content_string: # pylint: disable=line-too-long try: # Firts check if all the restart keys are present in the input dictionary wf_rest_fname_pointer = params['FORCE_EVAL']['DFT']['RESTART_FILE_NAME'] scf_guess_pointer = params['FORCE_EVAL']['DFT']['SCF']['SCF_GUESS'] restart_fname_pointer = params['EXT_RESTART']['RESTART_FILE_NAME'] # Also check if they all have the right value if not (wf_rest_fname_pointer == './parent_calc/aiida-RESTART.wfn' and scf_guess_pointer == 'RESTART' and restart_fname_pointer == './parent_calc/aiida-1.restart'): # If some values are incorrect add them to the input dictionary params = add_restart_sections(params) # If not all the restart keys are present, adding them to the input dictionary except (AttributeError, KeyError): params = add_restart_sections(params) # Might be able to solve the problem self.ctx.inputs.parameters = params # params (new or old ones) that for sure # include the necessary restart key-value pairs self.report( "The CP2K calculation wasn't completed. The restart of the calculation might be able to " "fix the problem.") return ProcessHandlerReport(False) # If the problem is not recoverable if (time_not_exceeded not in content_string or time_exceeded in content_string) and one_step_done not in content_string: self.report("It seems that the restart of CP2K calculation wouldn't be able to fix the problem as the " "geometry optimization couldn't complete a single step. Sending a signal to stop the Base " "work chain.") # Signaling to the base work chain that the problem could not be recovered. return ProcessHandlerReport(True, ExitCode(1)) self.report("The geometry seem to be converged.") # If everything is alright return None
def handle_not_converged(self, calculation): """Lower threshold and restart calculation that finished ok, but did not reach convergence because of min threshold parameters.""" # try: # settings = calculation.inputs.z2pack.z2pack_settings # except: # settings = {} param = calculation.outputs.output_parameters self.ctx.is_converged = False if not param['Tests_passed']: report = param['convergence_report'] # self.report_error_handled('calculation<{}> did not achieve convergence.') if len(report['PosCheck']['FAILED']): # pos_tol = settings.get('pos_tol', Z2packCalculation._DEFAULT_POS_TOLERANCE) # iterator = settings.get('iterator', Z2packCalculation._DEFAULT_ITERATOR) # self.report_error_handled('Convergence across line failed with `pos_tol={}` and `iterator={}`'.format( # pos_tol, iterator # )) return ProcessHandlerReport( True, self.exit_codes.ERROR_POS_TOL_CONVERGENCE_FAILED) # if len(param['GapCheck']['FAILED']): # # gap_tol = settings.get('gap_tol', Z2packCalculation._DEFAULT_GAP_TOLERANCE) # # MND = settings.get('min_neighbour_dist', Z2packCalculation._DEFAULT_MIN_NEIGHBOUR_DISTANCE) # # self.report_error_handled( # # 'Convergence of gap position between lines failed with `gap_tol={}` and `min_neighbour_dist={}`'.format( # # gap_tol, MND # # )) # return ErrorHandlerReport(True, True, self.exit_codes.ERROR_GAP_TOL_CONVERGENCE_FAILED) gcheck = len(report['GapCheck']['FAILED']) mcheck = len(report['MoveCheck']['FAILED']) if mcheck or gcheck: self.ctx.current_MND /= self.ctx.MND_scale_factor if self.ctx.current_MND < self.ctx.MND_threshold: self.report_error_handled( calculation, 'Convergence between lines failed. `min_neighbour_dist` already at minimum value.' ) if gcheck: if mcheck: error = self.exit_codes.ERROR_MOVE_GAP_TOL_CONVERGENCE_FAILED else: error = self.exit_codes.ERROR_GAP_TOL_CONVERGENCE_FAILED else: error = self.exit_codes.ERROR_MOVE_TOL_CONVERGENCE_FAILED return ProcessHandlerReport(True, error) self.report_error_handled( calculation, 'Convergence between lines failed. Reducing `min_neighbour_dist` and rerunning calculation.' ) return ProcessHandlerReport(True)
def sanity_check_insufficient_bands(self, calculation): """Perform a sanity check on the band occupations of a successfully converged calculation. Verify that the occupation of the last band is below a certain threshold, unless `occupations` was explicitly set to `fixed` in the input parameters. If this is violated, the calculation used too few bands and cannot be trusted. The number of bands is increased and the calculation is restarted, starting from the last. """ from aiida_quantumespresso.utils.bands import get_highest_occupied_band # Only skip the check on the highest band occupation if `occupations` was explicitly set to `fixed`. if calculation.inputs.parameters.get_attribute('SYSTEM', {}).get('occupations', None) == 'fixed': return try: bands = calculation.outputs.output_band get_highest_occupied_band(bands) except ValueError as exception: args = [self._process_class.__name__, calculation.pk] self.report('{}<{}> run with smearing and highest band is occupied'.format(*args)) self.report('BandsData<{}> has invalid occupations: {}'.format(bands.pk, exception)) self.report('{}<{}> had insufficient bands'.format(calculation.process_label, calculation.pk)) nbnd_cur = calculation.outputs.output_parameters.get_dict()['number_of_bands'] nbnd_new = nbnd_cur + max(int(nbnd_cur * self.defaults.delta_factor_nbnd), self.defaults.delta_minimum_nbnd) self.ctx.inputs.parameters.setdefault('SYSTEM', {})['nbnd'] = nbnd_new self.report('Action taken: increased number of bands to {} and restarting from scratch'.format(nbnd_new)) return ProcessHandlerReport(True)
def handle_failed(self, calculation): """Handle calculation that did not produce an output.""" try: calculation.outputs.output_parameters except: return ProcessHandlerReport( True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE)
def handle_unrecoverable_failure(self, calculation): """Handle calculations with an exit status below 400 which are unrecoverable, so abort the work chain.""" if calculation.is_failed and calculation.exit_status < 400: self.report_error_handled(calculation, 'unrecoverable error, aborting...') return ProcessHandlerReport( True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE)
def resubmit_random_gpu_error(self, calc): content_string = calc.outputs.retrieved.get_object_content( calc.get_attribute('scheduler_stdout')) gpu_error = "Invalid argument: No OpKernel was registered to support " \ "Op 'DescrptSeA' with these attrs." time_exceeded = "Total wall time:" if gpu_error in content_string: self.report("Inspect GPU Error") return ProcessHandlerReport(False, ExitCode(500)) if (gpu_error not in content_string and time_exceeded not in content_string): self.report("Something wrong during moodel deviation") return ProcessHandlerReport(True, ExitCode(1)) return None
def handle_misc_failure(self, node): """ By default, the BaseRestartWorkChain restarts any unhandled error once Disable this feature for the exit_code that corresponds to out-of-time """ return ProcessHandlerReport( False, self.exit_codes.ERROR_UNRECOVERABLE_TERMINATION) # pylint: disable=no-member
def _handle_electronic_convergence_not_achieved(self, calculation): """In the case of `UNCONVERGED_SCF`, decrease the function mixing and restart from the last recorded configuration.""" factor = self.defaults.delta_factor_fmixing fmixing = ( self.ctx.inputs.parameters["scf"] .get("numerical", {}) .get("FMIXING", self.defaults.fmixing) ) fmixing_new = int(fmixing * factor) self.ctx.restart_calc = calculation self.ctx.use_fort9_restart = True self.ctx.inputs.parameters["scf"].setdefault("numerical", {})[ "FMIXING" ] = fmixing_new action = ( "reduced fmixing from {} to {} and restarting from last calculation".format( fmixing, fmixing_new ) ) self.report_error_handled(calculation, action) return ProcessHandlerReport(True)
def handle_known_unrecoverable_failure(self, calculation): """Handle calculations with an exit status that correspond to a known failure mode that are unrecoverable. These failures may always be unrecoverable or at some point a handler may be devised. """ self.report_error_handled(calculation, 'known unrecoverable failure detected, aborting...') return ProcessHandlerReport(True, self.exit_codes.ERROR_KNOWN_UNRECOVERABLE_FAILURE)
def _handle_out_of_walltime(self, calculation): """In the case of `ERROR_OUT_OF_WALLTIME`, restart from the last recorded configuration.""" if not self.ctx.is_optimisation: self.report_error_handled( calculation, "there is currently no restart facility for a killed scf calculation", ) return ProcessHandlerReport( True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE ) self.ctx.restart_calc = calculation self.ctx.use_fort9_restart = False # the fort.9 is wiped in-between SCF self.report_error_handled( calculation, "simply restart from the last calculation" ) return ProcessHandlerReport(True)
def _handle_unrecoverable_failure(self, calculation): """Calculations with an exit status below 400 are unrecoverable, so abort the work chain.""" if (not calculation.is_finished_ok) and calculation.exit_status < 400: self.report_error_handled(calculation, "unrecoverable error, aborting...") return ProcessHandlerReport( True, self.exit_codes.ERROR_UNRECOVERABLE_FAILURE )
def handle_no_save_file(self, calculation): """Try to relaunch calculation that did not produce a save file once. Exit if it fails twice.""" if 'restart_no_save' not in self.ctx: self.ctx.restart_no_save = True self.ctx.inputs.z2pack_settings['restart_mode'] = False self.report_error_handled( calculation, 'The calculation died before the savefile for a restart was produced, trying to restart it from scratch.' ) return ProcessHandlerReport(True) else: self.report_error_handled( calculation, self.exit_codes.ERROR_FAILED_SAVEFILE_TWICE.message + ' Aborting...') return ProcessHandlerReport( True, self.exit_codes.ERROR_FAILED_SAVEFILE_TWICE)
def sanity_check_insufficient_bands(self, calculation): """Perform a sanity check on the band occupations of a successfully converged calculation. Verify that the occupation of the last band is below a certain threshold, unless `occupations` was explicitly set to `fixed` in the input parameters. If this is violated, the calculation used too few bands and cannot be trusted. The number of bands is increased and the calculation is restarted, starting from the last. """ from aiida_quantumespresso.utils.bands import get_highest_occupied_band occupations = calculation.inputs.parameters.get_attribute( 'SYSTEM', {}).get('occupations', None) if occupations is None: self.report( '`SYSTEM.occupations` parameter is not defined: performing band occupation check. ' 'If you want to disable this, explicitly set `SYSTEM.occupations` to `fixed`.' ) # Only skip the check on the highest band occupation if `occupations` was explicitly set to `fixed`. if occupations == 'fixed': return try: bands = calculation.outputs.output_band except AttributeError: args = [self.ctx.process_name, calculation.pk] self.report( '{}<{}> does not have `output_band` output, skipping sanity check.' .format(*args)) return try: get_highest_occupied_band(bands) except ValueError as exception: args = [self.ctx.process_name, calculation.pk] self.report( '{}<{}> run with smearing and highest band is occupied'.format( *args)) self.report( f'BandsData<{bands.pk}> has invalid occupations: {exception}') self.report( f'{calculation.process_label}<{calculation.pk}> had insufficient bands' ) nbnd_cur = calculation.outputs.output_parameters.get_dict( )['number_of_bands'] nbnd_new = nbnd_cur + max( int(nbnd_cur * self.defaults.delta_factor_nbnd), self.defaults.delta_minimum_nbnd) self.ctx.inputs.parameters.setdefault('SYSTEM', {})['nbnd'] = nbnd_new self.report( f'Action taken: increased number of bands to {nbnd_new} and restarting from scratch' ) return ProcessHandlerReport(True)
def _handle_geometric_convergence_not_achieved(self, calculation): """In the case of `UNCONVERGED_GEOMETRY`, restart from the last recorded configuration. """ self.ctx.restart_calc = calculation self.ctx.use_fort9_restart = True self.report_error_handled( calculation, "simply restart from the last calculation" ) return ProcessHandlerReport(True)
def handle_relax_recoverable_ionic_convergence_error(self, calculation): """Handle various exit codes for recoverable `relax` calculations with failed ionic convergence. These exit codes signify that the ionic convergence thresholds were not met, but the output structure is usable, so the solution is to simply restart from scratch but from the output structure. """ self.ctx.restart_calc = None self.ctx.inputs.structure = calculation.outputs.output_structure action = 'no ionic convergence but clean shutdown: restarting from scratch but using output structure.' self.report_error_handled(calculation, action) return ProcessHandlerReport(True)
def handle_vcrelax_converged_except_final_scf(self, calculation): """Handle `ERROR_IONIC_CONVERGENCE_REACHED_EXCEPT_IN_FINAL_SCF` exit code. Convergence reached in `vc-relax` except thresholds exceeded in final scf: consider as converged. """ self.ctx.is_finished = True self.ctx.restart_calc = calculation action = 'ionic convergence thresholds met except in final scf: consider structure relaxed.' self.report_error_handled(calculation, action) self.results() # Call the results method to attach the output nodes return ProcessHandlerReport(True, self.exit_codes.ERROR_IONIC_CONVERGENCE_REACHED_EXCEPT_IN_FINAL_SCF)
def handle_convergence_not_achieved(self, node): """Handle `ERROR_CONVERGENCE_NOT_REACHED` exit code: decrease the mixing beta and restart from scratch.""" factor = self.defaults.delta_factor_alpha_mix alpha_mix = self.ctx.inputs.parameters.get('INPUTPH', {}).get('alpha_mix(1)', self.defaults.alpha_mix) alpha_mix_new = alpha_mix * factor self.ctx.restart_calc = node self.ctx.inputs.parameters.setdefault('INPUTPH', {})['alpha_mix(1)'] = alpha_mix_new action = 'reduced alpha_mix from {} to {} and restarting'.format(alpha_mix, alpha_mix_new) self.report_error_handled(node, action) return ProcessHandlerReport(True)
def handle_out_of_walltime(self, calculation): """Handle `ERROR_OUT_OF_WALLTIME` exit code: calculation shut down neatly and we can simply restart.""" try: self.ctx.inputs.structure = calculation.outputs.output_structure except exceptions.NotExistent: self.ctx.restart_calc = calculation self.report_error_handled(calculation, 'simply restart from the last calculation') else: self.ctx.restart_calc = None self.report_error_handled(calculation, 'out of walltime: structure changed so restarting from scratch') return ProcessHandlerReport(True)
def handle_error_bands(self, node): #pylint: disable = unused-argument """ If an error in the parsing of bands occours in the SiestaCalculation (node here), we expose all the output ports node that have been produced (SiestaCalculation is designed to produce the output_parameter and stress/forcess port before the check on the bands outputs) and then we stop the workchain with a specific error code. We exclude the "retrieved" output port as it refers only to the underline calculation, not to the WorkChain itself. """ for name in node.outputs: if name != "retrieved": output = node.get_outgoing(link_label_filter=name).one().node self.out(name, output) return ProcessHandlerReport(True, self.exit_codes.ERROR_BANDS_PARSING)
def handle_electronic_convergence_not_achieved(self, calculation): """Handle `ERROR_ELECTRONIC_CONVERGENCE_NOT_REACHED`: decrease the mixing beta and restart from scratch.""" factor = self.defaults.delta_factor_mixing_beta mixing_beta = self.ctx.inputs.parameters.get('ELECTRONS', {}).get( 'mixing_beta', self.defaults.qe.mixing_beta) mixing_beta_new = mixing_beta * factor self.ctx.restart_calc = None self.ctx.inputs.parameters.setdefault( 'ELECTRONS', {})['mixing_beta'] = mixing_beta_new action = f'reduced beta mixing from {mixing_beta} to {mixing_beta_new} and restarting from the last calculation' self.report_error_handled(calculation, action) return ProcessHandlerReport(True)
def handle_scf_failure(self, node): """ Try to restart with 1) scf=(yqc) and if it doesn't work then 2) scf=(xqc) """ params = dict(self.ctx.inputs.parameters) route_params = params['route_parameters'] if 'scf' not in route_params: route_params['scf'] = {} if 'xqc' in route_params['scf']: # XQC and YQC failed: self.report("SCF failed with YQC and XQC, giving up...") return ProcessHandlerReport( True, self.exit_codes.ERROR_UNRECOVERABLE_SCF_FAILURE) # pylint: disable=no-member new_scf = {} # keep the user-set convergence criterion; replace rest if 'conver' in route_params['scf']: new_scf['conver'] = route_params['scf']['conver'] if 'yqc' in route_params['scf']: self.report("SCF=(YQC) failed, retrying with SCF=(XQC)") new_scf['xqc'] = None else: self.report("SCF failed, retrying with SCF=(YQC)") new_scf['yqc'] = None # Update the params Dict route_params['scf'] = new_scf self.ctx.inputs.parameters = Dict(dict=params) return ProcessHandlerReport(True)
def handle_error_geom_not_conv(self, node): """ At the end of the scf cycle, the geometry convergence was not reached. We need to restart from the previous calculation """ self.report( f'SiestaCalculation<{node.pk}> did not reach geometry convergence') # We need to take care of passing the output geometry of old_calc to the new calculation. if node.outputs.output_parameters.attributes["variable_geometry"]: self.ctx.inputs['structure'] = node.outputs.output_structure # The presence of `parent_calc_folder` triggers the real restart, so we add it. self.ctx.inputs['parent_calc_folder'] = node.outputs.remote_folder return ProcessHandlerReport(do_break=True)
def handle_relax_recoverable_electronic_convergence_error(self, calculation): """Handle various exit codes for recoverable `relax` calculations with failed electronic convergence. These exit codes signify that the electronic convergence thresholds were not met, but the output structure is usable, so the solution is to simply restart from scratch but from the output structure. """ factor = self.defaults.delta_factor_mixing_beta mixing_beta = self.ctx.inputs.parameters.get('ELECTRONS', {}).get('mixing_beta', self.defaults.qe.mixing_beta) mixing_beta_new = mixing_beta * factor self.ctx.restart_calc = None self.ctx.inputs.parameters.setdefault('ELECTRONS', {})['mixing_beta'] = mixing_beta_new self.ctx.inputs.structure = calculation.outputs.output_structure action = 'no electronic convergence but clean shutdown: reduced beta mixing from {} to {} restarting from ' \ 'scratch but using output structure.'.format(mixing_beta, mixing_beta_new) self.report_error_handled(calculation, action) return ProcessHandlerReport(True)
def handle_error_scf_not_conv(self, node): """ SCF convergence was not reached. We need to restart from the previous calculation without changing any of the input parameters. """ self.report( f'SiestaCalculation<{node.pk}> did not achieve scf convergence.') # We need to take care of passing the output geometry of old_calc to the new calculation. if node.outputs.output_parameters.attributes["variable_geometry"]: self.ctx.inputs['structure'] = node.outputs.output_structure # The presence of `parent_calc_folder` triggers the real restart, so we add it. self.ctx.inputs['parent_calc_folder'] = node.outputs.remote_folder #Should be also increase the number of scf max iterations? return ProcessHandlerReport(do_break=True)
def handle_error_split_norm(self, node): """ The split_norm parameter was too small. We need to change it and restart. The minimum split_norm is stored in the logs of the old calculation. This error happens only at the beginning of the run, therefore no real restart needed. Just a new calculation with a new split_norm. """ self.report( f'SiestaCalculation<{node.pk}> crashed with split_norm issue.') #Retrive the minimum split norm from the logs of failed calc. logs = orm.Log.objects.get_logs_for(node) for log in logs: if "Error in split_norm option" in log.message: mylog = log.message.split() new_split_norm = float(mylog[-1]) + 0.001 # We want to understand the presence of "pao-split-norm" in input and: # 1) if present, we change its value to the minimum allowed # 2) if not present, we activate pao-SplitTailNorm # As we don't know in which sintax the user passed "pao-split-norm (remember that every fdf variant # is allowed), we translate the original dict to a FDFDict that is aware of equivalent keyword. transl_basis = FDFDict(self.ctx.inputs["basis"].get_dict()) glob_split_norm = False for key in transl_basis: if key == "paosplitnorm": glob_split_norm = True if glob_split_norm: self.report('Resetting the pao-split-norm global value') transl_basis["pao-split-norm"] = new_split_norm else: self.report( 'Adding pao-SplitTailNorm to solve the split_norm problem') transl_basis["pao-SplitTailNorm"] = True new_basis = orm.Dict(dict=transl_basis) self.ctx.inputs["basis"] = new_basis return ProcessHandlerReport(do_break=True)
def error_negative_sum(self, node): """What even is a negative number, how can I have minus three melons?!.""" self.ctx.inputs.x = Int(abs(node.inputs.x.value)) self.ctx.inputs.y = Int(abs(node.inputs.y.value)) return ProcessHandlerReport(True)
def disabled_handler(self, node): """By default this is not enabled and so should never be called, irrespective of exit codes of sub process.""" return ProcessHandlerReport(True, self.exit_codes.ERROR_ENABLED_DOOM)
def sanity_check_not_too_big(self, node): """My puny brain cannot deal with numbers that I cannot count on my hand.""" if node.is_finished_ok and node.outputs.sum > 10: return ProcessHandlerReport(True, self.exit_codes.ERROR_TOO_BIG)
def handle_out_of_walltime(self, node): """Handle `ERROR_OUT_OF_WALLTIME` exit code: calculation shut down neatly and we can simply restart.""" self.ctx.restart_calc = node self.report_error_handled(node, 'simply restart from the last calculation') return ProcessHandlerReport(True)
def disabled_handler(self, node): # pylint: disable=unused-argument """By default this is not enabled and so should never be called, irrespective of exit codes of sub process.""" return ProcessHandlerReport(True, self.exit_codes.ERROR_ENABLED_DOOM) # pylint: disable=no-member