Example #1
0
def _handle_time_limits(self, calculation):
    """
    If calculation fails due to time limits, we simply resubmit it.
    """

    if calculation.exit_status in FleurProcess.get_exit_statuses(
        ['ERROR_TIME_LIMIT']):

        self.report(
            'FleurCalculation failed due to time limits, I restart it from where it ended'
        )

        remote = calculation.get_outgoing().get_node_by_label('remote_folder')

        # if previous calculation failed for the same reason, do not restart
        prev_calculation_status = remote.get_incoming().all()[-1].exit_status
        if prev_calculation_status in FleurProcess.get_exit_statuses(
            ['ERROR_TIME_LIMIT']):
            self.ctx.is_finished = True
            return ErrorHandlerReport(True, True)

        # however, if it is the first time, resubmit profiding inp.xml and cdn from the remote folder
        self.ctx.is_finished = False
        self.ctx.inputs.parent_folder = remote
        if 'fleurinpdata' in self.ctx.inputs:
            del self.ctx.inputs.fleurinpdata

        return ErrorHandlerReport(True, True)
Example #2
0
    def check_failure(self):
        """
        Throws an exit code if scf failed
        """
        try:
            scf_wc = self.ctx.scf_res
        except AttributeError:
            message = 'ERROR: Something went wrong I do not have new atom positions calculation'
            self.control_end_wc(message)
            return self.exit_codes.ERROR_NO_SCF_OUTPUT

        if not scf_wc.is_finished_ok:
            exit_statuses = FleurScfWorkChain.get_exit_statuses(
                ['ERROR_FLEUR_CALCULATION_FAILED'])
            if scf_wc.exit_status == exit_statuses[0]:
                fleur_calc = load_node(
                    scf_wc.outputs.output_scf_wc_para.get_dict()
                    ['last_calc_uuid'])
                if fleur_calc.exit_status == FleurCalc.get_exit_statuses(
                    ['ERROR_VACUUM_SPILL_RELAX'])[0]:
                    self.control_end_wc(
                        'ERROR: Failed due to atom and vacuum overlap')
                    return self.exit_codes.ERROR_VACUUM_SPILL_RELAX
                elif fleur_calc.exit_status == FleurCalc.get_exit_statuses(
                    ['ERROR_MT_RADII_RELAX'])[0]:
                    self.control_end_wc('ERROR: Failed due to MT overlap')
                    return self.exit_codes.ERROR_MT_RADII_RELAX
            return self.exit_codes.ERROR_SCF_FAILED
Example #3
0
def _handle_time_limits(self, calculation):
    """
    If calculation fails due to time limits, we simply resubmit it.
    """
    from aiida.common.exceptions import NotExistent

    if calculation.exit_status in FleurProcess.get_exit_statuses(
        ['ERROR_TIME_LIMIT']):

        # if previous calculation failed for the same reason, do not restart
        try:
            prev_calculation_remote = calculation.get_incoming(
            ).get_node_by_label('parent_folder')
            prev_calculation_status = prev_calculation_remote.get_incoming(
            ).all()[-1].node.exit_status
            if prev_calculation_status in FleurProcess.get_exit_statuses(
                ['ERROR_TIME_LIMIT']):
                self.ctx.is_finished = True
                return ErrorHandlerReport(True, True)
        except NotExistent:
            pass

        self.report(
            'FleurCalculation failed due to time limits, I restart it from where it ended'
        )

        # increase wallclock time
        propose_wallclock = self.ctx.inputs.metadata.options[
            'max_wallclock_seconds'] * 2
        if propose_wallclock > self.ctx.max_queue_wallclock_sec:
            propose_wallclock = self.ctx.max_queue_wallclock_sec
        self.ctx.inputs.metadata.options[
            'max_wallclock_seconds'] = propose_wallclock

        # increase number of nodes
        propose_nodes = self.ctx.num_machines * 2
        if propose_nodes > self.ctx.max_queue_nodes:
            propose_nodes = self.ctx.max_queue_nodes
        self.ctx.num_machines = propose_nodes

        remote = calculation.get_outgoing().get_node_by_label('remote_folder')

        # resubmit providing inp.xml and cdn from the remote folder
        self.ctx.is_finished = False

        if 'fleurinpdata' in self.ctx.inputs:
            modes = self.ctx.inputs.fleurinpdata.get_fleur_modes()
            if not (modes['force_theorem'] or modes['dos'] or modes['band']):
                # in modes listed above it makes no sense copying cdn.hdf
                self.ctx.inputs.parent_folder = remote
                del self.ctx.inputs.fleurinpdata
        else:
            # it is harder to extract modes in this case - simply try to reuse cdn.hdf and hope it works
            self.ctx.inputs.parent_folder = remote

        return ErrorHandlerReport(True, True)
Example #4
0
def _handle_not_enough_memory(self, calculation):
    """
    Calculation failed due to lack of memory.
    Probably works for JURECA only, has to be tested for other systems.
    """

    if calculation.exit_status in FleurProcess.get_exit_statuses(
        ['ERROR_NOT_ENOUGH_MEMORY']):
        if self.ctx.can_be_optimised:
            self.ctx.restart_calc = None
            self.ctx.is_finished = False
            self.report(
                'Calculation failed due to lack of memory, I resubmit it with twice larger'
                ' amount of computational nodes and smaller MPI/OMP ratio')
            self.ctx.num_machines = self.ctx.num_machines * 2
            self.ctx.suggest_mpi_omp_ratio = self.ctx.suggest_mpi_omp_ratio / 2
            self.check_kpts()

            if 'settings' not in self.ctx.inputs:
                self.ctx.inputs.settings = {}
            else:
                self.ctx.inputs.settings = self.inputs.settings.get_dict()
            self.ctx.inputs.settings.setdefault('remove_from_remotecopy_list',
                                                []).append('mixing_history*')

            return ErrorHandlerReport(True, True)
        else:
            self.ctx.restart_calc = calculation
            self.ctx.is_finished = True
            self.report(
                'I am not allowed to optimize your settings. Consider providing at least'
                'num_machines and num_mpiprocs_per_machine')
            self.results()
            return ErrorHandlerReport(
                True, True, self.exit_codes.ERROR_MEMORY_ISSUE_NO_SOLUTION)
Example #5
0
def _handle_dirac_equation(self, calculation):
    """
    Sometimes relaxation calculation fails with Diraq problem which is usually caused by
    problems with reusing charge density. In this case we resubmit the calculation, dropping the input cdn.
    """

    if calculation.exit_status in FleurProcess.get_exit_statuses(
        ['ERROR_DROP_CDN']):

        # try to drop remote folder and see if it helps
        is_fleurinp_from_relax = False
        if 'fleurinpdata' in self.ctx.inputs:
            if 'relax.xml' in self.ctx.inputs.fleurinpdata.files:
                is_fleurinp_from_relax = True

        if 'parent_folder' in self.ctx.inputs and is_fleurinp_from_relax:
            del self.ctx.inputs.parent_folder
            self.ctx.restart_calc = None
            self.ctx.is_finished = False
            self.report(
                'Calculation seems to fail due to corrupted charge density (can happen'
                'during relaxation). I drop cdn from previous step')
            return ErrorHandlerReport(True, True)

        self.ctx.restart_calc = calculation
        self.ctx.is_finished = True
        self.report(
            'Can not drop charge density. If I drop the remote folder, there will be'
            'no inp.xml')
        self.results()
        return ErrorHandlerReport(True, True,
                                  self.exit_codes.ERROR_SOMETHING_WENT_WRONG)
Example #6
0
def _handle_mt_relax_error(self, calculation):
    """
    Calculation failed for unknown reason.
    """
    if calculation.exit_status in FleurProcess.get_exit_statuses(
        ['ERROR_MT_RADII_RELAX']):
        self.ctx.restart_calc = calculation
        self.ctx.is_finished = True
        self.report('FLEUR calculation failed due to MT overlap.'
                    ' Can be fixed via RelaxBaseWorkChain')
        self.results()
        return ErrorHandlerReport(True, True,
                                  self.exit_codes.ERROR_MT_RADII_RELAX)
Example #7
0
def _handle_vacuum_spill_error(self, calculation):
    """
    Calculation failed for unknown reason.
    """
    if calculation.exit_status in FleurProcess.get_exit_statuses(
        ['ERROR_VACUUM_SPILL_RELAX']):
        self.ctx.restart_calc = calculation
        self.ctx.is_finished = True
        self.report(
            'FLEUR calculation failed because an atom spilled to the vacuum during'
            'relaxation. Can be fixed via RelaxBaseWorkChain.')
        self.results()
        return ErrorHandlerReport(True, True,
                                  self.exit_codes.ERROR_VACUUM_SPILL_RELAX)
Example #8
0
def _handle_invalid_elements_mmpmat(self, calculation):
    """
    Calculation failed due to invalid elements in the LDA+U density matrix.
    Mixing history is reset.
    TODO: HOw to handle consecutive errors
    """
    if calculation.exit_status in FleurProcess.get_exit_statuses(
        ['ERROR_INVALID_ELEMENTS_MMPMAT']):
        self.ctx.restart_calc = None
        self.ctx.is_finished = False
        self.report(
            'FLEUR calculation failed due to invalid elements in mmpmat. Resetting mixing_history'
        )

        if 'settings' not in self.ctx.inputs:
            self.ctx.inputs.settings = {}
        else:
            self.ctx.inputs.settings = self.inputs.settings.get_dict()
        self.ctx.inputs.settings.setdefault('remove_from_remotecopy_list',
                                            []).append('mixing_history*')
        return ErrorHandlerReport(True, True)
Example #9
0
def _handle_general_error(self, calculation):
    """
    Calculation failed for unknown reason.
    """
    if calculation.exit_status in FleurProcess.get_exit_statuses([
            'ERROR_FLEUR_CALC_FAILED', 'ERROR_MT_RADII',
            'ERROR_NO_RETRIEVED_FOLDER', 'ERROR_OPENING_OUTPUTS',
            'ERROR_NO_OUTXML', 'ERROR_XMLOUT_PARSING_FAILED',
            'ERROR_RELAX_PARSING_FAILED'
    ]):
        self.ctx.restart_calc = calculation
        self.ctx.is_finished = True
        self.report(
            'Calculation failed for a reason that can not be resolved automatically'
        )
        self.results()
        return ErrorHandlerReport(True, True,
                                  self.exit_codes.ERROR_SOMETHING_WENT_WRONG)
    else:
        raise ValueError(
            'Calculation failed for unknown reason, please register the '
            'corresponding exit code in this error handler')