Beispiel #1
0
def task_upload_job(process, transport_queue, cancellable):
    """Transport task that will attempt to upload the files of a job calculation to the remote.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    node = process.node

    if node.get_state() == CalcJobState.SUBMITTING:
        logger.warning('CalcJob<{}> already marked as SUBMITTING, skipping task_update_job'.format(node.pk))
        raise Return

    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    authinfo = node.computer.get_authinfo(node.user)

    @coroutine
    def do_upload():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield cancellable.with_interrupt(request)

            with SandboxFolder() as folder:
                # Any exception thrown in `presubmit` call is not transient so we circumvent the exponential backoff
                try:
                    calc_info = process.presubmit(folder)
                except Exception as exception:  # pylint: disable=broad-except
                    raise PreSubmitException('exception occurred in presubmit call') from exception
                else:
                    execmanager.upload_calculation(node, transport, calc_info, folder)

            raise Return

    try:
        logger.info('scheduled request to upload CalcJob<{}>'.format(node.pk))
        ignore_exceptions = (plumpy.CancelledError, PreSubmitException)
        result = yield exponential_backoff_retry(
            do_upload, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=ignore_exceptions
        )
    except PreSubmitException:
        raise
    except plumpy.CancelledError:
        pass
    except Exception:
        logger.warning('uploading CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException('upload_calculation failed {} times consecutively'.format(max_attempts))
    else:
        logger.info('uploading CalcJob<{}> successful'.format(node.pk))
        node.set_state(CalcJobState.SUBMITTING)
        raise Return(result)
Beispiel #2
0
def task_update_job(node, job_manager, cancellable):
    """Transport task that will attempt to update the scheduler status of the job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :type node: :class:`aiida.orm.nodes.process.calculation.calcjob.CalcJobNode`
    :param job_manager: The job manager
    :type job_manager: :class:`aiida.engine.processes.calcjobs.manager.JobManager`
    :param cancellable: A cancel flag
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return containing True if the tasks was successfully completed, False otherwise
    """
    if node.get_state() == CalcJobState.RETRIEVING:
        logger.warning('CalcJob<{}> already marked as RETRIEVING, skipping task_update_job'.format(node.pk))
        raise Return(True)

    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    authinfo = node.computer.get_authinfo(node.user)
    job_id = node.get_job_id()

    @coroutine
    def do_update():
        # Get the update request
        with job_manager.request_job_info_update(authinfo, job_id) as update_request:
            job_info = yield cancellable.with_interrupt(update_request)

        if job_info is None:
            # If the job is computed or not found assume it's done
            node.set_scheduler_state(JobState.DONE)
            job_done = True
        else:
            node.set_last_job_info(job_info)
            node.set_scheduler_state(job_info.job_state)
            job_done = job_info.job_state == JobState.DONE

        raise Return(job_done)

    try:
        logger.info('scheduled request to update CalcJob<{}>'.format(node.pk))
        job_done = yield exponential_backoff_retry(
            do_update, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption
        )
    except plumpy.Interruption:
        raise
    except Exception:
        logger.warning('updating CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException('update_calculation failed {} times consecutively'.format(max_attempts))
    else:
        logger.info('updating CalcJob<{}> successful'.format(node.pk))
        if job_done:
            node.set_state(CalcJobState.RETRIEVING)

        raise Return(job_done)
Beispiel #3
0
def task_retrieve_job(node, transport_queue, retrieved_temporary_folder, cancellable):
    """Transport task that will attempt to retrieve all files of a completed job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    if node.get_state() == CalcJobState.PARSING:
        logger.warning('CalcJob<{}> already marked as PARSING, skipping task_retrieve_job'.format(node.pk))
        raise Return

    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    authinfo = node.computer.get_authinfo(node.user)

    @coroutine
    def do_retrieve():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield cancellable.with_interrupt(request)

            # Perform the job accounting and set it on the node if successful. If the scheduler does not implement this
            # still set the attribute but set it to `None`. This way we can distinguish calculation jobs for which the
            # accounting was called but could not be set.
            scheduler = node.computer.get_scheduler()
            scheduler.set_transport(transport)

            try:
                detailed_job_info = scheduler.get_detailed_job_info(node.get_job_id())
            except FeatureNotAvailable:
                logger.info('detailed job info not available for scheduler of CalcJob<{}>'.format(node.pk))
                node.set_detailed_job_info(None)
            else:
                node.set_detailed_job_info(detailed_job_info)

            raise Return(execmanager.retrieve_calculation(node, transport, retrieved_temporary_folder))

    try:
        logger.info('scheduled request to retrieve CalcJob<{}>'.format(node.pk))
        yield exponential_backoff_retry(
            do_retrieve, initial_interval, max_attempts, logger=node.logger, ignore_exceptions=plumpy.Interruption
        )
    except plumpy.Interruption:
        raise
    except Exception:
        logger.warning('retrieving CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException('retrieve_calculation failed {} times consecutively'.format(max_attempts))
    else:
        node.set_state(CalcJobState.PARSING)
        logger.info('retrieving CalcJob<{}> successful'.format(node.pk))
        raise Return
Beispiel #4
0
def task_submit_job(node, transport_queue, calc_info, script_filename,
                    cancellable):
    """
    Transport task that will attempt to submit a job calculation

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param calc_info: the calculation info datastructure returned by `CalcJobNode._presubmit`
    :param script_filename: the job launch script returned by `CalcJobNode._presubmit`
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    if node.get_state() == CalcJobState.WITHSCHEDULER:
        assert node.get_job_id(
        ) is not None, 'job is WITHSCHEDULER, however, it does not have a job id'
        logger.warning(
            'CalcJob<{}> already marked as WITHSCHEDULER, skipping task_submit_job'
            .format(node.pk))
        raise Return(node.get_job_id())

    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    authinfo = node.computer.get_authinfo(node.user)

    @coroutine
    def do_submit():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield cancellable.with_interrupt(request)
            raise Return(
                execmanager.submit_calculation(node, transport, calc_info,
                                               script_filename))

    try:
        logger.info('scheduled request to submit CalcJob<{}>'.format(node.pk))
        result = yield exponential_backoff_retry(
            do_submit,
            initial_interval,
            max_attempts,
            logger=node.logger,
            ignore_exceptions=plumpy.Interruption)
    except plumpy.Interruption:
        pass
    except Exception:
        logger.warning('submitting CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException(
            'submit_calculation failed {} times consecutively'.format(
                max_attempts))
    else:
        logger.info('submitting CalcJob<{}> successful'.format(node.pk))
        node.set_state(CalcJobState.WITHSCHEDULER)
        raise Return(result)
Beispiel #5
0
async def task_submit_job(node: CalcJobNode, transport_queue: TransportQueue,
                          cancellable: InterruptableFuture):
    """Transport task that will attempt to submit a job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled

    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    if node.get_state() == CalcJobState.WITHSCHEDULER:
        assert node.get_job_id(
        ) is not None, 'job is WITHSCHEDULER, however, it does not have a job id'
        logger.warning(
            f'CalcJob<{node.pk}> already marked as WITHSCHEDULER, skipping task_submit_job'
        )
        return node.get_job_id()

    initial_interval = get_config_option(RETRY_INTERVAL_OPTION)
    max_attempts = get_config_option(MAX_ATTEMPTS_OPTION)

    authinfo = node.get_authinfo()

    async def do_submit():
        with transport_queue.request_transport(authinfo) as request:
            transport = await cancellable.with_interrupt(request)
            return execmanager.submit_calculation(node, transport)

    try:
        logger.info(f'scheduled request to submit CalcJob<{node.pk}>')
        ignore_exceptions = (plumpy.futures.CancelledError,
                             plumpy.process_states.Interruption)
        result = await exponential_backoff_retry(
            do_submit,
            initial_interval,
            max_attempts,
            logger=node.logger,
            ignore_exceptions=ignore_exceptions)
    except (plumpy.futures.CancelledError, plumpy.process_states.Interruption):  # pylint: disable=try-except-raise
        raise
    except Exception as exception:
        logger.warning(f'submitting CalcJob<{node.pk}> failed')
        raise TransportTaskException(
            f'submit_calculation failed {max_attempts} times consecutively'
        ) from exception
    else:
        logger.info(f'submitting CalcJob<{node.pk}> successful')
        node.set_state(CalcJobState.WITHSCHEDULER)
        return result
Beispiel #6
0
async def task_stash_job(node: CalcJobNode, transport_queue: TransportQueue,
                         cancellable: InterruptableFuture):
    """Transport task that will optionally stash files of a completed job calculation on the remote.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    if node.get_state() == CalcJobState.RETRIEVING:
        logger.warning(
            f'calculation<{node.pk}> already marked as RETRIEVING, skipping task_stash_job'
        )
        return

    initial_interval = get_config_option(RETRY_INTERVAL_OPTION)
    max_attempts = get_config_option(MAX_ATTEMPTS_OPTION)

    authinfo = node.get_authinfo()

    async def do_stash():
        with transport_queue.request_transport(authinfo) as request:
            transport = await cancellable.with_interrupt(request)

            logger.info(f'stashing calculation<{node.pk}>')
            return execmanager.stash_calculation(node, transport)

    try:
        await exponential_backoff_retry(
            do_stash,
            initial_interval,
            max_attempts,
            logger=node.logger,
            ignore_exceptions=plumpy.process_states.Interruption)
    except plumpy.process_states.Interruption:
        raise
    except Exception as exception:
        logger.warning(f'stashing calculation<{node.pk}> failed')
        raise TransportTaskException(
            f'stash_calculation failed {max_attempts} times consecutively'
        ) from exception
    else:
        node.set_state(CalcJobState.RETRIEVING)
        logger.info(f'stashing calculation<{node.pk}> successful')
        return
Beispiel #7
0
def task_kill_job(node, transport_queue, cancel_flag):
    """
    Transport task that will attempt to kill a job calculation

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancel_flag: the cancelled flag that will be queried to determine whether the task was cancelled
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    initial_interval = 1
    max_attempts = 5

    if node.get_state() in [calc_states.NEW, calc_states.TOSUBMIT]:
        node._set_state(calc_states.FAILED)
        logger.warning('calculation<{}> killed, it was in the {} state'.format(node.pk, node.get_state()))
        raise Return(True)

    authinfo = node.get_computer().get_authinfo(node.get_user())

    @coroutine
    def do_kill():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield request

            # It may have taken time to get the transport, check if we've been cancelled
            if cancel_flag.is_cancelled:
                raise plumpy.CancelledError('task_kill_job for calculation<{}> cancelled'.format(node.pk))

            logger.info('killing calculation<{}>'.format(node.pk))

            raise Return(execmanager.kill_calculation(node, transport))

    try:
        result = yield exponential_backoff_retry(do_kill, initial_interval, max_attempts, logger=node.logger)
    except plumpy.CancelledError:
        pass
    except Exception:
        logger.warning('killing calculation<{}> failed:\n{}'.format(node.pk, traceback.format_exc()))
        node._set_state(calc_states.FAILED)
        raise TransportTaskException('kill_calculation failed {} times consecutively'.format(max_attempts))
    else:
        logger.info('killing calculation<{}> successful'.format(node.pk))
        raise Return(result)
Beispiel #8
0
def task_kill_job(node, transport_queue, cancellable):
    """
    Transport task that will attempt to kill a job calculation

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :raises: Return if the tasks was successfully completed
    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    initial_interval = TRANSPORT_TASK_RETRY_INITIAL_INTERVAL
    max_attempts = TRANSPORT_TASK_MAXIMUM_ATTEMTPS

    if node.get_state() in [CalcJobState.UPLOADING, CalcJobState.SUBMITTING]:
        logger.warning('CalcJob<{}> killed, it was in the {} state'.format(
            node.pk, node.get_state()))
        raise Return(True)

    authinfo = node.computer.get_authinfo(node.user)

    @coroutine
    def do_kill():
        with transport_queue.request_transport(authinfo) as request:
            transport = yield cancellable.with_interrupt(request)
            raise Return(execmanager.kill_calculation(node, transport))

    try:
        logger.info('scheduled request to kill CalcJob<{}>'.format(node.pk))
        result = yield exponential_backoff_retry(do_kill,
                                                 initial_interval,
                                                 max_attempts,
                                                 logger=node.logger)
    except plumpy.Interruption:
        raise
    except Exception:
        logger.warning('killing CalcJob<{}> failed'.format(node.pk))
        raise TransportTaskException(
            'kill_calculation failed {} times consecutively'.format(
                max_attempts))
    else:
        logger.info('killing CalcJob<{}> successful'.format(node.pk))
        node.set_scheduler_state(JobState.DONE)
        raise Return(result)
Beispiel #9
0
async def task_kill_job(node: CalcJobNode, transport_queue: TransportQueue,
                        cancellable: InterruptableFuture):
    """Transport task that will attempt to kill a job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled

    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    initial_interval = get_config_option(RETRY_INTERVAL_OPTION)
    max_attempts = get_config_option(MAX_ATTEMPTS_OPTION)

    if node.get_state() in [CalcJobState.UPLOADING, CalcJobState.SUBMITTING]:
        logger.warning(
            f'CalcJob<{node.pk}> killed, it was in the {node.get_state()} state'
        )
        return True

    authinfo = node.get_authinfo()

    async def do_kill():
        with transport_queue.request_transport(authinfo) as request:
            transport = await cancellable.with_interrupt(request)
            return execmanager.kill_calculation(node, transport)

    try:
        logger.info(f'scheduled request to kill CalcJob<{node.pk}>')
        result = await exponential_backoff_retry(do_kill,
                                                 initial_interval,
                                                 max_attempts,
                                                 logger=node.logger)
    except plumpy.process_states.Interruption:
        raise
    except Exception as exception:
        logger.warning(f'killing CalcJob<{node.pk}> failed')
        raise TransportTaskException(
            f'kill_calculation failed {max_attempts} times consecutively'
        ) from exception
    else:
        logger.info(f'killing CalcJob<{node.pk}> successful')
        node.set_scheduler_state(JobState.DONE)
        return result
Beispiel #10
0
async def task_upload_job(process: 'CalcJob', transport_queue: TransportQueue,
                          cancellable: InterruptableFuture):
    """Transport task that will attempt to upload the files of a job calculation to the remote.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param process: the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled

    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    node = process.node

    if node.get_state() == CalcJobState.SUBMITTING:
        logger.warning(
            f'CalcJob<{node.pk}> already marked as SUBMITTING, skipping task_update_job'
        )
        return

    initial_interval = get_config_option(RETRY_INTERVAL_OPTION)
    max_attempts = get_config_option(MAX_ATTEMPTS_OPTION)

    authinfo = node.get_authinfo()

    async def do_upload():
        with transport_queue.request_transport(authinfo) as request:
            transport = await cancellable.with_interrupt(request)

            with SandboxFolder() as folder:
                # Any exception thrown in `presubmit` call is not transient so we circumvent the exponential backoff
                try:
                    calc_info = process.presubmit(folder)
                except Exception as exception:  # pylint: disable=broad-except
                    raise PreSubmitException(
                        'exception occurred in presubmit call') from exception
                else:
                    execmanager.upload_calculation(node, transport, calc_info,
                                                   folder)
                    skip_submit = calc_info.skip_submit or False

            return skip_submit

    try:
        logger.info(f'scheduled request to upload CalcJob<{node.pk}>')
        ignore_exceptions = (plumpy.futures.CancelledError, PreSubmitException,
                             plumpy.process_states.Interruption)
        skip_submit = await exponential_backoff_retry(
            do_upload,
            initial_interval,
            max_attempts,
            logger=node.logger,
            ignore_exceptions=ignore_exceptions)
    except PreSubmitException:
        raise
    except (plumpy.futures.CancelledError, plumpy.process_states.Interruption):
        raise
    except Exception as exception:
        logger.warning(f'uploading CalcJob<{node.pk}> failed')
        raise TransportTaskException(
            f'upload_calculation failed {max_attempts} times consecutively'
        ) from exception
    else:
        logger.info(f'uploading CalcJob<{node.pk}> successful')
        node.set_state(CalcJobState.SUBMITTING)
        return skip_submit
Beispiel #11
0
async def task_retrieve_job(node: CalcJobNode, transport_queue: TransportQueue,
                            retrieved_temporary_folder: str,
                            cancellable: InterruptableFuture):
    """Transport task that will attempt to retrieve all files of a completed job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :param transport_queue: the TransportQueue from which to request a Transport
    :param retrieved_temporary_folder: the absolute path to a directory to store files
    :param cancellable: the cancelled flag that will be queried to determine whether the task was cancelled

    :raises: TransportTaskException if after the maximum number of retries the transport task still excepted
    """
    if node.get_state() == CalcJobState.PARSING:
        logger.warning(
            f'CalcJob<{node.pk}> already marked as PARSING, skipping task_retrieve_job'
        )
        return

    initial_interval = get_config_option(RETRY_INTERVAL_OPTION)
    max_attempts = get_config_option(MAX_ATTEMPTS_OPTION)

    authinfo = node.get_authinfo()

    async def do_retrieve():
        with transport_queue.request_transport(authinfo) as request:
            transport = await cancellable.with_interrupt(request)

            # Perform the job accounting and set it on the node if successful. If the scheduler does not implement this
            # still set the attribute but set it to `None`. This way we can distinguish calculation jobs for which the
            # accounting was called but could not be set.
            scheduler = node.computer.get_scheduler(
            )  # type: ignore[union-attr]
            scheduler.set_transport(transport)

            try:
                detailed_job_info = scheduler.get_detailed_job_info(
                    node.get_job_id())
            except FeatureNotAvailable:
                logger.info(
                    f'detailed job info not available for scheduler of CalcJob<{node.pk}>'
                )
                node.set_detailed_job_info(None)
            else:
                node.set_detailed_job_info(detailed_job_info)

            return execmanager.retrieve_calculation(
                node, transport, retrieved_temporary_folder)

    try:
        logger.info(f'scheduled request to retrieve CalcJob<{node.pk}>')
        ignore_exceptions = (plumpy.futures.CancelledError,
                             plumpy.process_states.Interruption)
        result = await exponential_backoff_retry(
            do_retrieve,
            initial_interval,
            max_attempts,
            logger=node.logger,
            ignore_exceptions=ignore_exceptions)
    except (plumpy.futures.CancelledError, plumpy.process_states.Interruption):  # pylint: disable=try-except-raise
        raise
    except Exception as exception:
        logger.warning(f'retrieving CalcJob<{node.pk}> failed')
        raise TransportTaskException(
            f'retrieve_calculation failed {max_attempts} times consecutively'
        ) from exception
    else:
        node.set_state(CalcJobState.PARSING)
        logger.info(f'retrieving CalcJob<{node.pk}> successful')
        return result
Beispiel #12
0
async def task_update_job(node: CalcJobNode, job_manager,
                          cancellable: InterruptableFuture):
    """Transport task that will attempt to update the scheduler status of the job calculation.

    The task will first request a transport from the queue. Once the transport is yielded, the relevant execmanager
    function is called, wrapped in the exponential_backoff_retry coroutine, which, in case of a caught exception, will
    retry after an interval that increases exponentially with the number of retries, for a maximum number of retries.
    If all retries fail, the task will raise a TransportTaskException

    :param node: the node that represents the job calculation
    :type node: :class:`aiida.orm.nodes.process.calculation.calcjob.CalcJobNode`
    :param job_manager: The job manager
    :type job_manager: :class:`aiida.engine.processes.calcjobs.manager.JobManager`
    :param cancellable: A cancel flag
    :type cancellable: :class:`aiida.engine.utils.InterruptableFuture`
    :return: True if the tasks was successfully completed, False otherwise
    """
    state = node.get_state()

    if state in [CalcJobState.RETRIEVING, CalcJobState.STASHING]:
        logger.warning(
            f'CalcJob<{node.pk}> already marked as `{state}`, skipping task_update_job'
        )
        return True

    initial_interval = get_config_option(RETRY_INTERVAL_OPTION)
    max_attempts = get_config_option(MAX_ATTEMPTS_OPTION)

    authinfo = node.get_authinfo()
    job_id = node.get_job_id()

    async def do_update():
        # Get the update request
        with job_manager.request_job_info_update(authinfo,
                                                 job_id) as update_request:
            job_info = await cancellable.with_interrupt(update_request)

        if job_info is None:
            # If the job is computed or not found assume it's done
            node.set_scheduler_state(JobState.DONE)
            job_done = True
        else:
            node.set_last_job_info(job_info)
            node.set_scheduler_state(job_info.job_state)
            job_done = job_info.job_state == JobState.DONE

        return job_done

    try:
        logger.info(f'scheduled request to update CalcJob<{node.pk}>')
        ignore_exceptions = (plumpy.futures.CancelledError,
                             plumpy.process_states.Interruption)
        job_done = await exponential_backoff_retry(
            do_update,
            initial_interval,
            max_attempts,
            logger=node.logger,
            ignore_exceptions=ignore_exceptions)
    except (plumpy.futures.CancelledError, plumpy.process_states.Interruption):  # pylint: disable=try-except-raise
        raise
    except Exception as exception:
        logger.warning(f'updating CalcJob<{node.pk}> failed')
        raise TransportTaskException(
            f'update_calculation failed {max_attempts} times consecutively'
        ) from exception
    else:
        logger.info(f'updating CalcJob<{node.pk}> successful')
        if job_done:
            node.set_state(CalcJobState.STASHING)

        return job_done