Esempio n. 1
0
def copytool_out(queues, traces, args):
    """
    Main stage-out thread.
    Perform stage-out as soon as a job object can be extracted from the data_out queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    cont = True
    logger.debug('entering copytool_out loop')
    if args.graceful_stop.is_set():
        logger.debug('graceful_stop already set')

    processed_jobs = []
    while cont:

        time.sleep(0.5)

        # abort if kill signal arrived too long time ago, ie loop is stuck
        current_time = int(time.time())
        if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
            logger.warning(
                'loop has run for too long time after first kill signal - will abort'
            )
            break

        # check for abort, print useful messages and include a 1 s sleep
        abort = should_abort(args, label='data:copytool_out')
        try:
            job = queues.data_out.get(block=True, timeout=1)
            if job:
                # hack to prevent stage-out to be called more than once for same job object (can apparently happen
                # in multi-output jobs)
                # should not be necessary unless job object is added to queues.data_out more than once - check this
                # for multiple output files
                if processed_jobs:
                    if is_already_processed(queues, processed_jobs):
                        continue

                logger.info('will perform stage-out for job id=%s', job.jobid)

                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_out detected a set abort_job pre stage-out (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_out,
                                           args.signal)
                    break

                if _stage_out_new(job, args):
                    if args.abort_job.is_set():
                        traces.pilot['command'] = 'abort'
                        logger.warning(
                            'copytool_out detected a set abort_job post stage-out (due to a kill signal)'
                        )
                        #declare_failed_by_kill(job, queues.failed_data_out, args.signal)
                        break

                    #queues.finished_data_out.put(job)
                    processed_jobs.append(job.jobid)
                    put_in_queue(job, queues.finished_data_out)
                    logger.debug('job object added to finished_data_out queue')
                else:
                    #queues.failed_data_out.put(job)
                    put_in_queue(job, queues.failed_data_out)
                    logger.debug('job object added to failed_data_out queue')
            else:
                logger.debug('no returned job - why no exception?')
        except queue.Empty:
            if abort:
                cont = False
                break
            continue

        if abort:
            cont = False
            break

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_out thread has finished')
Esempio n. 2
0
File: data.py Progetto: ptrlv/pilot2
def queue_monitoring(queues, traces, args):
    """
    Monitoring of Data queues.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while True:  # will abort when graceful_stop has been set
        if traces.pilot['command'] == 'abort':
            logger.warning('data queue monitor saw the abort instruction')

        # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set)
        # (abort at the end of the loop)
        abort = should_abort(args, label='data:queue_monitoring')

        # monitor the failed_data_in queue
        try:
            job = queues.failed_data_in.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            log = get_logger(job.jobid)

            # stage-out log file then add the job to the failed_jobs queue
            job.stageout = "log"

            # TODO: put in data_out queue instead?

            if not _stage_out_new(job, args):
                log.info("job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs "
                         "queue" % job.jobid)
                #queues.failed_data_out.put(job)
                put_in_queue(job, queues.failed_data_out)
            else:
                log.info("job %s failed during stage-in, adding job object to failed_jobs queue" % job.jobid)
                #queues.failed_jobs.put(job)
                put_in_queue(job, queues.failed_jobs)

        # monitor the finished_data_out queue
        try:
            job = queues.finished_data_out.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            log = get_logger(job.jobid)

            # use the payload/transform exitCode from the job report if it exists
            if job.transexitcode == 0 and job.exitcode == 0 and job.piloterrorcodes == []:
                log.info('finished stage-out for finished payload, adding job to finished_jobs queue')
                #queues.finished_jobs.put(job)
                put_in_queue(job, queues.finished_jobs)
            else:
                log.info('finished stage-out (of log) for failed payload')
                #queues.failed_jobs.put(job)
                put_in_queue(job, queues.failed_jobs)

        # monitor the failed_data_out queue
        try:
            job = queues.failed_data_out.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            log = get_logger(job.jobid)

            # attempt to upload the log in case the previous stage-out failure was not an SE error
            job.stageout = "log"
            set_pilot_state(job=job, state="failed")
            if not _stage_out_new(job, args):
                log.info("job %s failed during stage-out of data file(s) as well as during stage-out of log, "
                         "adding job object to failed_jobs queue" % job.jobid)
            else:
                log.info("job %s failed during stage-out of data file(s) - stage-out of log succeeded, adding job "
                         "object to failed_jobs queue" % job.jobid)

            #queues.failed_jobs.put(job)
            put_in_queue(job, queues.failed_jobs)

        if abort:
            break

    logger.debug('[data] queue_monitor thread has finished')
Esempio n. 3
0
def queue_monitoring(queues, traces, args):
    """
    Monitoring of Data queues.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while True:  # will abort when graceful_stop has been set
        time.sleep(0.5)
        if traces.pilot['command'] == 'abort':
            logger.warning('data queue monitor saw the abort instruction')
            args.graceful_stop.set()

        # abort in case graceful_stop has been set, and less than 30 s has passed since MAXTIME was reached (if set)
        # (abort at the end of the loop)
        abort = should_abort(args, label='data:queue_monitoring')

        # monitor the failed_data_in queue
        try:
            job = queues.failed_data_in.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            # stage-out log file then add the job to the failed_jobs queue
            job.stageout = "log"

            # TODO: put in data_out queue instead?

            if not _stage_out_new(job, args):
                logger.info(
                    "job %s failed during stage-in and stage-out of log, adding job object to failed_data_outs queue",
                    job.jobid)
                put_in_queue(job, queues.failed_data_out)
            else:
                logger.info(
                    "job %s failed during stage-in, adding job object to failed_jobs queue",
                    job.jobid)
                put_in_queue(job, queues.failed_jobs)

        # monitor the finished_data_out queue
        try:
            job = queues.finished_data_out.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            # use the payload/transform exitCode from the job report if it exists
            if job.transexitcode == 0 and job.exitcode == 0 and job.piloterrorcodes == []:
                logger.info(
                    'finished stage-out for finished payload, adding job to finished_jobs queue'
                )
                #queues.finished_jobs.put(job)
                put_in_queue(job, queues.finished_jobs)
            else:
                logger.info('finished stage-out (of log) for failed payload')
                #queues.failed_jobs.put(job)
                put_in_queue(job, queues.failed_jobs)

        # monitor the failed_data_out queue
        try:
            job = queues.failed_data_out.get(block=True, timeout=1)
        except queue.Empty:
            pass
        else:
            # attempt to upload the log in case the previous stage-out failure was not an SE error
            job.stageout = "log"
            set_pilot_state(job=job, state="failed")
            if not _stage_out_new(job, args):
                logger.info("job %s failed during stage-out", job.jobid)

            put_in_queue(job, queues.failed_jobs)

        if abort:
            break

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] queue_monitor thread has finished')
Esempio n. 4
0
File: data.py Progetto: ptrlv/pilot2
def copytool_out(queues, traces, args):
    """
    Main stage-out thread.
    Perform stage-out as soon as a job object can be extracted from the data_out queue.

    :param queues: pilot queues object.
    :param traces: pilot traces object.
    :param args: pilot args object.
    :return:
    """

    cont = True
    logger.debug('entering copytool_out loop')
    if args.graceful_stop.is_set():
        logger.debug('graceful_stop already set')
    first = True
#    while not args.graceful_stop.is_set() and cont:
    while cont:

        if first:
            first = False
            logger.debug('inside copytool_out() loop')

        # check for abort, print useful messages and include a 1 s sleep
        abort = should_abort(args, label='data:copytool_out')
        if abort:
            logger.debug('will abort ')
        try:
            job = queues.data_out.get(block=True, timeout=1)
            if job:
                log = get_logger(job.jobid)
                log.info('will perform stage-out')

                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    log.warning('copytool_out detected a set abort_job pre stage-out (due to a kill signal)')
                    declare_failed_by_kill(job, queues.failed_data_out, args.signal)
                    break

                if _stage_out_new(job, args):
                    if args.abort_job.is_set():
                        traces.pilot['command'] = 'abort'
                        log.warning('copytool_out detected a set abort_job post stage-out (due to a kill signal)')
                        #declare_failed_by_kill(job, queues.failed_data_out, args.signal)
                        break

                    #queues.finished_data_out.put(job)
                    put_in_queue(job, queues.finished_data_out)
                    log.debug('job object added to finished_data_out queue')
                else:
                    #queues.failed_data_out.put(job)
                    put_in_queue(job, queues.failed_data_out)
                    log.debug('job object added to failed_data_out queue')
            else:
                log.debug('no returned job - why no exception?')
        except queue.Empty:
            if abort:
                logger.debug('aborting')
                cont = False
                break
            continue

        if abort:
            logger.debug('aborting')
            cont = False
            break

    logger.debug('[data] copytool_out thread has finished')