Beispiel #1
0
def copytool_out(queues, traces, args):
    """
    Main stage-out thread.
    Perform stage-out as soon as a job object can be extracted from the data_out queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    cont = True
    logger.debug('entering copytool_out loop')
    if args.graceful_stop.is_set():
        logger.debug('graceful_stop already set')

    processed_jobs = []
    while cont:

        time.sleep(0.5)

        # abort if kill signal arrived too long time ago, ie loop is stuck
        current_time = int(time.time())
        if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
            logger.warning(
                'loop has run for too long time after first kill signal - will abort'
            )
            break

        # check for abort, print useful messages and include a 1 s sleep
        abort = should_abort(args, label='data:copytool_out')
        try:
            job = queues.data_out.get(block=True, timeout=1)
            if job:
                # hack to prevent stage-out to be called more than once for same job object (can apparently happen
                # in multi-output jobs)
                # should not be necessary unless job object is added to queues.data_out more than once - check this
                # for multiple output files
                if processed_jobs:
                    if is_already_processed(queues, processed_jobs):
                        continue

                logger.info('will perform stage-out for job id=%s', job.jobid)

                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_out detected a set abort_job pre stage-out (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_out,
                                           args.signal)
                    break

                if _stage_out_new(job, args):
                    if args.abort_job.is_set():
                        traces.pilot['command'] = 'abort'
                        logger.warning(
                            'copytool_out detected a set abort_job post stage-out (due to a kill signal)'
                        )
                        #declare_failed_by_kill(job, queues.failed_data_out, args.signal)
                        break

                    #queues.finished_data_out.put(job)
                    processed_jobs.append(job.jobid)
                    put_in_queue(job, queues.finished_data_out)
                    logger.debug('job object added to finished_data_out queue')
                else:
                    #queues.failed_data_out.put(job)
                    put_in_queue(job, queues.failed_data_out)
                    logger.debug('job object added to failed_data_out queue')
            else:
                logger.debug('no returned job - why no exception?')
        except queue.Empty:
            if abort:
                cont = False
                break
            continue

        if abort:
            cont = False
            break

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_out thread has finished')
Beispiel #2
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            # abort if kill signal arrived too long time ago, ie loop is stuck
            current_time = int(time.time())
            if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning(
                    'loop has run for too long time after first kill signal - will abort'
                )
                break

            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)

            # does the user want to execute any special commands before stage-in?
            pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
            user = __import__('pilot.user.%s.common' % pilot_user, globals(),
                              locals(), [pilot_user], 0)  # Python 2/3
            cmd = user.get_utility_commands(job=job,
                                            order=UTILITY_BEFORE_STAGEIN)
            if cmd:
                # xcache debug
                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[before xcache start] stdout=%s', _stdout)
                #logger.debug('[before xcache start] stderr=%s', _stderr)

                _, stdout, stderr = execute(cmd.get('command'))
                logger.debug('stdout=%s', stdout)
                logger.debug('stderr=%s', stderr)

                # xcache debug
                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[after xcache start] stdout=%s', _stdout)
                #logger.debug('[after xcache start] stderr=%s', _stderr)

                # perform any action necessary after command execution (e.g. stdout processing)
                kwargs = {
                    'label': cmd.get('label', 'utility'),
                    'output': stdout
                }
                user.post_prestagein_utility_command(**kwargs)

                # write output to log files
                write_utility_output(job.workdir, cmd.get('label', 'utility'),
                                     stdout, stderr)

            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - order log transfer and abort copytool_in()'
                )
                job.stageout = 'log'  # only stage-out log file
                put_in_queue(job, queues.data_out)
                break

            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                logger.warning(
                    'copytool_in detected a set abort_job pre stage-in (due to a kill signal)'
                )
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_in detected a set abort_job post stage-in (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_in,
                                           args.signal)
                    break

                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue',
                        _job.jobid)

                # now create input file metadata if required by the payload
                if os.environ.get('PILOT_ES_EXECUTOR_TYPE',
                                  'generic') == 'generic':
                    pilot_user = os.environ.get('PILOT_USER',
                                                'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user,
                                      globals(), locals(), [pilot_user],
                                      0)  # Python 2/3
                    file_dictionary = get_input_file_dictionary(job.indata)
                    xml = user.create_input_file_metadata(
                        file_dictionary, job.workdir)
                    logger.info('created input file metadata:\n%s', xml)
            else:
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue',
                        _job.jobid)
                logger.warning(
                    'stage-in failed, adding job object to failed_data_in queue'
                )
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()

        except queue.Empty:
            continue

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_in thread has finished')
Beispiel #3
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while not args.graceful_stop.is_set():
        try:
            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)
            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            if job:
                put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')
            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING
            log = get_logger(job.jobid)

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                log.warning('copytool_in detected a set abort_job pre stage-in (due to a kill signal)')
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    log.warning('copytool_in detected a set abort_job post stage-in (due to a kill signal)')
                    declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                    break

                #queues.finished_data_in.put(job)
                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    log.debug('job %s has been removed from the current_data_in queue' % _job.jobid)

                # now create input file metadata if required by the payload
                try:
                    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], -1)
                    _dir = '/srv' if job.usecontainer else job.workdir
                    file_dictionary = get_input_file_dictionary(job.indata, _dir)
                    #file_dictionary = get_input_file_dictionary(job.indata, job.workdir)
                    log.debug('file_dictionary=%s' % str(file_dictionary))
                    xml = user.create_input_file_metadata(file_dictionary, job.workdir)
                    log.info('created input file metadata:\n%s' % xml)
                except Exception as e:
                    pass
            else:
                log.warning('stage-in failed, adding job object to failed_data_in queue')
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                #queues.failed_data_in.put(job)
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()
                # send_state(job, args, 'failed')

        except queue.Empty:
            continue

    logger.debug('[data] copytool_in thread has finished')
Beispiel #4
0
def copytool_out(queues, traces, args):
    """
    Main stage-out thread.
    Perform stage-out as soon as a job object can be extracted from the data_out queue.

    :param queues: pilot queues object.
    :param traces: pilot traces object.
    :param args: pilot args object.
    :return:
    """

    cont = True
    logger.debug('entering copytool_out loop')
    if args.graceful_stop.is_set():
        logger.debug('graceful_stop already set')
    first = True
#    while not args.graceful_stop.is_set() and cont:
    while cont:

        if first:
            first = False
            logger.debug('inside copytool_out() loop')

        # check for abort, print useful messages and include a 1 s sleep
        abort = should_abort(args, label='data:copytool_out')
        if abort:
            logger.debug('will abort ')
        try:
            job = queues.data_out.get(block=True, timeout=1)
            if job:
                log = get_logger(job.jobid)
                log.info('will perform stage-out')

                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    log.warning('copytool_out detected a set abort_job pre stage-out (due to a kill signal)')
                    declare_failed_by_kill(job, queues.failed_data_out, args.signal)
                    break

                if _stage_out_new(job, args):
                    if args.abort_job.is_set():
                        traces.pilot['command'] = 'abort'
                        log.warning('copytool_out detected a set abort_job post stage-out (due to a kill signal)')
                        #declare_failed_by_kill(job, queues.failed_data_out, args.signal)
                        break

                    #queues.finished_data_out.put(job)
                    put_in_queue(job, queues.finished_data_out)
                    log.debug('job object added to finished_data_out queue')
                else:
                    #queues.failed_data_out.put(job)
                    put_in_queue(job, queues.failed_data_out)
                    log.debug('job object added to failed_data_out queue')
            else:
                log.debug('no returned job - why no exception?')
        except queue.Empty:
            if abort:
                logger.debug('aborting')
                cont = False
                break
            continue

        if abort:
            logger.debug('aborting')
            cont = False
            break

    logger.debug('[data] copytool_out thread has finished')
Beispiel #5
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            # abort if kill signal arrived too long time ago, ie loop is stuck
            current_time = int(time.time())
            if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning(
                    'loop has run for too long time after first kill signal - will abort'
                )
                break

            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)
            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            if job:
                put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - order log transfer and abort copytool_in()'
                )
                job.stageout = 'log'  # only stage-out log file
                put_in_queue(job, queues.data_out)
                break

            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                logger.warning(
                    'copytool_in detected a set abort_job pre stage-in (due to a kill signal)'
                )
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_in detected a set abort_job post stage-in (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_in,
                                           args.signal)
                    break

                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue'
                        % _job.jobid)

                # now create input file metadata if required by the payload
                if config.Payload.executor_type.lower() != 'raythena':
                    pilot_user = os.environ.get('PILOT_USER',
                                                'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user,
                                      globals(), locals(), [pilot_user],
                                      0)  # Python 2/3
                    file_dictionary = get_input_file_dictionary(job.indata)
                    xml = user.create_input_file_metadata(
                        file_dictionary, job.workdir)
                    logger.info('created input file metadata:\n%s' % xml)
            else:
                logger.warning(
                    'stage-in failed, adding job object to failed_data_in queue'
                )
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()

        except queue.Empty:
            continue

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_in thread has finished')