Esempio n. 1
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            # abort if kill signal arrived too long time ago, ie loop is stuck
            current_time = int(time.time())
            if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning(
                    'loop has run for too long time after first kill signal - will abort'
                )
                break

            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)

            # does the user want to execute any special commands before stage-in?
            pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
            user = __import__('pilot.user.%s.common' % pilot_user, globals(),
                              locals(), [pilot_user], 0)  # Python 2/3
            cmd = user.get_utility_commands(job=job,
                                            order=UTILITY_BEFORE_STAGEIN)
            if cmd:
                # xcache debug
                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[before xcache start] stdout=%s', _stdout)
                #logger.debug('[before xcache start] stderr=%s', _stderr)

                _, stdout, stderr = execute(cmd.get('command'))
                logger.debug('stdout=%s', stdout)
                logger.debug('stderr=%s', stderr)

                # xcache debug
                #_, _stdout, _stderr = execute('pgrep -x xrootd | awk \'{print \"ps -p \"$1\" -o args --no-headers --cols 300\"}\' | sh')
                #logger.debug('[after xcache start] stdout=%s', _stdout)
                #logger.debug('[after xcache start] stderr=%s', _stderr)

                # perform any action necessary after command execution (e.g. stdout processing)
                kwargs = {
                    'label': cmd.get('label', 'utility'),
                    'output': stdout
                }
                user.post_prestagein_utility_command(**kwargs)

                # write output to log files
                write_utility_output(job.workdir, cmd.get('label', 'utility'),
                                     stdout, stderr)

            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - order log transfer and abort copytool_in()'
                )
                job.stageout = 'log'  # only stage-out log file
                put_in_queue(job, queues.data_out)
                break

            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                logger.warning(
                    'copytool_in detected a set abort_job pre stage-in (due to a kill signal)'
                )
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_in detected a set abort_job post stage-in (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_in,
                                           args.signal)
                    break

                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue',
                        _job.jobid)

                # now create input file metadata if required by the payload
                if os.environ.get('PILOT_ES_EXECUTOR_TYPE',
                                  'generic') == 'generic':
                    pilot_user = os.environ.get('PILOT_USER',
                                                'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user,
                                      globals(), locals(), [pilot_user],
                                      0)  # Python 2/3
                    file_dictionary = get_input_file_dictionary(job.indata)
                    xml = user.create_input_file_metadata(
                        file_dictionary, job.workdir)
                    logger.info('created input file metadata:\n%s', xml)
            else:
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue',
                        _job.jobid)
                logger.warning(
                    'stage-in failed, adding job object to failed_data_in queue'
                )
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()

        except queue.Empty:
            continue

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_in thread has finished')
Esempio n. 2
0
File: data.py Progetto: ptrlv/pilot2
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    while not args.graceful_stop.is_set():
        try:
            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)
            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            if job:
                put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')
            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING
            log = get_logger(job.jobid)

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                log.warning('copytool_in detected a set abort_job pre stage-in (due to a kill signal)')
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    log.warning('copytool_in detected a set abort_job post stage-in (due to a kill signal)')
                    declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                    break

                #queues.finished_data_in.put(job)
                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    log.debug('job %s has been removed from the current_data_in queue' % _job.jobid)

                # now create input file metadata if required by the payload
                try:
                    pilot_user = os.environ.get('PILOT_USER', 'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user, globals(), locals(), [pilot_user], -1)
                    _dir = '/srv' if job.usecontainer else job.workdir
                    file_dictionary = get_input_file_dictionary(job.indata, _dir)
                    #file_dictionary = get_input_file_dictionary(job.indata, job.workdir)
                    log.debug('file_dictionary=%s' % str(file_dictionary))
                    xml = user.create_input_file_metadata(file_dictionary, job.workdir)
                    log.info('created input file metadata:\n%s' % xml)
                except Exception as e:
                    pass
            else:
                log.warning('stage-in failed, adding job object to failed_data_in queue')
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                #queues.failed_data_in.put(job)
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()
                # send_state(job, args, 'failed')

        except queue.Empty:
            continue

    logger.debug('[data] copytool_in thread has finished')
Esempio n. 3
0
def copytool_in(queues, traces, args):
    """
    Call the stage-in function and put the job object in the proper queue.

    :param queues: internal queues for job handling.
    :param traces: tuple containing internal pilot states.
    :param args: Pilot arguments (e.g. containing queue name, queuedata dictionary, etc).
    :return:
    """

    while not args.graceful_stop.is_set():
        time.sleep(0.5)
        try:
            # abort if kill signal arrived too long time ago, ie loop is stuck
            current_time = int(time.time())
            if args.kill_time and current_time - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning(
                    'loop has run for too long time after first kill signal - will abort'
                )
                break

            # extract a job to stage-in its input
            job = queues.data_in.get(block=True, timeout=1)
            # place it in the current stage-in queue (used by the jobs' queue monitoring)
            if job:
                put_in_queue(job, queues.current_data_in)

            # ready to set the job in running state
            send_state(job, args, 'running')

            # note: when sending a state change to the server, the server might respond with 'tobekilled'
            if job.state == 'failed':
                logger.warning(
                    'job state is \'failed\' - order log transfer and abort copytool_in()'
                )
                job.stageout = 'log'  # only stage-out log file
                put_in_queue(job, queues.data_out)
                break

            os.environ['SERVER_UPDATE'] = SERVER_UPDATE_RUNNING

            if args.abort_job.is_set():
                traces.pilot['command'] = 'abort'
                logger.warning(
                    'copytool_in detected a set abort_job pre stage-in (due to a kill signal)'
                )
                declare_failed_by_kill(job, queues.failed_data_in, args.signal)
                break

            if _stage_in(args, job):
                if args.abort_job.is_set():
                    traces.pilot['command'] = 'abort'
                    logger.warning(
                        'copytool_in detected a set abort_job post stage-in (due to a kill signal)'
                    )
                    declare_failed_by_kill(job, queues.failed_data_in,
                                           args.signal)
                    break

                put_in_queue(job, queues.finished_data_in)
                # remove the job from the current stage-in queue
                _job = queues.current_data_in.get(block=True, timeout=1)
                if _job:
                    logger.debug(
                        'job %s has been removed from the current_data_in queue'
                        % _job.jobid)

                # now create input file metadata if required by the payload
                if config.Payload.executor_type.lower() != 'raythena':
                    pilot_user = os.environ.get('PILOT_USER',
                                                'generic').lower()
                    user = __import__('pilot.user.%s.metadata' % pilot_user,
                                      globals(), locals(), [pilot_user],
                                      0)  # Python 2/3
                    file_dictionary = get_input_file_dictionary(job.indata)
                    xml = user.create_input_file_metadata(
                        file_dictionary, job.workdir)
                    logger.info('created input file metadata:\n%s' % xml)
            else:
                logger.warning(
                    'stage-in failed, adding job object to failed_data_in queue'
                )
                job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                    errors.STAGEINFAILED)
                set_pilot_state(job=job, state="failed")
                traces.pilot['error_code'] = job.piloterrorcodes[0]
                put_in_queue(job, queues.failed_data_in)
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is DONE_FINAL
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()

        except queue.Empty:
            continue

    # proceed to set the job_aborted flag?
    if threads_aborted():
        logger.debug('will proceed to set job_aborted')
        args.job_aborted.set()
    else:
        logger.debug('will not set job_aborted yet')

    logger.debug('[data] copytool_in thread has finished')
Esempio n. 4
0
def control(queues, traces, args):
    """
    Main control function, run from the relevant workflow module.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    t0 = time.time()
    traces.pilot['lifetime_start'] = t0  # ie referring to when pilot monitoring began
    traces.pilot['lifetime_max'] = t0

    threadchecktime = int(config.Pilot.thread_check)

    # for CPU usage debugging
    cpuchecktime = int(config.Pilot.cpu_check)
    tcpu = t0

    queuedata = get_queuedata_from_job(queues)
    max_running_time = get_max_running_time(args.lifetime, queuedata)

    try:
        # overall loop counter (ignoring the fact that more than one job may be running)
        n = 0

        while not args.graceful_stop.is_set():
            # every seconds, run the monitoring checks
            if args.graceful_stop.wait(1) or args.graceful_stop.is_set():  # 'or' added for 2.6 compatibility
                logger.warning('aborting monitor loop since graceful_stop has been set')
                break

            # abort if kill signal arrived too long time ago, ie loop is stuck
            if args.kill_time and int(time.time()) - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning('loop has run for too long time - will abort')
                args.graceful_stop.set()
                break

            # check if the pilot has run out of time (stop ten minutes before PQ limit)
            time_since_start = get_time_since_start(args)
            grace_time = 10 * 60
            if time_since_start - grace_time > max_running_time:
                logger.fatal('max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot' %
                             (max_running_time, grace_time))
                logger.info('setting REACHED_MAXTIME and graceful stop')
                environ['REACHED_MAXTIME'] = 'REACHED_MAXTIME'  # TODO: use singleton instead
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is FINAL_DONE
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()
                break
            else:
                if n % 60 == 0:
                    logger.info('%d s have passed since pilot start' % time_since_start)
            time.sleep(1)

            # time to check the CPU?
            if int(time.time() - tcpu) > cpuchecktime and False:  # for testing only
                processes = get_process_info('python pilot2/pilot.py', pid=getpid())
                if processes:
                    logger.info('-' * 100)
                    logger.info('PID=%d has CPU usage=%s%% MEM usage=%s%% CMD=%s' % (getpid(), processes[0], processes[1], processes[2]))
                    n = processes[3]
                    if n > 1:
                        logger.info('there are %d such processes running' % n)
                    else:
                        logger.info('there is %d such process running' % n)
                    logger.info('-' * 100)
                tcpu = time.time()

            # proceed with running the other checks
            run_checks(queues, args)

            # thread monitoring
            if int(time.time() - traces.pilot['lifetime_start']) % threadchecktime == 0:
                # get all threads
                for thread in threading.enumerate():
                    # logger.info('thread name: %s' % thread.name)
                    if not thread.is_alive():
                        logger.fatal('thread \'%s\' is not alive' % thread.name)
                        # args.graceful_stop.set()

            n += 1

    except Exception as e:
        print(("monitor: exception caught: %s" % e))
        raise PilotException(e)

    logger.info('[monitor] control thread has ended')
Esempio n. 5
0
def control(queues, traces, args):
    """
    Main control function, run from the relevant workflow module.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    traces.pilot['lifetime_start'] = time.time(
    )  # ie referring to when pilot monitoring begain
    traces.pilot['lifetime_max'] = time.time()

    threadchecktime = int(config.Pilot.thread_check)

    queuedata = get_queuedata_from_job(queues)
    if queuedata:
        logger.debug('extracted queuedata from job object')
    else:
        logger.debug('failed to extract queuedata from job object')
    max_running_time = get_max_running_time(args.lifetime, queuedata)

    try:
        # overall loop counter (ignoring the fact that more than one job may be running)
        n = 0

        while not args.graceful_stop.is_set():
            # every seconds, run the monitoring checks
            if args.graceful_stop.wait(1) or args.graceful_stop.is_set(
            ):  # 'or' added for 2.6 compatibility
                break

            # check if the pilot has run out of time (stop ten minutes before PQ limit)
            time_since_start = get_time_since_start(args)
            grace_time = 10 * 60
            if time_since_start - grace_time > max_running_time:
                logger.fatal(
                    'max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot'
                    % (max_running_time, grace_time))
                logger.info('setting REACHED_MAXTIME and graceful stop')
                environ[
                    'REACHED_MAXTIME'] = 'REACHED_MAXTIME'  # TODO: use singleton instead
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is FINAL_DONE
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()
                break
            else:
                if n % 60 == 0:
                    logger.info('%d s have passed since pilot start' %
                                time_since_start)
            time.sleep(1)

            # proceed with running the checks
            run_checks(queues, args)

            # thread monitoring
            if int(time.time() -
                   traces.pilot['lifetime_start']) % threadchecktime == 0:
                # get all threads
                for thread in threading.enumerate():
                    # logger.info('thread name: %s' % thread.name)
                    if not thread.is_alive():
                        logger.fatal('thread \'%s\' is not alive' %
                                     thread.name)
                        # args.graceful_stop.set()

            n += 1

    except Exception as e:
        print("monitor: exception caught: %s" % e)
        raise PilotException(e)

    logger.info('[monitor] control thread has ended')