コード例 #1
0
ファイル: poller2.py プロジェクト: pombredanne/rucio
def poller(worker_number=1, total_workers=1, chunk_size=100):

    print '%(worker_number)s / %(total_workers)s: Poller started' % locals()
    while not graceful_stop.is_set():
        try:
            s = time()
            transfer_requests = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                                      state=RequestState.SUBMITTED,
                                                      thread=worker_number,
                                                      total_threads=total_workers,
                                                      limit=10000)
            n = len(transfer_requests)
            print '%(worker_number)s / %(total_workers)s: get_next %(n)s requests' % locals()

            if not transfer_requests:
                sleep(0.01)
                continue

            for chunk in chunks(transfer_requests, chunk_size):
                try:
                    s = time()
                    fts3.query_all(transfer_requests=chunk)
                    print 'fts3.query_all', time() - s
                    s = time()
                    common.update_requests_states(chunk)
                    print 'update_requests_states', time() - s
                except:
                    print traceback.format_exc()

        except:
            print traceback.format_exc()
コード例 #2
0
ファイル: submitter.py プロジェクト: pombredanne/rucio
def get_requests(rse_id=None, process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activity=None):
    ts = time.time()
    reqs = request.get_next(request_type=[RequestType.TRANSFER,
                                          RequestType.STAGEIN,
                                          RequestType.STAGEOUT],
                            state=RequestState.QUEUED,
                            limit=bulk,
                            rse=rse_id,
                            activity=activity,
                            process=process,
                            total_processes=total_processes,
                            thread=thread,
                            total_threads=total_threads)
    record_timer('daemons.conveyor.submitter.get_next', (time.time() - ts) * 1000)
    return reqs
コード例 #3
0
ファイル: request.py プロジェクト: pombredanne/rucio
def get_next(request_type, state, issuer, account):
    """
    Retrieve the next request matching the request type and state.

    :param request_type: Type of the request as a string.
    :param state: State of the request as a string.
    :param issuer: Issuing account as a string.
    :param account: Account identifier as a string.
    :returns: Request as a dictionary.
    """

    kwargs = {'account': account, 'issuer': issuer, 'request_type': request_type, 'state': state}
    if not permission.has_permission(issuer=issuer, action='get_next', kwargs=kwargs):
        raise exception.AccessDenied('%(account)s cannot get the next request of type %(request_type)s in state %(state)s' % locals())

    return request.get_next(request_type, state)
コード例 #4
0
ファイル: poller.py プロジェクト: pombredanne/rucio
def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000, older_than=60):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                 thread, total_threads,
                                                                                 bulk))

    logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                thread, total_threads,
                                                                                bulk))

    while not graceful_stop.is_set():

        try:
            ts = time.time()

            logging.debug('%i:%i - start to poll requests older than %i seconds' % (process, thread, older_than))
            reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                    state=RequestState.SUBMITTED,
                                    limit=bulk,
                                    older_than=datetime.datetime.utcnow()-datetime.timedelta(seconds=older_than),
                                    process=process, total_processes=total_processes,
                                    thread=thread, total_threads=total_threads)
            record_timer('daemons.conveyor.poller.000-get_next', (time.time()-ts)*1000)

            if reqs:
                logging.debug('%i:%i - polling %i requests' % (process, thread, len(reqs)))

            if not reqs or reqs == []:
                if once:
                    break
                logging.debug("no requests found. will sleep 60 seconds")
                time.sleep(60)  # Only sleep if there is nothing to do
                continue

            for xfers in chunks(reqs, bulk):
                try:
                    req_ids = {}
                    for req in xfers:
                        record_counter('daemons.conveyor.poller.query_request')
                        if not req['external_host'] in req_ids:
                            req_ids[req['external_host']] = []
                        req_ids[req['external_host']].append((req['request_id'], req['external_id']))

                    responses = {}
                    for external_host in req_ids:
                        ts = time.time()
                        resps = request.bulk_query_requests(external_host, req_ids[external_host], 'fts3')
                        record_timer('daemons.conveyor.poller.001-bulk_query_requests', (time.time()-ts)*1000/len(req_ids[external_host]))
                        responses = dict(responses.items() + resps.items())

                    for external_host in req_ids:
                        for request_id, external_id in req_ids[external_host]:
                            response = responses[request_id]
                            if isinstance(response, Exception):
                                logging.critical("Failed to poll request(%s) with FTS(%s) job (%s): %s" % (request_id, external_host, external_id, responses[request_id]))
                                record_counter('daemons.conveyor.poller.query_request_exception')
                                response = {'new_state': None, 'request_id': request_id, 'transfer_id': external_id, 'job_state': None}
                            common.update_request_state(response)
                            record_counter('daemons.conveyor.poller.update_request_state')
                            if response['new_state'] == RequestState.LOST:
                                record_counter('daemons.conveyor.poller.request_lost')
                except:
                    logging.critical(traceback.format_exc())

        except:
            logging.critical(traceback.format_exc())

        if once:
            return

    logging.debug('%i:%i - graceful stop requests' % (process, thread))

    logging.debug('%i:%i - graceful stop done' % (process, thread))
コード例 #5
0
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000):
    """
    Main loop to update the replicas and rules based on finished requests.
    """
    try:
        suspicious_patterns = []
        pattern = get(section='conveyor', option='suspicious_pattern', session=None)
        pattern = str(pattern)
        patterns = pattern.split(",")
        for pat in patterns:
            suspicious_patterns.append(re.compile(pat.strip()))
    except ConfigNotFound:
        suspicious_patterns = []
    logging.debug("Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns])

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
    logging.info(prepend_str + 'Finisher starting - db_bulk(%i) bulk (%i)' % (db_bulk, bulk))

    graceful_stop.wait(10)
    while not graceful_stop.is_set():

        start_time = time.time()
        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
            logging.debug(prepend_str + 'Starting new cycle')
            if activities is None:
                activities = [None]

            for activity in activities:
                logging.debug(prepend_str + 'Working on activity %s' % activity)
                time1 = time.time()
                reqs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                             state=[RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING,
                                                    RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES],
                                             limit=db_bulk,
                                             older_than=datetime.datetime.utcnow(),
                                             total_workers=heart_beat['nr_threads'] - 1,
                                             worker_number=heart_beat['assign_thread'],
                                             mode_all=True,
                                             hash_variable='rule_id')
                record_timer('daemons.conveyor.finisher.000-get_next', (time.time() - time1) * 1000)
                time2 = time.time()
                if reqs:
                    logging.debug(prepend_str + 'Updating %i requests for activity %s' % (len(reqs), activity))

                for chunk in chunks(reqs, bulk):
                    try:
                        time3 = time.time()
                        __handle_requests(chunk, suspicious_patterns, prepend_str)
                        record_timer('daemons.conveyor.finisher.handle_requests', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1))
                        record_counter('daemons.conveyor.finisher.handle_requests', len(chunk))
                    except Exception as error:
                        logging.warn(prepend_str + str(error))
                if reqs:
                    logging.debug(prepend_str + 'Finish to update %s finished requests for activity %s in %s seconds' % (len(reqs), activity, time.time() - time2))

        except (DatabaseException, DatabaseError) as error:
            if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                logging.warn(prepend_str + 'Lock detected when handling request - skipping: %s' % (str(error)))
            else:
                logging.error(prepend_str + '%s' % (traceback.format_exc()))
        except Exception as error:
            logging.critical(prepend_str + '%s' % (str(error)))
        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logging.info(prepend_str + 'Sleeping for a while :  %s seconds' % (sleep_time - time_diff))
            graceful_stop.wait(sleep_time - time_diff)

        if once:
            return

    logging.info(prepend_str + 'Graceful stop requests')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
コード例 #6
0
ファイル: finisher.py プロジェクト: openmsi/rucio
def finisher(once=False,
             sleep_time=60,
             activities=None,
             bulk=100,
             db_bulk=1000):
    """
    Main loop to update the replicas and rules based on finished requests.
    """
    try:
        conveyor_config = {item[0]: item[1] for item in items('conveyor')}
    except ConfigNotFound:
        logging.log(logging.INFO, 'No configuration found for conveyor')
        conveyor_config = {}

    # Get suspicious patterns
    suspicious_patterns = conveyor_config.get('suspicious_pattern', [])
    if suspicious_patterns:
        pattern = str(suspicious_patterns)
        patterns = pattern.split(",")
        suspicious_patterns = [re.compile(pat.strip()) for pat in patterns]
    logging.log(
        logging.DEBUG, "Suspicious patterns: %s" %
        [pat.pattern for pat in suspicious_patterns])

    retry_protocol_mismatches = conveyor_config.get(
        'retry_protocol_mismatches', False)

    executable = 'conveyor-finisher'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-finisher[%i/%i] : ' % (heart_beat['assign_thread'],
                                              heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Finisher starting - db_bulk(%i) bulk (%i)', db_bulk,
           bulk)

    graceful_stop.wait(10)
    while not graceful_stop.is_set():

        start_time = time.time()
        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prefix = 'conveyor-finisher[%i/%i] : ' % (
                heart_beat['assign_thread'], heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')
            logger(logging.DEBUG, 'Starting new cycle')
            if activities is None:
                activities = [None]

            for activity in activities:
                logger(logging.DEBUG, 'Working on activity %s', activity)
                time1 = time.time()
                reqs = request_core.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[
                        RequestState.DONE, RequestState.FAILED,
                        RequestState.LOST, RequestState.SUBMITTING,
                        RequestState.SUBMISSION_FAILED,
                        RequestState.NO_SOURCES,
                        RequestState.ONLY_TAPE_SOURCES,
                        RequestState.MISMATCH_SCHEME
                    ],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow(),
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    mode_all=True,
                    hash_variable='rule_id')
                record_timer('daemons.conveyor.finisher.000-get_next',
                             (time.time() - time1) * 1000)
                time2 = time.time()
                if reqs:
                    logger(logging.DEBUG,
                           'Updating %i requests for activity %s', len(reqs),
                           activity)

                for chunk in chunks(reqs, bulk):
                    try:
                        time3 = time.time()
                        __handle_requests(chunk,
                                          suspicious_patterns,
                                          retry_protocol_mismatches,
                                          logger=logger)
                        record_timer(
                            'daemons.conveyor.finisher.handle_requests',
                            (time.time() - time3) * 1000 /
                            (len(chunk) if chunk else 1))
                        record_counter(
                            'daemons.conveyor.finisher.handle_requests',
                            len(chunk))
                    except Exception as error:
                        logger(logging.WARNING, '%s', str(error))
                if reqs:
                    logger(
                        logging.DEBUG,
                        'Finish to update %s finished requests for activity %s in %s seconds',
                        len(reqs), activity,
                        time.time() - time2)

        except (DatabaseException, DatabaseError) as error:
            if re.match('.*ORA-00054.*', error.args[0]) or re.match(
                    '.*ORA-00060.*',
                    error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                logger(logging.WARNING,
                       'Lock detected when handling request - skipping: %s',
                       str(error))
            else:
                logger(logging.ERROR, 'Exception', exc_info=True)
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)

        if once:
            return

        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logger(logging.INFO, 'Sleeping for a while :  %s seconds',
                   (sleep_time - time_diff))
            graceful_stop.wait(sleep_time - time_diff)

    logger(logging.INFO, 'Graceful stop requests')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
コード例 #7
0
def poller(once=False,
           activities=None,
           sleep_time=60,
           fts_bulk=100,
           db_bulk=1000,
           older_than=60,
           activity_shares=None,
           partition_wait_time=10):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    multi_vo = config_get_bool('common', 'multi_vo', False, None)
    logger_prefix = executable = 'conveyor-poller'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if activity_shares:
        activities.sort()
        executable += '--activity_shares' + str(activity_shares)
    if FILTER_TRANSFERTOOL:
        executable += ' --filter-transfertool ' + FILTER_TRANSFERTOOL

    with HeartbeatHandler(executable=executable,
                          logger_prefix=logger_prefix) as heartbeat_handler:
        logger = heartbeat_handler.logger
        logger(
            logging.INFO,
            'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' %
            (db_bulk, fts_bulk, timeout))
        activity_next_exe_time = defaultdict(time.time)

        if partition_wait_time:
            graceful_stop.wait(
                partition_wait_time
            )  # To prevent running on the same partition if all the poller restart at the same time
        while not graceful_stop.is_set():

            try:
                heart_beat, logger = heartbeat_handler.live(older_than=3600)
                if activities is None:
                    activities = [None]
                for activity in activities:
                    if activity_next_exe_time[activity] > time.time():
                        graceful_stop.wait(1)
                        continue

                    start_time = time.time()
                    logger(
                        logging.DEBUG,
                        'Start to poll transfers older than %i seconds for activity %s using transfer tool: %s'
                        % (older_than, activity, FILTER_TRANSFERTOOL))
                    transfs = request_core.get_next(
                        request_type=[
                            RequestType.TRANSFER, RequestType.STAGEIN,
                            RequestType.STAGEOUT
                        ],
                        state=[RequestState.SUBMITTED],
                        limit=db_bulk,
                        older_than=datetime.datetime.utcnow() -
                        datetime.timedelta(seconds=older_than)
                        if older_than else None,
                        total_workers=heart_beat['nr_threads'],
                        worker_number=heart_beat['assign_thread'],
                        mode_all=True,
                        hash_variable='id',
                        activity=activity,
                        activity_shares=activity_shares,
                        transfertool=FILTER_TRANSFERTOOL)

                    record_timer('daemons.conveyor.poller.get_next',
                                 (time.time() - start_time) * 1000)

                    if TRANSFER_TOOL and not FILTER_TRANSFERTOOL:
                        # only keep transfers which don't have any transfertool set, or have one equal to TRANSFER_TOOL
                        transfs_tmp = [
                            t for t in transfs if not t['transfertool']
                            or t['transfertool'] == TRANSFER_TOOL
                        ]
                        if len(transfs_tmp) != len(transfs):
                            logger(
                                logging.INFO,
                                'Skipping %i transfers because of missmatched transfertool',
                                len(transfs) - len(transfs_tmp))
                        transfs = transfs_tmp

                    if transfs:
                        logger(
                            logging.DEBUG,
                            'Polling %i transfers for activity %s' %
                            (len(transfs), activity))

                    transfs.sort(key=lambda t: (t['external_host'] or '', t[
                        'scope'].vo if multi_vo else '', t['external_id'] or
                                                '', t['request_id'] or ''))
                    for (external_host, vo), transfers_for_host in groupby(
                            transfs,
                            key=lambda t: (t['external_host'], t['scope'].vo
                                           if multi_vo else None)):
                        transfers_by_eid = {}
                        for external_id, xfers in groupby(
                                transfers_for_host,
                                key=lambda t: t['external_id']):
                            transfers_by_eid[external_id] = list(xfers)

                        for chunk in dict_chunks(transfers_by_eid, fts_bulk):
                            try:
                                poll_transfers(external_host=external_host,
                                               transfers_by_eid=chunk,
                                               vo=vo,
                                               timeout=timeout,
                                               logger=logger)
                            except Exception:
                                logger(logging.ERROR,
                                       'Exception',
                                       exc_info=True)

                    if len(transfs) < fts_bulk / 2:
                        logger(
                            logging.INFO,
                            "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds"
                            % (len(transfs), activity, fts_bulk, sleep_time))
                        if activity_next_exe_time[activity] < time.time():
                            activity_next_exe_time[activity] = time.time(
                            ) + sleep_time
            except Exception:
                logger(logging.CRITICAL, "Exception", exc_info=True)
                if once:
                    raise

            if once:
                break
コード例 #8
0
def finisher(once=False,
             process=0,
             total_processes=1,
             thread=0,
             total_threads=1,
             sleep_time=60,
             activities=None,
             bulk=100,
             db_bulk=1000):
    """
    Main loop to update the replicas and rules based on finished requests.
    """

    logging.info(
        'finisher starting - process (%i/%i) thread (%i/%i) db_bulk(%i) bulk (%i)'
        % (process, total_processes, thread, total_threads, db_bulk, bulk))
    try:
        suspicious_patterns = []
        pattern = get(section='conveyor',
                      option='suspicious_pattern',
                      session=None)
        pattern = str(pattern)
        patterns = pattern.split(",")
        for pat in patterns:
            suspicious_patterns.append(re.compile(pat.strip()))
    except ConfigNotFound:
        suspicious_patterns = []
    logging.debug("Suspicious patterns: %s" %
                  [pat.pattern for pat in suspicious_patterns])

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    graceful_stop.wait(1)
    sleeping = False
    while not graceful_stop.is_set():

        try:
            hb = heartbeat.live(executable,
                                hostname,
                                pid,
                                hb_thread,
                                older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1,
                                                 hb['nr_threads'])

            if activities is None:
                activities = [None]

            if sleeping:
                logging.info(prepend_str +
                             'Nothing to do. will sleep %s seconds' %
                             (sleep_time))
                time.sleep(sleep_time)

            sleeping = True
            for activity in activities:
                ts = time.time()
                reqs = request.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[
                        RequestState.DONE, RequestState.FAILED,
                        RequestState.LOST, RequestState.SUBMITTING,
                        RequestState.SUBMISSION_FAILED,
                        RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES
                    ],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow(),
                    activity=activity,
                    process=process,
                    total_processes=total_processes,
                    thread=hb['assign_thread'],
                    total_threads=hb['nr_threads'])
                record_timer('daemons.conveyor.finisher.000-get_next',
                             (time.time() - ts) * 1000)
                if reqs:
                    logging.debug(prepend_str +
                                  'Updating %i requests for activity %s' %
                                  (len(reqs), activity))
                    sleeping = False

                for chunk in chunks(reqs, bulk):
                    try:
                        ts = time.time()
                        common.handle_requests(chunk, suspicious_patterns)
                        record_timer(
                            'daemons.conveyor.finisher.handle_requests',
                            (time.time() - ts) * 1000 /
                            (len(chunk) if len(chunk) else 1))
                        record_counter(
                            'daemons.conveyor.finisher.handle_requests',
                            len(chunk))
                    except:
                        logging.warn(str(traceback.format_exc()))
                if reqs:
                    logging.debug(
                        prepend_str +
                        'Finish to update %s finished requests for activity %s'
                        % (len(reqs), activity))

        except (DatabaseException, DatabaseError) as error:
            if isinstance(
                    error.args[0],
                    tuple) and (re.match('.*ORA-00054.*', error.args[0][0]) or
                                ('ERROR 1205 (HY000)' in error.args[0][0])):
                logging.warn(
                    prepend_str +
                    'Lock detected when handling request - skipping: %s' %
                    (str(error)))
            else:
                logging.error(prepend_str + '%s' % (traceback.format_exc()))
            sleeping = False
        except:
            sleeping = False
            logging.critical(prepend_str + '%s' % (traceback.format_exc()))

        if once:
            return

    logging.info(prepend_str + 'Graceful stop requests')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
コード例 #9
0
def run_once(bulk, db_bulk, suspicious_patterns, retry_protocol_mismatches,
             heartbeat_handler, activity):
    worker_number, total_workers, logger = heartbeat_handler.live()

    try:
        logger(logging.DEBUG, 'Working on activity %s', activity)
        time1 = time.time()
        reqs = request_core.get_next(
            request_type=[
                RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT
            ],
            state=[
                RequestState.DONE, RequestState.FAILED, RequestState.LOST,
                RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED,
                RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES,
                RequestState.MISMATCH_SCHEME
            ],
            limit=db_bulk,
            older_than=datetime.datetime.utcnow(),
            total_workers=total_workers,
            worker_number=worker_number,
            mode_all=True,
            include_dependent=False,
            hash_variable='rule_id')
        record_timer('daemons.conveyor.finisher.get_next',
                     (time.time() - time1) * 1000)
        time2 = time.time()
        if reqs:
            logger(logging.DEBUG, 'Updating %i requests for activity %s',
                   len(reqs), activity)

        for chunk in chunks(reqs, bulk):
            try:
                worker_number, total_workers, logger = heartbeat_handler.live()
                time3 = time.time()
                __handle_requests(chunk,
                                  suspicious_patterns,
                                  retry_protocol_mismatches,
                                  logger=logger)
                record_timer('daemons.conveyor.finisher.handle_requests_time',
                             (time.time() - time3) * 1000 /
                             (len(chunk) if chunk else 1))
                record_counter('daemons.conveyor.finisher.handle_requests',
                               delta=len(chunk))
            except Exception as error:
                logger(logging.WARNING, '%s', str(error))
        if reqs:
            logger(
                logging.DEBUG,
                'Finish to update %s finished requests for activity %s in %s seconds',
                len(reqs), activity,
                time.time() - time2)

    except (DatabaseException, DatabaseError) as error:
        if re.match('.*ORA-00054.*', error.args[0]) or re.match(
                '.*ORA-00060.*',
                error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
            logger(logging.WARNING,
                   'Lock detected when handling request - skipping: %s',
                   str(error))
        else:
            raise
    return True
コード例 #10
0
ファイル: poller.py プロジェクト: rob-c/rucio
def poller(once=False,
           activities=None,
           sleep_time=60,
           fts_bulk=100,
           db_bulk=1000,
           older_than=60,
           activity_shares=None):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    executable = 'conveyor-poller'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if activity_shares:
        activities.sort()
        executable += '--activity_shares' + str(activity_shares)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str +
                 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' %
                 (db_bulk, fts_bulk, timeout))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Poller started')

    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                start_time = time.time()
                logging.debug(
                    prepend_str +
                    'Start to poll transfers older than %i seconds for activity %s'
                    % (older_than, activity))
                transfs = request_core.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[RequestState.SUBMITTED],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow() -
                    datetime.timedelta(seconds=older_than),
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    mode_all=False,
                    hash_variable='id',
                    activity=activity,
                    activity_shares=activity_shares)

                record_timer('daemons.conveyor.poller.000-get_next',
                             (time.time() - start_time) * 1000)

                if transfs:
                    logging.debug(prepend_str +
                                  'Polling %i transfers for activity %s' %
                                  (len(transfs), activity))

                xfers_ids = {}
                for transf in transfs:
                    if not transf['external_host'] in xfers_ids:
                        xfers_ids[transf['external_host']] = []
                    xfers_ids[transf['external_host']].append(
                        (transf['external_id'], transf['request_id']))

                for external_host in xfers_ids:
                    external_ids = list(
                        {trf[0]
                         for trf in xfers_ids[external_host]})
                    request_ids = [trf[1] for trf in xfers_ids[external_host]]
                    for xfers in chunks(external_ids, fts_bulk):
                        # poll transfers
                        poll_transfers(external_host=external_host,
                                       xfers=xfers,
                                       prepend_str=prepend_str,
                                       request_ids=request_ids,
                                       timeout=timeout)

                if len(transfs) < fts_bulk / 2:
                    logging.info(
                        prepend_str +
                        "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds"
                        % (len(transfs), activity, fts_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical(prepend_str + "%s" % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
コード例 #11
0
ファイル: finisher.py プロジェクト: panta-123/rucio
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000, partition_wait_time=10):
    """
    Main loop to update the replicas and rules based on finished requests.
    """
    try:
        conveyor_config = {item[0]: item[1] for item in items('conveyor')}
    except ConfigNotFound:
        logging.log(logging.INFO, 'No configuration found for conveyor')
        conveyor_config = {}

    # Get suspicious patterns
    suspicious_patterns = conveyor_config.get('suspicious_pattern', [])
    if suspicious_patterns:
        pattern = str(suspicious_patterns)
        patterns = pattern.split(",")
        suspicious_patterns = [re.compile(pat.strip()) for pat in patterns]
    logging.log(logging.DEBUG, "Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns])

    retry_protocol_mismatches = conveyor_config.get('retry_protocol_mismatches', False)

    logger_prefix = executable = 'conveyor-finisher'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)

    with HeartbeatHandler(executable=executable, logger_prefix=logger_prefix) as heartbeat_handler:
        logger = heartbeat_handler.logger
        logger(logging.INFO, 'Finisher starting - db_bulk(%i) bulk (%i)', db_bulk, bulk)

        if partition_wait_time:
            graceful_stop.wait(partition_wait_time)
        while not graceful_stop.is_set():

            start_time = time.time()
            try:
                heart_beat, logger = heartbeat_handler.live(older_than=3600)
                if activities is None:
                    activities = [None]

                for activity in activities:
                    logger(logging.DEBUG, 'Working on activity %s', activity)
                    time1 = time.time()
                    reqs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                                 state=[RequestState.DONE, RequestState.FAILED,
                                                        RequestState.LOST, RequestState.SUBMITTING,
                                                        RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES,
                                                        RequestState.ONLY_TAPE_SOURCES, RequestState.MISMATCH_SCHEME],
                                                 limit=db_bulk,
                                                 older_than=datetime.datetime.utcnow(),
                                                 total_workers=heart_beat['nr_threads'],
                                                 worker_number=heart_beat['assign_thread'],
                                                 mode_all=True,
                                                 hash_variable='rule_id')
                    record_timer('daemons.conveyor.finisher.get_next', (time.time() - time1) * 1000)
                    time2 = time.time()
                    if reqs:
                        logger(logging.DEBUG, 'Updating %i requests for activity %s', len(reqs), activity)

                    for chunk in chunks(reqs, bulk):
                        try:
                            time3 = time.time()
                            __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, logger=logger)
                            record_timer('daemons.conveyor.finisher.handle_requests_time', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1))
                            record_counter('daemons.conveyor.finisher.handle_requests', delta=len(chunk))
                        except Exception as error:
                            logger(logging.WARNING, '%s', str(error))
                    if reqs:
                        logger(logging.DEBUG, 'Finish to update %s finished requests for activity %s in %s seconds', len(reqs), activity, time.time() - time2)

            except (DatabaseException, DatabaseError) as error:
                if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                    logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                else:
                    logger(logging.ERROR, 'Exception', exc_info=True)
            except Exception:
                logger(logging.CRITICAL, 'Exception', exc_info=True)
                if once:
                    raise

            if once:
                break

            daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger)
コード例 #12
0
ファイル: poller.py プロジェクト: pradeepjasal/rucio
def run_once(fts_bulk, db_bulk, older_than, activity_shares, multi_vo, timeout,
             activity, heartbeat_handler):
    worker_number, total_workers, logger = heartbeat_handler.live()

    start_time = time.time()
    logger(
        logging.DEBUG,
        'Start to poll transfers older than %i seconds for activity %s using transfer tool: %s'
        % (older_than, activity, FILTER_TRANSFERTOOL))
    transfs = request_core.get_next(
        request_type=[
            RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT
        ],
        state=[RequestState.SUBMITTED],
        limit=db_bulk,
        older_than=datetime.datetime.utcnow() -
        datetime.timedelta(seconds=older_than) if older_than else None,
        total_workers=total_workers,
        worker_number=worker_number,
        mode_all=True,
        hash_variable='id',
        activity=activity,
        activity_shares=activity_shares,
        transfertool=FILTER_TRANSFERTOOL)

    record_timer('daemons.conveyor.poller.get_next',
                 (time.time() - start_time) * 1000)

    if TRANSFER_TOOL and not FILTER_TRANSFERTOOL:
        # only keep transfers which don't have any transfertool set, or have one equal to TRANSFER_TOOL
        transfs_tmp = [
            t for t in transfs
            if not t['transfertool'] or t['transfertool'] == TRANSFER_TOOL
        ]
        if len(transfs_tmp) != len(transfs):
            logger(
                logging.INFO,
                'Skipping %i transfers because of missmatched transfertool',
                len(transfs) - len(transfs_tmp))
        transfs = transfs_tmp

    if transfs:
        logger(
            logging.DEBUG,
            'Polling %i transfers for activity %s' % (len(transfs), activity))

    transfs.sort(
        key=lambda t: (t['external_host'] or '', t['scope'].vo if multi_vo else
                       '', t['external_id'] or '', t['request_id'] or ''))
    for (external_host,
         vo), transfers_for_host in groupby(transfs,
                                            key=lambda t:
                                            (t['external_host'], t['scope'].vo
                                             if multi_vo else None)):
        transfers_by_eid = {}
        for external_id, xfers in groupby(transfers_for_host,
                                          key=lambda t: t['external_id']):
            transfers_by_eid[external_id] = {t['request_id']: t for t in xfers}

        for chunk in dict_chunks(transfers_by_eid, fts_bulk):
            try:
                if TRANSFER_TOOL == 'globus':
                    transfertool_obj = GlobusTransferTool(external_host=None)
                else:
                    transfertool_obj = FTS3Transfertool(
                        external_host=external_host, vo=vo)
                worker_number, total_workers, logger = heartbeat_handler.live()
                poll_transfers(transfertool_obj=transfertool_obj,
                               transfers_by_eid=chunk,
                               timeout=timeout,
                               logger=logger)
            except Exception:
                logger(logging.ERROR, 'Exception', exc_info=True)

    queue_empty = False
    if len(transfs) < fts_bulk / 2:
        logger(
            logging.INFO,
            "Only %s transfers for activity %s, which is less than half of the bulk %s"
            % (len(transfs), activity, fts_bulk))
        queue_empty = True

    return queue_empty