Ejemplo n.º 1
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info(prepend_str + 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'] - 1,
                worker_number=heart_beat['assign_thread'],
                limit=1000)
            logging.info(prepend_str + '%s expired replicas returned' %
                         len(expired_replicas))
            logging.debug(prepend_str +
                          'List of expired replicas returned %s' %
                          str(expired_replicas))
            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                })
                bad_replicas.append({
                    'scope':
                    replica[0],
                    'name':
                    replica[1],
                    'rse_id':
                    replica[2],
                    'state':
                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                })
            session = get_session()

            for chunk in chunks(replicas, chunk_size):
                # Process and update the replicas in chunks
                update_replicas_states(chunk, nowait=False, session=session)
            for chunk in chunks(bad_replicas, chunk_size):
                # Remove the replicas from bad_replicas table in chunks
                bulk_delete_bad_replicas(chunk, session=session)

            session.commit()  # pylint: disable=no-member

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info('%s Minos Temporary Expiration starting', prepend_str)

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

    logging.info('%s Minos Temporary Expiration started', prepend_str)

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info('%s Getting list of expired replicas', prepend_str)
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logging.info('%s %s expired replicas returned', prepend_str, len(expired_replicas))
            logging.debug('%s List of expired replicas returned %s', prepend_str, str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope, name=name, session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE})
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
                try:
                    nchunk += 1
                    logging.debug('%s Running on %s chunk out of %s', prepend_str, nchunk, tot_chunk)
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning('%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', prepend_str, str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[2]
                        logging.debug('%s Working on %s:%s on %s', prepend_str, scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning('%s DID %s:%s does not exist anymore.', prepend_str, scope, name)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning('%s Replica %s:%s on RSEID %s does not exist anymore.', prepend_str, scope, name, rse_id)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical('%s %s', prepend_str, str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
Ejemplo n.º 3
0
def poller(once=False,
           activities=None,
           sleep_time=60,
           fts_bulk=100,
           db_bulk=1000,
           older_than=60,
           activity_shares=None):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    executable = sys.argv[0]
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if activity_shares:
        activities.sort()
        executable += '--activity_shares' + str(activity_shares)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str +
                 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' %
                 (db_bulk, fts_bulk, timeout))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Poller started')

    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] +
                                                 1, heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                start_time = time.time()
                logging.debug(
                    prepend_str +
                    'Start to poll transfers older than %i seconds for activity %s'
                    % (older_than, activity))
                transfs = request_core.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[RequestState.SUBMITTED],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow() -
                    datetime.timedelta(seconds=older_than),
                    total_workers=heart_beat['nr_threads'] - 1,
                    worker_number=heart_beat['assign_thread'],
                    mode_all=False,
                    hash_variable='id',
                    activity=activity,
                    activity_shares=activity_shares)

                record_timer('daemons.conveyor.poller.000-get_next',
                             (time.time() - start_time) * 1000)

                if transfs:
                    logging.debug(prepend_str +
                                  'Polling %i transfers for activity %s' %
                                  (len(transfs), activity))

                xfers_ids = {}
                for transf in transfs:
                    if not transf['external_host'] in xfers_ids:
                        xfers_ids[transf['external_host']] = []
                    xfers_ids[transf['external_host']].append(
                        (transf['external_id'], transf['request_id']))

                for external_host in xfers_ids:
                    external_ids = list(
                        {trf[0]
                         for trf in xfers_ids[external_host]})
                    request_ids = [trf[1] for trf in xfers_ids[external_host]]
                    for xfers in chunks(external_ids, fts_bulk):
                        # poll transfers
                        poll_transfers(external_host=external_host,
                                       xfers=xfers,
                                       prepend_str=prepend_str,
                                       request_ids=request_ids,
                                       timeout=timeout)

                if len(transfs) < fts_bulk / 2:
                    logging.info(
                        prepend_str +
                        "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds"
                        % (len(transfs), activity, fts_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical(prepend_str + "%s" % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 4
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting', worker_number)
    logging.info('Undertaker(%s): started', worker_number)
    hostname = socket.gethostname()
    pid = os.getpid()
    thread = threading.current_thread()
    sanity_check(executable='rucio-undertaker', hostname=hostname)

    paused_dids = {}  # {(scope, name): datetime}

    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable='rucio-undertaker',
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             older_than=6000)
            logging.info(
                'Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'
                .format(locals()))

            # Refresh paused dids
            iter_paused_dids = deepcopy(paused_dids)
            for key in iter_paused_dids:
                if datetime.utcnow() > paused_dids[key]:
                    del paused_dids[key]

            dids = list_expired_dids(worker_number=heartbeat['assign_thread'] +
                                     1,
                                     total_workers=heartbeat['nr_threads'],
                                     limit=10000)

            dids = [
                did for did in dids
                if (did['scope'], did['name']) not in paused_dids
            ]

            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.',
                             worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete',
                                 worker_number, len(chunk))
                    delete_dids(dids=chunk,
                                account=InternalAccount('root'),
                                expire_rules=True)
                    logging.info('Undertaker(%s): Delete %s dids',
                                 worker_number, len(chunk))
                    record_counter(counters='undertaker.delete_dids',
                                   delta=len(chunk))
                except RuleNotFound as error:
                    logging.error(error)
                except (DatabaseException, DatabaseError,
                        UnsupportedOperation) as e:
                    if match('.*ORA-00054.*', str(e.args[0])) or match(
                            '.*55P03.*', str(e.args[0])) or match(
                                '.*3572.*', str(e.args[0])):
                        for did in chunk:
                            paused_dids[(
                                did['scope'],
                                did['name'])] = datetime.utcnow() + timedelta(
                                    seconds=randint(600, 2400))
                        record_counter(
                            'undertaker.delete_dids.exceptions.LocksDetected')
                        logging.warning(
                            'undertaker[%s/%s]: Locks detected for chunk',
                            heartbeat['assign_thread'],
                            heartbeat['nr_threads'])
                    else:
                        logging.error('Undertaker(%s): Got database error %s.',
                                      worker_number, str(e))
        except:
            logging.critical(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    die(executable='rucio-undertaker',
        hostname=hostname,
        pid=pid,
        thread=thread)
    logging.info('Undertaker(%s): graceful stop requested', worker_number)
    logging.info('Undertaker(%s): graceful stop done', worker_number)
Ejemplo n.º 5
0
def re_evaluator(once=False):
    """
    Main loop to check the re-evaluation of dids.
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_dids = {}  # {(scope, name): datetime}

    # Make an initial heartbeat so that all judge-evaluators have the correct worker number on the next try
    executable = 'judge-evaluator'
    live(executable=executable,
         hostname=hostname,
         pid=pid,
         thread=current_thread,
         older_than=60 * 30)
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=current_thread,
                             older_than=60 * 30)

            start = time.time()  # NOQA

            # Refresh paused dids
            paused_dids = dict((k, v) for k, v in iteritems(paused_dids)
                               if datetime.utcnow() < v)

            # Select a bunch of dids for re evaluation for this worker
            dids = get_updated_dids(total_workers=heartbeat['nr_threads'],
                                    worker_number=heartbeat['assign_thread'],
                                    limit=100,
                                    blacklisted_dids=[
                                        (InternalScope(key[0],
                                                       fromExternal=False),
                                         key[1]) for key in paused_dids
                                    ])
            logging.debug(
                're_evaluator[%s/%s] index query time %f fetch size is %d (%d blacklisted)'
                % (heartbeat['assign_thread'], heartbeat['nr_threads'],
                   time.time() - start, len(dids),
                   len([(InternalScope(key[0], fromExternal=False), key[1])
                        for key in paused_dids])))

            # If the list is empty, sent the worker to sleep
            if not dids and not once:
                logging.debug(
                    're_evaluator[%s/%s] did not get any work (paused_dids=%s)'
                    % (heartbeat['assign_thread'], heartbeat['nr_threads'],
                       str(len(paused_dids))))
                graceful_stop.wait(30)
            else:
                done_dids = {}
                for did in dids:
                    if graceful_stop.is_set():
                        break

                    # Check if this did has already been operated on
                    did_tag = '%s:%s' % (did.scope.internal, did.name)
                    if did_tag in done_dids:
                        if did.rule_evaluation_action in done_dids[did_tag]:
                            logging.debug(
                                're_evaluator[%s/%s]: evaluation of %s:%s already done'
                                %
                                (heartbeat['assign_thread'],
                                 heartbeat['nr_threads'], did.scope, did.name))
                            delete_updated_did(id=did.id)
                            continue
                    else:
                        done_dids[did_tag] = []

                    # Jump paused dids
                    if (did.scope.internal, did.name) in paused_dids:
                        continue

                    try:
                        start_time = time.time()
                        re_evaluate_did(
                            scope=did.scope,
                            name=did.name,
                            rule_evaluation_action=did.rule_evaluation_action)
                        logging.debug(
                            're_evaluator[%s/%s]: evaluation of %s:%s took %f'
                            % (heartbeat['assign_thread'],
                               heartbeat['nr_threads'], did.scope, did.name,
                               time.time() - start_time))
                        delete_updated_did(id=did.id)
                        done_dids[did_tag].append(did.rule_evaluation_action)
                    except DataIdentifierNotFound as e:
                        delete_updated_did(id=did.id)
                    except (DatabaseException, DatabaseError) as e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_dids[(
                                did.scope.internal,
                                did.name)] = datetime.utcnow() + timedelta(
                                    seconds=randint(60, 600))
                            logging.warning(
                                're_evaluator[%s/%s]: Locks detected for %s:%s'
                                %
                                (heartbeat['assign_thread'],
                                 heartbeat['nr_threads'], did.scope, did.name))
                            record_counter(
                                'rule.judge.exceptions.LocksDetected')
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed as e:
                        record_counter('rule.judge.exceptions.%s' %
                                       e.__class__.__name__)
                        logging.warning(
                            're_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s'
                            % (heartbeat['assign_thread'],
                               heartbeat['nr_threads'], did.scope, did.name))
                    except FlushError as e:
                        record_counter('rule.judge.exceptions.%s' %
                                       e.__class__.__name__)
                        logging.warning(
                            're_evaluator[%s/%s]: Flush error for %s:%s' %
                            (heartbeat['assign_thread'],
                             heartbeat['nr_threads'], did.scope, did.name))
        except (DatabaseException, DatabaseError) as e:
            if match('.*QueuePool.*', str(e.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            elif match('.*ORA-03135.*', str(e.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
        except Exception as e:
            logging.critical(traceback.format_exc())
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)

        if once:
            break

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=current_thread)
Ejemplo n.º 6
0
def kronos_file(once=False,
                thread=0,
                brokers_resolved=None,
                dataset_queue=None,
                sleep_time=60):
    """
    Main loop to consume tracer reports.
    """

    logging.info('tracer consumer starting')

    hostname = gethostname()
    pid = getpid()
    thread = current_thread()

    chunksize = config_get_int('tracer-kronos', 'chunksize')
    prefetch_size = config_get_int('tracer-kronos', 'prefetch_size')
    subscription_id = config_get('tracer-kronos', 'subscription_id')
    try:
        bad_files_patterns = []
        pattern = get(section='kronos',
                      option='bad_files_patterns',
                      session=None)
        pattern = str(pattern)
        patterns = pattern.split(",")
        for pat in patterns:
            bad_files_patterns.append(re.compile(pat.strip()))
    except ConfigNotFound:
        bad_files_patterns = []
    except Exception as error:
        logging.error('(kronos_file) Failed to get bad_file_patterns' +
                      str(error))
        bad_files_patterns = []

    use_ssl = True
    try:
        use_ssl = config_get_bool('tracer-kronos', 'use_ssl')
    except Exception:
        pass

    if not use_ssl:
        username = config_get('tracer-kronos', 'username')
        password = config_get('tracer-kronos', 'password')

    excluded_usrdns = set(
        config_get('tracer-kronos', 'excluded_usrdns').split(','))

    conns = []
    for broker in brokers_resolved:
        if not use_ssl:
            conns.append(
                Connection(host_and_ports=[
                    (broker, config_get_int('tracer-kronos', 'port'))
                ],
                           use_ssl=False,
                           reconnect_attempts_max=config_get_int(
                               'tracer-kronos', 'reconnect_attempts')))
        else:
            conns.append(
                Connection(host_and_ports=[
                    (broker, config_get_int('tracer-kronos', 'port'))
                ],
                           use_ssl=True,
                           ssl_key_file=config_get('tracer-kronos',
                                                   'ssl_key_file'),
                           ssl_cert_file=config_get('tracer-kronos',
                                                    'ssl_cert_file'),
                           ssl_version=PROTOCOL_TLSv1,
                           reconnect_attempts_max=config_get_int(
                               'tracer-kronos', 'reconnect_attempts')))

    logging.info('(kronos_file) tracer consumer started')

    sanity_check(executable='kronos-file', hostname=hostname)
    while not graceful_stop.is_set():
        start_time = time()
        live(executable='kronos-file',
             hostname=hostname,
             pid=pid,
             thread=thread)
        for conn in conns:
            if not conn.is_connected():
                logging.info('(kronos_file) connecting to %s' %
                             conn.transport._Transport__host_and_ports[0][0])
                record_counter('daemons.tracer.kronos.reconnect.%s' %
                               conn.transport._Transport__host_and_ports[0]
                               [0].split('.')[0])
                conn.set_listener(
                    'rucio-tracer-kronos',
                    AMQConsumer(
                        broker=conn.transport._Transport__host_and_ports[0],
                        conn=conn,
                        queue=config_get('tracer-kronos', 'queue'),
                        chunksize=chunksize,
                        subscription_id=subscription_id,
                        excluded_usrdns=excluded_usrdns,
                        dataset_queue=dataset_queue,
                        bad_files_patterns=bad_files_patterns))
                conn.start()
                if not use_ssl:
                    conn.connect(username, password)
                else:
                    conn.connect()
                conn.subscribe(
                    destination=config_get('tracer-kronos', 'queue'),
                    ack='client-individual',
                    id=subscription_id,
                    headers={'activemq.prefetchSize': prefetch_size})
        tottime = time() - start_time
        if tottime < sleep_time:
            logging.info('(kronos_file) Will sleep for %s seconds' %
                         (sleep_time - tottime))
            sleep(sleep_time - tottime)

    logging.info('(kronos_file) graceful stop requested')

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    die(executable='kronos-file', hostname=hostname, pid=pid, thread=thread)
    logging.info('(kronos_file) graceful stop done')
Ejemplo n.º 7
0
def reaper(rses, chunk_size=100, once=False, scheme=None, sleep_time=300):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param sleep_time: Thread sleep time after each chunk of work.
    """

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    hash_executable = hashlib.sha256(
        (sys.argv[0] + ''.join(rses)).encode()).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    # heartbeat
    heartbeat = live(executable=executable,
                     hostname=hostname,
                     pid=pid,
                     thread=thread,
                     hash_executable=hash_executable)
    prepend_str = 'dark-reaper [%i/%i] : ' % (heartbeat['assign_thread'],
                                              heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Starting Dark Reaper on RSEs: %s', ', '.join(rses))

    if not once:
        logger(logging.INFO, 'Waiting for heartbeat synchonization')
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time

    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            prepend_str = 'dark-reaper [%i/%i] : ' % (
                heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals()))
            nothing_to_do = True
            start_time = time.time()

            rses_to_process = list(
                set(rses) & set(list_rses_with_quarantined_replicas()))
            random.shuffle(rses_to_process)
            for rse_id in rses_to_process:
                # The following query returns the list of real replicas (deleted_replicas) and list of dark replicas (dark_replicas)
                # Real replicas can be directly removed from the quarantine table
                deleted_replicas, dark_replicas = list_quarantined_replicas(
                    rse_id=rse_id,
                    limit=chunk_size,
                    worker_number=heartbeat['assign_thread'],
                    total_workers=heartbeat['nr_threads'])

                rse_info = rsemgr.get_rse_info(rse_id=rse_id)
                rse = rse_info['rse']
                prot = rsemgr.create_protocol(rse_info,
                                              'delete',
                                              scheme=scheme)

                heartbeat = live(executable=executable,
                                 hostname=hostname,
                                 pid=pid,
                                 thread=thread,
                                 hash_executable=hash_executable)
                prepend_str = 'dark-reaper [%i/%i] : ' % (
                    heartbeat['assign_thread'], heartbeat['nr_threads'])
                logger = formatted_logger(logging.log, prepend_str + '%s')
                try:
                    prot.connect()
                    for replica in dark_replicas:
                        nothing_to_do = False
                        scope = ''
                        if replica['scope']:
                            scope = replica['scope'].external
                        try:
                            pfn = str(
                                list(
                                    rsemgr.lfns2pfns(
                                        rse_settings=rse_info,
                                        lfns=[{
                                            'scope': scope,
                                            'name': replica['name'],
                                            'path': replica['path']
                                        }],
                                        operation='delete',
                                        scheme=scheme).values())[0])
                            logger(logging.INFO,
                                   'Deletion ATTEMPT of %s:%s as %s on %s',
                                   scope, replica['name'], pfn, rse)
                            start = time.time()
                            prot.delete(pfn)
                            duration = time.time() - start
                            logger(
                                logging.INFO,
                                'Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                scope, replica['name'], pfn, rse, duration)
                            payload = {
                                'scope': scope,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica.get('bytes') or 0,
                                'bytes': replica.get('bytes') or 0,
                                'url': pfn,
                                'duration': duration,
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-done', payload)
                            deleted_replicas.append(replica)
                        except SourceNotFound:
                            err_msg = (
                                'Deletion NOTFOUND of %s:%s as %s on %s' %
                                (scope, replica['name'], pfn, rse))
                            logger(logging.WARNING, err_msg)
                            deleted_replicas.append(replica)
                        except (ServiceUnavailable, RSEAccessDenied,
                                ResourceTemporaryUnavailable) as error:
                            err_msg = (
                                'Deletion NOACCESS of %s:%s as %s on %s: %s' %
                                (scope, replica['name'], pfn, rse, str(error)))
                            logger(logging.WARNING, err_msg)
                            payload = {
                                'scope': scope,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica['bytes'] or 0,
                                'bytes': replica['bytes'] or 0,
                                'url': pfn,
                                'reason': str(error),
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-failed', payload)

                        except Exception:
                            logging.critical(traceback.format_exc())
                finally:
                    prot.close()

                delete_quarantined_replicas(rse_id=rse_id,
                                            replicas=deleted_replicas)

                if once:
                    break

            if once:
                break

            if nothing_to_do:
                logger(logging.INFO, 'Nothing to do')
                daemon_sleep(start_time=start_time,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except Exception:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Ejemplo n.º 8
0
def submitter(once=False, rses=None, partition_wait_time=10,
              bulk=100, group_bulk=1, group_policy='rule', source_strategy=None,
              activities=None, sleep_time=600, max_sources=4, retry_other_fts=False,
              filter_transfertool=FILTER_TRANSFERTOOL, transfertool=TRANSFER_TOOL, transfertype=TRANSFER_TYPE):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = "conveyor-submitter"
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if filter_transfertool:
        executable += ' --filter-transfertool ' + filter_transfertool

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Submitter starting with timeout %s', timeout)

    if partition_wait_time:
        time.sleep(partition_wait_time)  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Transfer submitter started')

    while not graceful_stop.is_set():
        if activities is None:
            activities = [None]
        if rses:
            rse_ids = [rse['id'] for rse in rses]
        else:
            rse_ids = None
        for activity in activities:
            try:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
                prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
                logger = formatted_logger(logging.log, prefix + '%s')

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logger(logging.INFO, 'CMS user transfer activity')
                    user_transfer = True

                logger(logging.INFO, 'Starting to get transfer transfers for %s', activity)
                start_time = time.time()
                transfers = __get_transfers(total_workers=heart_beat['nr_threads'],
                                            worker_number=heart_beat['assign_thread'],
                                            failover_schemes=failover_scheme,
                                            limit=bulk,
                                            activity=activity,
                                            rses=rse_ids,
                                            schemes=scheme,
                                            max_sources=max_sources,
                                            bring_online=bring_online,
                                            retry_other_fts=retry_other_fts,
                                            transfertool=filter_transfertool,
                                            logger=logger)

                record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers))
                GET_TRANSFERS_COUNTER.inc(len(transfers))
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers))
                logger(logging.INFO, 'Got %s transfers for %s in %s seconds', len(transfers), activity, time.time() - start_time)

                # group transfers
                logger(logging.INFO, 'Starting to group transfers for %s', activity)
                start_time = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue, group_by_scope=user_transfer)
                record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1))

                logger(logging.INFO, 'Starting to submit transfers for %s', activity)

                if transfertool in ['fts3', 'mock']:
                    for external_host in grouped_jobs:
                        if not user_transfer:
                            for job in grouped_jobs[external_host]:
                                # submit transfers
                                submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter',
                                                timeout=timeout, logger=logger, transfertool=transfertool)
                        else:
                            for _, jobs in iteritems(grouped_jobs[external_host]):
                                # submit transfers
                                for job in jobs:
                                    submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter',
                                                    timeout=timeout, user_transfer_job=user_transfer, logger=logger, transfertool=transfertool)
                elif transfertool == 'globus':
                    if transfertype == 'bulk':
                        # build bulk job file list per external host to send to submit_transfer
                        for external_host in grouped_jobs:
                            # pad the job with job_params; irrelevant for globus but needed for further rucio parsing
                            submitjob = {'files': [], 'job_params': grouped_jobs[''][0].get('job_params')}
                            for job in grouped_jobs[external_host]:
                                submitjob.get('files').append(job.get('files')[0])
                            logger(logging.DEBUG, 'submitjob: %s' % submitjob)
                            submit_transfer(external_host=external_host, job=submitjob, submitter='transfer_submitter',
                                            timeout=timeout, logger=logger, transfertool=transfertool)
                    else:
                        # build single job files and individually send to submit_transfer
                        job_params = grouped_jobs[''][0].get('job_params') if grouped_jobs else None
                        for external_host in grouped_jobs:
                            for job in grouped_jobs[external_host]:
                                for file in job['files']:
                                    singlejob = {'files': [file], 'job_params': job_params}
                                    logger(logging.DEBUG, 'singlejob: %s' % singlejob)
                                    submit_transfer(external_host=external_host, job=singlejob, submitter='transfer_submitter',
                                                    timeout=timeout, logger=logger, transfertool=transfertool)
                else:
                    logger(logging.ERROR, 'Unknown transfer tool')

                if len(transfers) < group_bulk:
                    logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', len(transfers), activity, group_bulk, sleep_time)
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
            except Exception:
                logger(logging.CRITICAL, 'Exception', exc_info=True)

        if once:
            break

    logger(logging.INFO, 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
    return
Ejemplo n.º 9
0
def reaper(rses,
           worker_number=1,
           child_number=1,
           total_children=1,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param exclude_rses: RSE expression to exclude RSEs from the Reaper.
    """
    logging.info('Starting Reaper: Worker %(worker_number)s, '
                 'child %(child_number)s will work on RSEs: ' % locals() +
                 ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    # Generate a hash just for the subset of RSEs
    rse_names = [rse['rse'] for rse in rses]
    hash_executable = hashlib.sha256(sys.argv[0] +
                                     ''.join(rse_names)).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    nothing_to_do = {}
    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            checkpoint_time = datetime.datetime.now()
            # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))

            max_deleting_rate = 0
            for rse in sort_rses(rses):
                try:
                    if checkpoint_time + datetime.timedelta(
                            minutes=1) < datetime.datetime.now():
                        heartbeat = live(executable=executable,
                                         hostname=hostname,
                                         pid=pid,
                                         thread=thread,
                                         hash_executable=hash_executable)
                        # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))
                        checkpoint_time = datetime.datetime.now()

                    if rse['id'] in nothing_to_do and nothing_to_do[
                            rse['id']] > datetime.datetime.now():
                        continue
                    logging.info('Reaper %s-%s: Running on RSE %s %s',
                                 worker_number, child_number, rse['rse'],
                                 nothing_to_do.get(rse['id']))

                    rse_info = rsemgr.get_rse_info(rse['rse'])
                    rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id'])

                    if not rse_protocol['availability_delete']:
                        logging.info(
                            'Reaper %s-%s: RSE %s is not available for deletion',
                            worker_number, child_number, rse_info['rse'])
                        nothing_to_do[rse['id']] = datetime.datetime.now(
                        ) + datetime.timedelta(minutes=30)
                        continue

                    # Temporary hack to force gfal for deletion
                    for protocol in rse_info['protocols']:
                        if protocol[
                                'impl'] == 'rucio.rse.protocols.srm.Default' or protocol[
                                    'impl'] == 'rucio.rse.protocols.gsiftp.Default':
                            protocol[
                                'impl'] = 'rucio.rse.protocols.gfal.Default'

                        if protocol[
                                'impl'] == 'rucio.rse.protocols.signeds3.Default':
                            protocol[
                                'impl'] = 'rucio.rse.protocols.s3es.Default'

                    needed_free_space, max_being_deleted_files = None, 100
                    needed_free_space_per_child = None
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(
                            rse_id=rse['id'])
                        logging.info(
                            'Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s'
                            % locals())
                        if needed_free_space <= 0:
                            needed_free_space, needed_free_space_per_child = 0, 0
                            logging.info(
                                'Reaper %s-%s: free space is above minimum limit for %s',
                                worker_number, child_number, rse['rse'])
                        else:
                            if total_children and total_children > 0:
                                needed_free_space_per_child = needed_free_space / float(
                                    total_children)

                    start = time.time()
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(
                            rse_id=rse['id'],
                            bytes=needed_free_space_per_child,
                            limit=max_being_deleted_files,
                            worker_number=child_number,
                            total_workers=total_children,
                            delay_seconds=delay_seconds)
                    logging.debug(
                        'Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        worker_number, child_number, rse['rse'],
                        needed_free_space_per_child,
                        time.time() - start, len(replicas))

                    if not replicas:
                        nothing_to_do[rse['id']] = datetime.datetime.now(
                        ) + datetime.timedelta(minutes=30)
                        logging.info(
                            'Reaper %s-%s: No replicas to delete %s. The next check will occur at %s',
                            worker_number, child_number, rse['rse'],
                            nothing_to_do[rse['id']])
                        continue

                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s',
                                      worker_number, child_number, str(files))
                        try:
                            update_replicas_states(replicas=[
                                dict(replica.items() +
                                     [('state', ReplicaState.BEING_DELETED),
                                      ('rse_id', rse['id'])])
                                for replica in files
                            ],
                                                   nowait=True)
                            for replica in files:
                                try:
                                    replica['pfn'] = str(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope': replica['scope'],
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values()[0])
                                except (ReplicaUnAvailable,
                                        ReplicaNotFound) as error:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (
                                        replica['scope'], replica['name'],
                                        rse['rse'], str(error))
                                    logging.warning('Reaper %s-%s: %s',
                                                    worker_number,
                                                    child_number, err_msg)
                                    replica['pfn'] = None

                            monitor.record_counter(
                                counters='reaper.deletion.being_deleted',
                                delta=len(files))

                            try:
                                deleted_files = []
                                prot.connect()
                                for replica in files:
                                    try:
                                        logging.info(
                                            'Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'])
                                        start = time.time()
                                        if rse['staging_area'] or rse[
                                                'rse'].endswith("STAGING"):
                                            logging.warning(
                                                'Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                                                worker_number, child_number,
                                                replica['scope'],
                                                replica['name'],
                                                replica['pfn'], rse['rse'])
                                        else:
                                            if replica['pfn']:
                                                pfn = replica['pfn']
                                                # sign the URL if necessary
                                                if prot.attributes[
                                                        'scheme'] == 'https' and rse_info[
                                                            'sign_url'] is not None:
                                                    pfn = get_signed_url(
                                                        rse_info['sign_url'],
                                                        'delete', pfn)
                                                prot.delete(pfn)
                                            else:
                                                logging.warning(
                                                    'Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s',
                                                    worker_number,
                                                    child_number,
                                                    replica['scope'],
                                                    replica['name'],
                                                    replica['pfn'], rse['rse'])
                                        monitor.record_timer(
                                            'daemons.reaper.delete.%s.%s' %
                                            (prot.attributes['scheme'],
                                             rse['rse']),
                                            (time.time() - start) * 1000)
                                        duration = time.time() - start

                                        deleted_files.append({
                                            'scope':
                                            replica['scope'],
                                            'name':
                                            replica['name']
                                        })

                                        add_message(
                                            'deletion-done', {
                                                'scope':
                                                replica['scope'],
                                                'name':
                                                replica['name'],
                                                'rse':
                                                rse_info['rse'],
                                                'rse_id':
                                                rse_info['id'],
                                                'file-size':
                                                replica['bytes'],
                                                'bytes':
                                                replica['bytes'],
                                                'url':
                                                replica['pfn'],
                                                'duration':
                                                duration,
                                                'protocol':
                                                prot.attributes['scheme']
                                            })
                                        logging.info(
                                            'Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'],
                                            duration)
                                    except SourceNotFound:
                                        err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'])
                                        logging.warning(err_msg)
                                        deleted_files.append({
                                            'scope':
                                            replica['scope'],
                                            'name':
                                            replica['name']
                                        })
                                        if replica[
                                                'state'] == ReplicaState.AVAILABLE:
                                            add_message(
                                                'deletion-failed', {
                                                    'scope':
                                                    replica['scope'],
                                                    'name':
                                                    replica['name'],
                                                    'rse':
                                                    rse_info['rse'],
                                                    'rse_id':
                                                    rse_info['id'],
                                                    'file-size':
                                                    replica['bytes'],
                                                    'bytes':
                                                    replica['bytes'],
                                                    'url':
                                                    replica['pfn'],
                                                    'reason':
                                                    str(err_msg),
                                                    'protocol':
                                                    prot.attributes['scheme']
                                                })
                                    except (ServiceUnavailable,
                                            RSEAccessDenied,
                                            ResourceTemporaryUnavailable
                                            ) as error:
                                        logging.warning(
                                            'Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'],
                                            str(error))
                                        add_message(
                                            'deletion-failed', {
                                                'scope':
                                                replica['scope'],
                                                'name':
                                                replica['name'],
                                                'rse':
                                                rse_info['rse'],
                                                'rse_id':
                                                rse_info['id'],
                                                'file-size':
                                                replica['bytes'],
                                                'bytes':
                                                replica['bytes'],
                                                'url':
                                                replica['pfn'],
                                                'reason':
                                                str(error),
                                                'protocol':
                                                prot.attributes['scheme']
                                            })
                                    except Exception as error:
                                        logging.critical(
                                            'Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'],
                                            str(traceback.format_exc()))
                                        add_message(
                                            'deletion-failed', {
                                                'scope':
                                                replica['scope'],
                                                'name':
                                                replica['name'],
                                                'rse':
                                                rse_info['rse'],
                                                'rse_id':
                                                rse_info['id'],
                                                'file-size':
                                                replica['bytes'],
                                                'bytes':
                                                replica['bytes'],
                                                'url':
                                                replica['pfn'],
                                                'reason':
                                                str(error),
                                                'protocol':
                                                prot.attributes['scheme']
                                            })
                                    except:
                                        logging.critical(
                                            'Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'],
                                            str(traceback.format_exc()))
                            except (ServiceUnavailable, RSEAccessDenied,
                                    ResourceTemporaryUnavailable) as error:
                                for replica in files:
                                    logging.warning(
                                        'Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s',
                                        worker_number, child_number,
                                        replica['scope'], replica['name'],
                                        replica['pfn'], rse['rse'], str(error))
                                    add_message(
                                        'deletion-failed', {
                                            'scope': replica['scope'],
                                            'name': replica['name'],
                                            'rse': rse_info['rse'],
                                            'rse_id': rse_info['id'],
                                            'file-size': replica['bytes'],
                                            'bytes': replica['bytes'],
                                            'url': replica['pfn'],
                                            'reason': str(error),
                                            'protocol':
                                            prot.attributes['scheme']
                                        })
                                    break
                            finally:
                                prot.close()
                            start = time.time()
                            with monitor.record_timer_block(
                                    'reaper.delete_replicas'):
                                delete_replicas(rse_id=rse['id'],
                                                files=deleted_files)
                            logging.debug(
                                'Reaper %s-%s: delete_replicas successes %s %s %s',
                                worker_number, child_number, rse['rse'],
                                len(deleted_files),
                                time.time() - start)
                            monitor.record_counter(
                                counters='reaper.deletion.done',
                                delta=len(deleted_files))

                        except DatabaseException as error:
                            logging.warning(
                                'Reaper %s-%s: DatabaseException %s',
                                worker_number, child_number, str(error))
                        except UnsupportedOperation as error:
                            logging.warning(
                                'Reaper %s-%s: UnsupportedOperation %s',
                                worker_number, child_number, str(error))
                        except:
                            logging.critical(traceback.format_exc())

                except RSENotFound as error:
                    logging.warning('Reaper %s-%s: RSE not found %s',
                                    worker_number, child_number, str(error))

                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            time.sleep(1)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Ejemplo n.º 10
0
def deliver_emails(once=False, send_email=True, thread=0, bulk=1000, delay=10):
    '''
    Main loop to deliver emails via SMTP.
    '''
    logging.info('[email] starting - threads (%i) bulk (%i)', thread, bulk)

    executable = 'hermes [email]'
    hostname = socket.getfqdn()
    pid = os.getpid()
    heartbeat_thread = threading.current_thread()

    sanity_check(executable=executable, hostname=hostname)

    # Make an initial heartbeat so that all daemons have the correct worker number on the next try
    live(executable=executable,
         hostname=hostname,
         pid=pid,
         thread=heartbeat_thread)
    GRACEFUL_STOP.wait(1)

    email_from = config_get('messaging-hermes', 'email_from')

    while not GRACEFUL_STOP.is_set():

        heartbeat = live(executable, hostname, pid, heartbeat_thread)
        logging.debug('[email] %i:%i - bulk %i', heartbeat['assign_thread'],
                      heartbeat['nr_threads'], bulk)

        t_start = time.time()

        messages = retrieve_messages(bulk=bulk,
                                     thread=heartbeat['assign_thread'],
                                     total_threads=heartbeat['nr_threads'],
                                     event_type='email')

        if messages != []:
            to_delete = []
            for message in messages:
                logging.debug('[email] %i:%i - submitting: %s',
                              heartbeat['assign_thread'],
                              heartbeat['nr_threads'], str(message))

                msg = MIMEText(message['payload']['body'].encode('utf-8'))

                msg['From'] = email_from
                msg['To'] = ', '.join(message['payload']['to'])
                msg['Subject'] = message['payload']['subject'].encode('utf-8')

                if send_email:
                    smtp = smtplib.SMTP()
                    smtp.connect()
                    smtp.sendmail(msg['From'], message['payload']['to'],
                                  msg.as_string())
                    smtp.quit()

                to_delete.append({
                    'id': message['id'],
                    'created_at': message['created_at'],
                    'updated_at': message['created_at'],
                    'payload': str(message['payload']),
                    'event_type': 'email'
                })

                logging.debug('[email] %i:%i - submitting done: %s',
                              heartbeat['assign_thread'],
                              heartbeat['nr_threads'], str(message['id']))

            delete_messages(to_delete)
            logging.info('[email] %i:%i - submitted %i messages',
                         heartbeat['assign_thread'], heartbeat['nr_threads'],
                         len(to_delete))

        if once:
            break

        t_delay = delay - (time.time() - t_start)
        t_delay = t_delay if t_delay > 0 else 0
        if t_delay:
            logging.debug('[email] %i:%i - sleeping %s seconds',
                          heartbeat['assign_thread'], heartbeat['nr_threads'],
                          t_delay)
        time.sleep(t_delay)

    logging.debug('[email] %i:%i - graceful stop requested',
                  heartbeat['assign_thread'], heartbeat['nr_threads'])

    die(executable, hostname, pid, heartbeat_thread)

    logging.debug('[email] %i:%i - graceful stop done',
                  heartbeat['assign_thread'], heartbeat['nr_threads'])
Ejemplo n.º 11
0
def actions_loop(once, scope, rses, sleep_time, dark_min_age,
                 dark_threshold_percent, miss_threshold_percent, force_proceed,
                 scanner_files_path):
    """
    Main loop to apply the CC actions
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    executable = 'storage-consistency-actions'
    heartbeat = live(executable=executable,
                     hostname=hostname,
                     pid=pid,
                     thread=current_thread)

    # Make an initial heartbeat
    # so that all storage-consistency-actions have the correct worker number on the next try
    prefix = 'storage-consistency-actions[%i/%i] ' %\
             (heartbeat['assign_thread'], heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(
        logging.INFO, 'hostname: %s  pid: %d  current_thread: %s' %
        (hostname, pid, current_thread))

    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=current_thread)
            logger(logging.INFO, 'heartbeat? %s' % heartbeat)

            prefix = 'storage-consistency-actions[%i/%i] ' %\
                     (heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger(logging.INFO, 'prefix: %s' % prefix)
            start = time.time()
            logger(logging.DEBUG, 'Start time: %f' % start)

            deckard_loop(scope, rses, dark_min_age, dark_threshold_percent,
                         miss_threshold_percent, force_proceed,
                         scanner_files_path)
            daemon_sleep(start_time=start,
                         sleep_time=sleep_time,
                         graceful_stop=graceful_stop,
                         logger=logger)

        except Exception as e:
            traceback.print_exc()
            logger(logging.WARNING, '\n Something went wrong here... %s' % e)
            logger(
                logging.WARNING,
                '\n Something went wrong here... %s ' % (e.__class__.__name__))
        if once:
            break

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=current_thread)
Ejemplo n.º 12
0
def deliver_messages(once=False,
                     brokers_resolved=None,
                     thread=0,
                     bulk=1000,
                     delay=10,
                     broker_timeout=3,
                     broker_retry=3):
    '''
    Main loop to deliver messages to a broker.
    '''
    logging.info('[broker] starting - threads (%i) bulk (%i)', thread, bulk)

    if not brokers_resolved:
        logging.fatal('No brokers resolved.')
        return

    logging.info('[broker] checking authentication method')
    use_ssl = True
    try:
        use_ssl = config_get_bool('messaging-hermes', 'use_ssl')
    except:
        logging.info(
            '[broker] could not find use_ssl in configuration -- please update your rucio.cfg'
        )

    port = config_get_int('messaging-hermes', 'port')
    vhost = config_get('messaging-hermes',
                       'broker_virtual_host',
                       raise_exception=False)
    if not use_ssl:
        username = config_get('messaging-hermes', 'username')
        password = config_get('messaging-hermes', 'password')
        port = config_get_int('messaging-hermes', 'nonssl_port')

    conns = []
    for broker in brokers_resolved:
        if not use_ssl:
            logging.info(
                '[broker] setting up username/password authentication: %s' %
                broker)
            con = stomp.Connection12(host_and_ports=[(broker, port)],
                                     vhost=vhost,
                                     keepalive=True,
                                     timeout=broker_timeout)
        else:
            logging.info(
                '[broker] setting up ssl cert/key authentication: %s' % broker)
            con = stomp.Connection12(
                host_and_ports=[(broker, port)],
                use_ssl=True,
                ssl_key_file=config_get('messaging-hermes', 'ssl_key_file'),
                ssl_cert_file=config_get('messaging-hermes', 'ssl_cert_file'),
                vhost=vhost,
                keepalive=True,
                timeout=broker_timeout)

        con.set_listener(
            'rucio-hermes',
            HermesListener(con.transport._Transport__host_and_ports[0]))

        conns.append(con)
    destination = config_get('messaging-hermes', 'destination')

    executable = 'hermes [broker]'
    hostname = socket.getfqdn()
    pid = os.getpid()
    heartbeat_thread = threading.current_thread()

    # Make an initial heartbeat so that all daemons have the correct worker number on the next try
    sanity_check(executable=executable,
                 hostname=hostname,
                 pid=pid,
                 thread=heartbeat_thread)
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            t_start = time.time()

            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=heartbeat_thread)

            logging.debug('[broker] %i:%i - using: %s',
                          heartbeat['assign_thread'], heartbeat['nr_threads'],
                          [
                              conn.transport._Transport__host_and_ports[0][0]
                              for conn in conns
                          ])

            messages = retrieve_messages(bulk=bulk,
                                         thread=heartbeat['assign_thread'],
                                         total_threads=heartbeat['nr_threads'])

            if messages:

                logging.debug('[broker] %i:%i - retrieved %i messages',
                              heartbeat['assign_thread'],
                              heartbeat['nr_threads'], len(messages))
                to_delete = []
                for message in messages:
                    try:
                        conn = random.sample(conns, 1)[0]
                        if not conn.is_connected():
                            host_and_ports = conn.transport._Transport__host_and_ports[
                                0][0]
                            record_counter('daemons.hermes.reconnect.%s' %
                                           host_and_ports.split('.')[0])
                            conn.start()
                            if not use_ssl:
                                logging.info(
                                    '[broker] %i:%i - connecting with USERPASS to %s',
                                    heartbeat['assign_thread'],
                                    heartbeat['nr_threads'], host_and_ports)
                                conn.connect(username, password, wait=True)
                            else:
                                logging.info(
                                    '[broker] %i:%i - connecting with SSL to %s',
                                    heartbeat['assign_thread'],
                                    heartbeat['nr_threads'], host_and_ports)
                                conn.connect(wait=True)

                        conn.send(body=json.dumps({
                            'event_type':
                            str(message['event_type']).lower(),
                            'payload':
                            message['payload'],
                            'created_at':
                            str(message['created_at'])
                        }),
                                  destination=destination,
                                  headers={
                                      'persistent':
                                      'true',
                                      'event_type':
                                      str(message['event_type']).lower()
                                  })

                        to_delete.append({
                            'id':
                            message['id'],
                            'created_at':
                            message['created_at'],
                            'updated_at':
                            message['created_at'],
                            'payload':
                            json.dumps(message['payload']),
                            'event_type':
                            message['event_type']
                        })
                    except ValueError:
                        logging.warn('Cannot serialize payload to JSON: %s',
                                     str(message['payload']))
                        to_delete.append({
                            'id': message['id'],
                            'created_at': message['created_at'],
                            'updated_at': message['created_at'],
                            'payload': str(message['payload']),
                            'event_type': message['event_type']
                        })
                        continue
                    except stomp.exception.NotConnectedException as error:
                        logging.warn(
                            'Could not deliver message due to NotConnectedException: %s',
                            str(error))
                        continue
                    except stomp.exception.ConnectFailedException as error:
                        logging.warn(
                            'Could not deliver message due to ConnectFailedException: %s',
                            str(error))
                        continue
                    except Exception as error:
                        logging.warn('Could not deliver message: %s',
                                     str(error))
                        logging.critical(traceback.format_exc())
                        continue

                    if str(message['event_type']).lower().startswith(
                            'transfer') or str(message['event_type']).lower(
                            ).startswith('stagein'):
                        logging.debug(
                            '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, request-id: %s, transfer-id: %s, created_at: %s',
                            heartbeat['assign_thread'],
                            heartbeat['nr_threads'],
                            str(message['event_type']).lower(),
                            message['payload'].get('scope', None),
                            message['payload'].get('name', None),
                            message['payload'].get('dst-rse', None),
                            message['payload'].get('request-id', None),
                            message['payload'].get('transfer-id', None),
                            str(message['created_at']))

                    elif str(message['event_type']).lower().startswith(
                            'dataset'):
                        logging.debug(
                            '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, rule-id: %s, created_at: %s)',
                            heartbeat['assign_thread'],
                            heartbeat['nr_threads'],
                            str(message['event_type']).lower(),
                            message['payload']['scope'],
                            message['payload']['name'],
                            message['payload']['rse'],
                            message['payload']['rule_id'],
                            str(message['created_at']))

                    elif str(message['event_type']).lower().startswith(
                            'deletion'):
                        if 'url' not in message['payload']:
                            message['payload']['url'] = 'unknown'
                        logging.debug(
                            '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, url: %s, created_at: %s)',
                            heartbeat['assign_thread'],
                            heartbeat['nr_threads'],
                            str(message['event_type']).lower(),
                            message['payload']['scope'],
                            message['payload']['name'],
                            message['payload']['rse'],
                            message['payload']['url'],
                            str(message['created_at']))
                    else:
                        logging.debug('[broker] %i:%i - other message: %s',
                                      heartbeat['assign_thread'],
                                      heartbeat['nr_threads'], message)

                delete_messages(to_delete)
                logging.info('[broker] %i:%i - submitted %i messages',
                             heartbeat['assign_thread'],
                             heartbeat['nr_threads'], len(to_delete))

                if once:
                    break

        except NoResultFound:
            # silence this error: https://its.cern.ch/jira/browse/RUCIO-1699
            pass
        except:
            logging.critical(traceback.format_exc())

        t_delay = delay - (time.time() - t_start)
        t_delay = t_delay if t_delay > 0 else 0
        if t_delay:
            logging.debug('[broker] %i:%i - sleeping %s seconds',
                          heartbeat['assign_thread'], heartbeat['nr_threads'],
                          t_delay)
        time.sleep(t_delay)

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    logging.debug('[broker] %i:%i - graceful stop requested',
                  heartbeat['assign_thread'], heartbeat['nr_threads'])

    die(executable, hostname, pid, heartbeat_thread)

    logging.debug('[broker] %i:%i - graceful stop done',
                  heartbeat['assign_thread'], heartbeat['nr_threads'])
Ejemplo n.º 13
0
def declare_suspicious_replicas_bad(once=False,
                                    younger_than=3,
                                    nattempts=10,
                                    rse_expression='MOCK',
                                    max_replicas_per_rse=100):
    """
    Main loop to check for available replicas which are labeled as suspicious

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'.
    :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE.
                                 If more is found, processing is skipped and warning is printed.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]
    rses = []
    for rse in parse_expression(expression=rse_expression):
        rses.append(rse['id'])
    rses.sort()
    executable += ' --rse-expression ' + str(rses)

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    live(executable=executable,
         hostname=socket.gethostname(),
         pid=os.getpid(),
         thread=threading.current_thread())

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable,
                             hostname=socket.gethostname(),
                             pid=os.getpid(),
                             thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread']

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logging.error(
                    'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.',
                    socket.gethostname())
                die(executable=executable,
                    hostname=socket.gethostname(),
                    pid=os.getpid(),
                    thread=threading.current_thread())
                break

            start = time.time()

            logging.info(
                'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,'
                +
                ' reported suspicious in the last %i days at least %i times which are available on other RSEs.',  # NOQA: W503
                worker_number,
                total_workers,
                rse_expression,
                younger_than,
                nattempts)

            getfileskwargs = {
                'younger_than': younger_than,
                'nattempts': nattempts,
                'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                'available_elsewhere': True,
                'is_suspicious': True
            }

            recoverable_replicas = get_suspicious_files(
                rse_expression, **getfileskwargs)

            logging.info(
                'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.',
                worker_number, total_workers,
                time.time() - start, len(recoverable_replicas))

            if not recoverable_replicas and not once:
                logging.info(
                    'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas. Sleeping for 60 seconds.',
                    worker_number, total_workers, len(recoverable_replicas))
                GRACEFUL_STOP.wait(60)
            else:
                logging.info(
                    'replica_recoverer[%i/%i]: looking for replica surls.',
                    worker_number, total_workers)

                start = time.time()
                surls_to_recover = {
                }  # dictionary of { vo1: {rse1: [surl1, surl2, ... ], rse2: ...}, vo2:... }
                cnt_surl_not_found = 0
                for replica in recoverable_replicas:
                    scope = replica['scope']
                    name = replica['name']
                    vo = scope.vo
                    rse = replica['rse']
                    rse_id = replica['rse_id']
                    if GRACEFUL_STOP.is_set():
                        break
                    if vo not in surls_to_recover:
                        surls_to_recover[vo] = {}
                    if rse_id not in surls_to_recover[vo]:
                        surls_to_recover[vo][rse_id] = []
                    # for each suspicious replica, we get its surl through the list_replicas function
                    surl_not_found = True
                    for rep in list_replicas([{'scope': scope, 'name': name}]):
                        for site in rep['rses']:
                            if site == rse_id:
                                surls_to_recover[vo][rse_id].append(
                                    rep['rses'][site][0])
                                surl_not_found = False
                    if surl_not_found:
                        cnt_surl_not_found += 1
                        logging.warning(
                            'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.',
                            worker_number, total_workers, name, rse)

                logging.info(
                    'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.',
                    worker_number, total_workers,
                    len(recoverable_replicas) - cnt_surl_not_found,
                    len(recoverable_replicas),
                    time.time() - start)

                for vo in surls_to_recover:
                    for rse_id in surls_to_recover[vo]:
                        logging.info(
                            'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.',
                            worker_number, total_workers,
                            len(surls_to_recover[vo][rse_id]), rse,
                            str(surls_to_recover[vo][rse_id]))
                        if len(surls_to_recover[vo]
                               [rse_id]) > max_replicas_per_rse:
                            logging.warning(
                                'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.',
                                worker_number, total_workers,
                                max_replicas_per_rse,
                                str(len(surls_to_recover[vo][rse_id])), rse)
                        else:
                            declare_bad_file_replicas(
                                pfns=surls_to_recover[vo][rse_id],
                                reason='Suspicious. Automatic recovery.',
                                issuer=InternalAccount('root', vo=vo),
                                status=BadFilesStatus.BAD,
                                session=None)
                            logging.info(
                                'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.',
                                worker_number, total_workers, rse)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
        except Exception as err:
            logging.critical(traceback.format_exc())
            record_counter('replica.recoverer.exceptions.%s',
                           err.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=socket.gethostname(),
        pid=os.getpid(),
        thread=threading.current_thread())
    logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number,
                 total_workers)
Ejemplo n.º 14
0
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, scope='tests', once=False, dataset_lifetime=None, set_metadata=False):
    sleep(sleep_time * (total_workers - worker_number) / total_workers)

    executable = 'automatix'
    hostname = socket.getfqdn()
    pid = getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers)
    while not GRACEFUL_STOP.is_set():
        heartbeat.live(executable, hostname, pid, hb_thread)
        starttime = time()
        prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers)
        logging.info('%s Getting data distribution', prepend_str)
        probabilities, data = get_data_distribution(inputfile)
        logging.debug('%s Probabilities %s', prepend_str, probabilities)
        totretries = 3
        status = False

        for site in sites:

            for retry in range(0, totretries):
                start_time = time()
                tmpdir = tempfile.mkdtemp()
                logging.info('%s Running on site %s', prepend_str, site)
                dic = choose_element(probabilities, data)
                metadata = dic['metadata']
                try:
                    nbfiles = dic['nbfiles']
                except KeyError:
                    nbfiles = 2
                    logging.warning('%s No nbfiles defined in the configuration, will use 2', prepend_str)
                try:
                    filesize = dic['filesize']
                except KeyError:
                    filesize = 1000000
                    logging.warning('%s No filesize defined in the configuration, will use 1M files', prepend_str)
                dsn = generate_didname(metadata, None, 'dataset')
                fnames = []
                lfns = []
                physical_fnames = []
                for _ in range(nbfiles):
                    fname = generate_didname(metadata=metadata, dsn=dsn, did_type='file')
                    lfns.append(fname)
                    logging.info('%s Generating file %s in dataset %s', prepend_str, fname, dsn)
                    physical_fname = '%s/%s' % (tmpdir, "".join(fname.split('/')))
                    physical_fnames.append(physical_fname)
                    generate_file(physical_fname, filesize)
                    fnames.append(fname)
                logging.info('%s Upload %s to %s', prepend_str, dsn, site)
                dsn = '%s:%s' % (scope, dsn)
                status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, worker_number=worker_number, total_workers=total_workers, dataset_lifetime=dataset_lifetime, did=dsn, set_metadata=set_metadata)
                for physical_fname in physical_fnames:
                    remove(physical_fname)
                rmdir(tmpdir)
                if status:
                    monitor.record_counter(counters='automatix.addnewdataset.done', delta=1)
                    monitor.record_counter(counters='automatix.addnewfile.done', delta=nbfiles)
                    monitor.record_timer('automatix.datasetinjection', (time() - start_time) * 1000)
                    break
                else:
                    logging.info('%s Failed to upload files. Will retry another time (attempt %s/%s)', prepend_str, str(retry + 1), str(totretries))
        if once is True:
            logging.info('%s Run with once mode. Exiting', prepend_str)
            break
        tottime = time() - starttime
        if status:
            logging.info('%s It took %s seconds to upload one dataset on %s', prepend_str, str(tottime), str(sites))
            if sleep_time > tottime:
                logging.info('%s Will sleep for %s seconds', prepend_str, str(sleep_time - tottime))
                sleep(sleep_time - tottime)
        else:
            logging.info('%s Retrying a new upload', prepend_str)
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
Ejemplo n.º 15
0
def submitter(once=False, rses=[], mock=False,
              process=0, total_processes=1, total_threads=1,
              bulk=100, group_bulk=1, group_policy='rule', fts_source_strategy='auto',
              activities=None, sleep_time=600, max_sources=4, retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    logging.info('Transfer submitter starting - process (%i/%i) threads (%i)' % (process,
                                                                                 total_processes,
                                                                                 total_threads))

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue)

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)

    logging.info('Transfer submitter started - process (%i/%i) threads (%i/%i) timeout (%s)' % (process, total_processes,
                                                                                                hb['assign_thread'], hb['nr_threads'],
                                                                                                timeout))

    threadPool = ThreadPool(total_threads)
    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None

            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                logging.info("%s:%s Starting to get transfer transfers for %s" % (process, hb['assign_thread'], activity))
                ts = time.time()
                transfers = __get_transfers(process=process,
                                            total_processes=total_processes,
                                            thread=hb['assign_thread'],
                                            total_threads=hb['nr_threads'],
                                            failover_schemes=failover_scheme,
                                            limit=bulk,
                                            activity=activity,
                                            rses=rse_ids,
                                            schemes=scheme,
                                            mock=mock,
                                            max_sources=max_sources,
                                            bring_online=bring_online,
                                            retry_other_fts=retry_other_fts)
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1))
                record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers))
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers))
                logging.info("%s:%s Got %s transfers for %s" % (process, hb['assign_thread'], len(transfers), activity))

                # group transfers
                logging.info("%s:%s Starting to group transfers for %s" % (process, hb['assign_thread'], activity))
                ts = time.time()
                grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, fts_source_strategy, max_time_in_queue)
                record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1))

                logging.info("%s:%s Starting to submit transfers for %s" % (process, hb['assign_thread'], activity))
                for external_host in grouped_jobs:
                    for job in grouped_jobs[external_host]:
                        # submit transfers
                        # job_requests = makeRequests(submit_transfer, args_list=[((external_host, job, 'transfer_submitter', process, thread), {})])
                        job_requests = makeRequests(submit_transfer, args_list=[((), {'external_host': external_host,
                                                                                      'job': job,
                                                                                      'submitter':
                                                                                      'transfer_submitter',
                                                                                      'process': process,
                                                                                      'thread': hb['assign_thread'],
                                                                                      'timeout': timeout})])
                        [threadPool.putRequest(job_req) for job_req in job_requests]
                threadPool.wait()

                if len(transfers) < group_bulk:
                    logging.info('%i:%i - only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (process, hb['assign_thread'], len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
        except:
            logging.critical('%s:%s %s' % (process, hb['assign_thread'], traceback.format_exc()))

        if once:
            break

    logging.info('%s:%s graceful stop requested' % (process, hb['assign_thread']))

    threadPool.dismissWorkers(total_threads, do_join=True)
    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s:%s graceful stop done' % (process, hb['assign_thread']))
    return
Ejemplo n.º 16
0
def rule_repairer(once=False):
    """
    Main loop to check for STUCK replication rules
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_rules = {}  # {rule_id: datetime}

    # Make an initial heartbeat so that all judge-repairers have the correct worker number on the next try
    executable = 'judge-repairer'
    live(executable=executable,
         hostname=hostname,
         pid=pid,
         thread=current_thread,
         older_than=60 * 30)
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=current_thread,
                             older_than=60 * 30)

            start = time.time()

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            # Select a bunch of rules for this worker to repair
            rules = get_stuck_rules(
                total_workers=heartbeat['nr_threads'],
                worker_number=heartbeat['assign_thread'],
                delta=-1 if once else 1800,
                limit=100,
                blacklisted_rules=[key for key in paused_rules])
            logging.debug(
                'rule_repairer[%s/%s] index query time %f fetch size is %d' %
                (heartbeat['assign_thread'], heartbeat['nr_threads'],
                 time.time() - start, len(rules)))

            if not rules and not once:
                logging.debug(
                    'rule_repairer[%s/%s] did not get any work (paused_rules=%s)'
                    % (heartbeat['assign_thread'], heartbeat['nr_threads'],
                       str(len(paused_rules))))
                graceful_stop.wait(60)
            else:
                for rule_id in rules:
                    rule_id = rule_id[0]
                    logging.info('rule_repairer[%s/%s]: Repairing rule %s' %
                                 (heartbeat['assign_thread'],
                                  heartbeat['nr_threads'], rule_id))
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        repair_rule(rule_id=rule_id)
                        logging.debug(
                            'rule_repairer[%s/%s]: repairing of %s took %f' %
                            (heartbeat['assign_thread'],
                             heartbeat['nr_threads'], rule_id,
                             time.time() - start))
                    except (DatabaseException, DatabaseError) as e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow(
                            ) + timedelta(seconds=randint(600, 2400))
                            logging.warning(
                                'rule_repairer[%s/%s]: Locks detected for %s' %
                                (heartbeat['assign_thread'],
                                 heartbeat['nr_threads'], rule_id))
                            record_counter(
                                'rule.judge.exceptions.LocksDetected')
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logging.warning(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' %
                                           e.__class__.__name__)

        except (DatabaseException, DatabaseError) as e:
            if match('.*QueuePool.*', str(e.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            elif match('.*ORA-03135.*', str(e.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
        except Exception as e:
            logging.critical(traceback.format_exc())
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=current_thread)
Ejemplo n.º 17
0
def rule_injector(once=False):
    """
    Main loop to check for asynchronous creation of replication rules
    """

    hostname = socket.gethostname()
    pid = os.getpid()
    current_thread = threading.current_thread()

    paused_rules = {}  # {rule_id: datetime}

    # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try
    executable = 'judge-injector'
    heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60)
    prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    graceful_stop.wait(1)

    while not graceful_stop.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60)
            prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')

            start = time.time()

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            rules = get_injected_rules(total_workers=heartbeat['nr_threads'],
                                       worker_number=heartbeat['assign_thread'],
                                       limit=100,
                                       blocked_rules=[key for key in paused_rules])
            logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules)))

            if not rules and not once:
                logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules)))
                graceful_stop.wait(60)
            else:
                for rule in rules:
                    rule_id = rule[0]
                    logger(logging.INFO, 'Injecting rule %s' % rule_id)
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        inject_rule(rule_id=rule_id, logger=logger)
                        logger(logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError) as e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                            record_counter('rule.judge.exceptions.LocksDetected')
                            logger(logging.WARNING, 'Locks detected for %s' % rule_id)
                        elif match('.*QueuePool.*', str(e.args[0])):
                            logger(logging.WARNING, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        elif match('.*ORA-03135.*', str(e.args[0])):
                            logger(logging.WARNING, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logger(logging.ERROR, 'DatabaseException', exc_info=True)
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except (RSEWriteBlocked) as e:
                        paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                        logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id)
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed as e:
                        paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                        logger(logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id)
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except RuleNotFound:
                        pass
                    except InsufficientAccountLimit:
                        # A rule with InsufficientAccountLimit on injection hangs there potentially forever
                        # It should be marked as SUSPENDED
                        logger(logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id)
                        update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'})

        except (DatabaseException, DatabaseError) as e:
            if match('.*QueuePool.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            elif match('.*ORA-03135.*', str(e.args[0])):
                logger(logging.WARNING, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            else:
                logger(logging.CRITICAL, 'DatabaseException', exc_info=True)
                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        except Exception as e:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        if once:
            break

    die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
Ejemplo n.º 18
0
def transmogrifier(bulk=5, once=False, sleep_time=60):
    """
    Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash,
    identifies the subscriptions matching the DIDs and
    submit a replication rule for each DID matching a subscription.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)

        dids, subscriptions = [], []
        tottime = 0
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                             heart_beat['nr_threads'])

        try:
            #  Get the new DIDs based on the is_new flag
            for did in list_new_dids(thread=heart_beat['assign_thread'],
                                     total_threads=heart_beat['nr_threads'],
                                     chunk_size=bulk):
                dids.append({
                    'scope': did['scope'],
                    'did_type': str(did['did_type']),
                    'name': did['name']
                })

            sub_dict = {3: []}
            #  Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest
            #  The priority is defined as 'policyid'
            for sub in list_subscriptions(None, None):
                if sub['state'] != SubscriptionState.INACTIVE and sub[
                        'lifetime'] and (datetime.now() > sub['lifetime']):
                    update_subscription(
                        name=sub['name'],
                        account=sub['account'],
                        metadata={'state': SubscriptionState.INACTIVE},
                        issuer='root')

                elif sub['state'] in [
                        SubscriptionState.ACTIVE, SubscriptionState.UPDATED
                ]:
                    priority = 3
                    if 'policyid' in sub:
                        if int(sub['policyid']) not in sub_dict:
                            sub_dict[int(sub['policyid'])] = []
                        priority = int(sub['policyid'])
                    sub_dict[priority].append(sub)
            priorities = sub_dict.keys()
            priorities.sort()
            #  Order the subscriptions according to their priority
            for priority in priorities:
                subscriptions.extend(sub_dict[priority])
        except SubscriptionNotFound as error:
            logging.warning(prepend_str + 'No subscriptions defined: %s' %
                            (str(error)))
            time.sleep(10)
            continue
        except Exception as error:
            logging.error(
                prepend_str +
                'Failed to get list of new DIDs or subscriptions: %s' %
                (str(error)))

        try:
            results = {}
            start_time = time.time()
            blacklisted_rse_id = [
                rse['id'] for rse in list_rses({'availability_write': False})
            ]
            logging.debug(prepend_str + 'In transmogrifier worker')
            identifiers = []
            #  Loop over all the new dids
            for did in dids:
                did_success = True
                if did['did_type'] == str(
                        DIDType.DATASET) or did['did_type'] == str(
                            DIDType.CONTAINER):
                    results['%s:%s' % (did['scope'], did['name'])] = []
                    try:
                        metadata = get_metadata(did['scope'], did['name'])
                        # Loop over all the subscriptions
                        for subscription in subscriptions:
                            #  Check if the DID match the subscription
                            if is_matching_subscription(
                                    subscription, did, metadata) is True:
                                filter_string = loads(subscription['filter'])
                                split_rule = filter_string.get(
                                    'split_rule', False)
                                if split_rule == 'true':
                                    split_rule = True
                                elif split_rule == 'false':
                                    split_rule = False
                                stime = time.time()
                                results['%s:%s' %
                                        (did['scope'], did['name'])].append(
                                            subscription['id'])
                                logging.info(prepend_str +
                                             '%s:%s matches subscription %s' %
                                             (did['scope'], did['name'],
                                              subscription['name']))
                                for rule_string in loads(
                                        subscription['replication_rules']):
                                    # Get all the rule and subscription parameters
                                    grouping = rule_string.get(
                                        'grouping', 'DATASET')
                                    lifetime = rule_string.get(
                                        'lifetime', None)
                                    ignore_availability = rule_string.get(
                                        'ignore_availability', None)
                                    weight = rule_string.get('weight', None)
                                    source_replica_expression = rule_string.get(
                                        'source_replica_expression', None)
                                    locked = rule_string.get('locked', None)
                                    if locked == 'True':
                                        locked = True
                                    else:
                                        locked = False
                                    purge_replicas = rule_string.get(
                                        'purge_replicas', False)
                                    if purge_replicas == 'True':
                                        purge_replicas = True
                                    else:
                                        purge_replicas = False
                                    rse_expression = str(
                                        rule_string['rse_expression'])
                                    comment = str(subscription['comments'])
                                    subscription_id = str(subscription['id'])
                                    account = subscription['account']
                                    copies = int(rule_string['copies'])
                                    activity = rule_string.get(
                                        'activity', 'User Subscriptions')
                                    try:
                                        validate_schema(name='activity',
                                                        obj=activity)
                                    except InputValidationError as error:
                                        logging.error(
                                            prepend_str +
                                            'Error validating the activity %s'
                                            % (str(error)))
                                        activity = 'User Subscriptions'
                                    if lifetime:
                                        lifetime = int(lifetime)

                                    str_activity = "".join(activity.split())
                                    success = False
                                    nattempt = 5
                                    attemptnr = 0
                                    skip_rule_creation = False

                                    if split_rule:
                                        rses = parse_expression(rse_expression)
                                        list_of_rses = [
                                            rse['rse'] for rse in rses
                                        ]
                                        # Check that some rule doesn't already exist for this DID and subscription
                                        preferred_rse_ids = []
                                        for rule in list_rules(
                                                filters={
                                                    'subscription_id':
                                                    subscription_id,
                                                    'scope': did['scope'],
                                                    'name': did['name']
                                                }):
                                            already_existing_rses = [
                                                (rse['rse'], rse['id'])
                                                for rse in parse_expression(
                                                    rule['rse_expression'])
                                            ]
                                            for rse, rse_id in already_existing_rses:
                                                if (rse in list_of_rses) and (
                                                        rse_id not in
                                                        preferred_rse_ids):
                                                    preferred_rse_ids.append(
                                                        rse_id)
                                        if len(preferred_rse_ids) >= copies:
                                            skip_rule_creation = True

                                        rse_id_dict = {}
                                        for rse in rses:
                                            rse_id_dict[rse['id']] = rse['rse']
                                        try:
                                            rseselector = RSESelector(
                                                account=account,
                                                rses=rses,
                                                weight=weight,
                                                copies=copies -
                                                len(preferred_rse_ids))
                                            selected_rses = [
                                                rse_id_dict[rse_id] for rse_id,
                                                _, _ in rseselector.select_rse(
                                                    0,
                                                    preferred_rse_ids=
                                                    preferred_rse_ids,
                                                    copies=copies,
                                                    blacklist=blacklisted_rse_id
                                                )
                                            ]
                                        except (InsufficientTargetRSEs,
                                                InsufficientAccountLimit,
                                                InvalidRuleWeight) as error:
                                            logging.warning(
                                                prepend_str +
                                                'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites'
                                                % (subscription['name'],
                                                   account, str(error)))
                                            # Now including the blacklisted sites
                                            try:
                                                rseselector = RSESelector(
                                                    account=account,
                                                    rses=rses,
                                                    weight=weight,
                                                    copies=copies -
                                                    len(preferred_rse_ids))
                                                selected_rses = [
                                                    rse_id_dict[rse_id]
                                                    for rse_id, _, _ in
                                                    rseselector.select_rse(
                                                        0,
                                                        preferred_rse_ids=
                                                        preferred_rse_ids,
                                                        copies=copies,
                                                        blacklist=[])
                                                ]
                                                ignore_availability = True
                                            except (InsufficientTargetRSEs,
                                                    InsufficientAccountLimit,
                                                    InvalidRuleWeight
                                                    ) as error:
                                                logging.error(
                                                    prepend_str +
                                                    'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.'
                                                    % (subscription['name'],
                                                       account, str(error)))
                                                monitor.record_counter(
                                                    counters=
                                                    'transmogrifier.addnewrule.errortype.%s'
                                                    % (str(error.__class__.
                                                           __name__)),
                                                    delta=1)
                                                # The DID won't be reevaluated at the next cycle
                                                did_success = did_success and True
                                                continue

                                    for attempt in range(0, nattempt):
                                        attemptnr = attempt
                                        nb_rule = 0
                                        #  Try to create the rule
                                        try:
                                            if split_rule:
                                                if not skip_rule_creation:
                                                    for rse in selected_rses:
                                                        logging.info(
                                                            prepend_str +
                                                            'Will insert one rule for %s:%s on %s'
                                                            %
                                                            (did['scope'],
                                                             did['name'], rse))
                                                        add_rule(
                                                            dids=[{
                                                                'scope':
                                                                did['scope'],
                                                                'name':
                                                                did['name']
                                                            }],
                                                            account=account,
                                                            copies=1,
                                                            rse_expression=rse,
                                                            grouping=grouping,
                                                            weight=weight,
                                                            lifetime=lifetime,
                                                            locked=locked,
                                                            subscription_id=
                                                            subscription_id,
                                                            source_replica_expression
                                                            =source_replica_expression,
                                                            activity=activity,
                                                            purge_replicas=
                                                            purge_replicas,
                                                            ignore_availability=
                                                            ignore_availability,
                                                            comment=comment)

                                                        nb_rule += 1
                                                        if nb_rule == copies:
                                                            success = True
                                                            break
                                            else:
                                                add_rule(
                                                    dids=[{
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }],
                                                    account=account,
                                                    copies=copies,
                                                    rse_expression=
                                                    rse_expression,
                                                    grouping=grouping,
                                                    weight=weight,
                                                    lifetime=lifetime,
                                                    locked=locked,
                                                    subscription_id=
                                                    subscription['id'],
                                                    source_replica_expression=
                                                    source_replica_expression,
                                                    activity=activity,
                                                    purge_replicas=
                                                    purge_replicas,
                                                    ignore_availability=
                                                    ignore_availability,
                                                    comment=comment)
                                                nb_rule += 1
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.done',
                                                delta=nb_rule)
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.activity.%s'
                                                % str_activity,
                                                delta=nb_rule)
                                            success = True
                                            break
                                        except (InvalidReplicationRule,
                                                InvalidRuleWeight,
                                                InvalidRSEExpression,
                                                StagingAreaRuleRequiresLifetime,
                                                DuplicateRule) as error:
                                            # Errors that won't be retried
                                            success = True
                                            logging.error(prepend_str + '%s' %
                                                          (str(error)))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                            break
                                        except (ReplicationRuleCreationTemporaryFailed,
                                                InsufficientTargetRSEs,
                                                InsufficientAccountLimit,
                                                DatabaseException,
                                                RSEBlacklisted) as error:
                                            # Errors to be retried
                                            logging.error(
                                                prepend_str +
                                                '%s Will perform an other attempt %i/%i'
                                                % (str(error), attempt + 1,
                                                   nattempt))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                        except Exception as error:
                                            # Unexpected errors
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.unknown',
                                                delta=1)
                                            exc_type, exc_value, exc_traceback = exc_info(
                                            )
                                            logging.critical(
                                                prepend_str + ''.join(
                                                    format_exception(
                                                        exc_type, exc_value,
                                                        exc_traceback)).strip(
                                                        ))

                                    did_success = (did_success and success)
                                    if (attemptnr +
                                            1) == nattempt and not success:
                                        logging.error(
                                            prepend_str +
                                            'Rule for %s:%s on %s cannot be inserted'
                                            % (did['scope'], did['name'],
                                               rse_expression))
                                    else:
                                        logging.info(
                                            prepend_str +
                                            '%s rule(s) inserted in %f seconds'
                                            % (str(nb_rule),
                                               time.time() - stime))
                    except DataIdentifierNotFound as error:
                        logging.warning(prepend_str + error)

                if did_success:
                    if did['did_type'] == str(DIDType.FILE):
                        monitor.record_counter(
                            counters='transmogrifier.did.file.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.DATASET):
                        monitor.record_counter(
                            counters='transmogrifier.did.dataset.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.CONTAINER):
                        monitor.record_counter(
                            counters='transmogrifier.did.container.processed',
                            delta=1)
                    monitor.record_counter(
                        counters='transmogrifier.did.processed', delta=1)
                    identifiers.append({
                        'scope':
                        did['scope'],
                        'name':
                        did['name'],
                        'did_type':
                        DIDType.from_sym(did['did_type'])
                    })

            time1 = time.time()

            #  Mark the DIDs as processed
            for identifier in chunks(identifiers, 100):
                _retrial(set_new_dids, identifier, None)

            logging.info(prepend_str + 'Time to set the new flag : %f' %
                         (time.time() - time1))
            tottime = time.time() - start_time
            for sub in subscriptions:
                update_subscription(
                    name=sub['name'],
                    account=sub['account'],
                    metadata={'last_processed': datetime.now()},
                    issuer='root')
            logging.info(prepend_str +
                         'It took %f seconds to process %i DIDs' %
                         (tottime, len(dids)))
            logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids)))
            monitor.record_counter(counters='transmogrifier.job.done', delta=1)
            monitor.record_timer(stat='transmogrifier.job.duration',
                                 time=1000 * tottime)
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(
                format_exception(exc_type, exc_value, exc_traceback)).strip())
            monitor.record_counter(counters='transmogrifier.job.error',
                                   delta=1)
            monitor.record_counter(counters='transmogrifier.addnewrule.error',
                                   delta=1)
        if once is True:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 19
0
def poller(once=False,
           process=0, total_processes=1, thread=0, total_threads=1, activities=None, sleep_time=60,
           fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i) timeout (%s)' % (process, total_processes,
                                                                                              thread, total_threads,
                                                                                              db_bulk, timeout))

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)

    logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                hb['assign_thread'], hb['nr_threads'],
                                                                                db_bulk))

    activity_next_exe_time = defaultdict(time.time)
    threadPool = ThreadPool(total_threads)
    sleeping = False

    while not graceful_stop.is_set():

        try:
            hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            logging.debug('poller - thread (%i/%i)' % (hb['assign_thread'], hb['nr_threads']))

            if not sleeping:
                sleeping = True

            if activities is None:
                activities = [None]
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    time.sleep(1)
                    continue
                sleeping = False

                ts = time.time()
                logging.debug('%i:%i - start to poll transfers older than %i seconds for activity %s' % (process, hb['assign_thread'], older_than, activity))
                transfs = request.get_next_transfers(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                                     state=[RequestState.SUBMITTED],
                                                     limit=db_bulk,
                                                     older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than),
                                                     process=process, total_processes=total_processes,
                                                     thread=hb['assign_thread'], total_threads=hb['nr_threads'],
                                                     activity=activity,
                                                     activity_shares=activity_shares)
                record_timer('daemons.conveyor.poller.000-get_next_transfers', (time.time() - ts) * 1000)

                if transfs:
                    logging.debug('%i:%i - polling %i transfers for activity %s' % (process, hb['assign_thread'], len(transfs), activity))

                xfers_ids = {}
                for transf in transfs:
                    if not transf['external_host'] in xfers_ids:
                        xfers_ids[transf['external_host']] = []
                    xfers_ids[transf['external_host']].append(transf['external_id'])

                for external_host in xfers_ids:
                    for xfers in chunks(xfers_ids[external_host], fts_bulk):
                        # poll transfers
                        # xfer_requests = makeRequests(common.poll_transfers, args_list=[((external_host, xfers, process, thread), {})])
                        xfer_requests = makeRequests(common.poll_transfers, args_list=[((), {'external_host': external_host, 'xfers': xfers, 'process': process, 'thread': hb['assign_thread'], 'timeout': timeout})])
                        [threadPool.putRequest(xfer_req) for xfer_req in xfer_requests]
                threadPool.wait()

                if len(transfs) < db_bulk / 2:
                    logging.info("%i:%i - only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (process, hb['assign_thread'], len(transfs), activity, db_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
        except:
            logging.critical("%i:%i - %s" % (process, hb['assign_thread'], traceback.format_exc()))

        if once:
            break

    logging.info('%i:%i - graceful stop requests' % (process, hb['assign_thread']))

    threadPool.dismissWorkers(total_threads, do_join=True)
    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%i:%i - graceful stop done' % (process, hb['assign_thread']))
Ejemplo n.º 20
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info(prepend_str + 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logging.info(prepend_str + '%s expired replicas returned' % len(expired_replicas))
            logging.debug(prepend_str + 'List of expired replicas returned %s' % str(expired_replicas))
            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE})
                bad_replicas.append({'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
            session = get_session()

            nchunk = 0
            tot_chunk = int(math.ceil(len(replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                # Process and update the replicas in chunks
                replicas = [{'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE} for replica in chunk]
                # Remove the replicas from bad_replicas table in chunks
                bad_replicas = [{'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE} for replica in chunk]
                try:
                    nchunk += 1
                    logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk))
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning(prepend_str + 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s' % str(error))
                    for idx in range(len(chunk)):
                        logging.debug(prepend_str + 'Working on %s' % (str(replicas[idx])))
                        try:
                            get_metadata(replicas[idx]['scope'], replicas[idx]['name'])
                            update_replicas_states([replicas[idx], ], nowait=True, session=session)
                            bulk_delete_bad_replicas([bad_replicas[idx], ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(prepend_str + 'DID %s:%s does not exist anymore. ' % (bad_replicas[idx]['scope'], bad_replicas[idx]['name']))
                            bulk_delete_bad_replicas([bad_replicas[idx], ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(prepend_str + '%s:%s on RSEID %s does not exist anymore. ' % (replicas[idx]['scope'], replicas[idx]['name'], replicas[idx]['rse_id']))
                            bulk_delete_bad_replicas([bad_replicas[idx], ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())
                    session = get_session()

        except Exception:
            logging.critical(traceback.format_exc())

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 21
0
def atropos(thread,
            bulk,
            date_check,
            dry_run=True,
            grace_period=86400,
            once=True,
            unlock=False,
            spread_period=0,
            purge_replicas=False):
    """
    Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param grace_period: The grace_period for the rules.
    :param once: Run only once.
    """

    sleep_time = 60

    executable = 'atropos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    now = datetime.datetime.now()
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    time.sleep(10)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads'])
    logging.debug(prepend_str + 'Starting worker')
    summary = {}
    lifetime_exceptions = {}
    rand = random.Random(hb['assign_thread'])
    for excep in rucio.core.lifetime_exception.list_exceptions(
            exception_id=None,
            states=[
                LifetimeExceptionsState.APPROVED,
            ],
            session=None):
        if '%s:%s' % (excep['scope'],
                      excep['name']) not in lifetime_exceptions:
            lifetime_exceptions['%s:%s' %
                                (excep['scope'],
                                 excep['name'])] = excep['expires_at']
        elif lifetime_exceptions['%s:%s' %
                                 (excep['scope'],
                                  excep['name'])] < excep['expires_at']:
            lifetime_exceptions['%s:%s' %
                                (excep['scope'],
                                 excep['name'])] = excep['expires_at']
    logging.debug(prepend_str +
                  '%s active exceptions' % len(lifetime_exceptions))
    if not dry_run and date_check > now:
        logging.error(
            prepend_str +
            'Atropos cannot run in non-dry-run mode for date in the future')
    else:
        while not GRACEFUL_STOP.is_set():

            hb = heartbeat.live(executable, hostname, pid, hb_thread)
            prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'],
                                                 hb['nr_threads'])

            stime = time.time()
            try:
                rules = get_rules_beyond_eol(date_check,
                                             thread,
                                             hb['nr_threads'],
                                             session=None)
                logging.info(prepend_str + '%s rules to process' %
                             (len(rules)))
                for rule_idx, rule in enumerate(rules, start=1):
                    did = '%s:%s' % (rule.scope, rule.name)
                    logging.debug(prepend_str +
                                  'Working on rule %s on DID %s on %s' %
                                  (rule.id, did, rule.rse_expression))

                    if (rule_idx % 1000) == 0:
                        logging.info(prepend_str + '%s/%s rules processed' %
                                     (rule_idx, len(rules)))

                    # We compute the expected eol_at
                    try:
                        rses = parse_expression(rule.rse_expression)
                    except InvalidRSEExpression:
                        logging.warning(
                            prepend_str +
                            'Rule %s has an RSE expression that results in an empty set: %s'
                            % (rule.id, rule.rse_expression))
                        continue
                    eol_at = rucio.core.lifetime_exception.define_eol(
                        rule.scope, rule.name, rses)
                    if eol_at != rule.eol_at:
                        logging.warning(
                            prepend_str +
                            'The computed eol %s differs from the one recorded %s for rule %s on %s at %s'
                            % (eol_at, rule.eol_at, rule.id, did,
                               rule.rse_expression))
                        try:
                            update_rule(rule.id, options={'eol_at': eol_at})
                        except RuleNotFound:
                            logging.warning(prepend_str +
                                            'Cannot find rule %s on DID %s' %
                                            (rule.id, did))
                            continue

                    # Check the exceptions
                    if did in lifetime_exceptions:
                        if eol_at > lifetime_exceptions[did]:
                            logging.info(
                                prepend_str +
                                'Rule %s on DID %s on %s has longer expiration date than the one requested : %s'
                                % (rule.id, did, rule.rse_expression,
                                   lifetime_exceptions[did]))
                        else:
                            # If eol_at < requested extension, update eol_at
                            logging.info(
                                prepend_str +
                                'Updating rule %s on DID %s on %s according to the exception till %s'
                                % (rule.id, did, rule.rse_expression,
                                   lifetime_exceptions[did]))
                            eol_at = lifetime_exceptions[did]
                            try:
                                update_rule(rule.id,
                                            options={
                                                'eol_at':
                                                lifetime_exceptions[did]
                                            })
                            except RuleNotFound:
                                logging.warning(
                                    prepend_str +
                                    'Cannot find rule %s on DID %s' %
                                    (rule.id, did))
                                continue

                    # Now check that the new eol_at is expired
                    if eol_at and eol_at < date_check:
                        no_locks = True
                        for lock in get_dataset_locks(rule.scope, rule.name):
                            if lock['rule_id'] == rule[4]:
                                no_locks = False
                                if lock['rse_id'] not in summary:
                                    summary[lock['rse_id']] = {}
                                if did not in summary[lock['rse_id']]:
                                    summary[lock['rse_id']][did] = {
                                        'length': lock['length'] or 0,
                                        'bytes': lock['bytes'] or 0
                                    }
                        if no_locks:
                            logging.warning(
                                prepend_str +
                                'Cannot find a lock for rule %s on DID %s' %
                                (rule.id, did))
                        if not dry_run:
                            lifetime = grace_period + rand.randrange(
                                spread_period + 1)
                            logging.info(
                                prepend_str +
                                'Setting %s seconds lifetime for rule %s' %
                                (lifetime, rule.id))
                            options = {'lifetime': lifetime}
                            if purge_replicas:
                                options['purge_replicas'] = True
                            if rule.locked and unlock:
                                logging.info(prepend_str + 'Unlocking rule %s',
                                             rule.id)
                                options['locked'] = False
                            try:
                                update_rule(rule.id, options=options)
                            except RuleNotFound:
                                logging.warning(
                                    prepend_str +
                                    'Cannot find rule %s on DID %s' %
                                    (rule.id, did))
                                continue
            except Exception:
                exc_type, exc_value, exc_traceback = exc_info()
                logging.critical(''.join(
                    format_exception(exc_type, exc_value,
                                     exc_traceback)).strip())

            for rse_id in summary:
                tot_size, tot_files, tot_datasets = 0, 0, 0
                for did in summary[rse_id]:
                    tot_datasets += 1
                    tot_files += summary[rse_id][did].get('length', 0)
                    tot_size += summary[rse_id][did].get('bytes', 0)
                logging.info(
                    prepend_str +
                    'For RSE %s %s datasets will be deleted representing %s files and %s bytes'
                    % (get_rse_name(rse_id=rse_id), tot_datasets, tot_files,
                       tot_size))

            if once:
                break
            else:
                tottime = time.time() - stime
                if tottime < sleep_time:
                    logging.info(prepend_str + 'Will sleep for %s seconds' %
                                 (str(sleep_time - tottime)))
                    time.sleep(sleep_time - tottime)
                    continue

        logging.info(prepend_str + 'Graceful stop requested')
        heartbeat.die(executable, hostname, pid, hb_thread)
        logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 22
0
def reaper(rses,
           include_rses,
           exclude_rses,
           vos=None,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60,
           auto_exclude_threshold=100,
           auto_exclude_timeout=600):
    """
    Main loop to select and delete files.

    :param rses:                   List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param include_rses:           RSE expression to include RSEs.
    :param exclude_rses:           RSE expression to exclude RSEs from the Reaper.
    :param vos:                    VOs on which to look for RSEs. Only used in multi-VO mode.
                                   If None, we either use all VOs if run from "def", or the current VO otherwise.
    :param chunk_size:             The size of chunk for deletion.
    :param once:                   If True, only runs one iteration of the main loop.
    :param greedy:                 If True, delete right away replicas with tombstone.
    :param scheme:                 Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:          The delay to query replicas in BEING_DELETED state.
    :param sleep_time:             Time between two cycles.
    :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded.
    :param auto_exclude_timeout:   Timeout for temporarily excluded RSEs.
    """
    hostname = socket.getfqdn()
    executable = 'reaper2'
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                       heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    logger(logging.INFO, 'Reaper starting')

    if not once:
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                       heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Reaper started')

    while not GRACEFUL_STOP.is_set():
        # try to get auto exclude parameters from the config table. Otherwise use CLI parameters.
        try:
            auto_exclude_threshold = get('reaper',
                                         'auto_exclude_threshold',
                                         default=auto_exclude_threshold)
            auto_exclude_timeout = get('reaper',
                                       'auto_exclude_timeout',
                                       default=auto_exclude_timeout)
        except ConfigNotFound:
            pass

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = get('reaper',
                                              'max_evaluator_backlog_count')
        except ConfigNotFound:
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = get(
                'reaper', 'max_evaluator_backlog_duration')
        except ConfigNotFound:
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog()
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses,
                                              vos)
        if not rses_to_process:
            logger(logging.ERROR,
                   'Reaper: No RSEs found. Will sleep for 30 seconds')
            GRACEFUL_STOP.wait(30)
            continue
        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                               heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            tot_needed_free_space = 0
            for rse in rses_to_process:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blacklisted
                if rse['availability'] % 2 == 0:
                    logger(logging.DEBUG, 'RSE %s is blacklisted for delete',
                           rse['rse'])
                    continue
                needed_free_space, only_delete_obsolete = __check_rse_usage(
                    rse['rse'], rse['id'], greedy=greedy, logger=logger)
                if needed_free_space:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                    tot_needed_free_space += needed_free_space
                elif only_delete_obsolete:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                else:
                    logger(logging.DEBUG, 'Nothing to delete on %s',
                           rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True))
            logger(logging.DEBUG,
                   'List of RSEs to process ordered by needed space desc: %s',
                   str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                if tot_needed_free_space:
                    max_workers = ceil(dict_rses[rse_key][0] /
                                       tot_needed_free_space * 1000 /
                                       heart_beat['nr_threads'])
                else:
                    max_workers = 1

                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                result = REGION.get('pause_deletion_%s' % rse_id,
                                    expiration_time=120)
                if result is not NO_VALUE:
                    logger(
                        logging.INFO,
                        'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                        rse_name)
                    continue
                result = REGION.get('temporary_exclude_%s' % rse_id,
                                    expiration_time=auto_exclude_timeout)
                if result is not NO_VALUE:
                    logger(
                        logging.WARNING,
                        'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                        rse_name)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    continue
                labels = {'rse': rse_name}
                EXCLUDED_RSE_GAUGE.labels(**labels).set(0)
                percent = 0
                if tot_needed_free_space:
                    percent = needed_free_space / tot_needed_free_space * 100
                logger(
                    logging.DEBUG,
                    'Working on %s. Percentage of the total space needed %.2f',
                    rse_name, percent)
                try:
                    rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                except KeyError:
                    logger(logging.DEBUG, "Hostname lookup for %s failed.",
                           rse_name)
                    REGION.set('pause_deletion_%s' % rse_id, True)
                    continue
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]
                max_deletion_thread = get_max_deletion_threads_by_hostname(
                    rse_hostname)
                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logger(
                        logging.DEBUG,
                        'Too many deletion threads for %s on RSE %s. Back off',
                        rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue
                logger(
                    logging.INFO,
                    'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logger(logging.DEBUG, 'Total deletion workers for %s : %i',
                       rse_hostname, tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                only_delete_obsolete = dict_rses[(rse_name, rse_id)][1]
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        if only_delete_obsolete:
                            logger(
                                logging.DEBUG,
                                'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                                rse_name)
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            only_delete_obsolete=only_delete_obsolete,
                            session=None)
                    logger(
                        logging.DEBUG,
                        'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logger(
                            logging.DEBUG,
                            'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            rse_name, chunk_size, len(replicas))
                        REGION.set('pause_deletion_%s' % rse_id, True)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logger(logging.ERROR, '%s', str(error))
                    continue
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)
                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme,
                                                  logger=logger)
                    for file_replicas in chunks(replicas, 100):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    list(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope':
                                                replica['scope'].external,
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values())[0])
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logger(
                                    logging.WARNING,
                                    'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    replica['scope'], replica['name'],
                                    rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logger(logging.CRITICAL,
                                       'Exception',
                                       exc_info=True)

                        deleted_files = delete_from_storage(
                            file_replicas,
                            prot,
                            rse_info,
                            staging_areas,
                            auto_exclude_threshold,
                            logger=logger)
                        logger(logging.INFO,
                               '%i files processed in %s seconds',
                               len(file_replicas),
                               time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logger(
                            logging.DEBUG,
                            'delete_replicas successed on %s : %s replicas in %s seconds',
                            rse_name, len(deleted_files),
                            time.time() - del_start)
                        monitor.record_counter(counters='reaper.deletion.done',
                                               delta=len(deleted_files))
                        DELETION_COUNTER.inc(len(deleted_files))
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            if once:
                break

            tottime = time.time() - start_time
            if tottime < sleep_time:
                logger(logging.INFO, 'Will sleep for %s seconds',
                       sleep_time - tottime)
                GRACEFUL_STOP.wait(sleep_time - tottime)

        except DatabaseException as error:
            logger(logging.WARNING, 'Reaper:  %s', str(error))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
    return
Ejemplo n.º 23
0
def reaper(rses=[],
           worker_number=0,
           total_workers=1,
           chunk_size=100,
           once=False,
           scheme=None):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param total_workers:  The total number of workers.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    """
    logging.info('Starting Dark Reaper %s-%s: Will work on RSEs: %s',
                 worker_number, total_workers, str(rses))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    hash_executable = hashlib.sha256(sys.argv[0] + ''.join(rses)).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            logging.info(
                'Dark Reaper({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'
                .format(locals()))
            nothing_to_do = True

            random.shuffle(rses)
            for rse_id in rses:
                rse = rse_core.get_rse_name(rse_id=rse_id)
                rse_info = rsemgr.get_rse_info(rse)
                replicas = list_quarantined_replicas(
                    rse_id=rse_id,
                    limit=chunk_size,
                    worker_number=worker_number,
                    total_workers=total_workers)

                rse_protocol = rse_core.get_rse_protocols(rse_id=rse_id)
                prot = rsemgr.create_protocol(rse_info,
                                              'delete',
                                              scheme=scheme)
                deleted_replicas = []
                try:
                    prot.connect()
                    for replica in replicas:
                        nothing_to_do = False
                        scope = ''
                        if replica['scope']:
                            scope = replica['scope'].external
                        try:
                            pfn = str(
                                rsemgr.lfns2pfns(rse_settings=rse_info,
                                                 lfns=[{
                                                     'scope': scope,
                                                     'name': replica['name'],
                                                     'path': replica['path']
                                                 }],
                                                 operation='delete',
                                                 scheme=scheme).values()[0])
                            logging.info(
                                'Dark Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s',
                                worker_number, total_workers, scope,
                                replica['name'], pfn, rse)
                            start = time.time()
                            prot.delete(pfn)
                            duration = time.time() - start
                            logging.info(
                                'Dark Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                worker_number, total_workers, scope,
                                replica['name'], pfn, rse, duration)
                            add_message(
                                'deletion-done', {
                                    'scope': scope,
                                    'name': replica['name'],
                                    'rse': rse,
                                    'rse_id': rse_id,
                                    'file-size': replica.get('bytes') or 0,
                                    'bytes': replica.get('bytes') or 0,
                                    'url': pfn,
                                    'duration': duration,
                                    'protocol': prot.attributes['scheme']
                                })
                            deleted_replicas.append(replica)
                        except SourceNotFound:
                            err_msg = 'Dark Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (
                                worker_number, total_workers, scope,
                                replica['name'], pfn, rse)
                            logging.warning(err_msg)
                            deleted_replicas.append(replica)
                        except (ServiceUnavailable, RSEAccessDenied,
                                ResourceTemporaryUnavailable) as error:
                            err_msg = 'Dark Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (
                                worker_number, total_workers, scope,
                                replica['name'], pfn, rse, str(error))
                            logging.warning(err_msg)
                            add_message(
                                'deletion-failed', {
                                    'scope': scope,
                                    'name': replica['name'],
                                    'rse': rse,
                                    'rse_id': rse_id,
                                    'file-size': replica['bytes'] or 0,
                                    'bytes': replica['bytes'] or 0,
                                    'url': pfn,
                                    'reason': str(error),
                                    'protocol': prot.attributes['scheme']
                                })

                        except Exception:
                            logging.critical(traceback.format_exc())
                finally:
                    prot.close()

                delete_quarantined_replicas(rse_id=rse_id,
                                            replicas=deleted_replicas)

                if once:
                    break

            if once:
                break

            if nothing_to_do:
                logging.info(
                    'Dark Reaper %s-%s: Nothing to do. I will sleep for 60s',
                    worker_number, total_workers)
                time.sleep(60)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except Exception:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Ejemplo n.º 24
0
            elif match('.*ORA-03135.*', str(e.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('rule.judge.exceptions.%s' %
                               e.__class__.__name__)
        except Exception, e:
            logging.critical(traceback.format_exc())
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
        if once:
            break

    die(executable='rucio-judge-repairer',
        hostname=hostname,
        pid=pid,
        thread=current_thread)


def stop(signum=None, frame=None):
    """
    Graceful exit.
    """

    graceful_stop.set()


def run(once=False, threads=1):
    """
    Starts up the Judge-Repairer threads.
    """
Ejemplo n.º 25
0
            if time_left > 0:
                logging.debug(
                    "Waiting %s seconds until next FTS terminal state retrieval"
                    % time_left)
                time.sleep(time_left)
        except RequestException, e:
            logging.error("Failed to contact FTS server: %s" % (str(e)))
        except:
            logging.critical(traceback.format_exc())

        if once:
            return

    logging.info('poller_latest - graceful stop requests')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('poller_latest - graceful stop done')


def stop(signum=None, frame=None):
    """
    Graceful exit.
    """

    graceful_stop.set()


def run(once=False,
        last_nhours=1,
        external_hosts=None,
Ejemplo n.º 26
0
def fts_throttler(once=False, cycle_interval=3600):
    """
    Main loop to automatically configure FTS storages.
    """
    graceful_stop.clear()
    logging.info('FTS Throttler starting')

    executable = 'conveyor-fts-throttler'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    current_time = time.time()
    graceful_stop.wait(10)
    running_instance = False

    while not graceful_stop.is_set():
        heart_beat = heartbeat.live(executable,
                                    hostname,
                                    pid,
                                    hb_thread,
                                    older_than=3600)
        if heart_beat['nr_threads'] < 2:
            running_instance = True
            # this loop cannot be entered by more than one instance at a time.
            while not graceful_stop.is_set():

                try:
                    heart_beat = heartbeat.live(executable,
                                                hostname,
                                                pid,
                                                hb_thread,
                                                older_than=3600)
                    prepend_str = 'Thread [%i/%i] : ' % (
                        heart_beat['assign_thread'], heart_beat['nr_threads'])

                    logging.info(prepend_str + "fts_throttler start cycle")
                    if FTSThrottler().revert():
                        logging.info('revert was successful, now tuning')
                        FTSThrottler().tune()
                        logging.info('Tuning finished for this cycle')
                    else:
                        logging.warning(
                            'could not revert, cannot tune unless revert has been done, will try again next cycle.'
                        )

                    if once:
                        break
                    if time.time() < current_time + cycle_interval:
                        graceful_stop.wait(
                            int((current_time + cycle_interval) - time.time()))
                    current_time = time.time()
                except Exception:
                    logging.critical(prepend_str + 'fts_throttler crashed %s' %
                                     (traceback.format_exc()))

                if once:
                    break
        else:
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])
            logging.info(
                prepend_str +
                'another fts_throttler instance already exists. will wait')
            if time.time() < current_time + cycle_interval:
                graceful_stop.wait(
                    int((current_time + cycle_interval) - time.time()))
            current_time = time.time()

    logging.info(prepend_str + 'Throttler - graceful stop requested')

    # before we stop, try to revert, but only if this instance was running the cycles.
    # ! If the cycle info file information is shared between instances, then this implementation must be changed !
    if running_instance:
        try:
            FTSThrottler().revert()
        except Exception:
            logging.warning('could not revert changes before stopping')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Throttler - graceful stop done')
Ejemplo n.º 27
0
def receiver(id, total_threads=1, full_mode=False):
    """
    Main loop to consume messages from the FTS3 producer.
    """

    logging.info('receiver starting in full mode: %s' % full_mode)

    executable = 'conveyor-receiver'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()

    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    heartbeat.live(executable, hostname, pid, hb_thread)

    brokers_alias = []
    brokers_resolved = []
    try:
        brokers_alias = [
            b.strip()
            for b in config_get('messaging-fts3', 'brokers').split(',')
        ]
    except Exception:
        raise Exception('Could not load brokers from configuration')

    logging.info('resolving broker dns alias: %s' % brokers_alias)

    brokers_resolved = []
    for broker in brokers_alias:
        addrinfos = socket.getaddrinfo(broker, 0, socket.AF_INET, 0,
                                       socket.IPPROTO_TCP)
        brokers_resolved.extend(ai[4][0] for ai in addrinfos)

    logging.info('brokers resolved to %s', brokers_resolved)

    logging.info('checking authentication method')
    use_ssl = True
    try:
        use_ssl = config_get_bool('messaging-fts3', 'use_ssl')
    except:
        logging.info(
            'could not find use_ssl in configuration -- please update your rucio.cfg'
        )

    port = config_get_int('messaging-fts3', 'port')
    vhost = config_get('messaging-fts3',
                       'broker_virtual_host',
                       raise_exception=False)
    if not use_ssl:
        username = config_get('messaging-fts3', 'username')
        password = config_get('messaging-fts3', 'password')
        port = config_get_int('messaging-fts3', 'nonssl_port')

    conns = []
    for broker in brokers_resolved:
        if not use_ssl:
            logging.info('setting up username/password authentication: %s' %
                         broker)
            con = stomp.Connection12(host_and_ports=[(broker, port)],
                                     use_ssl=False,
                                     vhost=vhost,
                                     reconnect_attempts_max=999)
        else:
            logging.info('setting up ssl cert/key authentication: %s' % broker)
            con = stomp.Connection12(
                host_and_ports=[(broker, port)],
                use_ssl=True,
                ssl_key_file=config_get('messaging-fts3', 'ssl_key_file'),
                ssl_cert_file=config_get('messaging-fts3', 'ssl_cert_file'),
                vhost=vhost,
                reconnect_attempts_max=999)
        conns.append(con)

    logging.info('receiver started')

    while not graceful_stop.is_set():

        heartbeat.live(executable, hostname, pid, hb_thread)

        for conn in conns:

            if not conn.is_connected():
                logging.info('connecting to %s' %
                             conn.transport._Transport__host_and_ports[0][0])
                record_counter('daemons.messaging.fts3.reconnect.%s' %
                               conn.transport._Transport__host_and_ports[0]
                               [0].split('.')[0])

                conn.set_listener(
                    'rucio-messaging-fts3',
                    Receiver(
                        broker=conn.transport._Transport__host_and_ports[0],
                        id=id,
                        total_threads=total_threads,
                        full_mode=full_mode))
                conn.start()
                if not use_ssl:
                    conn.connect(username, password, wait=True)
                else:
                    conn.connect(wait=True)
                conn.subscribe(destination=config_get('messaging-fts3',
                                                      'destination'),
                               id='rucio-messaging-fts3',
                               ack='auto')

        time.sleep(1)

    logging.info('receiver graceful stop requested')

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('receiver graceful stop done')
Ejemplo n.º 28
0
def submitter(once=False, rses=None, mock=False,
              bulk=100, group_bulk=1, group_policy='rule', fts_source_strategy='auto',
              activities=None, sleep_time=600, max_sources=4, retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = sys.argv[0]
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
    logging.info(prepend_str + 'Submitter starting with timeout %s' % (timeout))

    time.sleep(10)  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
    logging.info(prepend_str + 'Transfer submitter started')

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logging.info(prepend_str + "CMS user transfer activity")
                    user_transfer = True

                logging.info(prepend_str + 'Starting to get transfer transfers for %s' % (activity))
                start_time = time.time()
                transfers = __get_transfers(total_workers=heart_beat['nr_threads'] - 1,
                                            worker_number=heart_beat['assign_thread'],
                                            failover_schemes=failover_scheme,
                                            limit=bulk,
                                            activity=activity,
                                            rses=rse_ids,
                                            schemes=scheme,
                                            mock=mock,
                                            max_sources=max_sources,
                                            bring_online=bring_online,
                                            retry_other_fts=retry_other_fts)
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers))
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers))
                logging.info(prepend_str + 'Got %s transfers for %s in %s seconds' % (len(transfers), activity, time.time() - start_time))

                # group transfers
                logging.info(prepend_str + 'Starting to group transfers for %s' % (activity))
                start_time = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, fts_source_strategy, max_time_in_queue)
                record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1))

                logging.info(prepend_str + 'Starting to submit transfers for %s' % (activity))

                for external_host in grouped_jobs:
                    if not user_transfer:
                        for job in grouped_jobs[external_host]:
                            # submit transfers
                            submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter',
                                            logging_prepend_str=prepend_str, timeout=timeout)
                    else:
                        for _, jobs in grouped_jobs[external_host].iteritems():
                            # submit transfers
                            for job in jobs:
                                submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter',
                                                logging_prepend_str=prepend_str, timeout=timeout, user_transfer_job=user_transfer)

                if len(transfers) < group_bulk:
                    logging.info(prepend_str + 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
        except Exception:
            logging.critical(prepend_str + '%s' % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
    return
Ejemplo n.º 29
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logger(logging.INFO, 'Minos started')

    chunk_size = 10  # The chunk size used for the commits

    while not GRACEFUL_STOP.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                    heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prefix + '%s')
        pfns = []
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = states_mapping[pfn['state']]
                if state in [BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                elif state == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                vo = account.vo
                pfns = bad_replicas[(account, reason, state)]
                logger(
                    logging.INFO,
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                schemes = {}
                dict_rse = {}
                unknown_replicas = []
                try:
                    # Splitting the PFNs by schemes
                    for pfn in pfns:
                        scheme = pfn.split(':')[0]
                        if scheme not in schemes:
                            schemes[scheme] = []
                        schemes[scheme].append(pfn)
                    for scheme in schemes:
                        _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                            schemes[scheme], vo=vo)
                        for rse_id in tmp_dict_rse:
                            if rse_id not in dict_rse:
                                dict_rse[rse_id] = []
                            dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))
                    # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                    if unknown_replicas:
                        logger(
                            logging.INFO,
                            'The following replicas are unknown and will be removed : %s'
                            % str(unknown_replicas))
                        bulk_delete_bad_pfns(pfns=unknown_replicas,
                                             session=None)

                    for rse_id in dict_rse:
                        vo_str = '' if vo == 'def' else ' on VO ' + vo
                        logger(
                            logging.DEBUG,
                            'Running on RSE %s%s with %s replicas' %
                            (get_rse_name(rse_id=rse_id), vo_str,
                             len(dict_rse[rse_id])))
                        nchunk = 0
                        tot_chunk = int(
                            math.ceil(len(dict_rse[rse_id]) / chunk_size))
                        for chunk in chunks(dict_rse[rse_id], chunk_size):
                            nchunk += 1
                            logger(
                                logging.DEBUG,
                                'Running on %s chunk out of %s' %
                                (nchunk, tot_chunk))
                            unknown_replicas = declare_bad_file_replicas(
                                pfns=chunk,
                                reason=reason,
                                issuer=account,
                                status=state,
                                session=session)
                            if unknown_replicas:
                                logger(
                                    logging.DEBUG, 'Unknown replicas : %s' %
                                    (str(unknown_replicas)))
                            bulk_delete_bad_pfns(pfns=chunk, session=session)
                            session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                vo = account.vo
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logger(
                    logging.INFO,
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logger(logging.DEBUG, 'Extracting RSEs')
                schemes = {}
                dict_rse = {}
                unknown_replicas = []

                # Splitting the PFNs by schemes
                for pfn in pfns:
                    scheme = pfn.split(':')[0]
                    if scheme not in schemes:
                        schemes[scheme] = []
                    schemes[scheme].append(pfn)
                for scheme in schemes:
                    _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                        schemes[scheme], vo=vo)
                    for rse_id in tmp_dict_rse:
                        if rse_id not in dict_rse:
                            dict_rse[rse_id] = []
                        dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))

                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logger(
                        logging.INFO,
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse_id in dict_rse:
                    replicas = []
                    rse = get_rse_name(rse_id=rse_id, session=None)
                    rse_vo_str = rse if vo == 'def' else '{} on {}'.format(
                        rse, vo)
                    logger(logging.DEBUG, 'Running on RSE %s' % rse_vo_str)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse_id],
                                                 rse_id=None,
                                                 vo=vo,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    logger(
                        logging.DEBUG, 'Running on %s replicas on RSE %s' %
                        (len(replicas), rse_vo_str))
                    nchunk = 0
                    tot_chunk = int(
                        math.ceil(len(replicas) / float(chunk_size)))
                    session = get_session()
                    for chunk in chunks(replicas, chunk_size):
                        try:
                            nchunk += 1
                            logger(
                                logging.DEBUG,
                                'Running on %s chunk out of %s' %
                                (nchunk, tot_chunk))
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=reason,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                        except (UnsupportedOperation,
                                ReplicaNotFound) as error:
                            session.rollback()  # pylint: disable=no-member
                            logger(
                                logging.ERROR,
                                'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s'
                                % str(error))
                            for rep in chunk:
                                logger(logging.DEBUG,
                                       'Working on %s' % (str(rep)))
                                try:
                                    get_metadata(rep['scope'], rep['name'])
                                    unavailable_states = []
                                    rep_state = get_replicas_state(
                                        rep['scope'], rep['name'])
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.TEMPORARY_UNAVAILABLE,
                                            []))
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.BEING_DELETED, []))
                                    unavailable_states.extend(
                                        rep_state.get(ReplicaState.BAD, []))
                                    # If the replica is already not available, it is removed from the bad PFNs table
                                    if rep['rse_id'] in unavailable_states:
                                        logger(
                                            logging.INFO,
                                            '%s is in unavailable state. Will be removed from the list of bad PFNs'
                                            % str(rep['pfn']))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    # If the expiration date of the TEMPORARY_UNAVAILABLE is in the past, it is removed from the bad PFNs table
                                    elif expires_at < datetime.now():
                                        logger(
                                            logging.INFO,
                                            'PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns'
                                            % (str(rep['pfn']), expires_at))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    # Else update everything in the same transaction
                                    else:
                                        try:
                                            session = get_session()
                                            update_replicas_states(
                                                [rep],
                                                nowait=False,
                                                session=session)
                                            bulk_add_bad_replicas(
                                                [rep],
                                                account,
                                                state=BadFilesStatus.
                                                TEMPORARY_UNAVAILABLE,
                                                reason=reason,
                                                expires_at=expires_at,
                                                session=session)
                                            bulk_delete_bad_pfns(
                                                pfns=[rep['pfn']],
                                                session=session)
                                            session.commit()  # pylint: disable=no-member
                                        except Exception:
                                            logger(
                                                logging.ERROR,
                                                'Cannot update state of %s' %
                                                str(rep['pfn']))
                                except (DataIdentifierNotFound,
                                        ReplicaNotFound):
                                    logger(
                                        logging.ERROR,
                                        'Will remove %s from the list of bad PFNs'
                                        % str(rep['pfn']))
                                    bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                         session=None)
                            session = get_session()
                        except Exception:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.CRITICAL,
                                   'Exception',
                                   exc_info=True)
                            session = get_session()

        except Exception as error:
            logger(logging.ERROR, '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if len(pfns) == bulk:
            logger(
                logging.INFO,
                'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle'
            )
        elif tottime < sleep_time:
            logger(logging.INFO,
                   'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
Ejemplo n.º 30
0
def kronos_file(thread=0, dataset_queue=None, sleep_time=60):
    """
    Main loop to consume tracer reports.
    """

    logging.info('kronos_file[%i/?] starting', thread)

    executable = 'kronos-file'
    hostname = socket.gethostname()
    pid = getpid()
    hb_thread = current_thread()

    chunksize = config_get_int('tracer-kronos', 'chunksize')
    prefetch_size = config_get_int('tracer-kronos', 'prefetch_size')
    subscription_id = config_get('tracer-kronos', 'subscription_id')
    try:
        bad_files_patterns = []
        pattern = config_get(section='kronos',
                             option='bad_files_patterns',
                             session=None)
        pattern = str(pattern)
        patterns = pattern.split(",")
        for pat in patterns:
            bad_files_patterns.append(re.compile(pat.strip()))
    except (NoOptionError, NoSectionError, RuntimeError):
        bad_files_patterns = []
    except Exception as error:
        logging.log(logging.ERROR,
                    'kronos_file[%i/?] Failed to get bad_file_patterns %s',
                    thread, str(error))
        bad_files_patterns = []

    use_ssl = True
    try:
        use_ssl = config_get_bool('tracer-kronos', 'use_ssl')
    except Exception:
        pass

    if not use_ssl:
        username = config_get('tracer-kronos', 'username')
        password = config_get('tracer-kronos', 'password')

    excluded_usrdns = set(
        config_get('tracer-kronos', 'excluded_usrdns').split(','))
    vhost = config_get('tracer-kronos',
                       'broker_virtual_host',
                       raise_exception=False)

    brokers_alias = [
        b.strip() for b in config_get('tracer-kronos', 'brokers').split(',')
    ]
    port = config_get_int('tracer-kronos', 'port')
    reconnect_attempts = config_get_int('tracer-kronos', 'reconnect_attempts')
    ssl_key_file = config_get('tracer-kronos',
                              'ssl_key_file',
                              raise_exception=False)
    ssl_cert_file = config_get('tracer-kronos',
                               'ssl_cert_file',
                               raise_exception=False)

    sanity_check(executable=executable, hostname=hostname)
    while not graceful_stop.is_set():
        start_time = time()
        heart_beat = live(executable, hostname, pid, hb_thread)
        prepend_str = 'kronos-file[%i/%i] ' % (heart_beat['assign_thread'],
                                               heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prepend_str + '%s')
        conns = get_stomp_brokers(brokers=brokers_alias,
                                  port=port,
                                  use_ssl=use_ssl,
                                  vhost=vhost,
                                  reconnect_attempts=reconnect_attempts,
                                  ssl_key_file=ssl_key_file,
                                  ssl_cert_file=ssl_cert_file,
                                  timeout=sleep_time,
                                  logger=logger)
        for conn in conns:
            if not conn.is_connected():
                logger(
                    logging.INFO, 'connecting to %s' %
                    str(conn.transport._Transport__host_and_ports[0]))
                record_counter(
                    'daemons.tracer.kronos.reconnect.{host}',
                    labels={
                        'host': conn.transport._Transport__host_and_ports[0][0]
                    })
                conn.set_listener(
                    'rucio-tracer-kronos',
                    AMQConsumer(
                        broker=conn.transport._Transport__host_and_ports[0],
                        conn=conn,
                        queue=config_get('tracer-kronos', 'queue'),
                        chunksize=chunksize,
                        subscription_id=subscription_id,
                        excluded_usrdns=excluded_usrdns,
                        dataset_queue=dataset_queue,
                        bad_files_patterns=bad_files_patterns,
                        logger=logger))
                if not use_ssl:
                    conn.connect(username, password)
                else:
                    conn.connect()
                conn.subscribe(
                    destination=config_get('tracer-kronos', 'queue'),
                    ack='client-individual',
                    id=subscription_id,
                    headers={'activemq.prefetchSize': prefetch_size})

        tottime = time() - start_time
        if tottime < sleep_time:
            logger(logging.INFO,
                   'Will sleep for %s seconds' % (sleep_time - tottime))
            sleep(sleep_time - tottime)

    logger(logging.INFO, 'graceful stop requested')

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    die(executable=executable, hostname=hostname, pid=pid, thread=thread)
    logger(logging.INFO, 'graceful stop done')