Example #1
0
def declare_bad_file_replicas(pfns, rse, issuer):
    """
    Declare a list of bad replicas.

    :param pfns: The list of PFNs.
    :param rse: The RSE name.
    :param issuer: The issuer account.
    """
    kwargs = {}
    if not permission.has_permission(issuer=issuer, action='declare_bad_file_replicas', kwargs=kwargs):
        raise exception.AccessDenied('Account %s can not declare bad replicas on %s' % (issuer, rse))
    return replica.declare_bad_file_replicas(pfns=pfns, rse=rse)
Example #2
0
        """ REPLICA (CORE): Add bad replicas and list them"""
        tmp_scope = 'mock'
        nbfiles = 5
        # Adding replicas to deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in xrange(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK')
        rse_id1 = rse_info['id']
        add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True)

        # Listing replicas on deterministic RSE
        replicas = []
        list_rep = []
        for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
            replicas.extend(replica['rses']['MOCK'])
            list_rep.append(replica)
        declare_bad_file_replicas(replicas, 'MOCK')
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id1:
                    if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Adding replicas to non-deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb',
                  'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': {'events': 10}} for i in xrange(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK2')
        rse_id2 = rse_info['id']
        add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True)
Example #3
0
                     files=files,
                     account='root',
                     ignore_availability=True)

        # Listing replicas on deterministic RSE
        replicas = []
        list_rep = []
        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['srm']):
            replicas.extend(replica['rses']['MOCK'])
            list_rep.append(replica)
        r = declare_bad_file_replicas(replicas, 'This is a good reason',
                                      'root')
        assert_equal(r, {})
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id1:
                    if badrep['scope'] == rep['scope'] and badrep[
                            'name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Adding replicas to non-deterministic RSE
        files = [{
            'scope':
            tmp_scope,
def declare_suspicious_replicas_bad(once=False,
                                    younger_than=3,
                                    nattempts=10,
                                    rse_expression='MOCK',
                                    max_replicas_per_rse=100):
    """
    Main loop to check for available replicas which are labeled as suspicious

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'.
    :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE.
                                 If more is found, processing is skipped and warning is printed.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]
    rses = []
    for rse in parse_expression(expression=rse_expression):
        rses.append(rse['rse'])
    rses.sort()
    executable += ' --rse-expression ' + str(rses)

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    live(executable=executable,
         hostname=socket.gethostname(),
         pid=os.getpid(),
         thread=threading.current_thread())

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable,
                             hostname=socket.gethostname(),
                             pid=os.getpid(),
                             thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread'] + 1

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logging.error(
                    'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.',
                    socket.gethostname())
                die(executable=executable,
                    hostname=socket.gethostname(),
                    pid=os.getpid(),
                    thread=threading.current_thread())
                break

            start = time.time()

            logging.info(
                'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,'
                +
                ' reported suspicious in the last %i days at least %i times which are available on other RSEs.',
                worker_number, total_workers, rse_expression, younger_than,
                nattempts)

            getfileskwargs = {
                'younger_than': younger_than,
                'nattempts': nattempts,
                'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                'available_elsewhere': True,
                'is_suspicious': True
            }

            recoverable_replicas = get_suspicious_files(
                rse_expression, **getfileskwargs)

            logging.info(
                'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.',
                worker_number, total_workers,
                time.time() - start, len(recoverable_replicas))

            if not recoverable_replicas and not once:
                logging.info(
                    'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas. Sleeping for 60 seconds.',
                    worker_number, total_workers, len(recoverable_replicas))
                GRACEFUL_STOP.wait(60)
            else:
                logging.info(
                    'replica_recoverer[%i/%i]: looking for replica surls.',
                    worker_number, total_workers)

                start = time.time()
                surls_to_recover = {
                }  # dictionary of { rse1: [surl1, surl2, ... ], rse2: ... }
                cnt_surl_not_found = 0
                for replica in recoverable_replicas:
                    scope = replica['scope']
                    name = replica['name']
                    rse = replica['rse']
                    rse_id = replica['rse_id']
                    if GRACEFUL_STOP.is_set():
                        break
                    if rse_id not in surls_to_recover:
                        surls_to_recover[rse_id] = []
                    # for each suspicious replica, we get its surl through the list_replicas function
                    surl_not_found = True
                    for rep in list_replicas([{'scope': scope, 'name': name}]):
                        for site in rep['rses']:
                            if site == rse_id:
                                surls_to_recover[rse_id].append(
                                    rep['rses'][site][0])
                                surl_not_found = False
                    if surl_not_found:
                        cnt_surl_not_found += 1
                        logging.warning(
                            'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.',
                            worker_number, total_workers, name, rse)

                logging.info(
                    'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.',
                    worker_number, total_workers,
                    len(recoverable_replicas) - cnt_surl_not_found,
                    len(recoverable_replicas),
                    time.time() - start)

                for rse_id in surls_to_recover:
                    logging.info(
                        'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.',
                        worker_number, total_workers,
                        len(surls_to_recover[rse_id]), rse,
                        str(surls_to_recover[rse_id]))
                    if len(surls_to_recover[rse_id]) > max_replicas_per_rse:
                        logging.warning(
                            'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.',
                            worker_number, total_workers, max_replicas_per_rse,
                            str(len(surls_to_recover[rse_id])), rse)
                    else:
                        declare_bad_file_replicas(
                            pfns=surls_to_recover[rse_id],
                            reason='Suspicious. Automatic recovery.',
                            issuer=InternalAccount('root'),
                            status=BadFilesStatus.BAD,
                            session=None)
                        logging.info(
                            'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.',
                            worker_number, total_workers, rse)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
        except Exception as err:
            logging.critical(traceback.format_exc())
            record_counter('replica.recoverer.exceptions.%s',
                           err.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=socket.gethostname(),
        pid=os.getpid(),
        thread=threading.current_thread())
    logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number,
                 total_workers)
Example #5
0
    def __update_atime(self):
        """
        Bulk update atime.
        """
        replicas = []
        rses = []
        for report in self.__reports:
            try:
                # Identify suspicious files
                try:
                    if self.__bad_files_patterns and report['eventType'] in [
                            'get_sm', 'get_sm_a', 'get'
                    ] and 'clientState' in report and report[
                            'clientState'] not in [
                                'DONE', 'FOUND_ROOT', 'ALREADY_DONE'
                            ]:
                        for pattern in self.__bad_files_patterns:
                            if 'stateReason' in report and report[
                                    'stateReason'] and isinstance(
                                        report['stateReason'],
                                        str) and pattern.match(
                                            report['stateReason']):
                                reason = report['stateReason'][:255]
                                if 'url' not in report or not report['url']:
                                    logging.error(
                                        'Missing url in the following trace : '
                                        + str(report))
                                else:
                                    try:
                                        surl = report['url']
                                        declare_bad_file_replicas(
                                            [
                                                surl,
                                            ],
                                            reason=reason,
                                            issuer='root',
                                            status=BadFilesStatus.SUSPICIOUS)
                                        logging.info(
                                            'Declare suspicious file %s with reason %s'
                                            % (report['url'], reason))
                                    except Exception as error:
                                        logging.error(
                                            'Failed to declare suspicious file'
                                            + str(error))
                except Exception as error:
                    logging.error('Problem with bad trace : %s . Error %s' %
                                  (str(report), str(error)))

                # check if scope in report. if not skip this one.
                if 'scope' not in report:
                    record_counter('daemons.tracer.kronos.missing_scope')
                    if report['eventType'] != 'touch':
                        continue
                else:
                    record_counter('daemons.tracer.kronos.with_scope')

                # handle all events starting with get* and download and touch events.
                if not report['eventType'].startswith('get') and not report[
                        'eventType'].startswith('sm_get') and not report[
                            'eventType'] == 'download' and not report[
                                'eventType'] == 'touch':
                    continue
                if report['eventType'].endswith('_es'):
                    continue
                record_counter('daemons.tracer.kronos.total_get')
                if report['eventType'] == 'get':
                    record_counter('daemons.tracer.kronos.dq2clients')
                elif report['eventType'] == 'get_sm' or report[
                        'eventType'] == 'sm_get':
                    if report['eventVersion'] == 'aCT':
                        record_counter(
                            'daemons.tracer.kronos.panda_production_act')
                    else:
                        record_counter(
                            'daemons.tracer.kronos.panda_production')
                elif report['eventType'] == 'get_sm_a' or report[
                        'eventType'] == 'sm_get_a':
                    if report['eventVersion'] == 'aCT':
                        record_counter(
                            'daemons.tracer.kronos.panda_analysis_act')
                    else:
                        record_counter('daemons.tracer.kronos.panda_analysis')
                elif report['eventType'] == 'download':
                    record_counter('daemons.tracer.kronos.rucio_download')
                elif report['eventType'] == 'touch':
                    record_counter('daemons.tracer.kronos.rucio_touch')
                else:
                    record_counter('daemons.tracer.kronos.other_get')

                if report['eventType'] == 'download' or report[
                        'eventType'] == 'touch':
                    report['usrdn'] = report['account']

                if report['usrdn'] in self.__excluded_usrdns:
                    continue

                # handle touch and non-touch traces differently
                if report['eventType'] != 'touch':
                    # check if the report has the right state.
                    if 'eventVersion' in report:
                        if report['eventVersion'] != 'aCT':
                            if report['clientState'] in self.__excluded_states:
                                continue

                    if 'remoteSite' not in report:
                        continue
                    if not report['remoteSite']:
                        continue

                    if 'filename' not in report:
                        if 'name' in report:
                            report['filename'] = report['name']

                    rses = report['remoteSite'].strip().split(',')
                    for rse in rses:
                        replicas.append({
                            'name':
                            report['filename'],
                            'scope':
                            report['scope'],
                            'rse':
                            rse,
                            'accessed_at':
                            datetime.utcfromtimestamp(
                                report['traceTimeentryUnix']),
                            'traceTimeentryUnix':
                            report['traceTimeentryUnix'],
                            'eventVersion':
                            report['eventVersion']
                        })
                else:
                    # if touch event and if datasetScope is in the report then it means
                    # that there is no file scope/name and therefore only the dataset is
                    # put in the queue to be updated and the rest is skipped.
                    if 'datasetScope' in report:
                        rse = None
                        if 'remoteSite' in report:
                            rse = report['remoteSite']
                        self.__dataset_queue.put({
                            'scope':
                            report['datasetScope'],
                            'name':
                            report['dataset'],
                            'rse':
                            rse,
                            'accessed_at':
                            datetime.utcfromtimestamp(
                                report['traceTimeentryUnix'])
                        })
                        continue
                    else:
                        if 'remoteSite' not in report:
                            continue
                        replicas.append({
                            'name':
                            report['filename'],
                            'scope':
                            report['scope'],
                            'rse':
                            report['remoteSite'],
                            'accessed_at':
                            datetime.utcfromtimestamp(
                                report['traceTimeentryUnix'])
                        })

            except (KeyError, AttributeError):
                logging.error(format_exc())
                record_counter('daemons.tracer.kronos.report_error')
                continue

            for did in list_parent_dids(report['scope'], report['filename']):
                if did['type'] != DIDType.DATASET:
                    continue
                # do not update _dis datasets
                if did['scope'] == 'panda' and '_dis' in did['name']:
                    continue
                for rse in rses:
                    self.__dataset_queue.put({
                        'scope':
                        did['scope'],
                        'name':
                        did['name'],
                        'did_type':
                        did['type'],
                        'rse':
                        rse,
                        'accessed_at':
                        datetime.utcfromtimestamp(report['traceTimeentryUnix'])
                    })

        logging.debug(replicas)

        try:
            start_time = time()
            for replica in replicas:
                # if touch replica hits a locked row put the trace back into queue for later retry
                if not touch_replica(replica):
                    resubmit = {
                        'filename': replica['name'],
                        'scope': replica['scope'],
                        'remoteSite': replica['rse'],
                        'traceTimeentryUnix': replica['traceTimeentryUnix'],
                        'eventType': 'get',
                        'usrdn': 'someuser',
                        'clientState': 'DONE',
                        'eventVersion': replica['eventVersion']
                    }
                    self.__conn.send(body=jdumps(resubmit),
                                     destination=self.__queue,
                                     headers={
                                         'appversion': 'rucio',
                                         'resubmitted': '1'
                                     })
                    record_counter('daemons.tracer.kronos.sent_resubmitted')
                    logging.warning(
                        '(kronos_file) hit locked row, resubmitted to queue')
            record_timer('daemons.tracer.kronos.update_atime',
                         (time() - start_time) * 1000)
        except Exception:
            logging.error(format_exc())
            record_counter('daemons.tracer.kronos.update_error')

        logging.info('(kronos_file) updated %d replicas' % len(replicas))
Example #6
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logger(logging.INFO, 'Minos started')

    chunk_size = 10  # The chunk size used for the commits

    while not GRACEFUL_STOP.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                    heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prefix + '%s')
        pfns = []
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = states_mapping[pfn['state']]
                if state in [BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                elif state == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                vo = account.vo
                pfns = bad_replicas[(account, reason, state)]
                logger(
                    logging.INFO,
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                schemes = {}
                dict_rse = {}
                unknown_replicas = []
                try:
                    # Splitting the PFNs by schemes
                    for pfn in pfns:
                        scheme = pfn.split(':')[0]
                        if scheme not in schemes:
                            schemes[scheme] = []
                        schemes[scheme].append(pfn)
                    for scheme in schemes:
                        _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                            schemes[scheme], vo=vo)
                        for rse_id in tmp_dict_rse:
                            if rse_id not in dict_rse:
                                dict_rse[rse_id] = []
                            dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))
                    # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                    if unknown_replicas:
                        logger(
                            logging.INFO,
                            'The following replicas are unknown and will be removed : %s'
                            % str(unknown_replicas))
                        bulk_delete_bad_pfns(pfns=unknown_replicas,
                                             session=None)

                    for rse_id in dict_rse:
                        vo_str = '' if vo == 'def' else ' on VO ' + vo
                        logger(
                            logging.DEBUG,
                            'Running on RSE %s%s with %s replicas' %
                            (get_rse_name(rse_id=rse_id), vo_str,
                             len(dict_rse[rse_id])))
                        nchunk = 0
                        tot_chunk = int(
                            math.ceil(len(dict_rse[rse_id]) / chunk_size))
                        for chunk in chunks(dict_rse[rse_id], chunk_size):
                            nchunk += 1
                            logger(
                                logging.DEBUG,
                                'Running on %s chunk out of %s' %
                                (nchunk, tot_chunk))
                            unknown_replicas = declare_bad_file_replicas(
                                pfns=chunk,
                                reason=reason,
                                issuer=account,
                                status=state,
                                session=session)
                            if unknown_replicas:
                                logger(
                                    logging.DEBUG, 'Unknown replicas : %s' %
                                    (str(unknown_replicas)))
                            bulk_delete_bad_pfns(pfns=chunk, session=session)
                            session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                vo = account.vo
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logger(
                    logging.INFO,
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logger(logging.DEBUG, 'Extracting RSEs')
                schemes = {}
                dict_rse = {}
                unknown_replicas = []

                # Splitting the PFNs by schemes
                for pfn in pfns:
                    scheme = pfn.split(':')[0]
                    if scheme not in schemes:
                        schemes[scheme] = []
                    schemes[scheme].append(pfn)
                for scheme in schemes:
                    _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                        schemes[scheme], vo=vo)
                    for rse_id in tmp_dict_rse:
                        if rse_id not in dict_rse:
                            dict_rse[rse_id] = []
                        dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))

                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logger(
                        logging.INFO,
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse_id in dict_rse:
                    replicas = []
                    rse = get_rse_name(rse_id=rse_id, session=None)
                    rse_vo_str = rse if vo == 'def' else '{} on {}'.format(
                        rse, vo)
                    logger(logging.DEBUG, 'Running on RSE %s' % rse_vo_str)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse_id],
                                                 rse_id=None,
                                                 vo=vo,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    logger(
                        logging.DEBUG, 'Running on %s replicas on RSE %s' %
                        (len(replicas), rse_vo_str))
                    nchunk = 0
                    tot_chunk = int(
                        math.ceil(len(replicas) / float(chunk_size)))
                    session = get_session()
                    for chunk in chunks(replicas, chunk_size):
                        try:
                            nchunk += 1
                            logger(
                                logging.DEBUG,
                                'Running on %s chunk out of %s' %
                                (nchunk, tot_chunk))
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=reason,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                        except (UnsupportedOperation,
                                ReplicaNotFound) as error:
                            session.rollback()  # pylint: disable=no-member
                            logger(
                                logging.ERROR,
                                'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s'
                                % str(error))
                            for rep in chunk:
                                logger(logging.DEBUG,
                                       'Working on %s' % (str(rep)))
                                try:
                                    get_metadata(rep['scope'], rep['name'])
                                    unavailable_states = []
                                    rep_state = get_replicas_state(
                                        rep['scope'], rep['name'])
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.TEMPORARY_UNAVAILABLE,
                                            []))
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.BEING_DELETED, []))
                                    unavailable_states.extend(
                                        rep_state.get(ReplicaState.BAD, []))
                                    # If the replica is already not available, it is removed from the bad PFNs table
                                    if rep['rse_id'] in unavailable_states:
                                        logger(
                                            logging.INFO,
                                            '%s is in unavailable state. Will be removed from the list of bad PFNs'
                                            % str(rep['pfn']))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    # If the expiration date of the TEMPORARY_UNAVAILABLE is in the past, it is removed from the bad PFNs table
                                    elif expires_at < datetime.now():
                                        logger(
                                            logging.INFO,
                                            'PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns'
                                            % (str(rep['pfn']), expires_at))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    # Else update everything in the same transaction
                                    else:
                                        try:
                                            session = get_session()
                                            update_replicas_states(
                                                [rep],
                                                nowait=False,
                                                session=session)
                                            bulk_add_bad_replicas(
                                                [rep],
                                                account,
                                                state=BadFilesStatus.
                                                TEMPORARY_UNAVAILABLE,
                                                reason=reason,
                                                expires_at=expires_at,
                                                session=session)
                                            bulk_delete_bad_pfns(
                                                pfns=[rep['pfn']],
                                                session=session)
                                            session.commit()  # pylint: disable=no-member
                                        except Exception:
                                            logger(
                                                logging.ERROR,
                                                'Cannot update state of %s' %
                                                str(rep['pfn']))
                                except (DataIdentifierNotFound,
                                        ReplicaNotFound):
                                    logger(
                                        logging.ERROR,
                                        'Will remove %s from the list of bad PFNs'
                                        % str(rep['pfn']))
                                    bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                         session=None)
                            session = get_session()
                        except Exception:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.CRITICAL,
                                   'Exception',
                                   exc_info=True)
                            session = get_session()

        except Exception as error:
            logger(logging.ERROR, '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if len(pfns) == bulk:
            logger(
                logging.INFO,
                'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle'
            )
        elif tottime < sleep_time:
            logger(logging.INFO,
                   'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
def declare_suspicious_replicas_bad(once=False,
                                    younger_than=3,
                                    nattempts=10,
                                    vos=None,
                                    limit_suspicious_files_on_rse=5,
                                    sleep_time=3600,
                                    active_mode=False):
    """
    Main loop to check for available replicas which are labeled as suspicious.

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.
    Replicas that are the last remaining copy of a file have additional checks (checksum
    comparison, etc.) before being declared bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param vos: VOs on which to look for RSEs. Only used in multi-VO mode.
                If None, we either use all VOs if run from "def",
    :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE
                                          is considered problematic and the suspicious replicas on that RSE
                                          are labeled as 'TEMPORARY_UNAVAILABLE'.
    :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then
                       it should sleep until sleep_time is over.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]

    prepend_str = 'replica_recoverer: '
    logger = formatted_logger(logging.log, prepend_str + '%s')

    multi_vo = config_get_bool('common',
                               'multi_vo',
                               raise_exception=False,
                               default=False)
    if not multi_vo:
        if vos:
            logger(
                logging.WARNING,
                'Ignoring argument vos, this is only applicable in a multi-VO setup.'
            )
        vos = ['def']
    else:
        if vos:
            invalid = set(vos) - set([v['vo'] for v in list_vos()])
            if invalid:
                msg = 'VO{} {} cannot be found'.format(
                    's' if len(invalid) > 1 else '',
                    ', '.join([repr(v) for v in invalid]))
                raise VONotFound(msg)
        else:
            vos = [v['vo'] for v in list_vos()]
        logger(
            logging.INFO,
            'replica_recoverer: This instance will work on VO%s: %s' %
            ('s' if len(vos) > 1 else '', ', '.join([v for v in vos])))

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    heartbeat = live(executable=executable,
                     hostname=socket.gethostname(),
                     pid=os.getpid(),
                     thread=threading.current_thread())
    prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'],
                                                    heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable,
                             hostname=socket.gethostname(),
                             pid=os.getpid(),
                             thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread'] + 1

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logger(
                    logging.ERROR,
                    'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.',
                    socket.gethostname())
                die(executable=executable,
                    hostname=socket.gethostname(),
                    pid=os.getpid(),
                    thread=threading.current_thread())
                break

            prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number,
                                                          total_workers)
            logger = formatted_logger(logging.log, prepend_str + '%s')

            start = time.time()

            try:
                json_file = open(
                    "/opt/rucio/etc/suspicious_replica_recoverer.json")
            except:
                logger(
                    logging.WARNING,
                    "An error occured whilst trying to open the JSON file.")
                break

            try:
                json_data = json.load(json_file)
            except ValueError:
                logger(logging.WARNING, "No JSON object could be decoded.")

            # Checking that the json file is formatedd properly.
            for i, entry in enumerate(json_data):
                if "datatype" not in entry or "action" not in entry:
                    logger(
                        logging.ERROR,
                        'Entry %s in the json file is incomplete (missing either "datatype" or "action").',
                        i)
                    break

            logger(
                logging.INFO,
                'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.',
                younger_than, nattempts)

            getfileskwargs = {
                'younger_than': younger_than,
                'nattempts': nattempts,
                'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                'is_suspicious': True
            }

            for vo in vos:
                logger(logging.INFO, 'Start replica recovery for VO: %s', vo)
                recoverable_replicas = {}
                if vo not in recoverable_replicas:
                    recoverable_replicas[vo] = {}

                # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse'])
                rse_list = sorted([
                    rse for rse in parse_expression(
                        'enable_suspicious_file_recovery=true')
                    if rse['vo'] == vo
                ],
                                  key=lambda k: k['rse'])

                logger(
                    logging.DEBUG,
                    "List of RSEs with enable_suspicious_file_recovery = True:"
                )
                for i in rse_list:
                    logger(logging.DEBUG, '%s', i)

                for rse in rse_list:
                    time_start_rse = time.time()
                    rse_expr = rse['rse']
                    cnt_surl_not_found = 0
                    if rse_expr not in recoverable_replicas[vo]:
                        recoverable_replicas[vo][rse_expr] = {}
                    # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs
                    suspicious_replicas_avail_elsewhere = get_suspicious_files(
                        rse_expr,
                        available_elsewhere=SuspiciousAvailability[
                            "EXIST_COPIES"].value,
                        filter_={'vo': vo},
                        **getfileskwargs)
                    # Get the suspicious replicas that are the last remaining copies
                    suspicious_replicas_last_copy = get_suspicious_files(
                        rse_expr,
                        available_elsewhere=SuspiciousAvailability["LAST_COPY"]
                        .value,
                        filter_={'vo': vo},
                        **getfileskwargs)

                    logger(logging.DEBUG, 'Suspicious replicas on %s:',
                           rse_expr)
                    logger(logging.DEBUG,
                           'Replicas with copies on other RSEs (%s):',
                           len(suspicious_replicas_avail_elsewhere))
                    for i in suspicious_replicas_avail_elsewhere:
                        logger(logging.DEBUG, '%s', i)
                    logger(logging.DEBUG,
                           'Replicas that are the last remaining copy (%s):',
                           len(suspicious_replicas_last_copy))
                    for i in suspicious_replicas_last_copy:
                        logger(logging.DEBUG, '%s', i)

                    # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE.
                    if (rse['availability'] not in {4, 5, 6, 7}) and (
                        (len(suspicious_replicas_avail_elsewhere) > 0) or
                        (len(suspicious_replicas_last_copy) > 0)):
                        logger(
                            logging.WARNING,
                            "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n",
                            rse_expr, rse['availability'])
                        continue

                    if suspicious_replicas_avail_elsewhere:
                        for replica in suspicious_replicas_avail_elsewhere:
                            if vo == replica['scope'].vo:
                                scope = replica['scope']
                                rep_name = replica['name']
                                rse_id = replica['rse_id']
                                surl_not_found = True
                                for rep in list_replicas([{
                                        'scope': scope,
                                        'name': rep_name
                                }]):
                                    for rse_ in rep['rses']:
                                        if rse_ == rse_id:
                                            recoverable_replicas[vo][rse_expr][
                                                rep_name] = {
                                                    'name': rep_name,
                                                    'rse_id': rse_id,
                                                    'scope': scope,
                                                    'surl':
                                                    rep['rses'][rse_][0],
                                                    'available_elsewhere': True
                                                }
                                            surl_not_found = False
                                if surl_not_found:
                                    cnt_surl_not_found += 1
                                    logger(
                                        logging.WARNING,
                                        'Skipping suspicious replica %s on %s, no surls were found.',
                                        rep_name, rse_expr)

                    if suspicious_replicas_last_copy:
                        for replica in suspicious_replicas_last_copy:
                            if vo == replica['scope'].vo:
                                scope = replica['scope']
                                rep_name = replica['name']
                                rse_id = replica['rse_id']
                                surl_not_found = True
                                # Should only return one rse, as there is only one replica remaining
                                for rep in list_replicas([{
                                        'scope': scope,
                                        'name': rep_name
                                }]):
                                    recoverable_replicas[vo][rse_expr][
                                        rep_name] = {
                                            'name': rep_name,
                                            'rse_id': rse_id,
                                            'scope': scope,
                                            'surl': rep['rses'][rse_id][0],
                                            'available_elsewhere': False
                                        }
                                    surl_not_found = False
                                if surl_not_found:
                                    cnt_surl_not_found += 1
                                    logger(
                                        logging.WARNING,
                                        'Skipping suspicious replica %s on %s, no surls were found.',
                                        rep_name, rse_expr)

                    logger(
                        logging.INFO,
                        'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.',
                        time.time() - time_start_rse, rse_expr,
                        len(suspicious_replicas_avail_elsewhere) +
                        len(suspicious_replicas_last_copy),
                        len(suspicious_replicas_avail_elsewhere) +
                        len(suspicious_replicas_last_copy) -
                        cnt_surl_not_found,
                        len(suspicious_replicas_avail_elsewhere) +
                        len(suspicious_replicas_last_copy))

                    if len(suspicious_replicas_avail_elsewhere) + len(
                            suspicious_replicas_last_copy) != 0:
                        logger(
                            logging.DEBUG,
                            'List of replicas on %s for which the pfns have been found:',
                            rse_expr)
                        for i in recoverable_replicas[vo][rse_expr]:
                            logger(logging.DEBUG, '%s', i)

                # Log file is long and hard to read -> implement some spacing
                logger(
                    logging.INFO,
                    'All RSEs have been checked for suspicious replicas. Total time: %s seconds.',
                    time.time() - start)
                logger(logging.INFO, 'Begin check for problematic RSEs.')
                time_start_check_probl = time.time()

                # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE.
                # The suspicious files are marked as temporarily unavailable.
                list_problematic_rses = []
                for rse_key in list(recoverable_replicas[vo].keys()):
                    if len(recoverable_replicas[vo]
                           [rse_key].values()) > limit_suspicious_files_on_rse:
                        list_problematic_rses.append(rse_key)
                        surls_list = []
                        for replica_value in recoverable_replicas[vo][
                                rse_key].values():
                            surls_list.append(replica_value['surl'])

                        if active_mode:
                            add_bad_pfns(pfns=surls_list,
                                         account=InternalAccount('root',
                                                                 vo=vo),
                                         state='TEMPORARY_UNAVAILABLE',
                                         expires_at=datetime.utcnow() +
                                         timedelta(days=3))

                        logger(
                            logging.INFO,
                            "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).",
                            rse_key, limit_suspicious_files_on_rse)
                        logger(
                            logging.INFO,
                            "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:",
                            rse_key)
                        for replica_values in recoverable_replicas[vo][
                                rse_key].values():
                            logger(
                                logging.INFO,
                                'Temporarily unavailable: RSE: %s    Scope: %s    Name: %s    PFN: %s',
                                rse_key, replica_values['scope'],
                                replica_values['name'], replica_values['surl'])
                        # Remove the RSE from the dictionary as it has been dealt with.
                        del recoverable_replicas[vo][rse_key]

                logger(logging.INFO,
                       "Following RSEs were deemed problematic (total: %s)",
                       len(list_problematic_rses))
                for rse in list_problematic_rses:
                    logger(logging.INFO, "%s", rse)

                auditor = 0
                checksum = 0

                # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious).
                # If they are the last remaining copies, deal with them differently.
                for rse_key in list(recoverable_replicas[vo].keys()):
                    files_to_be_declared_bad = []
                    files_to_be_ignored = []
                    # Remove RSEs from dictionary that don't have any suspicious replicas
                    if len(recoverable_replicas[vo][rse_key]) == 0:
                        del recoverable_replicas[vo][rse_key]
                        continue
                    # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there
                    rse_id = list(recoverable_replicas[vo]
                                  [rse_key].values())[0]['rse_id']
                    for replica_key in list(
                            recoverable_replicas[vo][rse_key].keys()):
                        file_metadata = get_metadata(
                            recoverable_replicas[vo][rse_key][replica_key]
                            ["scope"], recoverable_replicas[vo][rse_key]
                            [replica_key]["name"])
                        recoverable_replicas[vo][rse_key][replica_key][
                            "datatype"] = str(file_metadata["datatype"])
                        if recoverable_replicas[vo][rse_key][replica_key][
                                'available_elsewhere'] is True:
                            # Replicas with other copies on at least one other RSE can safely be labeled as bad
                            files_to_be_declared_bad.append(
                                recoverable_replicas[vo][rse_key][replica_key])
                            # Remove replica from dictionary
                            del recoverable_replicas[vo][rse_key][replica_key]
                        elif recoverable_replicas[vo][rse_key][replica_key][
                                'available_elsewhere'] is False:
                            if (recoverable_replicas[vo][rse_key][replica_key]
                                ['name'].startswith("log.")) or (
                                    recoverable_replicas[vo][rse_key]
                                    [replica_key]['name'].startswith("user")):
                                # Don't keep log files or user files
                                files_to_be_declared_bad.append(
                                    recoverable_replicas[vo][rse_key]
                                    [replica_key])
                                del recoverable_replicas[vo][rse_key][
                                    replica_key]
                            else:
                                # Deal with replicas based on their metadata.
                                if file_metadata[
                                        "datatype"] is None:  # "None" type has no function "split()"
                                    files_to_be_ignored.append(
                                        recoverable_replicas[vo][rse_key]
                                        [replica_key])
                                    continue
                                for i in json_data:
                                    if i["datatype"] == file_metadata[
                                            "datatype"].split("_")[-1]:
                                        action = i["action"]
                                        if action == "ignore":
                                            files_to_be_ignored.append(
                                                recoverable_replicas[vo]
                                                [rse_key][replica_key])
                                        elif action == "declare bad":
                                            suspicious_reason = get_suspicious_reason(
                                                recoverable_replicas[vo]
                                                [rse_key][replica_key]
                                                ["rse_id"],
                                                recoverable_replicas[vo]
                                                [rse_key][replica_key]
                                                ["scope"],
                                                recoverable_replicas[vo]
                                                [rse_key][replica_key]["name"],
                                                nattempts)
                                            for reason in suspicious_reason:
                                                if "auditor" in reason[
                                                        "reason"].lower():
                                                    auditor += 1
                                                    files_to_be_declared_bad.append(
                                                        recoverable_replicas[
                                                            vo][rse_key]
                                                        [replica_key])
                                                    break
                                                elif "checksum" in reason[
                                                        "reason"].lower():
                                                    checksum += 1
                                                    files_to_be_declared_bad.append(
                                                        recoverable_replicas[
                                                            vo][rse_key]
                                                        [replica_key])
                                                    break
                                        else:
                                            logger(
                                                logging.WARNING,
                                                "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)",
                                                rse_key, replica_key,
                                                recoverable_replicas[vo]
                                                [rse_key][replica_key]['surl'],
                                                i["datatype"], i["action"])
                                        break
                                    else:
                                        # If no policy has been set, default to ignoring the file (no action taken).
                                        files_to_be_ignored.append(
                                            recoverable_replicas[vo][rse_key]
                                            [replica_key])

                    logger(
                        logging.INFO,
                        '(%s) Remaining replicas (pfns) that will be ignored:',
                        rse_key)
                    for i in files_to_be_ignored:
                        logger(
                            logging.INFO,
                            'Ignore: RSE: %s    Scope: %s    Name: %s    Datatype: %s    PFN: %s',
                            rse_key, i["scope"], i["name"], i["datatype"],
                            i["surl"])
                    logger(
                        logging.INFO,
                        '(%s) Remaining replica (pfns) that will be declared BAD:',
                        rse_key)
                    for i in files_to_be_declared_bad:
                        logger(
                            logging.INFO,
                            'Declare bad: RSE: %s    Scope: %s    Name: %s    Datatype: %s    PFN: %s',
                            rse_key, i["scope"], i["name"], i["datatype"],
                            i["surl"])

                    if files_to_be_declared_bad:
                        logger(
                            logging.INFO,
                            'Ready to declare %s bad replica(s) on %s (RSE id: %s).',
                            len(files_to_be_declared_bad), rse_key,
                            str(rse_id))
                        logger(
                            logging.INFO,
                            'Number of replicas with checksum problems: %i',
                            checksum)
                        logger(
                            logging.INFO,
                            'Number of replicas that were declared suspicious by the auditor: %i',
                            auditor)

                        if active_mode:
                            declare_bad_file_replicas(
                                pfns=files_to_be_declared_bad,
                                reason='Suspicious. Automatic recovery.',
                                issuer=InternalAccount('root', vo=vo),
                                session=None)

                        logger(logging.INFO,
                               'Finished declaring bad replicas on %s.\n',
                               rse_key)
                    else:
                        logger(logging.INFO,
                               'No files were declared bad on %s.\n', rse_key)

                logger(
                    logging.INFO,
                    'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.',
                    time.time() - time_start_check_probl)

                time_passed = time.time() - start
                logger(logging.INFO, 'Total time: %s seconds', time_passed)
                daemon_sleep(start_time=start,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logger(logging.WARNING, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' +
                               err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logger(logging.WARNING, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' +
                               err.__class__.__name__)
            else:
                logger(logging.CRITICAL, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' +
                               err.__class__.__name__)
        except Exception as err:
            logger(logging.CRITICAL, traceback.format_exc())
            record_counter('replica.recoverer.exceptions.' +
                           err.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=socket.gethostname(),
        pid=os.getpid(),
        thread=threading.current_thread())
    logger(logging.INFO, 'Graceful stop done.')
Example #8
0
def test_add_and_delete_bad_replicas(rse_factory, mock_scope, root_account,
                                     did_client, vo):
    """ REPLICA (CORE): Add bad replicas and delete them"""
    # Adding replicas to deterministic RSE
    nbfiles = 5
    rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True, vo=vo)
    files = [{
        'scope': mock_scope,
        'name': 'file_%s' % generate_uuid(),
        'bytes': 1,
        'adler32': '0cc737eb',
        'meta': {
            'events': 10
        }
    } for _ in range(nbfiles)]
    client_files = [{
        'scope': file_['scope'].external,
        'name': file_['name']
    } for file_ in files]
    add_replicas(rse_id=rse1_id,
                 files=files,
                 account=root_account,
                 ignore_availability=True)
    tmp_dsn = 'dataset_%s' % generate_uuid()
    did_client.add_dataset(scope=mock_scope.external, name=tmp_dsn)
    did_client.add_files_to_dataset(mock_scope.external,
                                    name=tmp_dsn,
                                    files=client_files,
                                    rse=rse1)

    # Declare replica bad
    replicas = []
    for replica in list_replicas(dids=[{
            'scope': f['scope'],
            'name': f['name'],
            'type': DIDType.FILE
    } for f in files],
                                 schemes=['srm']):
        replicas.extend(replica['rses'][rse1_id])
    r = declare_bad_file_replicas(replicas, 'This is a good reason',
                                  root_account)
    assert r == {}

    # Check state of bad replicas
    list_bad_rep = [{
        'scope': rep['scope'].external,
        'name': rep['name']
    } for rep in list_bad_replicas_status(
        state=BadFilesStatus.BAD, rse_id=rse1_id, vo=vo)]
    for rep in client_files:
        assert rep in list_bad_rep
    assert [
        rep for rep in list_bad_replicas_status(
            state=BadFilesStatus.DELETED, rse_id=rse1_id, vo=vo)
    ] == []

    # Now delete the dataset
    delete_dids([{
        'scope': mock_scope,
        'name': tmp_dsn,
        'did_type': DIDType.DATASET,
        'purge_replicas': True
    }],
                account=root_account)
    assert [
        rep for rep in list_bad_replicas_status(
            state=BadFilesStatus.BAD, rse_id=rse1_id, vo=vo)
    ] == []
    list_deleted_rep = [{
        'scope': rep['scope'].external,
        'name': rep['name']
    } for rep in list_bad_replicas_status(
        state=BadFilesStatus.DELETED, rse_id=rse1_id, vo=vo)]
    for rep in client_files:
        assert rep in list_deleted_rep
Example #9
0
def test_add_list_bad_replicas(rse_factory, mock_scope, root_account):
    """ REPLICA (CORE): Add bad replicas and list them"""

    nbfiles = 5
    # Adding replicas to deterministic RSE
    _, rse1_id = rse_factory.make_srm_rse(deterministic=True)
    files = [{
        'scope': mock_scope,
        'name': 'file_%s' % generate_uuid(),
        'bytes': 1,
        'adler32': '0cc737eb',
        'meta': {
            'events': 10
        }
    } for _ in range(nbfiles)]
    add_replicas(rse_id=rse1_id,
                 files=files,
                 account=root_account,
                 ignore_availability=True)

    # Listing replicas on deterministic RSE
    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{
            'scope': f['scope'],
            'name': f['name'],
            'type': DIDType.FILE
    } for f in files],
                                 schemes=['srm']):
        replicas.extend(replica['rses'][rse1_id])
        list_rep.append(replica)
    r = declare_bad_file_replicas(replicas, 'This is a good reason',
                                  root_account)
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse1_id:
                if badrep['scope'] == rep['scope'] and badrep['name'] == rep[
                        'name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # Adding replicas to non-deterministic RSE
    _, rse2_id = rse_factory.make_srm_rse(deterministic=False)
    files = [{
        'scope':
        mock_scope,
        'name':
        'file_%s' % generate_uuid(),
        'bytes':
        1,
        'adler32':
        '0cc737eb',
        'pfn':
        'srm://%s.cern.ch/srm/managerv2?SFN=/test/%s/%s' %
        (rse2_id, mock_scope, generate_uuid()),
        'meta': {
            'events': 10
        }
    } for _ in range(nbfiles)]
    add_replicas(rse_id=rse2_id,
                 files=files,
                 account=root_account,
                 ignore_availability=True)

    # Listing replicas on non-deterministic RSE
    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{
            'scope': f['scope'],
            'name': f['name'],
            'type': DIDType.FILE
    } for f in files],
                                 schemes=['srm']):
        replicas.extend(replica['rses'][rse2_id])
        list_rep.append(replica)
    r = declare_bad_file_replicas(replicas, 'This is a good reason',
                                  root_account)
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse2_id:
                if badrep['scope'] == rep['scope'] and badrep['name'] == rep[
                        'name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # Try adding replicas already declared bad
    r = declare_bad_file_replicas(replicas, 'This is a good reason',
                                  root_account)
    output = ['%s Unknown replica' % rep for rep in replicas]
    assert list(r.keys()) == [rse2_id]
    list1 = r[rse2_id]
    list1.sort()
    list2 = ['%s Already declared' % clean_surls([rep])[0] for rep in replicas]
    list2.sort()
    assert list1 == list2

    # Now adding non-existing bad replicas
    files = [
        'srm://%s.cern.ch/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()),
    ]
    r = declare_bad_file_replicas(files, 'This is a good reason', root_account)
    output = ['%s Unknown replica' % rep for rep in files]
    assert r == {rse2_id: output}
Example #10
0
def test_get_bad_replicas_backlog(rse_factory, mock_scope, root_account,
                                  file_config_mock):
    """ REPLICA (CORE): Check the behaviour of the necromancer in case of backlog on an RSE"""

    # Run necromancer once
    necromancer_run(threads=1, bulk=10000, once=True)

    nbfiles1 = 100
    nbfiles2 = 20
    # Adding replicas to deterministic RSE
    rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True)
    _, rse2_id = rse_factory.make_srm_rse(deterministic=True)

    # Create bad replicas on rse1
    files = [{
        'scope': mock_scope,
        'name': 'file_%s' % generate_uuid(),
        'bytes': 1,
        'adler32': '0cc737eb',
        'meta': {
            'events': 10
        }
    } for _ in range(nbfiles1)]
    add_replicas(rse_id=rse1_id,
                 files=files,
                 account=root_account,
                 ignore_availability=True)

    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{
            'scope': f['scope'],
            'name': f['name'],
            'type': DIDType.FILE
    } for f in files],
                                 schemes=['srm']):
        replicas.extend(replica['rses'][rse1_id])
        list_rep.append({
            'scope': replica['scope'],
            'name': replica['name'],
            'rse': rse1,
            'rse_id': rse1_id
        })
    res = declare_bad_file_replicas(replicas, 'This is a good reason',
                                    root_account)
    assert res == {}

    result = get_bad_replicas_backlog()
    assert rse1_id in result
    assert result[rse1_id] == nbfiles1

    # Create more bad replicas on rse2
    files = [{
        'scope': mock_scope,
        'name': 'file_%s' % generate_uuid(),
        'bytes': 1,
        'adler32': '0cc737eb',
        'meta': {
            'events': 10
        }
    } for _ in range(nbfiles2)]
    add_replicas(rse_id=rse2_id,
                 files=files,
                 account=root_account,
                 ignore_availability=True)

    repl = []
    for replica in list_replicas(dids=[{
            'scope': f['scope'],
            'name': f['name'],
            'type': DIDType.FILE
    } for f in files],
                                 schemes=['srm']):
        repl.extend(replica['rses'][rse2_id])
    res = declare_bad_file_replicas(repl, 'This is a good reason',
                                    root_account)
    assert res == {}

    # List bad replicas on rse1
    bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}])
    assert len(bad_replicas) == nbfiles1
    for rep in bad_replicas:
        assert rep in list_rep

    # Run necromancer once, all the files on RSE2 should be gone, 80 files should stay on RSE1
    REGION.invalidate()
    get_bad_replicas_backlog()
    necromancer_run(threads=1, bulk=20, once=True)

    bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}, {'id': rse2_id}])
    assert len(bad_replicas) == 80
    for rep in bad_replicas:
        assert rep['rse_id'] == rse1_id
Example #11
0
def process_output(output, sanity_check=True, compress=True):
    """Perform post-consistency-check actions.

    DARK files are put in the quarantined-replica table so that they
    may be deleted by the Dark Reaper.  LOST files are reported as
    suspicious so that they may be further checked by the cloud squads.

    ``output`` should be an ``str`` with the absolute path to the file
    produced by ``consistency()``.  It must maintain its naming
    convention.

    If ``sanity_check`` is ``True`` (default) and the number of entries
    in the output file is deemed excessive, the actions are aborted.

    If ``compress`` is ``True`` (default), the file is compressed with
    bzip2 after the actions are successfully performed.
    """
    logger = logging.getLogger('auditor-worker')
    dark_replicas = []
    lost_replicas = []
    try:
        with open(output) as f:
            for line in f:
                label, path = line.rstrip().split(',', 1)
                scope, name = guess_replica_info(path)
                if label == 'DARK':
                    dark_replicas.append({
                        'path': path,
                        'scope': scope,
                        'name': name
                    })
                elif label == 'LOST':
                    lost_replicas.append({'scope': scope, 'name': name})
                else:
                    raise ValueError('unexpected label')
    # Since the file is read immediately after its creation, any error
    # exposes a bug in the Auditor.
    except Exception as error:
        logger.critical('Error processing "%s"', output, exc_info=True)
        raise error

    rse = os.path.basename(output[:output.rfind('_')])
    usage = get_rse_usage(rse, source='rucio')[0]
    threshold = config.config_get('auditor', 'threshold', False, 0.2)

    # Perform a basic sanity check by comparing the number of entries
    # with the total number of files on the RSE.  If the percentage is
    # significant, there is most likely an issue with the site dump.
    found_error = False
    if len(dark_replicas) > threshold * usage['files']:
        logger.warning('Number of DARK files is exceeding threshold: "%s"',
                       output)
        found_error = True
    if len(lost_replicas) > threshold * usage['files']:
        logger.warning('Number of LOST files is exceeding threshold: "%s"',
                       output)
        found_error = True
    if found_error and sanity_check:
        raise AssertionError('sanity check failed')

    # While converting LOST replicas to PFNs, entries that do not
    # correspond to a replica registered in Rucio are silently dropped.
    lost_pfns = [
        r['rses'][rse][0] for r in list_replicas(lost_replicas)
        if rse in r['rses']
    ]

    add_quarantined_replicas(rse, dark_replicas)
    logger.debug('Processed %d DARK files from "%s"', len(dark_replicas),
                 output)
    declare_bad_file_replicas(lost_pfns,
                              reason='Reported by Auditor',
                              issuer='root',
                              status=BadFilesStatus.SUSPICIOUS)
    logger.debug('Processed %d LOST files from "%s"', len(lost_replicas),
                 output)

    if compress:
        destination = bz2_compress_file(output)
        logger.debug('Compressed "%s"', destination)