Ejemplo n.º 1
0
def __update_bulk_replicas(replicas, session=None, logger=logging.log):
    """
    Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way.

    :param replicas:              List of replicas.
    :param session:               The database session to use.
    :returns commit_or_rollback:  Boolean.
    """
    try:
        replica_core.update_replicas_states(replicas,
                                            nowait=True,
                                            session=session)
    except ReplicaNotFound as error:
        logger(logging.WARNING,
               'Failed to bulk update replicas, will do it one by one: %s',
               str(error))
        raise ReplicaNotFound(error)

    for replica in replicas:
        if not replica['archived']:
            request_core.archive_request(replica['request_id'],
                                         session=session)
        logger(logging.INFO, "HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s",
               replica['request_id'], replica['scope'], replica['name'],
               replica['rse_id'], str(replica['state']))
    return True
Ejemplo n.º 2
0
def __update_bulk_replicas(replicas, session=None):
    """
    Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way.

    :param replicas:              List of replicas.
    :param session:               The database session to use.
    :returns commit_or_rollback:  Boolean.
    """
    try:
        replica_core.update_replicas_states(
            replicas, nowait=True, add_tombstone=False, session=session
        )  # TODO: add_tombstone needs to be changed to True once set_tombstone is fixed
    except ReplicaNotFound as error:
        logging.warn(
            'Failed to bulk update replicas, will do it one by one: %s',
            str(error))
        raise ReplicaNotFound(error)

    for replica in replicas:
        if not replica['archived']:
            request_core.archive_request(replica['request_id'],
                                         session=session)
        logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s",
                     replica['request_id'], replica['scope'], replica['name'],
                     replica['rse_id'], str(replica['state']))
    return True
Ejemplo n.º 3
0
def __update_replica(replica, session=None):
    """
    Used by finisher to update a replica to a finished state.

    :param replica:               Replica as a dictionary.
    :param rule_id:               RULE id.
    :param session:               The database session to use.
    :returns commit_or_rollback:  Boolean.
    """

    try:
        replica_core.update_replicas_states([replica],
                                            nowait=True,
                                            add_tombstone=True,
                                            session=session)
        if not replica['archived']:
            request_core.archive_request(replica['request_id'],
                                         session=session)
        logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s",
                     replica['request_id'], replica['scope'], replica['name'],
                     replica['rse_id'], str(replica['state']))
    except (UnsupportedOperation, ReplicaNotFound) as error:
        logging.warn(
            "ERROR WHEN HANDLING REQUEST %s DID %s:%s AT RSE %s STATE %s: %s",
            replica['request_id'], replica['scope'], replica['name'],
            replica['rse_id'], str(replica['state']), str(error))
        # replica cannot be found. register it and schedule it for deletion
        try:
            if replica['state'] == ReplicaState.AVAILABLE and replica[
                    'request_type'] != RequestType.STAGEIN:
                logging.info(
                    "Replica cannot be found. Adding a replica %s:%s AT RSE %s with tombstone=utcnow",
                    replica['scope'], replica['name'], replica['rse_id'])
                replica_core.add_replica(
                    replica['rse_id'],
                    replica['scope'],
                    replica['name'],
                    replica['bytes'],
                    pfn=replica['pfn'] if 'pfn' in replica else None,
                    account=InternalAccount(
                        'root', vo=replica['scope'].vo
                    ),  # it will deleted immediately, do we need to get the accurate account from rule?
                    adler32=replica['adler32'],
                    tombstone=datetime.datetime.utcnow(),
                    session=session)
            if not replica['archived']:
                request_core.archive_request(replica['request_id'],
                                             session=session)
            logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s",
                         replica['request_id'], replica['scope'],
                         replica['name'], replica['rse_id'],
                         str(replica['state']))
        except Exception as error:
            logging.error(
                'Cannot register replica for DID %s:%s at RSE %s - potential dark data - %s',
                replica['scope'], replica['name'], replica['rse_id'],
                str(error))
            raise

    return True
Ejemplo n.º 4
0
    def remove_extra_replicas(self, to_remove):
        """
        :param to_remove: replicas to remove from Rucio
        :return:
        """
        scope = InternalScope(self.scope)
        with monitor.record_timer_block('cms_sync.time_remove_replica'):
            if to_remove and self.dry_run:
                logging.info('Dry run: Removing replicas %s from rse %s.',
                             str(to_remove), self.rse)
            elif to_remove:
                logging.debug('Removing %s replicas from rse %s.',
                              len(to_remove), self.rse)
                for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE):
                    replicas = [{
                        'scope': scope,
                        'name': lfn,
                        "rse_id": self.rse_id,
                        "state": "U"
                    } for lfn in to_remove_chunk]
                    # transactional_session here?
                    # while lock is set stuck, judge-repairer might make transfer requests before rule is gone but does it matter?
                    update_replicas_states(
                        replicas=replicas,
                        add_tombstone=False,
                    )

                # delete_replicas(rse=self.rse, issuer=self.account,
                #                     files=[{'scope': self.scope, 'name': lfn} for lfn in to_remove_chunk])
                return len(to_remove)
Ejemplo n.º 5
0
def __update_bulk_replicas(replicas, req_type, rule_id, session=None):
    """
    Used by finisher to handle available and unavailable replicas blongs to same rule in bulk way.

    :param replicas:              List of replicas.
    :param req_type:              Request type: STAGEIN, STAGEOUT, TRANSFER.
    :param rule_id:               RULE id.
    :param session:               The database session to use.
    :returns commit_or_rollback:  Boolean.
    """
    try:
        replica_core.update_replicas_states(replicas,
                                            nowait=True,
                                            session=session)
    except ReplicaNotFound as error:
        logging.warn(
            'Failed to bulk update replicas, will do it one by one: %s' %
            str(error))
        raise ReplicaNotFound(error)

    for replica in replicas:
        if not replica['archived']:
            request_core.archive_request(replica['request_id'],
                                         session=session)
        logging.info("HANDLED REQUEST %s DID %s:%s AT RSE %s STATE %s" %
                     (replica['request_id'], replica['scope'], replica['name'],
                      replica['rse_id'], str(replica['state'])))
    return True
Ejemplo n.º 6
0
def update_replicas_states(rse, files, issuer):
    """
    Update File replica information and state.

    :param rse: The RSE name.
    :param files: The list of files.
    :param issuer: The issuer account.
    """
    for v_file in files:
        v_file.update(
            {"type":
             "FILE"})  # Make sure DIDs are identified as files for checking
    validate_schema(name='dids', obj=files)

    rse_id = get_rse_id(rse=rse)

    kwargs = {'rse': rse, 'rse_id': rse_id}
    if not permission.has_permission(
            issuer=issuer, action='update_replicas_states', kwargs=kwargs):
        raise exception.AccessDenied(
            'Account %s can not update file replicas state on %s' %
            (issuer, rse))
    replicas = []
    for file in files:
        rep = file
        rep['rse_id'] = rse_id
        rep['scope'] = InternalScope(rep['scope'])
        replicas.append(rep)
    replica.update_replicas_states(replicas=replicas)
Ejemplo n.º 7
0
def add_sync_rule(rse, scope, block, account, session=None):
    rse_id = get_rse_id(rse=rse, session=session)
    scope = InternalScope(scope)
    account = InternalAccount(account)

    files = []
    for file in list_files(scope, block, long=False, session=session):
        try:
            update_replicas_states(
                replicas=[
                    {
                        "scope": scope,
                        "name": file["name"],
                        "rse_id": rse_id,
                        "state": "A",
                    }
                ],
                add_tombstone=False,
                session=session,
            )
        except ReplicaNotFound:
            add_replicas(
                rse_id=rse_id,
                files=[
                    {
                        "scope": scope,
                        "name": file["name"],
                        "bytes": file["bytes"],
                        "adler32": file["adler32"],
                    }
                ],
                account=account,
                ignore_availability=True,
                session=session,
            )

    rules = add_rule(
        dids=[{"scope": scope, "name": block}],
        copies=1,
        rse_expression=rse,
        weight=None,
        lifetime=None,
        grouping="DATASET",
        account=account,
        locked=False,
        subscription_id=None,
        source_replica_expression=None,
        activity="Data Consolidation",
        notify=None,
        purge_replicas=False,
        ignore_availability=True,
        comment=None,
        ask_approval=False,
        asynchronous=False,
        priority=3,
        split_container=False,
        meta=json.dumps({"phedex_group": "DataOps", "phedex_custodial": True}),
        session=session,
    )
    return rules[0]
Ejemplo n.º 8
0
def update_replicas_states(rse, files, issuer, vo='def'):
    """
    Update File replica information and state.

    :param rse: The RSE name.
    :param files: The list of files.
    :param issuer: The issuer account.
    :param vo: The VO to act on.
    """
    validate_schema(name='dids', obj=files)

    rse_id = get_rse_id(rse=rse, vo=vo)

    kwargs = {'rse': rse, 'rse_id': rse_id}
    if not permission.has_permission(
            issuer=issuer, vo=vo, action='update_replicas_states',
            kwargs=kwargs):
        raise exception.AccessDenied(
            'Account %s can not update file replicas state on %s' %
            (issuer, rse))
    replicas = []
    for file in files:
        rep = file
        rep['rse_id'] = rse_id
        rep['scope'] = InternalScope(rep['scope'], vo=vo)
        replicas.append(rep)
    replica.update_replicas_states(replicas=replicas)
Ejemplo n.º 9
0
    def add_missing_replicas(self, missing):
        """
        :param missing: possible missing lfns
        :return:
        """

        with monitor.record_timer_block('cms_sync.time_add_replica'):
            if missing and self.dry_run:
                logging.info('Dry run: Adding replicas %s to rse %s.', str(missing), self.rse)
            elif missing:
                logging.info('Adding %s replicas to rse %s.', len(missing), self.rse)
                replicas_to_add = [self.replicas[lfn] for lfn in missing]
                files = replica_file_list(replicas=replicas_to_add, scope=self.scope)
                for rucio_file in files:
                    try:
                        update_file = copy.deepcopy(rucio_file)
                        update_file.update({'scope': InternalScope(self.scope), "rse_id": self.rse_id, "state": "A"})
                        update_replicas_states(replicas=[update_file], add_tombstone=False)
                    except ReplicaNotFound:
                        resurrect_file = copy.deepcopy(rucio_file)
                        resurrect_file.update({'scope': 'cms', 'type': 'FILE'})
                        try:
                            add_replicas(rse=self.rse, files=[resurrect_file], issuer=self.account,
                                         ignore_availability=True)
                        except RucioException:
                            logging.critical('Could not add %s to %s. Constraint violated?', resurrect_file, self.rse)
                            resurrect_file.update({'scope': 'cms', 'type': 'FILE'})  # Reset to Internal scope by call
                            resurrect([resurrect_file], issuer=self.account)
                            resurrect_file.update({'scope': 'cms', 'type': 'FILE'})  # Reset to Internal scope by call
                            add_replicas(rse=self.rse, files=[resurrect_file], issuer=self.account,
                                         ignore_availability=True)
                            logging.critical('Resurrected %s at %s', resurrect_file, self.rse)

                # add_replicas(rse=self.rse, files=files, issuer=self.account)
                lfns = [item['name'] for item in list_files(scope=self.scope, name=self.block_name, long=False)]

                missing_lfns = list(set(missing) - set(lfns))

                if missing_lfns:
                    logging.debug('Attaching %s lfns to %s at %s', len(missing_lfns), self.block_name, self.rse)
                    dids = [{'scope': self.scope, 'name': lfn} for lfn in missing_lfns]
                    try:
                        attach_dids(scope=self.scope, name=self.block_name, attachment={'dids': dids},
                                    issuer=self.account)
                    except FileAlreadyExists:
                        logging.warning('Trying to attach already existing files to %s', self.block_name)
                    except DataIdentifierNotFound:
                        logging.critical('Could not attach to %s at %s. Constraint violated?', self.block_name, self.rse)
                return len(missing_lfns)
Ejemplo n.º 10
0
def update_replicas_states(rse, files, issuer):

    """
    Update File replica information and state.

    :param rse: The RSE name.
    :param files: The list of files.
    :param issuer: The issuer account.
    """
    validate_schema(name='dids', obj=files)

    kwargs = {'rse': rse}
    if not permission.has_permission(issuer=issuer, action='update_replicas_states', kwargs=kwargs):
        raise exception.AccessDenied('Account %s can not update file replicas state on %s' % (issuer, rse))
    replicas = []
    for file in files:
        rep = file
        rep['rse'] = rse
        replicas.append(rep)
    replica.update_replicas_states(replicas=replicas)
Ejemplo n.º 11
0
def update_replicas_states(rse, files, issuer):

    """
    Update File replica information and state.

    :param rse: The RSE name.
    :param files: The list of files.
    :param issuer: The issuer account.
    """
    validate_schema(name='dids', obj=files)

    kwargs = {'rse': rse}
    if not permission.has_permission(issuer=issuer, action='update_replicas_states', kwargs=kwargs):
        raise exception.AccessDenied('Account %s can not update file replicas state on %s' % (issuer, rse))
    replicas = []
    for file in files:
        rep = file
        rep['rse'] = rse
        replicas.append(rep)
    replica.update_replicas_states(replicas=replicas)
Ejemplo n.º 12
0
def delete_sync_rule(rule_id, session=None):
    rule = get_rule(rule_id, session=session)
    if rule["did_type"] != DIDType.DATASET:
        raise RuntimeError("Rule applies to did with wrong type")
    block = rule["name"]
    try:
        rse_id = get_rse_id(rse=rule["rse_expression"], session=session)
    except RSENotFound:
        raise RuntimeError("Rule does not apply to a specific RSE")
    scope = rule["scope"]
    account = rule["account"]

    files = []
    for file in list_files(scope, block, long=False, session=session):
        files.append(
            {"scope": scope, "name": file["name"], "rse_id": rse_id, "state": "U"}
        )

    update_replicas_states(
        replicas=files, add_tombstone=False, session=session
    )
    delete_rule(rule_id=rule_id, purge_replicas=True, soft=False, session=session)
Ejemplo n.º 13
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logging.info(prepend_str + 'Minos started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        pfns = []
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = pfn['state']
                if states_mapping[state] in [
                        BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS
                ]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                if states_mapping[
                        state] == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                vo = account.vo
                pfns = bad_replicas[(account, reason, state)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                schemes = {}
                dict_rse = {}
                unknown_replicas = []
                try:
                    # Splitting the PFNs by schemes
                    for pfn in pfns:
                        scheme = pfn.split(':')[0]
                        if scheme not in schemes:
                            schemes[scheme] = []
                        schemes[scheme].append(pfn)
                    for scheme in schemes:
                        _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                            schemes[scheme], vo=vo)
                        for rse_id in tmp_dict_rse:
                            if rse_id not in dict_rse:
                                dict_rse[rse_id] = []
                            dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                            unknown_replicas.extend(
                                tmp_unknown_replicas.get('unknown', []))
                    # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                    if unknown_replicas:
                        logging.info(
                            prepend_str +
                            'The following replicas are unknown and will be removed : %s'
                            % str(unknown_replicas))
                        bulk_delete_bad_pfns(pfns=unknown_replicas,
                                             session=None)

                    for rse_id in dict_rse:
                        vo_str = '' if vo == 'def' else ' on VO ' + vo
                        logging.debug(prepend_str +
                                      'Running on RSE %s%s with %s replicas' %
                                      (get_rse_name(rse_id=rse_id), vo_str,
                                       len(dict_rse[rse_id])))
                        nchunk = 0
                        tot_chunk = int(
                            math.ceil(len(dict_rse[rse_id]) / chunk_size))
                        for chunk in chunks(dict_rse[rse_id], chunk_size):
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            unknown_replicas = declare_bad_file_replicas(
                                pfns=chunk,
                                reason=reason,
                                issuer=account,
                                status=state,
                                session=session)
                            if unknown_replicas:
                                logging.debug(prepend_str +
                                              'Unknown replicas : %s' %
                                              (str(unknown_replicas)))
                            bulk_delete_bad_pfns(pfns=chunk, session=session)
                            session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                vo = account.vo
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logging.info(
                    prepend_str +
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logging.debug(prepend_str + 'Extracting RSEs')
                schemes = {}
                dict_rse = {}
                unknown_replicas = []

                # Splitting the PFNs by schemes
                for pfn in pfns:
                    scheme = pfn.split(':')[0]
                    if scheme not in schemes:
                        schemes[scheme] = []
                    schemes[scheme].append(pfn)
                for scheme in schemes:
                    _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                        schemes[scheme], vo=vo)
                    for rse_id in tmp_dict_rse:
                        if rse_id not in dict_rse:
                            dict_rse[rse_id] = []
                        dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))

                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logging.info(
                        prepend_str +
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse_id in dict_rse:
                    replicas = []
                    rse = get_rse_name(rse_id=rse_id, session=None)
                    rse_vo_str = rse if vo == 'def' else '{} on {}'.format(
                        rse, vo)
                    logging.debug(prepend_str +
                                  'Running on RSE %s' % rse_vo_str)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse_id],
                                                 rse_id=None,
                                                 vo=vo,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    logging.debug(prepend_str +
                                  'Running on %s replicas on RSE %s' %
                                  (len(replicas), rse_vo_str))
                    nchunk = 0
                    tot_chunk = int(
                        math.ceil(len(replicas) / float(chunk_size)))
                    session = get_session()
                    for chunk in chunks(replicas, chunk_size):
                        try:
                            nchunk += 1
                            logging.debug(prepend_str +
                                          'Running on %s chunk out of %s' %
                                          (nchunk, tot_chunk))
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=None,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                        except (UnsupportedOperation,
                                ReplicaNotFound) as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.error(
                                prepend_str +
                                'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s'
                                % str(error))
                            for rep in chunk:
                                logging.debug(prepend_str + 'Working on %s' %
                                              (str(rep)))
                                try:
                                    get_metadata(rep['scope'], rep['name'])
                                    unavailable_states = []
                                    rep_state = get_replicas_state(
                                        rep['scope'], rep['name'])
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.TEMPORARY_UNAVAILABLE,
                                            []))
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.BEING_DELETED, []))
                                    unavailable_states.extend(
                                        rep_state.get(ReplicaState.BAD, []))
                                    if rep['rse_id'] in unavailable_states:
                                        logging.info(
                                            prepend_str +
                                            '%s is in unavailable state. Will be removed from the list of bad PFNs'
                                            % str(rep['pfn']))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    elif expires_at < datetime.now():
                                        logging.info(
                                            '%s PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns',
                                            prepend_str, str(rep['pfn']),
                                            expires_at)
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                except (DataIdentifierNotFound,
                                        ReplicaNotFound) as error:
                                    logging.error(
                                        prepend_str +
                                        'Will remove %s from the list of bad PFNs'
                                        % str(rep['pfn']))
                                    bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                         session=None)
                            session = get_session()
                        except Exception:
                            session.rollback()  # pylint: disable=no-member
                            logging.critical(traceback.format_exc())
                            session = get_session()

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if len(pfns) == bulk:
            logging.info(
                prepend_str +
                'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle'
            )
        elif tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 14
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info(prepend_str + 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'],
                worker_number=heart_beat['assign_thread'],
                limit=1000)
            logging.info(prepend_str + '%s expired replicas returned' %
                         len(expired_replicas))
            logging.debug(prepend_str +
                          'List of expired replicas returned %s' %
                          str(expired_replicas))
            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                })
                bad_replicas.append({
                    'scope':
                    replica[0],
                    'name':
                    replica[1],
                    'rse_id':
                    replica[2],
                    'state':
                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                })
            session = get_session()

            nchunk = 0
            tot_chunk = int(math.ceil(len(replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                # Process and update the replicas in chunks
                replicas = [{
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                } for replica in chunk]
                # Remove the replicas from bad_replicas table in chunks
                bad_replicas = [{
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': BadFilesStatus.TEMPORARY_UNAVAILABLE
                } for replica in chunk]
                try:
                    nchunk += 1
                    logging.debug(prepend_str +
                                  'Running on %s chunk out of %s' %
                                  (nchunk, tot_chunk))
                    update_replicas_states(replicas,
                                           nowait=True,
                                           session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning(
                        prepend_str +
                        'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s'
                        % str(error))
                    for idx in range(len(chunk)):
                        logging.debug(prepend_str + 'Working on %s' %
                                      (str(replicas[idx])))
                        try:
                            get_metadata(replicas[idx]['scope'],
                                         replicas[idx]['name'])
                            update_replicas_states([
                                replicas[idx],
                            ],
                                                   nowait=True,
                                                   session=session)
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                prepend_str +
                                'DID %s:%s does not exist anymore. ' %
                                (bad_replicas[idx]['scope'],
                                 bad_replicas[idx]['name']))
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound as error:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                prepend_str +
                                '%s:%s on RSEID %s does not exist anymore. ' %
                                (replicas[idx]['scope'], replicas[idx]['name'],
                                 replicas[idx]['rse_id']))
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())
                    session = get_session()

        except Exception as error:
            logging.critical(traceback.format_exc())

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 15
0
def update_bad_request(req, dest_rse, new_state, detail, session=None):
    if new_state == RequestState.FAILED:
        request.set_request_state(req['request_id'], new_state, session=session)

        activity = 'default'
        if req['attributes']:
            if type(req['attributes']) is dict:
                req_attributes = json.loads(json.dumps(req['attributes']))
            else:
                req_attributes = json.loads(str(req['attributes']))
            activity = req_attributes['activity'] if req_attributes['activity'] else 'default'

        tss = time.time()
        add_message('transfer-failed', {'activity': activity,
                                        'request-id': req['request_id'],
                                        'checksum-adler': None,
                                        'checksum-md5': None,
                                        'dst-rse': dest_rse,
                                        'dst-url': None,
                                        'name': req['name'],
                                        'guid': None,
                                        'file-size': None,
                                        'previous-request-id': req['request_id'],
                                        'protocol': None,
                                        'reason': detail,
                                        'transfer-link': None,
                                        'scope': req['scope'],
                                        'src-rse': None,
                                        'src-url': None,
                                        'tool-id': 'rucio-conveyor',
                                        'transfer-endpoint': config_get('conveyor', 'ftshosts'),
                                        'transfer-id': None},
                    session=session)

        request.archive_request(req['request_id'], session=session)
        logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'],
                                                                req['name'],
                                                                req['request_id'],
                                                                detail))
        try:
            replica.update_replicas_states([{'rse': dest_rse,
                                             'scope': req['scope'],
                                             'name': req['name'],
                                             'state': ReplicaState.UNAVAILABLE}],
                                           session=session)
        except:
            logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'],
                                                                                                      req['name'],
                                                                                                      dest_rse,
                                                                                                      traceback.format_exc()))
            raise

        tss = time.time()
        try:
            lock.failed_transfer(req['scope'],
                                 req['name'],
                                 req['dest_rse_id'],
                                 session=session)
        except:
            logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'],
                                                                                         req['name'],
                                                                                         dest_rse,
                                                                                         traceback.format_exc()))
            raise
        record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
Ejemplo n.º 16
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 500  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'] - 1,
                worker_number=heart_beat['assign_thread'],
                limit=1000)

            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                })
                bad_replicas.append({
                    'scope':
                    replica[0],
                    'name':
                    replica[1],
                    'rse_id':
                    replica[2],
                    'state':
                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                })
            session = get_session()

            for chunk in chunks(replicas, chunk_size):
                # Process and update the replicas in chunks
                update_replicas_states(chunk, nowait=False, session=session)
            for chunk in chunks(bad_replicas, chunk_size):
                # Remove the replicas from bad_replicas table in chunks
                bulk_delete_bad_replicas(chunk, session=session)

            session.commit()  # pylint: disable=no-member

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 17
0
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param exclude_rses: RSE expression to exclude RSEs from the Reaper.
    """
    logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals())
    while not graceful_stop.is_set():
        try:
            max_deleting_rate = 0

            for rse in rses:
                deleting_rate = 0
                rse_info = rsemgr.get_rse_info(rse['rse'])
                rse_protocol = rse_core.get_rse_protocols(rse['rse'])

                if not rse_protocol['availability_delete']:
                    logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse']))
                    continue

                # Temporary hack to force gfal for deletion
                for protocol in rse_info['protocols']:
                    if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                        protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse']))
                try:
                    needed_free_space, max_being_deleted_files = None, 10000
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space = 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse']))

                    s = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas)))

                    if not replicas:
                        logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse']))
                        continue

                    p = rsemgr.create_protocol(rse_info, 'delete', scheme=None)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files)))
                        try:
                            s = time.time()
                            update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files])

                            for replica in files:
                                try:
                                    replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0])
                                except ReplicaUnAvailable as e:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e))
                                    logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg))
                                    replica['pfn'] = None

                                add_message('deletion-planned', {'scope': replica['scope'],
                                                                 'name': replica['name'],
                                                                 'file-size': replica['bytes'],
                                                                 'url': replica['pfn'],
                                                                 'rse': rse_info['rse']})

                            # logging.debug('update_replicas_states %s' % (time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.being_deleted',  delta=len(files))

                            if not scheme:
                                try:
                                    deleted_files = []
                                    p.connect()
                                    for replica in files:
                                        try:
                                            logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            s = time.time()
                                            if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                                logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number,
                                                                                                                                                                                    child_number,
                                                                                                                                                                                    replica['scope'],
                                                                                                                                                                                    replica['name'],
                                                                                                                                                                                    replica['pfn'],
                                                                                                                                                                                    rse['rse']))
                                            else:
                                                if replica['pfn']:
                                                    p.delete(replica['pfn'])
                                                else:
                                                    logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000)
                                            duration = time.time() - s

                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                            add_message('deletion-done', {'scope': replica['scope'],
                                                                          'name': replica['name'],
                                                                          'rse': rse_info['rse'],
                                                                          'file-size': replica['bytes'],
                                                                          'url': replica['pfn'],
                                                                          'duration': duration})
                                            logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                        except SourceNotFound:
                                            err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                            logging.warning(err_msg)
                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': err_msg})
                                        except (ServiceUnavailable, RSEAccessDenied) as e:
                                            logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except Exception as e:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                except (ServiceUnavailable, RSEAccessDenied) as e:
                                    for replica in files:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                        add_message('deletion-failed', {'scope': replica['scope'],
                                                                        'name': replica['name'],
                                                                        'rse': rse_info['rse'],
                                                                        'file-size': replica['bytes'],
                                                                        'url': replica['pfn'],
                                                                        'reason': str(e)})
                                finally:
                                    p.close()
                            s = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse=rse['rse'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.done',  delta=len(deleted_files))
                            deleting_rate += len(deleted_files)
                        except:
                            logging.critical(traceback.format_exc())
                    deleting_rate = deleting_rate * 1.0 / max_being_deleted_files
                    if deleting_rate > max_deleting_rate:
                        max_deleting_rate = deleting_rate
                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate))
            sleep_time = int((1 - max_deleting_rate) * 60 + 1)
            time.sleep(sleep_time)

        except:
            logging.critical(traceback.format_exc())

    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
Ejemplo n.º 18
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logging.info(prepend_str + 'Minos started')

    chunk_size = 500  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                             heart_beat['nr_threads'])
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = pfn['state']
                if states_mapping[state] in [
                        BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS
                ]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                if states_mapping[
                        state] == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                pfns = bad_replicas[(account, reason, state)]
                logging.debug(
                    prepend_str +
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                try:
                    for chunk in chunks(pfns, chunk_size):
                        unknown_replicas = declare_bad_file_replicas(
                            pfns=chunk,
                            reason=reason,
                            issuer=account,
                            status=state,
                            session=session)
                        bulk_delete_bad_pfns(pfns=chunk, session=session)
                        session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logging.debug(
                    prepend_str +
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logging.debug(prepend_str + 'Extracting RSEs')
                _, dict_rse, unknown_replicas = get_pfn_to_rse(pfns)
                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logging.info(
                        prepend_str +
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse in dict_rse:
                    replicas = []
                    rse_id = get_rse_id(rse=rse, session=None)
                    logging.debug(prepend_str + 'Running on RSE %s' % rse)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse],
                                                 rse=rse,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    session = get_session()
                    try:
                        for chunk in chunks(replicas, chunk_size):
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=None,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                    except Exception:
                        session.rollback()  # pylint: disable=no-member
                        logging.critical(traceback.format_exc())

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 19
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos_temporary_expiration[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Minos Temporary Expiration starting')

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        try:
            # Get list of expired TU replicas
            logger(logging.INFO, 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logger(logging.INFO, '%s expired replicas returned', len(expired_replicas))
            logger(logging.DEBUG, 'List of expired replicas returned %s', str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope, name=name, session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE})
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
                try:
                    nchunk += 1
                    logger(logging.DEBUG, 'Running on %s chunk out of %s', nchunk, tot_chunk)
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.WARNING, 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[2]
                        logger(logging.DEBUG, 'Working on %s:%s on %s', scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.WARNING, 'DID %s:%s does not exist anymore.', scope, name)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.WARNING, 'Replica %s:%s on RSEID %s does not exist anymore.', scope, name, rse_id)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except (DatabaseException, DatabaseError) as error:
                    if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                        logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                    else:
                        logger(logging.ERROR, 'Exception', exc_info=True)
                    session.rollback()
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.CRITICAL, str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logger(logging.CRITICAL, str(traceback.format_exc()))

        if once:
            break
        daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
Ejemplo n.º 20
0
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100,
           once=False, greedy=False, scheme=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    """
    logging.info('Starting Reaper: Worker %(worker_number)s, '
                 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    # Generate a hash just for the subset of RSEs
    rse_names = [rse['rse'] for rse in rses]
    hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    nothing_to_do = {}
    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
            checkpoint_time = datetime.datetime.now()
            # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))

            max_deleting_rate = 0
            for rse in sort_rses(rses):
                try:
                    if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now():
                        heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
                        # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))
                        checkpoint_time = datetime.datetime.now()

                    if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now():
                        continue
                    logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number,
                                 rse['rse'], nothing_to_do.get(rse['id']))

                    rse_info = rsemgr.get_rse_info(rse_id=rse['id'])
                    rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id'])

                    if not rse_protocol['availability_delete']:
                        logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse'])
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        continue

                    # Temporary hack to force gfal for deletion
                    for protocol in rse_info['protocols']:
                        if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                            protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                    needed_free_space, max_being_deleted_files = None, 100
                    needed_free_space_per_child = None
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space, needed_free_space_per_child = 0, 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse'])
                        else:
                            if total_children and total_children > 0:
                                needed_free_space_per_child = needed_free_space / float(total_children)

                    start = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse_id=rse['id'],
                                                          bytes=needed_free_space_per_child,
                                                          limit=max_being_deleted_files,
                                                          worker_number=child_number,
                                                          total_workers=total_children,
                                                          delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas))

                    if not replicas:
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s',
                                     worker_number, child_number, rse['rse'],
                                     nothing_to_do[rse['id']])
                        continue

                    prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files))
                        try:
                            update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True)
                            for replica in files:
                                try:
                                    replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info,
                                                                               lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}],
                                                                               operation='delete', scheme=scheme).values())[0])
                                except (ReplicaUnAvailable, ReplicaNotFound) as error:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error))
                                    logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg)
                                    replica['pfn'] = None

                            monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files))

                            try:
                                deleted_files = []
                                prot.connect()
                                for replica in files:
                                    try:
                                        deletion_dict = {'scope': replica['scope'].external,
                                                         'name': replica['name'],
                                                         'rse': rse_info['rse'],
                                                         'rse_id': rse_info['id'],
                                                         'file-size': replica['bytes'],
                                                         'bytes': replica['bytes'],
                                                         'url': replica['pfn'],
                                                         'protocol': prot.attributes['scheme']}
                                        if replica['scope'].vo != 'def':
                                            deletion_dict['vo'] = replica['scope'].vo
                                        logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        start = time.time()
                                        if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                            logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                                                            worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        else:
                                            if replica['pfn']:
                                                pfn = replica['pfn']
                                                # sign the URL if necessary
                                                if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None:
                                                    pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn)
                                                prot.delete(pfn)
                                            else:
                                                logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000)
                                        duration = time.time() - start

                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                        deletion_dict['duration'] = duration
                                        add_message('deletion-done', deletion_dict)
                                        logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration)
                                    except SourceNotFound:
                                        err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        logging.warning(err_msg)
                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                        if replica['state'] == ReplicaState.AVAILABLE:
                                            deletion_dict['reason'] = str(err_msg)
                                            add_message('deletion-failed', deletion_dict)
                                    except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except Exception as error:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                            except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                for replica in files:
                                    logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                    payload = {'scope': replica['scope'].external,
                                               'name': replica['name'],
                                               'rse': rse_info['rse'],
                                               'rse_id': rse_info['id'],
                                               'file-size': replica['bytes'],
                                               'bytes': replica['bytes'],
                                               'url': replica['pfn'],
                                               'reason': str(error),
                                               'protocol': prot.attributes['scheme']}
                                    if replica['scope'].vo != 'def':
                                        deletion_dict['vo'] = replica['scope'].vo
                                    add_message('deletion-failed', payload)
                                    break
                            finally:
                                prot.close()
                            start = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse_id=rse['id'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start)
                            monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files))

                        except DatabaseException as error:
                            logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error))
                        except UnsupportedOperation as error:
                            logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error))
                        except:
                            logging.critical(traceback.format_exc())

                except RSENotFound as error:
                    logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error))

                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            time.sleep(1)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Ejemplo n.º 21
0
            req = request.get_request(response['request_id'], session=session)
            if req['request_type'] == RequestType.STAGEIN:
                rse_update_name = rse_core.list_rse_attributes(response['dst_rse'], session=session)['staging_buffer']
                logging.debug('OVERRIDE REPLICA DID %s:%s RSE %s TO %s' % (response['scope'], response['name'], response['dst_rse'], rse_update_name))

            try:
                tss = time.time()
                logging.debug('UPDATE REPLICA STATE DID %s:%s RSE %s' % (response['scope'], response['name'], rse_update_name))

                # make sure we do not leave the transaction
                try:
                    # try quickly
                    replica.update_replicas_states([{'rse': rse_update_name,
                                                     'scope': response['scope'],
                                                     'name': response['name'],
                                                     'state': ReplicaState.AVAILABLE}],
                                                   nowait=False,
                                                   session=session)
                except UnsupportedOperation:
                    # replica not found, but it has been transferred because cancel came too late.
                    # so we need to register the replica and schedule it for deletion again
                    record_timer('daemons.conveyor.common.update_request_state.replica-update_replicas_states', (time.time()-tss)*1000)
                    logging.warn('DID %s:%s AT RSE %s NOT FOUND - Registering replica and scheduling for immediate deletion' % (response['scope'],
                                                                                                                                response['name'],
                                                                                                                                rse_name))
                    did_meta = None
                    try:
                        did_meta = did.get_metadata(response['scope'],
                                                    response['name'],
                                                    session=session)
                    except:
Ejemplo n.º 22
0
def declare_bad_file_replicas(dids,
                              rse_id,
                              reason,
                              issuer,
                              status=BadFilesStatus.BAD,
                              scheme=None,
                              session=None):
    """
    Declare a list of bad replicas.

    :param dids: The list of DIDs.
    :param rse_id: The RSE id.
    :param reason: The reason of the loss.
    :param issuer: The issuer account.
    :param status: Either BAD or SUSPICIOUS.
    :param scheme: The scheme of the PFNs.
    :param session: The database session in use.
    """
    unknown_replicas = []
    replicas = []
    for did in dids:
        scope = InternalScope(did['scope'], vo=issuer.vo)
        name = did['name']
        path = None
        scope, name, path, exists, already_declared, size = __exist_replicas(
            rse_id, [(scope, name, path)], session=session)[0]
        if exists and (
            (str(status) == str(BadFilesStatus.BAD) and not already_declared)
                or str(status) == str(BadFilesStatus.SUSPICIOUS)):
            replicas.append({
                'scope': scope,
                'name': name,
                'rse_id': rse_id,
                'state': ReplicaState.BAD
            })
            new_bad_replica = models.BadReplicas(scope=scope,
                                                 name=name,
                                                 rse_id=rse_id,
                                                 reason=reason,
                                                 state=status,
                                                 account=issuer,
                                                 bytes=size)
            new_bad_replica.save(session=session, flush=False)
            session.query(models.Source).filter_by(
                scope=scope, name=name,
                rse_id=rse_id).delete(synchronize_session=False)
        else:
            if already_declared:
                unknown_replicas.append(
                    '%s:%s %s' %
                    (did['scope'], did['name'], 'Already declared'))
            else:
                unknown_replicas.append(
                    '%s:%s %s' %
                    (did['scope'], did['name'], 'Unknown replica'))
    if str(status) == str(BadFilesStatus.BAD):
        # For BAD file, we modify the replica state, not for suspicious
        try:
            # there shouldn't be any exceptions since all replicas exist
            update_replicas_states(replicas, session=session)
        except exception.UnsupportedOperation:
            raise exception.ReplicaNotFound(
                "One or several replicas don't exist.")
    try:
        session.flush()
    except IntegrityError as error:
        raise exception.RucioException(error.args)
    except DatabaseError as error:
        raise exception.RucioException(error.args)
    except FlushError as error:
        raise exception.RucioException(error.args)

    return unknown_replicas
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info('%s Minos Temporary Expiration starting', prepend_str)

    time.sleep(10)  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

    logging.info('%s Minos Temporary Expiration started', prepend_str)

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info('%s Getting list of expired replicas', prepend_str)
            expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'],
                                                                           worker_number=heart_beat['assign_thread'],
                                                                           limit=1000)
            logging.info('%s %s expired replicas returned', prepend_str, len(expired_replicas))
            logging.debug('%s List of expired replicas returned %s', prepend_str, str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope, name=name, session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE})
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE})
                try:
                    nchunk += 1
                    logging.debug('%s Running on %s chunk out of %s', prepend_str, nchunk, tot_chunk)
                    update_replicas_states(replicas, nowait=True, session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning('%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', prepend_str, str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[2]
                        logging.debug('%s Working on %s:%s on %s', prepend_str, scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning('%s DID %s:%s does not exist anymore.', prepend_str, scope, name)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning('%s Replica %s:%s on RSEID %s does not exist anymore.', prepend_str, scope, name, rse_id)
                            bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical('%s %s', prepend_str, str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)