def load_info(self, session=None): if self.info is None: self.info = rsemgr.get_rse_info( rse=self.load_name(session=session), vo=rse_core.get_rse_vo(rse_id=self.id, session=session), session=session) return self.info
def list_rebalance_rule_candidates(rse_id, mode=None, session=None): """ List the rebalance rule candidates based on the agreed on specification :param rse_id: RSE of the source. :param mode: Rebalancing mode. :param session: DB Session. """ vo = get_rse_vo(rse_id=rse_id) # dumps can be applied only for decommission since the dumps doesn't contain info from dids if mode == 'decommission': return _list_rebalance_rule_candidates_dump(rse_id, mode) # the rest is done with sql query from_date = datetime.utcnow() + timedelta(days=60) to_date = datetime.now() - timedelta(days=60) allowed_accounts = [ InternalAccount(a, vo=vo) for a in ('panda', 'root', 'ddmadmin') ] allowed_grouping = [RuleGrouping.DATASET, RuleGrouping.ALL] external_dsl = aliased(models.DatasetLock) count_locks = select([func.count()]).where( and_(external_dsl.scope == models.DatasetLock.scope, external_dsl.name == models.DatasetLock.name, external_dsl.rse_id == models.DatasetLock.rse_id)).as_scalar() query = session.query(models.DatasetLock.scope, models.DatasetLock.name, models.ReplicationRule.id, models.ReplicationRule.rse_expression, models.ReplicationRule.subscription_id, models.DataIdentifier.bytes, models.DataIdentifier.length, case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, Integer))).\ join(models.ReplicationRule, models.ReplicationRule.id == models.DatasetLock.rule_id).\ join(models.DataIdentifier, and_(models.DatasetLock.scope == models.DataIdentifier.scope, models.DatasetLock.name == models.DataIdentifier.name)).\ filter(models.DatasetLock.rse_id == rse_id).\ filter(or_(models.ReplicationRule.expires_at > from_date, models.ReplicationRule.expires_at.is_(None))).\ filter(models.ReplicationRule.created_at < to_date).\ filter(models.ReplicationRule.account.in_(allowed_accounts)).\ filter(models.ReplicationRule.state == RuleState.OK).\ filter(models.ReplicationRule.did_type == DIDType.DATASET).\ filter(models.ReplicationRule.copies == 1).\ filter(models.ReplicationRule.child_rule_id.is_(None)).\ filter(models.ReplicationRule.grouping.in_(allowed_grouping)).\ filter(models.DataIdentifier.bytes.isnot(None)).\ filter(models.DataIdentifier.is_open == 0).\ filter(models.DataIdentifier.did_type == DIDType.DATASET).\ filter(case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, Integer)) > 1000000000).\ filter(count_locks == 1) summary = query.order_by( case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, Integer)), models.DatasetLock.accessed_at).all() return summary
def __dump_url(rse_id, logger=logging.log): """ getting potential urls of the dump over last week :param rse_id: RSE where the dump is released. :param logger: Logger. """ rse = get_rse_name(rse_id=rse_id) vo = get_rse_vo(rse_id=rse_id) # get the date of the most recent dump today = date.today() dump_dates = [] dump_production_day = config_get('bb8', 'dump_production_day', raise_exception=False, default=None) if dump_production_day is None: for idx in range(0, 7): dump_date = today - timedelta(idx) dump_dates.append(dump_date.strftime('%d-%m-%Y')) else: weekdays = { 'Sunday': 6, 'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5 } if dump_production_day not in weekdays: logger( logging.WARNING, 'ERROR: please set the day of a dump creation in bb8 config correctly, e.g. Monday' ) return False today_idx = (today.weekday() - weekdays[dump_production_day]) % 7 dump_date = today - timedelta(today_idx) dump_dates = [dump_date.strftime('%d-%m-%Y')] # getting structure (template) of url location of a dump url_template_str = config_get( 'bb8', 'dump_url_template', raise_exception=False, default= 'http://rucio-analytix.cern.ch:8080/LOCKS/GetFileFromHDFS?date=${date}&rse=${rse}' ) url_template = Template(url_template_str) # populating url template urls = [] for d in dump_dates: url = url_template.substitute({'date': d, 'rse': rse, 'vo': vo}) urls.append(url) return urls
def __dump_url(rse_id, logger=logging.log): """ getting potential urls of the dump over last week :param rse_id: RSE where the dump is released. :param logger: Logger. """ rse = get_rse_name(rse_id=rse_id) vo = get_rse_vo(rse_id=rse_id) # get the date of the most recent dump today = date.today() dump_dates = [] dump_production_day = config_get("bb8", "dump_production_day", raise_exception=False, default=None) if dump_production_day is None: for idx in range(0, 7): dump_date = today - timedelta(idx) dump_dates.append(dump_date.strftime("%d-%m-%Y")) else: weekdays = { "Sunday": 6, "Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5, } if dump_production_day not in weekdays: logger( logging.WARNING, "ERROR: please set the day of a dump creation in bb8 config correctly, e.g. Monday", ) return False today_idx = (today.weekday() - weekdays[dump_production_day]) % 7 dump_date = today - timedelta(today_idx) dump_dates = [dump_date.strftime("%d-%m-%Y")] # getting structure (template) of url location of a dump url_template_str = config_get( "bb8", "dump_url_template", raise_exception=False, ) url_template = Template(url_template_str) # populating url template urls = [] for d in dump_dates: url = url_template.substitute({"date": d, "rse": rse, "vo": vo}) urls.append(url) return urls
def _get_path_nondeterministic_server(self, scope, name): # pylint: disable=invalid-name """ Provides the path of a replica for non-deterministic sites. Will be assigned to get path by the __init__ method if neccessary. """ vo = get_rse_vo(self.rse['id']) scope = InternalScope(scope, vo=vo) rep = replica.get_replica(scope=scope, name=name, rse_id=self.rse['id']) if 'path' in rep and rep['path'] is not None: path = rep['path'] elif 'state' in rep and (rep['state'] is None or rep['state'] == 'UNAVAILABLE'): raise exception.ReplicaUnAvailable('Missing path information and state is UNAVAILABLE for replica %s:%s on non-deterministic storage named %s' % (scope, name, self.rse['rse'])) else: raise exception.ReplicaNotFound('Missing path information for replica %s:%s on non-deterministic storage named %s' % (scope, name, self.rse['rse'])) if path.startswith('/'): path = path[1:] if path.endswith('/'): path = path[:-1] return path
def test_account_counters_at_different_vos(self): """ MULTI VO (CLIENT): Test that account counters from 2nd vo don't interfere """ session = db_session.get_session() # add some RSEs to test create_counters_for_new_account rse_client = RSEClient() rse_str = ''.join(choice(ascii_uppercase) for x in range(10)) tst_rse1 = 'TST1_%s' % rse_str new_rse1 = 'NEW1_%s' % rse_str rse_client.add_rse(tst_rse1) add_rse(new_rse1, 'root', **self.new_vo) # add an account - should have counters created for RSEs on the same VO usr_uuid = str(generate_uuid()).lower()[:16] new_acc_str = 'shr-%s' % usr_uuid new_acc = InternalAccount(new_acc_str, **self.new_vo) add_account(new_acc_str, 'USER', '*****@*****.**', 'root', **self.new_vo) query = session.query(models.AccountUsage.account, models.AccountUsage.rse_id).\ distinct(models.AccountUsage.account, models.AccountUsage.rse_id).\ filter_by(account=new_acc) acc_counters = list(query.all()) assert_not_equal(0, len(acc_counters)) for counter in acc_counters: rse_id = counter[1] vo = get_rse_vo(rse_id) assert_equal(vo, self.new_vo['vo']) # add an RSE - should have counters created for accounts on the same VO new_rse2 = 'NEW2_' + rse_str new_rse2_id = add_rse(new_rse2, 'root', **self.new_vo) query = session.query(models.AccountUsage.account, models.AccountUsage.rse_id).\ distinct(models.AccountUsage.account, models.AccountUsage.rse_id).\ filter_by(rse_id=new_rse2_id) rse_counters = list(query.all()) assert_not_equal(0, len(rse_counters)) for counter in rse_counters: account = counter[0] assert_equal(account.vo, self.new_vo['vo']) session.commit()
def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400, once=True, unlock=False, spread_period=0, purge_replicas=False): """ Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param grace_period: The grace_period for the rules. :param once: Run only once. """ sleep_time = 60 executable = 'atropos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) now = datetime.datetime.now() hb = heartbeat.live(executable, hostname, pid, hb_thread) time.sleep(10) hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) logging.debug(prepend_str + 'Starting worker') summary = {} lifetime_exceptions = {} rand = random.Random(hb['assign_thread']) for excep in rucio.core.lifetime_exception.list_exceptions( exception_id=None, states=[ LifetimeExceptionsState.APPROVED, ], session=None): key = '{}:{}'.format(excep['scope'].internal, excep['name']) if key not in lifetime_exceptions: lifetime_exceptions[key] = excep['expires_at'] elif lifetime_exceptions[key] < excep['expires_at']: lifetime_exceptions[key] = excep['expires_at'] logging.debug(prepend_str + '%s active exceptions' % len(lifetime_exceptions)) if not dry_run and date_check > now: logging.error( prepend_str + 'Atropos cannot run in non-dry-run mode for date in the future') else: while not GRACEFUL_STOP.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) stime = time.time() try: rules = get_rules_beyond_eol(date_check, thread, hb['nr_threads'], session=None) logging.info(prepend_str + '%s rules to process' % (len(rules))) for rule_idx, rule in enumerate(rules, start=1): did = '%s:%s' % (rule.scope, rule.name) did_key = '{}:{}'.format(rule.scope.internal, rule.name) logging.debug(prepend_str + 'Working on rule %s on DID %s on %s' % (rule.id, did, rule.rse_expression)) if (rule_idx % 1000) == 0: logging.info(prepend_str + '%s/%s rules processed' % (rule_idx, len(rules))) # We compute the expected eol_at try: rses = parse_expression(rule.rse_expression, filter={'vo': rule.account.vo}) except InvalidRSEExpression: logging.warning( prepend_str + 'Rule %s has an RSE expression that results in an empty set: %s' % (rule.id, rule.rse_expression)) continue eol_at = rucio.core.lifetime_exception.define_eol( rule.scope, rule.name, rses) if eol_at != rule.eol_at: logging.warning( prepend_str + 'The computed eol %s differs from the one recorded %s for rule %s on %s at %s' % (eol_at, rule.eol_at, rule.id, did, rule.rse_expression)) try: update_rule(rule.id, options={'eol_at': eol_at}) except RuleNotFound: logging.warning(prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Check the exceptions if did_key in lifetime_exceptions: if eol_at > lifetime_exceptions[did_key]: logging.info( prepend_str + 'Rule %s on DID %s on %s has longer expiration date than the one requested : %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did_key])) else: # If eol_at < requested extension, update eol_at logging.info( prepend_str + 'Updating rule %s on DID %s on %s according to the exception till %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did_key])) eol_at = lifetime_exceptions[did_key] try: update_rule(rule.id, options={ 'eol_at': lifetime_exceptions[did_key] }) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Now check that the new eol_at is expired if eol_at and eol_at < date_check: no_locks = True for lock in get_dataset_locks(rule.scope, rule.name): if lock['rule_id'] == rule[4]: no_locks = False if lock['rse_id'] not in summary: summary[lock['rse_id']] = {} if did_key not in summary[lock['rse_id']]: summary[lock['rse_id']][did_key] = { 'length': lock['length'] or 0, 'bytes': lock['bytes'] or 0 } if no_locks: logging.warning( prepend_str + 'Cannot find a lock for rule %s on DID %s' % (rule.id, did)) if not dry_run: lifetime = grace_period + rand.randrange( spread_period + 1) logging.info( prepend_str + 'Setting %s seconds lifetime for rule %s' % (lifetime, rule.id)) options = {'lifetime': lifetime} if purge_replicas: options['purge_replicas'] = True if rule.locked and unlock: logging.info(prepend_str + 'Unlocking rule %s', rule.id) options['locked'] = False try: update_rule(rule.id, options=options) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) for rse_id in summary: tot_size, tot_files, tot_datasets = 0, 0, 0 for did in summary[rse_id]: tot_datasets += 1 tot_files += summary[rse_id][did].get('length', 0) tot_size += summary[rse_id][did].get('bytes', 0) vo = get_rse_vo(rse_id=rse_id) logging.info( prepend_str + 'For RSE %s %s %s datasets will be deleted representing %s files and %s bytes' % (get_rse_name(rse_id=rse_id), '' if vo == 'def' else 'on VO ' + vo, tot_datasets, tot_files, tot_size)) if once: break else: tottime = time.time() - stime if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def list_rebalance_rule_candidates(rse_id, mode=None, session=None): """ List the rebalance rule candidates based on the agreed on specification :param rse_id: RSE of the source. :param mode: Rebalancing mode. :param session: DB Session. """ vo = get_rse_vo(rse_id=rse_id) # dumps can be applied only for decommission since the dumps doesn't contain info from dids if mode == 'decommission': return _list_rebalance_rule_candidates_dump(rse_id, mode) # If no decommissioning use SQLAlchemy # Rules constraints. By default only moves rules in state OK that have no children and have only one copy # Additional constraints can be imposed by setting specific configuration rule_clause = [ models.ReplicationRule.state == RuleState.OK, models.ReplicationRule.child_rule_id.is_(None), models.ReplicationRule.copies == 1 ] # Only move rules w/o expiration date, or rules with expiration_date > >min_expires_date_in_days> days expiration_clause = models.ReplicationRule.expires_at.is_(None) min_expires_date_in_days = config_get_int( section='bb8', option='min_expires_date_in_days', raise_exception=False, default=-1, expiration_time=3600) if min_expires_date_in_days > 0: min_expires_date_in_days = datetime.utcnow() + timedelta( days=min_expires_date_in_days) expiration_clause = or_( models.ReplicationRule.expires_at > min_expires_date_in_days, models.ReplicationRule.expires_at.is_(None)) rule_clause.append(expiration_clause) # Only move rules which were created more than <min_created_days> days ago min_created_days = config_get_int(section='bb8', option='min_created_days', raise_exception=False, default=-1, expiration_time=3600) if min_created_days > 0: min_created_days = datetime.now() - timedelta(days=min_created_days) rule_clause.append( models.ReplicationRule.created_at < min_created_days) # Only move rules which are owned by <allowed_accounts> (coma separated accounts, e.g. panda,root,ddmadmin,jdoe) allowed_accounts = config_get(section='bb8', option='allowed_accounts', raise_exception=False, default=None, expiration_time=3600) if allowed_accounts: allowed_accounts = [ InternalAccount(acc.strip(' '), vo=vo) for acc in allowed_accounts.split(',') ] rule_clause.append( models.ReplicationRule.account.in_(allowed_accounts)) # Only move rules that have a certain grouping <allowed_grouping> (accepted values : all, dataset, none) rule_grouping_mapping = { 'all': RuleGrouping.ALL, 'dataset': RuleGrouping.DATASET, 'none': RuleGrouping.NONE } allowed_grouping = config_get(section='bb8', option='allowed_grouping', raise_exception=False, default=None, expiration_time=3600) if allowed_grouping: rule_clause.append(models.ReplicationRule.grouping == rule_grouping_mapping.get(allowed_grouping)) # DIDs constraints. By default only moves rules of DID where we can compute the size # Additional constraints can be imposed by setting specific configuration did_clause = [models.DataIdentifier.bytes.isnot(None)] type_to_did_type_mapping = { 'all': [DIDType.CONTAINER, DIDType.DATASET, DIDType.FILE], 'collection': [DIDType.CONTAINER, DIDType.DATASET], 'container': [DIDType.CONTAINER], 'dataset': [DIDType.DATASET], 'file': [DIDType.FILE] } # Only allows to migrate rules of a certain did_type <allowed_did_type> (accepted values : all, collection, container, dataset, file) allowed_did_type = config_get(section='bb8', option='allowed_did_type', raise_exception=False, default=None, expiration_time=3600) if allowed_did_type: allowed_did_type = [ models.DataIdentifier.did_type == did_type for did_type in type_to_did_type_mapping.get(allowed_did_type) ] did_clause.append(or_(allowed_did_type)) # Only allows to migrate rules of closed DID is <only_move_closed_did> is set only_move_closed_did = config_get_bool(section='bb8', option='only_move_closed_did', raise_exception=False, default=None, expiration_time=3600) if only_move_closed_did: did_clause.append(models.DataIdentifier.is_open == False) # NOQA # Now build the query external_dsl = aliased(models.DatasetLock) count_locks = select([func.count()]).where( and_(external_dsl.scope == models.DatasetLock.scope, external_dsl.name == models.DatasetLock.name, external_dsl.rse_id == models.DatasetLock.rse_id)).as_scalar() query = session.query(models.DatasetLock.scope, models.DatasetLock.name, models.ReplicationRule.id, models.ReplicationRule.rse_expression, models.ReplicationRule.subscription_id, models.DataIdentifier.bytes, models.DataIdentifier.length, case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, BigInteger))).\ join(models.ReplicationRule, models.ReplicationRule.id == models.DatasetLock.rule_id).\ join(models.DataIdentifier, and_(models.DatasetLock.scope == models.DataIdentifier.scope, models.DatasetLock.name == models.DataIdentifier.name)).\ filter(models.DatasetLock.rse_id == rse_id).\ filter(and_(*rule_clause)).\ filter(and_(*did_clause)).\ filter(case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, BigInteger)) > 1000000000).\ filter(count_locks == 1) summary = query.order_by( case([(or_(models.DatasetLock.length < 1, models.DatasetLock.length.is_(None)), 0)], else_=cast(models.DatasetLock.bytes / models.DatasetLock.length, BigInteger)), models.DatasetLock.accessed_at).all() return summary
def minos(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Worker that gets a list of bad PFNs, extract the scope, name and rse_id and fill the bad_replicas table. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Minos starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) states_mapping = { BadPFNStatus.BAD: BadFilesStatus.BAD, BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS, BadPFNStatus.TEMPORARY_UNAVAILABLE: BadFilesStatus.TEMPORARY_UNAVAILABLE } logging.info(prepend_str + 'Minos started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) pfns = [] try: bad_replicas = {} temporary_unvailables = {} pfns = get_bad_pfns(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], limit=bulk) # Class the PFNs into bad_replicas and temporary_unavailable for pfn in pfns: path = pfn['pfn'] account = pfn['account'] reason = pfn['reason'] expires_at = pfn['expires_at'] state = pfn['state'] if states_mapping[state] in [ BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS ]: if (account, reason, state) not in bad_replicas: bad_replicas[(account, reason, state)] = [] bad_replicas[(account, reason, state)].append(path) if states_mapping[ state] == BadFilesStatus.TEMPORARY_UNAVAILABLE: if (account, reason, expires_at) not in temporary_unvailables: temporary_unvailables[(account, reason, expires_at)] = [] temporary_unvailables[(account, reason, expires_at)].append(path) # Process the bad and suspicious files # The scope, name, rse_id are extracted and filled into the bad_replicas table for account, reason, state in bad_replicas: pfns = bad_replicas[(account, reason, state)] logging.info( prepend_str + 'Declaring %s replicas with state %s and reason %s' % (len(pfns), str(state), reason)) session = get_session() schemes = {} dict_rse = {} unknown_replicas = [] try: # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme]) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: vo = get_rse_vo(rse_id=rse_id) vo_str = '' if vo == 'def' else ' on VO ' + vo logging.debug(prepend_str + 'Running on RSE %s%s with %s replicas' % (get_rse_name(rse_id=rse_id), vo_str, len(dict_rse[rse_id]))) nchunk = 0 tot_chunk = int( math.ceil(len(dict_rse[rse_id]) / chunk_size)) for chunk in chunks(dict_rse[rse_id], chunk_size): nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) unknown_replicas = declare_bad_file_replicas( pfns=chunk, reason=reason, issuer=account, status=state, session=session) if unknown_replicas: logging.debug(prepend_str + 'Unknown replicas : %s' % (str(unknown_replicas))) bulk_delete_bad_pfns(pfns=chunk, session=session) session.commit() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) # Now get the temporary unavailable and update the replicas states for account, reason, expires_at in temporary_unvailables: pfns = temporary_unvailables[(account, reason, expires_at)] logging.info( prepend_str + 'Declaring %s replicas temporary available with timeout %s and reason %s' % (len(pfns), str(expires_at), reason)) logging.debug(prepend_str + 'Extracting RSEs') schemes = {} dict_rse = {} unknown_replicas = [] # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme]) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: replicas = [] rse = get_rse_name(rse_id=rse_id, session=None) vo = get_rse_vo(rse_id=rse_id) rse_vo_str = rse if vo == 'def' else '{} on {}'.format( rse, vo) logging.debug(prepend_str + 'Running on RSE %s' % rse_vo_str) for rep in get_did_from_pfns(pfns=dict_rse[rse_id], rse_id=None, session=None): for pfn in rep: scope = rep[pfn]['scope'] name = rep[pfn]['name'] replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.TEMPORARY_UNAVAILABLE, 'pfn': pfn }) # The following part needs to be atomic # We update the replicas states to TEMPORARY_UNAVAILABLE # then insert a row in the bad_replicas table. TODO Update the row if it already exists # then delete the corresponding rows into the bad_pfns table logging.debug(prepend_str + 'Running on %s replicas on RSE %s' % (len(replicas), rse_vo_str)) nchunk = 0 tot_chunk = int( math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(replicas, chunk_size): try: nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(chunk, nowait=False, session=session) bulk_add_bad_replicas( chunk, account, state=BadFilesStatus.TEMPORARY_UNAVAILABLE, reason=None, expires_at=expires_at, session=session) pfns = [entry['pfn'] for entry in chunk] bulk_delete_bad_pfns(pfns=pfns, session=session) session.commit() # pylint: disable=no-member except UnsupportedOperation as error: session.rollback() # pylint: disable=no-member logging.error( prepend_str + 'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s' % str(error)) for rep in chunk: logging.debug(prepend_str + 'Working on %s' % (str(rep))) try: get_metadata(rep['scope'], rep['name']) unavailable_states = [] rep_state = get_replicas_state( rep['scope'], rep['name']) unavailable_states.extend( rep_state.get( ReplicaState.TEMPORARY_UNAVAILABLE, [])) unavailable_states.extend( rep_state.get( ReplicaState.BEING_DELETED, [])) unavailable_states.extend( rep_state.get(ReplicaState.BAD, [])) if rep['rse_id'] in unavailable_states: logging.info( prepend_str + '%s is in unavailable state. Will be removed from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) except DataIdentifierNotFound as error: logging.error( prepend_str + 'Will remove %s from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) session = get_session() except Exception as error: logging.error(prepend_str + '%s' % (str(error))) tottime = time.time() - start_time if once: break if len(pfns) == bulk: logging.info( prepend_str + 'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle' ) elif tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')