def test_archive_on_dataset_level(rse_factory, did_factory, root_account): rse_name, rse_id = rse_factory.make_xroot_rse() dataset1 = did_factory.make_dataset() dataset2 = did_factory.make_dataset() container = did_factory.make_container() attach_dids(dids=[dataset1, dataset2], account=root_account, **container) # Add a random file to the datasets to avoid dataset deletion when the archive is deleted a_file = did_factory.random_did() add_replicas(rse_id=rse_id, files=[{ **a_file, 'bytes': 500, 'type': 'FILE', 'adler32': 'beefbeef' }], account=root_account) attach_dids(dids=[a_file], account=root_account, **dataset1) attach_dids(dids=[a_file], account=root_account, **dataset2) # adding a non-archive file should not set is_archive=True metadata = get_metadata(**dataset1) assert not metadata['is_archive'] # Create an archive and its constituents, attach the archive to datasets archive = did_factory.random_did(name_prefix='archive', name_suffix='.zip') add_replicas(rse_id=rse_id, files=[{ **archive, 'bytes': 500, 'type': 'FILE', 'adler32': 'beefbeef' }], account=root_account) constituents = [did_factory.random_did() for _ in range(2)] # Add archive to one dataset _before_ attaching files to the archive (before is_archive is set on the archive did) attach_dids(dids=[archive], account=root_account, **dataset1) attach_dids(dids=[{ **c, 'bytes': 200, 'adler32': 'ababbaba' } for c in constituents], account=root_account, **archive) # Attach to another dataset _after_ attaching files to the archive attach_dids(dids=[archive], account=root_account, **dataset2) # Both datasets must have is_archive = True metadata = get_metadata(**dataset1) assert metadata['is_archive'] is True metadata = get_metadata(**dataset2) assert metadata['is_archive'] is True # Delete the archive, the datasets must now have is_archive == false delete_replicas(rse_id=rse_id, files=[archive]) metadata = get_metadata(**dataset1) assert not metadata['is_archive'] metadata = get_metadata(**dataset2) assert not metadata['is_archive']
def get_metadata(scope, name): """ Get data identifier metadata :param scope: The scope name. :param name: The data identifier name. """ return did.get_metadata(scope=scope, name=name)
def test_atropos(root_account, rse_factory, mock_scope, did_factory, rucio_client): """ Test the behaviour of atropos """ today = datetime.now() check_date = datetime.now() + timedelta(days=365) check_date = check_date.isoformat().split('T')[0] # Define a policy lifetime_dir = '/opt/rucio/etc/policies' os.makedirs('/opt/rucio/etc/policies', exist_ok=True) lifetime_policy = [{'name': 'Test', 'include': {'datatype': ['RAW'], 'project': ['data%']}, 'age': '6', 'extension': '1'}] with open('%s/config_other.json' % lifetime_dir, 'w') as outfile: json.dump(lifetime_policy, outfile) REGION.invalidate() nb_datasets = 2 today = datetime.now() rse, rse_id = rse_factory.make_posix_rse() datasets = [did_factory.make_dataset() for _ in range(nb_datasets)] rules = list() expiration_date = None # Check that the eol_at is properly set # Rule on dataset 0 that matches the policy should get an eol_at # Rule on dataset 1 that doesn't matches the policy should not get an eol_at for cnt, dataset in enumerate(datasets): if cnt == 0: set_metadata(dataset['scope'], dataset['name'], 'datatype', 'RAW') set_metadata(dataset['scope'], dataset['name'], 'project', 'data') rule_ids = add_rule(dids=[{'scope': dataset['scope'], 'name': dataset['name']}], account=root_account, copies=1, rse_expression=rse, grouping='DATASET', weight=None, lifetime=None, locked=None, subscription_id=None) rules.append(rule_ids[0]) rule = get_rule(rule_ids[0]) if cnt == 0: expiration_date = rule['eol_at'] assert expiration_date is not None assert expiration_date - today < timedelta(181) assert expiration_date - today > timedelta(179) else: assert rule['eol_at'] is None # Run atropos in dry-run mode to set eol_at on the dataset # Dataset 0 should get eol_at # Dataset 1 should not get eol_at atropos(thread=1, bulk=100, date_check=datetime.strptime(check_date, '%Y-%m-%d'), dry_run=True, grace_period=86400, once=True, unlock=False, spread_period=0, purge_replicas=False, sleep_time=60) for cnt, dataset in enumerate(datasets): meta = get_metadata(dataset['scope'], dataset['name']) if cnt == 0: assert meta['eol_at'] is not None assert meta['eol_at'] == expiration_date else: assert meta['eol_at'] is None # Clean-up os.remove('/opt/rucio/etc/policies/config_other.json')
def test_update_dids(self): """ DATA IDENTIFIERS (CORE): Update file size and checksum""" tmp_scope = 'mock' dsn = 'dsn_%s' % generate_uuid() lfn = 'lfn.%s' % str(generate_uuid()) add_did(scope=tmp_scope, name=dsn, type=DIDType.DATASET, account='root') files = [{'scope': tmp_scope, 'name': lfn, 'bytes': 724963570, 'adler32': '0cc737eb', 'meta': {'guid': str(generate_uuid()), 'events': 100}}] attach_dids(scope=tmp_scope, name=dsn, rse='MOCK', dids=files, account='root') set_metadata(scope=tmp_scope, name=lfn, key='adler32', value='0cc737ee') assert_equal(get_metadata(scope=tmp_scope, name=lfn)['adler32'], '0cc737ee') with assert_raises(UnsupportedOperation): set_metadata(scope=tmp_scope, name='Nimportnawak', key='adler32', value='0cc737ee') set_metadata(scope=tmp_scope, name=lfn, key='bytes', value=724963577) assert_equal(get_metadata(scope=tmp_scope, name=lfn)['bytes'], 724963577)
def test_update_dids(self): """ DATA IDENTIFIERS (CORE): Update file size and checksum""" tmp_scope = InternalScope('mock', **self.vo) root = InternalAccount('root', **self.vo) dsn = 'dsn_%s' % generate_uuid() lfn = 'lfn.%s' % str(generate_uuid()) add_did(scope=tmp_scope, name=dsn, type=DIDType.DATASET, account=root) files = [{'scope': tmp_scope, 'name': lfn, 'bytes': 724963570, 'adler32': '0cc737eb', 'meta': {'guid': str(generate_uuid()), 'events': 100}}] attach_dids(scope=tmp_scope, name=dsn, rse_id=get_rse_id(rse='MOCK', **self.vo), dids=files, account=root) set_metadata(scope=tmp_scope, name=lfn, key='adler32', value='0cc737ee') assert get_metadata(scope=tmp_scope, name=lfn)['adler32'] == '0cc737ee' with pytest.raises(DataIdentifierNotFound): set_metadata(scope=tmp_scope, name='Nimportnawak', key='adler32', value='0cc737ee') set_metadata(scope=tmp_scope, name=lfn, key='bytes', value=724963577) assert get_metadata(scope=tmp_scope, name=lfn)['bytes'] == 724963577
def get_metadata(scope, name): """ Get data identifier metadata :param scope: The scope name. :param name: The data identifier name. """ scope = InternalScope(scope) d = did.get_metadata(scope=scope, name=name) return api_update_return_dict(d)
def get_metadata(scope, name, plugin='DID_COLUMN', vo='def'): """ Get data identifier metadata :param scope: The scope name. :param name: The data identifier name. :param vo: The VO to act on. """ scope = InternalScope(scope, vo=vo) d = did.get_metadata(scope=scope, name=name, plugin=plugin) return api_update_return_dict(d)
def perm_set_status(issuer, kwargs, session=None): """ Checks if an account can set status on an data identifier. :param issuer: Account identifier which issues the command. :param kwargs: List of arguments for the action. :param session: The DB session to use :returns: True if account is allowed, otherwise False """ meta = get_metadata(kwargs['scope'], kwargs['name'], session=session) return perm_default(issuer, kwargs, session=session)\ or has_account_attribute(account=issuer, key='did_admin', session=session)\ or meta.get('account', '') == issuer\ or rucio.core.scope.is_scope_owner(scope=kwargs['scope'], account=issuer, session=session)
def get_metadata(scope, name, plugin='DID_COLUMN', vo='def', session=None): """ Get data identifier metadata :param scope: The scope name. :param name: The data identifier name. :param vo: The VO to act on. :param session: The database session in use. """ scope = InternalScope(scope, vo=vo) d = did.get_metadata(scope=scope, name=name, plugin=plugin, session=session) return api_update_return_dict(d, session=session)
def transmogrifier(bulk=5, once=False, sleep_time=60): """ Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash, identifies the subscriptions matching the DIDs and submit a replication rule for each DID matching a subscription. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'transmogrifier' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) dids, subscriptions = [], [] tottime = 0 prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get the new DIDs based on the is_new flag for did in list_new_dids(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], chunk_size=bulk, did_type=None): dids.append({ 'scope': did['scope'], 'did_type': str(did['did_type']), 'name': did['name'] }) sub_dict = {3: []} # Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest # The priority is defined as 'policyid' for sub in list_subscriptions(None, None): if sub['state'] != SubscriptionState.INACTIVE and sub[ 'lifetime'] and (datetime.now() > sub['lifetime']): update_subscription( name=sub['name'], account=sub['account'], metadata={'state': SubscriptionState.INACTIVE}) elif sub['state'] in [ SubscriptionState.ACTIVE, SubscriptionState.UPDATED ]: priority = 3 if 'policyid' in sub: if int(sub['policyid']) not in sub_dict: sub_dict[int(sub['policyid'])] = [] priority = int(sub['policyid']) sub_dict[priority].append(sub) priorities = list(sub_dict.keys()) priorities.sort() # Order the subscriptions according to their priority for priority in priorities: subscriptions.extend(sub_dict[priority]) except SubscriptionNotFound as error: logging.warning(prepend_str + 'No subscriptions defined: %s' % (str(error))) time.sleep(10) continue except Exception as error: logging.error( prepend_str + 'Failed to get list of new DIDs or subscriptions: %s' % (str(error))) try: results = {} start_time = time.time() blacklisted_rse_id = [ rse['id'] for rse in list_rses({'availability_write': False}) ] logging.debug(prepend_str + 'In transmogrifier worker') identifiers = [] # Loop over all the new dids for did in dids: did_success = True if did['did_type'] == str( DIDType.DATASET) or did['did_type'] == str( DIDType.CONTAINER): did_tag = '%s:%s' % (did['scope'].internal, did['name']) results[did_tag] = [] try: metadata = get_metadata(did['scope'], did['name']) # Loop over all the subscriptions for subscription in subscriptions: # Check if the DID match the subscription if is_matching_subscription( subscription, did, metadata) is True: filter_string = loads(subscription['filter']) split_rule = filter_string.get( 'split_rule', False) stime = time.time() results[did_tag].append(subscription['id']) logging.info(prepend_str + '%s:%s matches subscription %s' % (did['scope'], did['name'], subscription['name'])) rules = loads( subscription['replication_rules']) created_rules = {} cnt = 0 for rule_dict in rules: cnt += 1 created_rules[cnt] = [] # Get all the rule and subscription parameters grouping = rule_dict.get( 'grouping', 'DATASET') lifetime = rule_dict.get('lifetime', None) ignore_availability = rule_dict.get( 'ignore_availability', None) weight = rule_dict.get('weight', None) source_replica_expression = rule_dict.get( 'source_replica_expression', None) locked = rule_dict.get('locked', None) if locked == 'True': locked = True else: locked = False purge_replicas = rule_dict.get( 'purge_replicas', False) if purge_replicas == 'True': purge_replicas = True else: purge_replicas = False rse_expression = str( rule_dict['rse_expression']) comment = str(subscription['comments']) subscription_id = str(subscription['id']) account = subscription['account'] copies = int(rule_dict['copies']) activity = rule_dict.get( 'activity', 'User Subscriptions') try: validate_schema(name='activity', obj=activity) except InputValidationError as error: logging.error( prepend_str + 'Error validating the activity %s' % (str(error))) activity = 'User Subscriptions' if lifetime: lifetime = int(lifetime) str_activity = "".join(activity.split()) success = False nattempt = 5 attemptnr = 0 skip_rule_creation = False selected_rses = [] chained_idx = rule_dict.get( 'chained_idx', None) if chained_idx: params = {} if rule_dict.get( 'associated_site_idx', None): params[ 'associated_site_idx'] = rule_dict.get( 'associated_site_idx', None) logging.debug( '%s Chained subscription identified. Will use %s', prepend_str, str(created_rules[chained_idx])) algorithm = rule_dict.get( 'algorithm', None) selected_rses = select_algorithm( algorithm, created_rules[chained_idx], params) else: # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm if split_rule: vo = account.vo rses = parse_expression( rse_expression, filter={'vo': vo}) list_of_rses = [ rse['id'] for rse in rses ] # Check that some rule doesn't already exist for this DID and subscription preferred_rse_ids = [] for rule in list_rules( filters={ 'subscription_id': subscription_id, 'scope': did['scope'], 'name': did['name'] }): already_existing_rses = [ (rse['rse'], rse['id']) for rse in parse_expression( rule['rse_expression'], filter={'vo': vo}) ] for rse, rse_id in already_existing_rses: if (rse_id in list_of_rses ) and ( rse_id not in preferred_rse_ids): preferred_rse_ids.append( rse_id) if len(preferred_rse_ids ) >= copies: skip_rule_creation = True rse_id_dict = {} for rse in rses: rse_id_dict[ rse['id']] = rse['rse'] try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist= blacklisted_rse_id) ] except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota) as error: logging.warning( prepend_str + 'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites' % (subscription['name'], account, str(error))) # Now including the blacklisted sites try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist=[]) ] ignore_availability = True except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota) as error: logging.error( prepend_str + 'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.' % (subscription['name'], account, str(error))) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str(error.__class__. __name__)), delta=1) # The DID won't be reevaluated at the next cycle did_success = did_success and True continue for attempt in range(0, nattempt): attemptnr = attempt nb_rule = 0 # Try to create the rule try: if split_rule: if not skip_rule_creation: for rse in selected_rses: if isinstance( selected_rses, dict): source_replica_expression = selected_rses[ rse].get( 'source_replica_expression', None) weight = selected_rses[ rse].get( 'weight', None) logging.info( prepend_str + 'Will insert one rule for %s:%s on %s' % (did['scope'], did['name'], rse)) rule_ids = add_rule( dids=[{ 'scope': did['scope'], 'name': did['name'] }], account=account, copies=1, rse_expression=rse, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription_id, source_replica_expression =source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment) created_rules[ cnt].append( rule_ids[0]) nb_rule += 1 if nb_rule == copies: success = True break else: rule_ids = add_rule( dids=[{ 'scope': did['scope'], 'name': did['name'] }], account=account, copies=copies, rse_expression= rse_expression, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription['id'], source_replica_expression= source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment) created_rules[cnt].append( rule_ids[0]) nb_rule += 1 monitor.record_counter( counters= 'transmogrifier.addnewrule.done', delta=nb_rule) monitor.record_counter( counters= 'transmogrifier.addnewrule.activity.%s' % str_activity, delta=nb_rule) success = True break except (InvalidReplicationRule, InvalidRuleWeight, InvalidRSEExpression, StagingAreaRuleRequiresLifetime, DuplicateRule) as error: # Errors that won't be retried success = True logging.error(prepend_str + '%s' % (str(error))) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str( error.__class__.__name__)), delta=1) break except (ReplicationRuleCreationTemporaryFailed, InsufficientTargetRSEs, InsufficientAccountLimit, DatabaseException, RSEBlacklisted, RSEWriteBlocked) as error: # Errors to be retried logging.error( prepend_str + '%s Will perform an other attempt %i/%i' % (str(error), attempt + 1, nattempt)) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str( error.__class__.__name__)), delta=1) except Exception: # Unexpected errors monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.unknown', delta=1) exc_type, exc_value, exc_traceback = exc_info( ) logging.critical( prepend_str + ''.join( format_exception( exc_type, exc_value, exc_traceback)).strip( )) did_success = (did_success and success) if (attemptnr + 1) == nattempt and not success: logging.error( prepend_str + 'Rule for %s:%s on %s cannot be inserted' % (did['scope'], did['name'], rse_expression)) else: logging.info( prepend_str + '%s rule(s) inserted in %f seconds' % (str(nb_rule), time.time() - stime)) except DataIdentifierNotFound as error: logging.warning(prepend_str + error) if did_success: if did['did_type'] == str(DIDType.FILE): monitor.record_counter( counters='transmogrifier.did.file.processed', delta=1) elif did['did_type'] == str(DIDType.DATASET): monitor.record_counter( counters='transmogrifier.did.dataset.processed', delta=1) elif did['did_type'] == str(DIDType.CONTAINER): monitor.record_counter( counters='transmogrifier.did.container.processed', delta=1) monitor.record_counter( counters='transmogrifier.did.processed', delta=1) identifiers.append({ 'scope': did['scope'], 'name': did['name'], 'did_type': DIDType.from_sym(did['did_type']) }) time1 = time.time() # Mark the DIDs as processed for identifier in chunks(identifiers, 100): _retrial(set_new_dids, identifier, None) logging.info(prepend_str + 'Time to set the new flag : %f' % (time.time() - time1)) tottime = time.time() - start_time for sub in subscriptions: update_subscription( name=sub['name'], account=sub['account'], metadata={'last_processed': datetime.now()}) logging.info(prepend_str + 'It took %f seconds to process %i DIDs' % (tottime, len(dids))) logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids))) monitor.record_counter(counters='transmogrifier.job.done', delta=1) monitor.record_timer(stat='transmogrifier.job.duration', time=1000 * tottime) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(prepend_str + ''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) monitor.record_counter(counters='transmogrifier.job.error', delta=1) monitor.record_counter(counters='transmogrifier.addnewrule.error', delta=1) if once is True: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Minos Temporary Expiration starting', prepend_str) time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Minos Temporary Expiration started', prepend_str) chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get list of expired TU replicas logging.info('%s Getting list of expired replicas', prepend_str) expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logging.info('%s %s expired replicas returned', prepend_str, len(expired_replicas)) logging.debug('%s List of expired replicas returned %s', prepend_str, str(expired_replicas)) replicas = [] bad_replicas = [] nchunk = 0 tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): skip_replica_update = [] # Process and update the replicas in chunks for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] states_dictionary = get_replicas_state(scope=scope, name=name, session=session) # Check if the replica is not declared bad # If already declared bad don't update the replica state, but remove from bad_pfns if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]): replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}) else: skip_replica_update.append((scope, name)) # Remove the replicas from bad_replicas table in chunks bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) try: nchunk += 1 logging.debug('%s Running on %s chunk out of %s', prepend_str, nchunk, tot_chunk) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logging.warning('%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', prepend_str, str(error)) for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] logging.debug('%s Working on %s:%s on %s', prepend_str, scope, name, rse_id) try: # First check if the DID exists get_metadata(scope, name) if (scope, name) not in skip_replica_update: update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logging.warning('%s DID %s:%s does not exist anymore.', prepend_str, scope, name) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logging.warning('%s Replica %s:%s on RSEID %s does not exist anymore.', prepend_str, scope, name, rse_id) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical('%s %s', prepend_str, str(traceback.format_exc())) session = get_session() except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop requested', prepend_str) logging.info('%s Graceful stop done', prepend_str)
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos_temporary_expiration[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Minos Temporary Expiration starting') time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Minos Temporary Expiration started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) try: # Get list of expired TU replicas logger(logging.INFO, 'Getting list of expired replicas') expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logger(logging.INFO, '%s expired replicas returned', len(expired_replicas)) logger(logging.DEBUG, 'List of expired replicas returned %s', str(expired_replicas)) replicas = [] bad_replicas = [] nchunk = 0 tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) skip_replica_update = [] # Process and update the replicas in chunks for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] states_dictionary = get_replicas_state(scope=scope, name=name, session=session) # Check if the replica is not declared bad # If already declared bad don't update the replica state, but remove from bad_pfns if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]): replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}) else: skip_replica_update.append((scope, name)) # Remove the replicas from bad_replicas table in chunks bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) try: nchunk += 1 logger(logging.DEBUG, 'Running on %s chunk out of %s', nchunk, tot_chunk) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', str(error)) for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] logger(logging.DEBUG, 'Working on %s:%s on %s', scope, name, rse_id) try: # First check if the DID exists get_metadata(scope, name) if (scope, name) not in skip_replica_update: update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'DID %s:%s does not exist anymore.', scope, name) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'Replica %s:%s on RSEID %s does not exist anymore.', scope, name, rse_id) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member session = get_session() except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) session.rollback() session = get_session() except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, str(traceback.format_exc())) session = get_session() except Exception: logger(logging.CRITICAL, str(traceback.format_exc())) if once: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop) heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def minos(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Worker that gets a list of bad PFNs, extract the scope, name and rse_id and fill the bad_replicas table. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) states_mapping = { BadPFNStatus.BAD: BadFilesStatus.BAD, BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS, BadPFNStatus.TEMPORARY_UNAVAILABLE: BadFilesStatus.TEMPORARY_UNAVAILABLE } logging.info(prepend_str + 'Minos started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) pfns = [] try: bad_replicas = {} temporary_unvailables = {} pfns = get_bad_pfns(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], limit=bulk) # Class the PFNs into bad_replicas and temporary_unavailable for pfn in pfns: path = pfn['pfn'] account = pfn['account'] reason = pfn['reason'] expires_at = pfn['expires_at'] state = pfn['state'] if states_mapping[state] in [ BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS ]: if (account, reason, state) not in bad_replicas: bad_replicas[(account, reason, state)] = [] bad_replicas[(account, reason, state)].append(path) if states_mapping[ state] == BadFilesStatus.TEMPORARY_UNAVAILABLE: if (account, reason, expires_at) not in temporary_unvailables: temporary_unvailables[(account, reason, expires_at)] = [] temporary_unvailables[(account, reason, expires_at)].append(path) # Process the bad and suspicious files # The scope, name, rse_id are extracted and filled into the bad_replicas table for account, reason, state in bad_replicas: vo = account.vo pfns = bad_replicas[(account, reason, state)] logging.info( prepend_str + 'Declaring %s replicas with state %s and reason %s' % (len(pfns), str(state), reason)) session = get_session() schemes = {} dict_rse = {} unknown_replicas = [] try: # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: vo_str = '' if vo == 'def' else ' on VO ' + vo logging.debug(prepend_str + 'Running on RSE %s%s with %s replicas' % (get_rse_name(rse_id=rse_id), vo_str, len(dict_rse[rse_id]))) nchunk = 0 tot_chunk = int( math.ceil(len(dict_rse[rse_id]) / chunk_size)) for chunk in chunks(dict_rse[rse_id], chunk_size): nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) unknown_replicas = declare_bad_file_replicas( pfns=chunk, reason=reason, issuer=account, status=state, session=session) if unknown_replicas: logging.debug(prepend_str + 'Unknown replicas : %s' % (str(unknown_replicas))) bulk_delete_bad_pfns(pfns=chunk, session=session) session.commit() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) # Now get the temporary unavailable and update the replicas states for account, reason, expires_at in temporary_unvailables: vo = account.vo pfns = temporary_unvailables[(account, reason, expires_at)] logging.info( prepend_str + 'Declaring %s replicas temporary available with timeout %s and reason %s' % (len(pfns), str(expires_at), reason)) logging.debug(prepend_str + 'Extracting RSEs') schemes = {} dict_rse = {} unknown_replicas = [] # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: replicas = [] rse = get_rse_name(rse_id=rse_id, session=None) rse_vo_str = rse if vo == 'def' else '{} on {}'.format( rse, vo) logging.debug(prepend_str + 'Running on RSE %s' % rse_vo_str) for rep in get_did_from_pfns(pfns=dict_rse[rse_id], rse_id=None, vo=vo, session=None): for pfn in rep: scope = rep[pfn]['scope'] name = rep[pfn]['name'] replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.TEMPORARY_UNAVAILABLE, 'pfn': pfn }) # The following part needs to be atomic # We update the replicas states to TEMPORARY_UNAVAILABLE # then insert a row in the bad_replicas table. TODO Update the row if it already exists # then delete the corresponding rows into the bad_pfns table logging.debug(prepend_str + 'Running on %s replicas on RSE %s' % (len(replicas), rse_vo_str)) nchunk = 0 tot_chunk = int( math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(replicas, chunk_size): try: nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(chunk, nowait=False, session=session) bulk_add_bad_replicas( chunk, account, state=BadFilesStatus.TEMPORARY_UNAVAILABLE, reason=None, expires_at=expires_at, session=session) pfns = [entry['pfn'] for entry in chunk] bulk_delete_bad_pfns(pfns=pfns, session=session) session.commit() # pylint: disable=no-member except (UnsupportedOperation, ReplicaNotFound) as error: session.rollback() # pylint: disable=no-member logging.error( prepend_str + 'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s' % str(error)) for rep in chunk: logging.debug(prepend_str + 'Working on %s' % (str(rep))) try: get_metadata(rep['scope'], rep['name']) unavailable_states = [] rep_state = get_replicas_state( rep['scope'], rep['name']) unavailable_states.extend( rep_state.get( ReplicaState.TEMPORARY_UNAVAILABLE, [])) unavailable_states.extend( rep_state.get( ReplicaState.BEING_DELETED, [])) unavailable_states.extend( rep_state.get(ReplicaState.BAD, [])) if rep['rse_id'] in unavailable_states: logging.info( prepend_str + '%s is in unavailable state. Will be removed from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) elif expires_at < datetime.now(): logging.info( '%s PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns', prepend_str, str(rep['pfn']), expires_at) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) except (DataIdentifierNotFound, ReplicaNotFound) as error: logging.error( prepend_str + 'Will remove %s from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) session = get_session() except Exception as error: logging.error(prepend_str + '%s' % (str(error))) tottime = time.time() - start_time if once: break if len(pfns) == bulk: logging.info( prepend_str + 'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle' ) elif tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get list of expired TU replicas logging.info(prepend_str + 'Getting list of expired replicas') expired_replicas = list_expired_temporary_unavailable_replicas( total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logging.info(prepend_str + '%s expired replicas returned' % len(expired_replicas)) logging.debug(prepend_str + 'List of expired replicas returned %s' % str(expired_replicas)) replicas = [] bad_replicas = [] for replica in expired_replicas: replicas.append({ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE }) bad_replicas.append({ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE }) session = get_session() nchunk = 0 tot_chunk = int(math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): # Process and update the replicas in chunks replicas = [{ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE } for replica in chunk] # Remove the replicas from bad_replicas table in chunks bad_replicas = [{ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE } for replica in chunk] try: nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logging.warning( prepend_str + 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s' % str(error)) for idx in range(len(chunk)): logging.debug(prepend_str + 'Working on %s' % (str(replicas[idx]))) try: get_metadata(replicas[idx]['scope'], replicas[idx]['name']) update_replicas_states([ replicas[idx], ], nowait=True, session=session) bulk_delete_bad_replicas([ bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound as error: session.rollback() # pylint: disable=no-member logging.warning( prepend_str + 'DID %s:%s does not exist anymore. ' % (bad_replicas[idx]['scope'], bad_replicas[idx]['name'])) bulk_delete_bad_replicas([ bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound as error: session.rollback() # pylint: disable=no-member logging.warning( prepend_str + '%s:%s on RSEID %s does not exist anymore. ' % (replicas[idx]['scope'], replicas[idx]['name'], replicas[idx]['rse_id'])) bulk_delete_bad_replicas([ bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) session = get_session() except Exception as error: logging.critical(traceback.format_exc()) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
assert_equal(None, get_did_atime(scope=tmp_scope, name=tmp_dsn2)) def test_update_dids(self): """ DATA IDENTIFIERS (CORE): Update file size and checksum""" tmp_scope = 'mock' dsn = 'dsn_%s' % generate_uuid() lfn = 'lfn.%s' % str(generate_uuid()) add_did(scope=tmp_scope, name=dsn, type=DIDType.DATASET, account='root') files = [{'scope': tmp_scope, 'name': lfn, 'bytes': 724963570L, 'adler32': '0cc737eb', 'meta': {'guid': str(generate_uuid()), 'events': 100}}] attach_dids(scope=tmp_scope, name=dsn, rse='MOCK', dids=files, account='root') set_metadata(scope=tmp_scope, name=lfn, key='adler32', value='0cc737ee') assert_equal(get_metadata(scope=tmp_scope, name=lfn)['adler32'], '0cc737ee') with assert_raises(UnsupportedOperation): set_metadata(scope=tmp_scope, name='Nimportnawak', key='adler32', value='0cc737ee') set_metadata(scope=tmp_scope, name=lfn, key='bytes', value=724963577L) assert_equal(get_metadata(scope=tmp_scope, name=lfn)['bytes'], 724963577L) def test_get_did_with_dynamic(self): """ DATA IDENTIFIERS (CORE): Get did with dynamic resolve of size""" tmp_scope = 'mock' tmp_dsn1 = 'dsn_%s' % generate_uuid() tmp_dsn2 = 'dsn_%s' % generate_uuid() tmp_dsn3 = 'dsn_%s' % generate_uuid() tmp_dsn4 = 'dsn_%s' % generate_uuid()
'scope': response['scope'], 'name': response['name'], 'state': ReplicaState.AVAILABLE}], nowait=False, session=session) except UnsupportedOperation: # replica not found, but it has been transferred because cancel came too late. # so we need to register the replica and schedule it for deletion again record_timer('daemons.conveyor.common.update_request_state.replica-update_replicas_states', (time.time()-tss)*1000) logging.warn('DID %s:%s AT RSE %s NOT FOUND - Registering replica and scheduling for immediate deletion' % (response['scope'], response['name'], rse_name)) did_meta = None try: did_meta = did.get_metadata(response['scope'], response['name'], session=session) except: logging.critical('DID %s:%s NOT FOUND - Cannot re-register replica - potential dark data' % (response['scope'], response['name'])) raise if did_meta: try: replica.add_replica(rse_name, response['scope'], response['name'], did_meta['bytes'], did_meta['account'], adler32=did_meta['adler32'], tombstone=datetime.datetime.utcnow(),
def run_once(heartbeat_handler: "HeartbeatHandler", bulk: int, **_kwargs) -> bool: worker_number, total_workers, logger = heartbeat_handler.live() dids, subscriptions = [], [] tottime = 0 try: # Get the new DIDs based on the is_new flag logger(logging.DEBUG, "Listing new dids") for did in list_new_dids( thread=worker_number, total_threads=total_workers, chunk_size=bulk, did_type=None, ): dids.append({ "scope": did["scope"], "did_type": str(did["did_type"]), "name": did["name"], }) logger(logging.INFO, "%i new dids to process", len(dids)) sub_dict = {3: []} # Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest # The priority is defined as 'policyid' logger(logging.DEBUG, "Listing active subscriptions") for sub in list_subscriptions(None, None): if (sub["state"] != SubscriptionState.INACTIVE and sub["lifetime"] and (datetime.now() > sub["lifetime"])): update_subscription( name=sub["name"], account=sub["account"], metadata={"state": SubscriptionState.INACTIVE}, ) elif sub["state"] in [ SubscriptionState.ACTIVE, SubscriptionState.UPDATED ]: priority = 3 if "policyid" in sub: if int(sub["policyid"]) not in sub_dict: sub_dict[int(sub["policyid"])] = [] priority = int(sub["policyid"]) sub_dict[priority].append(sub) priorities = list(sub_dict.keys()) priorities.sort() # Order the subscriptions according to their priority for priority in priorities: subscriptions.extend(sub_dict[priority]) logger(logging.INFO, "%i active subscriptions", len(subscriptions)) except SubscriptionNotFound as error: logger(logging.WARNING, "No subscriptions defined: %s" % (str(error))) must_sleep = True return must_sleep except Exception as error: logger( logging.ERROR, "Failed to get list of new DIDs or subscriptions: %s" % (str(error)), ) must_sleep = False return must_sleep results = {} start_time = time.time() blocklisted_rse_id = [ rse["id"] for rse in list_rses({"availability_write": False}) ] identifiers = [] # Loop over all the new dids for did in dids: _, _, logger = heartbeat_handler.live() did_success = True if did["did_type"] == str(DIDType.DATASET) or did["did_type"] == str( DIDType.CONTAINER): did_tag = "%s:%s" % (did["scope"].internal, did["name"]) results[did_tag] = [] try: metadata = get_metadata(did["scope"], did["name"]) # Loop over all the subscriptions for subscription in subscriptions: # Check if the DID match the subscription if is_matching_subscription(subscription, did, metadata) is True: filter_string = loads(subscription["filter"]) split_rule = filter_string.get("split_rule", False) stime = time.time() results[did_tag].append(subscription["id"]) logger( logging.INFO, "%s:%s matches subscription %s" % (did["scope"], did["name"], subscription["name"]), ) rules = loads(subscription["replication_rules"]) created_rules = {} cnt = 0 for rule_dict in rules: cnt += 1 created_rules[cnt] = [] # Get all the rule and subscription parameters grouping = rule_dict.get("grouping", "DATASET") lifetime = rule_dict.get("lifetime", None) ignore_availability = rule_dict.get( "ignore_availability", None) weight = rule_dict.get("weight", None) source_replica_expression = rule_dict.get( "source_replica_expression", None) locked = rule_dict.get("locked", None) if locked == "True": locked = True else: locked = False purge_replicas = rule_dict.get( "purge_replicas", False) if purge_replicas == "True": purge_replicas = True else: purge_replicas = False rse_expression = str(rule_dict["rse_expression"]) comment = str(subscription["comments"] )[:RULES_COMMENT_LENGTH] if "comments" in rule_dict: comment = str(rule_dict["comments"]) subscription_id = str(subscription["id"]) account = subscription["account"] copies = int(rule_dict["copies"]) activity = rule_dict.get("activity", "User Subscriptions") try: validate_schema(name="activity", obj=activity, vo=account.vo) except InputValidationError as error: logger( logging.ERROR, "Error validating the activity %s" % (str(error)), ) activity = "User Subscriptions" if lifetime: lifetime = int(lifetime) str_activity = "".join(activity.split()) success = False nattempt = 5 attemptnr = 0 skip_rule_creation = False selected_rses = [] chained_idx = rule_dict.get("chained_idx", None) if chained_idx: params = {} if rule_dict.get("associated_site_idx", None): params[ "associated_site_idx"] = rule_dict.get( "associated_site_idx", None) logger( logging.DEBUG, "Chained subscription identified. Will use %s", str(created_rules[chained_idx]), ) algorithm = rule_dict.get("algorithm", None) selected_rses = select_algorithm( algorithm, created_rules[chained_idx], params) else: # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm if split_rule: preferred_rses = set() for rule in list_rules( filters={ "subscription_id": subscription_id, "scope": did["scope"], "name": did["name"], }): for rse_dict in parse_expression( rule["rse_expression"], filter_={"vo": account.vo}, ): preferred_rses.add(rse_dict["rse"]) preferred_rses = list(preferred_rses) try: ( selected_rses, preferred_unmatched, ) = resolve_rse_expression( rse_expression, account, weight=weight, copies=copies, size=0, preferred_rses=preferred_rses, blocklist=blocklisted_rse_id, ) except ( InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota, ) as error: logger( logging.WARNING, 'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blocklisted sites' % ( subscription["name"], account, str(error), ), ) # Now including the blocklisted sites try: ( selected_rses, preferred_unmatched, ) = resolve_rse_expression( rse_expression, account, weight=weight, copies=copies, size=0, preferred_rses=preferred_rses, ) ignore_availability = True except ( InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight, RSEOverQuota, ) as error: logger( logging.ERROR, 'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.' % ( subscription["name"], account, str(error), ), ) monitor.record_counter( name= "transmogrifier.addnewrule.errortype.{exception}", labels={ "exception": str(error.__class__. __name__) }, ) # The DID won't be reevaluated at the next cycle did_success = did_success and True continue if (len(preferred_rses) - len(preferred_unmatched) >= copies): skip_rule_creation = True for attempt in range(0, nattempt): attemptnr = attempt nb_rule = 0 # Try to create the rule try: if split_rule: if not skip_rule_creation: for rse in selected_rses: if isinstance( selected_rses, dict): source_replica_expression = ( selected_rses[rse].get( "source_replica_expression", None, )) weight = selected_rses[ rse].get( "weight", None) logger( logging.INFO, "Will insert one rule for %s:%s on %s" % (did["scope"], did["name"], rse), ) rule_ids = add_rule( dids=[{ "scope": did["scope"], "name": did["name"], }], account=account, copies=1, rse_expression=rse, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription_id, source_replica_expression= source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment, ) created_rules[cnt].append( rule_ids[0]) nb_rule += 1 if nb_rule == copies: success = True break else: rule_ids = add_rule( dids=[{ "scope": did["scope"], "name": did["name"], }], account=account, copies=copies, rse_expression=rse_expression, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id=subscription["id"], source_replica_expression= source_replica_expression, activity=activity, purge_replicas=purge_replicas, ignore_availability= ignore_availability, comment=comment, ) created_rules[cnt].append(rule_ids[0]) nb_rule += 1 monitor.record_counter( name="transmogrifier.addnewrule.done", delta=nb_rule, ) monitor.record_counter( name= "transmogrifier.addnewrule.activity.{activity}", delta=nb_rule, labels={"activity": str_activity}, ) success = True break except ( InvalidReplicationRule, InvalidRuleWeight, InvalidRSEExpression, StagingAreaRuleRequiresLifetime, DuplicateRule, ) as error: # Errors that won't be retried success = True logger(logging.ERROR, str(error)) monitor.record_counter( name= "transmogrifier.addnewrule.errortype.{exception}", labels={ "exception": str(error.__class__.__name__) }, ) break except ( ReplicationRuleCreationTemporaryFailed, InsufficientTargetRSEs, InsufficientAccountLimit, DatabaseException, RSEWriteBlocked, ) as error: # Errors to be retried logger( logging.ERROR, "%s Will perform an other attempt %i/%i" % (str(error), attempt + 1, nattempt), ) monitor.record_counter( name= "transmogrifier.addnewrule.errortype.{exception}", labels={ "exception": str(error.__class__.__name__) }, ) except Exception: # Unexpected errors monitor.record_counter( name= "transmogrifier.addnewrule.errortype.{exception}", labels={"exception": "unknown"}, ) logger(logging.ERROR, "Unexpected error", exc_info=True) did_success = did_success and success if (attemptnr + 1) == nattempt and not success: logger( logging.ERROR, "Rule for %s:%s on %s cannot be inserted" % (did["scope"], did["name"], rse_expression), ) else: logger( logging.INFO, "%s rule(s) inserted in %f seconds" % (str(nb_rule), time.time() - stime), ) except DataIdentifierNotFound as error: logger(logging.WARNING, str(error)) if did_success: if did["did_type"] == str(DIDType.FILE): monitor.record_counter( name="transmogrifier.did.file.processed") elif did["did_type"] == str(DIDType.DATASET): monitor.record_counter( name="transmogrifier.did.dataset.processed") elif did["did_type"] == str(DIDType.CONTAINER): monitor.record_counter( name="transmogrifier.did.container.processed", delta=1) monitor.record_counter(name="transmogrifier.did.processed", delta=1) identifiers.append({ "scope": did["scope"], "name": did["name"], "did_type": did["did_type"], }) time1 = time.time() # Mark the DIDs as processed for identifier in chunks(identifiers, 100): _retrial(set_new_dids, identifier, None) logger(logging.DEBUG, "Time to set the new flag : %f" % (time.time() - time1)) tottime = time.time() - start_time for sub in subscriptions: update_subscription( name=sub["name"], account=sub["account"], metadata={"last_processed": datetime.now()}, ) logger(logging.INFO, "It took %f seconds to process %i DIDs" % (tottime, len(dids))) logger(logging.DEBUG, "DIDs processed : %s" % (str(dids))) monitor.record_counter(name="transmogrifier.job.done", delta=1) monitor.record_timer(name="transmogrifier.job.duration", time=1000 * tottime) must_sleep = True return must_sleep
assert_equal(None, get_did_atime(scope=tmp_scope, name=tmp_dsn2)) def test_update_dids(self): """ DATA IDENTIFIERS (CORE): Update file size and checksum""" tmp_scope = 'mock' dsn = 'dsn_%s' % generate_uuid() lfn = 'lfn.%s' % str(generate_uuid()) add_did(scope=tmp_scope, name=dsn, type=DIDType.DATASET, account='root') files = [{'scope': tmp_scope, 'name': lfn, 'bytes': 724963570L, 'adler32': '0cc737eb', 'meta': {'guid': str(generate_uuid()), 'events': 100}}] attach_dids(scope=tmp_scope, name=dsn, rse='MOCK', dids=files, account='root') set_metadata(scope=tmp_scope, name=lfn, key='adler32', value='0cc737ee') assert_equal(get_metadata(scope=tmp_scope, name=lfn)['adler32'], '0cc737ee') with assert_raises(UnsupportedOperation): set_metadata(scope=tmp_scope, name='Nimportnawak', key='adler32', value='0cc737ee') set_metadata(scope=tmp_scope, name=lfn, key='bytes', value=724963577L) assert_equal(get_metadata(scope=tmp_scope, name=lfn)['bytes'], 724963577L) class TestDIDApi: def test_list_new_dids(self): """ DATA IDENTIFIERS (API): List new identifiers """ tmp_scope = scope_name_generator() tmp_dsn = 'dsn_%s' % generate_uuid() scope.add_scope(tmp_scope, 'jdoe', 'jdoe')
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5, sleep_time=300): """ Main loop to check for available replicas which are labeled as suspicious. Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE is considered problematic and the suspicious replicas on that RSE are labeled as 'TEMPORARY_UNAVAILABLE'. :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then it should sleep until sleep_time is over. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] prepend_str = 'replica_recoverer: ' logger = formatted_logger(logging.log, prepend_str + '%s') multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logger(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.') vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logger(logging.INFO, 'replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] + 1 # there is only 1 worker allowed for this daemon if total_workers != 1: logger(logging.ERROR, 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prepend_str + '%s') start = time.time() try: json_file = open("/opt/rucio/etc/suspicious_replica_recoverer.json") except: logger(logging.WARNING, "An error occured whilst trying to open the JSON file.") break try: json_data = json.load(json_file) except ValueError: logger(logging.WARNING, "No JSON object could be decoded.") # Checking that the json file is formatedd properly. for i, entry in enumerate(json_data): if "datatype" not in entry or "action" not in entry: logger(logging.ERROR, 'Entry %s in the json file is incomplete (missing either "datatype" or "action").', i) break logger(logging.INFO, 'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.', younger_than, nattempts) getfileskwargs = {'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'is_suspicious': True} for vo in vos: logger(logging.INFO, 'Start replica recovery for VO: %s', vo) recoverable_replicas = {} if vo not in recoverable_replicas: recoverable_replicas[vo] = {} # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse']) rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true') if rse['vo'] == vo], key=lambda k: k['rse']) logger(logging.DEBUG, "List of RSEs with enable_suspicious_file_recovery = True:") for i in rse_list: logger(logging.DEBUG, '%s', i) for rse in rse_list: time_start_rse = time.time() rse_expr = rse['rse'] cnt_surl_not_found = 0 if rse_expr not in recoverable_replicas[vo]: recoverable_replicas[vo][rse_expr] = {} # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs suspicious_replicas_avail_elsewhere = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["EXIST_COPIES"].value, filter_={'vo': vo}, **getfileskwargs) # Get the suspicious replicas that are the last remaining copies suspicious_replicas_last_copy = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["LAST_COPY"].value, filter_={'vo': vo}, **getfileskwargs) logger(logging.DEBUG, 'Suspicious replicas on %s:', rse_expr) logger(logging.DEBUG, 'Replicas with copies on other RSEs (%s):', len(suspicious_replicas_avail_elsewhere)) for i in suspicious_replicas_avail_elsewhere: logger(logging.DEBUG, '%s', i) logger(logging.DEBUG, 'Replicas that are the last remaining copy (%s):', len(suspicious_replicas_last_copy)) for i in suspicious_replicas_last_copy: logger(logging.DEBUG, '%s', i) # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE. if (rse['availability'] not in {4, 5, 6, 7}) and ((len(suspicious_replicas_avail_elsewhere) > 0) or (len(suspicious_replicas_last_copy) > 0)): logger(logging.WARNING, "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n", rse_expr, rse['availability']) continue if suspicious_replicas_avail_elsewhere: for replica in suspicious_replicas_avail_elsewhere: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': rep_name}]): for rse_ in rep['rses']: if rse_ == rse_id: recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_][0], 'available_elsewhere': True} surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) if suspicious_replicas_last_copy: for replica in suspicious_replicas_last_copy: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True # Should only return one rse, as there is only one replica remaining for rep in list_replicas([{'scope': scope, 'name': rep_name}]): recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_id][0], 'available_elsewhere': False} surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) logger(logging.INFO, 'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.', time.time() - time_start_rse, rse_expr, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy), len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) - cnt_surl_not_found, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy)) if len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) != 0: logger(logging.DEBUG, 'List of replicas on %s for which the pfns have been found:', rse_expr) for i in recoverable_replicas[vo][rse_expr]: logger(logging.DEBUG, '%s', i) # Log file is long and hard to read -> implement some spacing logger(logging.INFO, 'All RSEs have been checked for suspicious replicas. Total time: %s seconds.', time.time() - start) logger(logging.INFO, 'Begin check for problematic RSEs.') time_start_check_probl = time.time() # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE. # The suspicious files are marked as temporarily unavailable. list_problematic_rses = [] for rse_key in list(recoverable_replicas[vo].keys()): if len(recoverable_replicas[vo][rse_key].values()) > limit_suspicious_files_on_rse: list_problematic_rses.append(rse_key) surls_list = [] for replica_value in recoverable_replicas[vo][rse_key].values(): surls_list.append(replica_value['surl']) add_bad_pfns(pfns=surls_list, account=InternalAccount('root', vo=vo), state='TEMPORARY_UNAVAILABLE', expires_at=datetime.utcnow() + timedelta(days=3)) logger(logging.INFO, "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).", rse_key, limit_suspicious_files_on_rse) logger(logging.INFO, "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:", rse_key) for rse_values in recoverable_replicas[vo][rse_key].values(): logger(logging.INFO, 'Scope: %s Name: %s', rse_values['scope'], rse_values['name']) # Remove the RSE from the dictionary as it has been dealt with. del recoverable_replicas[vo][rse_key] logger(logging.INFO, "Following RSEs were deemed problematic (total: %s)", len(list_problematic_rses)) for rse in list_problematic_rses: logger(logging.INFO, "%s", rse) # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious). # If they are the last remaining copies, deal with them differently. for rse_key in list(recoverable_replicas[vo].keys()): files_to_be_declared_bad = [] files_to_be_ignored = [] # Remove RSEs from dictionary that don't have any suspicious replicas if len(recoverable_replicas[vo][rse_key]) == 0: del recoverable_replicas[vo][rse_key] continue # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there rse_id = list(recoverable_replicas[vo][rse_key].values())[0]['rse_id'] for replica_key in list(recoverable_replicas[vo][rse_key].keys()): if recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is True: # Replicas with other copies on at least one other RSE can safely be labeled as bad files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) # Remove replica from dictionary del recoverable_replicas[vo][rse_key][replica_key] elif recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is False: if (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("log.")) or (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("user")): # Don't keep log files or user files files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) del recoverable_replicas[vo][rse_key][replica_key] else: # Deal with replicas based on their metadata. file_metadata = get_metadata(recoverable_replicas[vo][rse_key][replica_key]["scope"], recoverable_replicas[vo][rse_key][replica_key]["name"]) if file_metadata["datatype"] is None: # "None" type has no function "split()" files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) continue for i in json_data: if i["datatype"] == file_metadata["datatype"].split("_")[-1]: action = i["action"] if action == "ignore": files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) elif action == "declare bad": files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) else: logger(logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)", rse_key, replica_key, recoverable_replicas[vo][rse_key][replica_key]['surl'], i["datatype"], i["action"]) break else: # If no policy has be set, default to ignoring the file (no action taken). files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) logger(logging.INFO, '(%s) Remaining replicas (pfns) that will be ignored:', rse_key) for i in files_to_be_ignored: logger(logging.INFO, '%s', i) logger(logging.INFO, '(%s) Remaining replica (pfns) that will be declared BAD:', rse_key) for i in files_to_be_declared_bad: logger(logging.INFO, '%s', i) if files_to_be_declared_bad: logger(logging.INFO, 'Ready to declare %s bad replica(s) on %s (RSE id: %s).', len(files_to_be_declared_bad), rse_key, str(rse_id)) declare_bad_file_replicas(pfns=files_to_be_declared_bad, reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), session=None) logger(logging.INFO, 'Finished declaring bad replicas on %s.\n', rse_key) logger(logging.INFO, 'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.', time.time() - time_start_check_probl) time_passed = time.time() - start logger(logging.INFO, 'Total time: %s seconds', time_passed) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) else: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) except Exception as err: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logger(logging.INFO, 'Graceful stop done.')
def lfn2pfn_DUNE(scope, name, rse, rse_attrs, protocol_attrs): global metacat_base from rucio.common import config from rucio.common.types import InternalScope from rucio.rse import rsemanager from metacat.webapi import MetaCatClient # current URL: https://metacat.fnal.gov:9443/dune_meta_demo/app metacat_url = config.config_get( 'policy', 'metacat_base_url') or os.environ.get("METACAT_SERVER_URL") if metacat_url is None: raise ValueError("MetaCat client URL is not configured") metacat_client = MetaCatClient(metacat_url) def get_metadata_field(metadata, field): if field in metadata: return metadata[field] return 'None' # check to see if PFN is already cached in Rucio's metadata system didclient = None didmd = {} internal_scope = InternalScope(scope) if getattr(rsemanager, 'CLIENT_MODE', None): from rucio.client.didclient import DIDClient didclient = DIDClient() didmd = didclient.get_metadata(internal_scope, name) if getattr(rsemanager, 'SERVER_MODE', None): from rucio.core.did import get_metadata didmd = get_metadata(internal_scope, name) # if it is, just return it md_key = 'PFN_' + rse if md_key in didmd: return didmd[md_key] lfn = scope + ':' + name jsondata = metacat_client.get_file(name=lfn) metadata = jsondata["metadata"] # determine year from timestamps timestamp = None if 'core.start_time' in metadata: timestamp = metadata['core.start_time'] elif 'core.end_time' in metadata: timestamp = metadata['core.end_time'] elif 'created_timestamp' in jsondata: timestamp = jsondata['created_timestamp'] if timestamp is None: year = 'None' else: dt = datetime.utcfromtimestamp(timestamp) year = str(dt.year) # determine hashes from run number run_number = 0 if 'core.runs' in metadata: run_number = int(metadata['core.runs'][0]) hash1 = "%02d" % ((run_number // 1000000) % 100) hash2 = "%02d" % ((run_number // 10000) % 100) hash3 = "%02d" % ((run_number // 100) % 100) hash4 = "%02d" % (run_number % 100) run_type = get_metadata_field(metadata, 'core.run_type') data_tier = get_metadata_field(metadata, 'core.data_tier') file_type = get_metadata_field(metadata, 'core.file_type') data_stream = get_metadata_field(metadata, 'core.data_stream') data_campaign = get_metadata_field(metadata, 'DUNE.campaign') filename = name pfn = 'pnfs/dune/tape_backed/dunepro/' + run_type + '/' + data_tier + '/' + year + '/' + file_type + '/' + data_stream + '/' + data_campaign + '/' + hash1 + '/' + hash2 + '/' + hash3 + '/' + hash4 + '/' + filename # store the PFN in Rucio metadata for next time if getattr(rsemanager, 'CLIENT_MODE', None): didclient.set_metadata(internal_scope, name, md_key, pfn) if getattr(rsemanager, 'SERVER_MODE', None): from rucio.core.did import set_metadata set_metadata(internal_scope, name, md_key, pfn) return pfn