def test_reaper(vo, caches_mock, file_config_mock): """ REAPER (DAEMON): Test the reaper daemon.""" [cache_region] = caches_mock scope = InternalScope('data13_hip', vo=vo) nb_files = 250 file_size = 200 # 2G rse_name, rse_id, dids = __add_test_rse_and_replicas(vo=vo, scope=scope, rse_name=rse_name_generator(), names=['lfn' + generate_uuid() for _ in range(nb_files)], file_size=file_size) rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=50 * file_size) assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files # Check first if the reaper does not delete anything if no space is needed cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=323000000000) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files # Now put it over threshold and delete cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=1) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) assert len(list(replica_core.list_replicas(dids, rse_expression=rse_name))) == 200
def test_reaper_multi_vo(vo, second_vo, scope_factory, caches_mock, file_config_mock): """ REAPER (DAEMON): Test the reaper daemon with multiple vo.""" [cache_region] = caches_mock new_vo = second_vo _, [scope_tst, scope_new] = scope_factory(vos=[vo, new_vo]) nb_files = 250 file_size = 200 # 2G rse1_name, rse1_id, dids1 = __add_test_rse_and_replicas(vo=vo, scope=scope_tst, rse_name=rse_name_generator(), names=['lfn' + generate_uuid() for _ in range(nb_files)], file_size=file_size) rse2_name, rse2_id, dids2 = __add_test_rse_and_replicas(vo=new_vo, scope=scope_new, rse_name=rse_name_generator(), names=['lfn' + generate_uuid() for _ in range(nb_files)], file_size=file_size) rse_core.set_rse_limits(rse_id=rse1_id, name='MinFreeSpace', value=50 * file_size) rse_core.set_rse_limits(rse_id=rse2_id, name='MinFreeSpace', value=50 * file_size) # Check we reap all VOs by default cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse1_id, source='storage', used=nb_files * file_size, free=1) rse_core.set_rse_usage(rse_id=rse2_id, source='storage', used=nb_files * file_size, free=1) both_rses = '%s|%s' % (rse1_name, rse2_name) reaper(once=True, rses=[], include_rses=both_rses, exclude_rses=None) reaper(once=True, rses=[], include_rses=both_rses, exclude_rses=None) assert len(list(replica_core.list_replicas(dids=dids1, rse_expression=both_rses))) == 200 assert len(list(replica_core.list_replicas(dids=dids2, rse_expression=both_rses))) == 200
def test_add_list_bad_replicas(self): """ REPLICA (CORE): Add bad replicas and list them""" tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in range(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK') rse_id1 = rse_info['id'] add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True) # Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses']['MOCK']) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root') assert_equal(r, {}) bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id1: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Adding replicas to non-deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': {'events': 10}} for i in range(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK2') rse_id2 = rse_info['id'] add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True) # Listing replicas on non-deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses']['MOCK2']) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root') assert_equal(r, {}) bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse_id2: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert_equal(len(replicas), nbbadrep) # Now adding non-existing bad replicas files = ['srm://mock2.com/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), ] r = declare_bad_file_replicas(files, 'This is a good reason', 'root') output = ['%s Unknown replica' % rep for rep in files] assert_equal(r, {'MOCK2': output})
def test_add_list_bad_replicas(rse_factory, mock_scope, root_account): """ REPLICA (CORE): Add bad replicas and list them""" nbfiles = 5 # Adding replicas to deterministic RSE _, rse1_id = rse_factory.make_srm_rse(deterministic=True) files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles)] add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True) # Listing replicas on deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse1_id]) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse1_id: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert len(replicas) == nbbadrep # Adding replicas to non-deterministic RSE _, rse2_id = rse_factory.make_srm_rse(deterministic=False) files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'pfn': 'srm://%s.cern.ch/srm/managerv2?SFN=/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), 'meta': {'events': 10}} for _ in range(nbfiles)] add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True) # Listing replicas on non-deterministic RSE replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse2_id]) list_rep.append(replica) r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert r == {} bad_replicas = list_bad_replicas() nbbadrep = 0 for rep in list_rep: for badrep in bad_replicas: if badrep['rse_id'] == rse2_id: if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']: nbbadrep += 1 assert len(replicas) == nbbadrep # Now adding non-existing bad replicas files = ['srm://%s.cern.ch/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), ] r = declare_bad_file_replicas(files, 'This is a good reason', root_account) output = ['%s Unknown replica' % rep for rep in files] assert r == {rse2_id: output}
def test_get_bad_replicas_backlog(rse_factory, mock_scope, root_account, file_config_mock): """ REPLICA (CORE): Check the behaviour of the necromancer in case of backlog on an RSE""" # Run necromancer once necromancer_run(threads=1, bulk=10000, once=True) nbfiles1 = 100 nbfiles2 = 20 # Adding replicas to deterministic RSE rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True) _, rse2_id = rse_factory.make_srm_rse(deterministic=True) # Create bad replicas on rse1 files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles1)] add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True) replicas = [] list_rep = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): replicas.extend(replica['rses'][rse1_id]) list_rep.append({'scope': replica['scope'], 'name': replica['name'], 'rse': rse1, 'rse_id': rse1_id}) res = declare_bad_file_replicas(replicas, 'This is a good reason', root_account) assert res == {} result = get_bad_replicas_backlog(force_refresh=True) assert rse1_id in result assert result[rse1_id] == nbfiles1 # Create more bad replicas on rse2 files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles2)] add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True) repl = [] for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): repl.extend(replica['rses'][rse2_id]) res = declare_bad_file_replicas(repl, 'This is a good reason', root_account) assert res == {} # List bad replicas on rse1 bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}]) assert len(bad_replicas) == nbfiles1 for rep in bad_replicas: assert rep in list_rep # Run necromancer once, all the files on RSE2 should be gone, 80 files should stay on RSE1 get_bad_replicas_backlog(force_refresh=True) necromancer_run(threads=1, bulk=20, once=True) bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}, {'id': rse2_id}]) assert len(bad_replicas) == 80 for rep in bad_replicas: assert rep['rse_id'] == rse1_id
def test_archive_of_deleted_dids(vo, did_factory, root_account, core_config_mock, caches_mock, file_config_mock): """ REAPER (DAEMON): Test that the options to keep the did and content history work.""" [reaper_cache_region, _config_cache_region, _replica_cache_region] = caches_mock scope = InternalScope('data13_hip', vo=vo) account = root_account nb_files = 10 file_size = 200 # 2G rse_name, rse_id, dids = __add_test_rse_and_replicas(vo=vo, scope=scope, rse_name=rse_name_generator(), names=['lfn' + generate_uuid() for _ in range(nb_files)], file_size=file_size, epoch_tombstone=True) dataset = did_factory.make_dataset() did_core.attach_dids(dids=dids, account=account, **dataset) rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=50 * file_size) assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files # Check first if the reaper does not delete anything if no space is needed reaper_cache_region.invalidate() rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=323000000000) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, greedy=True) assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == 0 file_clause = [] for did in dids: file_clause.append(and_(models.DeletedDataIdentifier.scope == did['scope'], models.DeletedDataIdentifier.name == did['name'])) session = get_session() query = session.query(models.DeletedDataIdentifier.scope, models.DeletedDataIdentifier.name, models.DeletedDataIdentifier.did_type).\ filter(or_(*file_clause)) deleted_dids = list() for did in query.all(): print(did) deleted_dids.append(did) assert len(deleted_dids) == len(dids) query = session.query(models.DataIdentifierAssociationHistory.child_scope, models.DataIdentifierAssociationHistory.child_name, models.DataIdentifierAssociationHistory.child_type).\ filter(and_(models.DataIdentifierAssociationHistory.scope == dataset['scope'], models.DataIdentifierAssociationHistory.name == dataset['name'])) deleted_dids = list() for did in query.all(): print(did) deleted_dids.append(did) assert len(deleted_dids) == len(dids)
def necromancer(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. param worker_number: The number of the worker (thread). param total_number: The total number of workers (threads). chunk_size: The chunk of the size to process. once: To run only once """ sleep_time = 60 while not graceful_stop.is_set(): stime = time.time() try: replicas = list_bad_replicas(limit=chunk_size, worker_number=worker_number, total_workers=total_workers) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info('Thread [%i/%i] : Working on %s:%s on %s' % (worker_number, total_workers, scope, name, rse)) rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])] if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]): logging.info('Thread [%i/%i] : File %s:%s has no other replicas, it will be marked as lost' % (worker_number, total_workers, scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException, e: logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e))) else: logging.info('Thread [%i/%i] : File %s:%s can be recovered. Available sources : %s' % (worker_number, total_workers, scope, name, str(rep[0]['rses']))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException, e: logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e))) logging.info('Thread [%i/%i] : It took %s seconds to process %s replicas' % (worker_number, total_workers, str(time.time() - stime), str(len(replicas))))
def list_replicas(dids, schemes=None, unavailable=False, request_id=None, ignore_availability=True, all_states=False, rse_expression=None): """ List file replicas for a list of data identifiers. :param dids: The list of data identifiers (DIDs). :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...) :param unavailable: Also include unavailable replicas in the list. :param request_id: ID associated with the request for debugging. :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary. :param rse_expression: The RSE expression to restrict replicas on a set of RSEs. """ validate_schema(name='r_dids', obj=dids) return replica.list_replicas(dids=dids, schemes=schemes, unavailable=unavailable, request_id=request_id, ignore_availability=ignore_availability, all_states=all_states, rse_expression=rse_expression)
def test_add_list_replicas(self): """ REPLICA (CORE): Add and list file replicas """ tmp_scope = 'mock' nbfiles = 13 files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles)] rses = ['MOCK', 'MOCK3'] for rse in rses: add_replicas(rse=rse, files=files, account='root', ignore_availability=True) replica_cpt = 0 for _ in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): replica_cpt += 1 assert_equal(nbfiles, replica_cpt)
def list_replicas(dids, schemes=None, unavailable=False, request_id=None, ignore_availability=True, all_states=False, rse_expression=None, client_location=None, domain=None): """ List file replicas for a list of data identifiers. :param dids: The list of data identifiers (DIDs). :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...) :param unavailable: Also include unavailable replicas in the list. :param request_id: ID associated with the request for debugging. :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary. :param rse_expression: The RSE expression to restrict replicas on a set of RSEs. :param client_location: Client location dictionary for PFN modification {'ip', 'fqdn', 'site'} :param domain: The network domain for the call, either None, 'wan' or 'lan'. Compatibility fallback: None falls back to 'wan'. """ validate_schema(name='r_dids', obj=dids) return replica.list_replicas(dids=dids, schemes=schemes, unavailable=unavailable, request_id=request_id, ignore_availability=ignore_availability, all_states=all_states, rse_expression=rse_expression, client_location=client_location, domain=domain)
def cleanup(self, session=None): if not self.created_dids: return # Cleanup Transfers session.query(models.Source).filter(or_(and_(models.Source.scope == did['scope'], models.Source.name == did['name']) for did in self.created_dids)).delete(synchronize_session=False) session.query(models.Request).filter(or_(and_(models.Request.scope == did['scope'], models.Request.name == did['name']) for did in self.created_dids)).delete(synchronize_session=False) # Cleanup Locks Rules query = session.query(models.ReplicationRule.id).filter(or_(and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name']) for did in self.created_dids)) for rule_id, in query: rule_core.delete_rule(rule_id, session=session) # Cleanup Replicas and Parent Datasets dids_by_rse = {} replicas = list(replica_core.list_replicas(self.created_dids, all_states=True, session=session)) for replica in replicas: for rse_id in replica['rses']: dids_by_rse.setdefault(rse_id, []).append({'scope': replica['scope'], 'name': replica['name']}) for rse_id, dids in dids_by_rse.items(): replica_core.delete_replicas(rse_id=rse_id, files=dids, session=session)
def test_update_replicas_paths(self): """ REPLICA (CORE): Force update the replica path """ tmp_scope = 'mock' nbfiles = 5 rse_info = rsemgr.get_rse_info('MOCK') files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/does/not/really/matter/where', 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 }, 'rse_id': rse_info['id'], 'path': '/does/not/really/matter/where' } for _ in range(nbfiles)] add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True) update_replicas_paths(files) for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): # force the changed string - if we look it up from the DB, then we're not testing anything :-D assert_equal( replica['rses']['MOCK2'][0], 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/does/not/really/matter/where' )
def list_replicas(dids, schemes=None, unavailable=False, request_id=None, ignore_availability=True, all_states=False, rse_expression=None, client_location=None, domain=None, lifetime=None, issuer=None): """ List file replicas for a list of data identifiers. :param dids: The list of data identifiers (DIDs). :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...) :param unavailable: Also include unavailable replicas in the list. :param request_id: ID associated with the request for debugging. :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary. :param rse_expression: The RSE expression to restrict replicas on a set of RSEs. :param client_location: Client location dictionary for PFN modification {'ip', 'fqdn', 'site'} :param domain: The network domain for the call, either None, 'wan' or 'lan'. Compatibility fallback: None falls back to 'wan'. :param lifetime: If supported, in seconds, restrict the lifetime of the replica PFN. :param issuer: The issuer account. """ validate_schema(name='r_dids', obj=dids) # Allow selected authenticated users to retrieve signed URLs. # Unauthenticated users, or permission-less users will get the raw URL without the signature. sign_urls = False if permission.has_permission(issuer=issuer, action='get_signed_urls', kwargs={}): sign_urls = True return replica.list_replicas(dids=dids, schemes=schemes, unavailable=unavailable, request_id=request_id, ignore_availability=ignore_availability, all_states=all_states, rse_expression=rse_expression, client_location=client_location, domain=domain, sign_urls=sign_urls, lifetime=lifetime)
def list_replicas(dids, schemes=None, unavailable=False, request_id=None, ignore_availability=True, all_states=False): """ List file replicas for a list of data identifiers. :param dids: The list of data identifiers (DIDs). :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...) :param unavailable: Also include unavailable replicas in the list. :param request_id: ID associated with the request for debugging. :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary. """ validate_schema(name='r_dids', obj=dids) return replica.list_replicas(dids=dids, schemes=schemes, unavailable=unavailable, request_id=request_id, ignore_availability=ignore_availability, all_states=all_states)
def necromancer(thread=0, bulk=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. """ sleep_time = 60 update_history_threshold = 3600 update_history_time = time.time() executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) stime = time.time() try: replicas = list_bad_replicas(limit=bulk, thread=hb['assign_thread'], total_threads=hb['nr_threads']) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse)) rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])] if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]): logging.info(prepend_str + 'File %s:%s has no other replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException, error: logging.info(prepend_str + '%s' % (str(error))) else: logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s' % (scope, name, str(rep[0]['rses']))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException, error: logging.info(prepend_str + '%s' % (str(error))) logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
def get_source_rse(scope, name, src_url): try: scheme = src_url.split(":")[0] replications = replica.list_replicas([{'scope': scope, 'name': name, 'type': DIDType.FILE}], schemes=[scheme], unavailable=True) for source in replications: for source_rse in source['rses']: for pfn in source['rses'][source_rse]: if pfn == src_url: return source_rse # cannot find matched surl logging.warn('Cannot get correct RSE for source url: %s' % (src_url)) return None except: logging.error('Cannot get correct RSE for source url: %s(%s)' % (src_url, sys.exc_info()[1])) return None
def test_reaper(): """ REAPER2 (DAEMON): Test the reaper2 daemon.""" if config_get_bool('common', 'multi_vo', raise_exception=False, default=False): vo = {'vo': config_get('client', 'vo', raise_exception=False, default='tst')} else: vo = {} rse_name = rse_name_generator() rse_id = rse_core.add_rse(rse_name, **vo) mock_protocol = {'scheme': 'MOCK', 'hostname': 'localhost', 'port': 123, 'prefix': '/test/reaper', 'impl': 'rucio.rse.protocols.mock.Default', 'domains': { 'lan': {'read': 1, 'write': 1, 'delete': 1}, 'wan': {'read': 1, 'write': 1, 'delete': 1}}} rse_core.add_protocol(rse_id=rse_id, parameter=mock_protocol) nb_files = 30 file_size = 2147483648 # 2G file_names = [] for i in range(nb_files): file_name = 'lfn' + generate_uuid() file_names.append(file_name) replica_core.add_replica(rse_id=rse_id, scope=InternalScope('data13_hip', **vo), name=file_name, bytes=file_size, tombstone=datetime.utcnow() - timedelta(days=1), account=InternalAccount('root', **vo), adler32=None, md5=None) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=800) rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=10737418240) rse_core.set_rse_limits(rse_id=rse_id, name='MaxBeingDeletedFiles', value=10) if vo: reaper(once=True, rses=[], include_rses='vo=%s&(%s)' % (vo['vo'], rse_name), exclude_rses=[]) reaper(once=True, rses=[], include_rses='vo=%s&(%s)' % (vo['vo'], rse_name), exclude_rses=[]) else: reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=[]) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=[]) assert len(list(replica_core.list_replicas(dids=[{'scope': InternalScope('data13_hip', **vo), 'name': n} for n in file_names], rse_expression=rse_name))) == nb_files - 5
def test_get_did_from_pfns_nondeterministic(self): """ REPLICA (CLIENT): Get list of DIDs associated to PFNs for non-deterministic sites""" rse = 'MOCK2' tmp_scope = 'mock' nbfiles = 3 pfns = [] input = {} rse_info = rsemgr.get_rse_info(rse) assert_equal(rse_info['deterministic'], False) files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': { 'events': 10 } } for _ in range(nbfiles)] for f in files: input[f['pfn']] = {'scope': f['scope'], 'name': f['name']} add_replicas(rse=rse, files=files, account='root', ignore_availability=True) for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm'], ignore_availability=True): for rse in replica['rses']: pfns.extend(replica['rses'][rse]) for result in self.replica_client.get_did_from_pfns(pfns, rse): pfn = result.keys()[0] assert_equal(input[pfn], result.values()[0])
def test_list_replicas_all_states(self): """ REPLICA (CORE): list file replicas with all_states""" tmp_scope = 'mock' nbfiles = 13 files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in range(nbfiles)] rses = ['MOCK', 'MOCK3'] for rse in rses: add_replicas(rse=rse, files=files, account='root', ignore_availability=True) for file in files: update_replica_state('MOCK', tmp_scope, file['name'], ReplicaState.COPYING) replica_cpt = 0 for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm'], all_states=True): assert_in('states', replica) assert_equal(replica['states']['MOCK'], str(ReplicaState.COPYING)) assert_equal(replica['states']['MOCK3'], str(ReplicaState.AVAILABLE)) replica_cpt += 1 assert_equal(nbfiles, replica_cpt)
def setUp(self): if config_get_bool('common', 'multi_vo', raise_exception=False, default=False): self.vo = {'vo': get_vo()} else: self.vo = {} self.replica_client = ReplicaClient() # Using two test RSEs self.rse4suspicious = 'MOCK_SUSPICIOUS' self.rse4suspicious_id = get_rse_id(self.rse4suspicious, **self.vo) self.rse4recovery = 'MOCK_RECOVERY' self.rse4recovery_id = get_rse_id(self.rse4recovery, **self.vo) self.scope = 'mock' self.internal_scope = InternalScope(self.scope, **self.vo) # For testing, we create 3 files and upload them to Rucio to two test RSEs. self.tmp_file1 = file_generator() self.tmp_file2 = file_generator() self.tmp_file3 = file_generator() self.tmp_file4 = file_generator() self.tmp_file5 = file_generator() self.listdids = [{'scope': self.internal_scope, 'name': path.basename(f), 'type': DIDType.FILE} for f in [self.tmp_file1, self.tmp_file2, self.tmp_file3, self.tmp_file4, self.tmp_file5]] for rse in [self.rse4suspicious, self.rse4recovery]: cmd = 'rucio -v upload --rse {0} --scope {1} {2} {3} {4} {5} {6}'.format(rse, self.scope, self.tmp_file1, self.tmp_file2, self.tmp_file3, self.tmp_file4, self.tmp_file5) exitcode, out, err = execute(cmd) # checking if Rucio upload went OK assert exitcode == 0 # Set fictional datatypes set_metadata(self.internal_scope, path.basename(self.tmp_file4), 'datatype', 'testtypedeclarebad') set_metadata(self.internal_scope, path.basename(self.tmp_file5), 'datatype', 'testtypenopolicy') # Allow for the RSEs to be affected by the suspicious file recovery daemon add_rse_attribute(self.rse4suspicious_id, "enable_suspicious_file_recovery", True) add_rse_attribute(self.rse4recovery_id, "enable_suspicious_file_recovery", True) # removing physical files from /tmp location - keeping only their DB info remove(self.tmp_file1) remove(self.tmp_file2) remove(self.tmp_file3) remove(self.tmp_file4) remove(self.tmp_file5) # Gather replica info replicalist = list_replicas(dids=self.listdids) # Changing the replica statuses as follows: # ---------------------------------------------------------------------------------------------------------------------------------- # Name State(s) declared on MOCK_RECOVERY State(s) declared on MOCK_SUSPICIOUS Metadata "datatype" # ---------------------------------------------------------------------------------------------------------------------------------- # tmp_file1 available suspicious (available) # tmp_file2 available suspicious + bad (unavailable) # tmp_file3 unavailable suspicious (available) RAW # tmp_file4 unavailable suspicious (available) testtypedeclarebad # tmp_file5 unavailable suspicious (available) testtypenopolicy # ---------------------------------------------------------------------------------------------------------------------------------- for replica in replicalist: suspicious_pfns = replica['rses'][self.rse4suspicious_id] for i in range(3): print("Declaring suspicious file replica: " + suspicious_pfns[0]) self.replica_client.declare_suspicious_file_replicas([suspicious_pfns[0], ], 'This is a good reason.') sleep(1) if replica['name'] == path.basename(self.tmp_file2): print("Declaring bad file replica: " + suspicious_pfns[0]) self.replica_client.declare_bad_file_replicas([suspicious_pfns[0], ], 'This is a good reason') if replica['name'] == path.basename(self.tmp_file3): print("Updating replica state as unavailable: " + replica['rses'][self.rse4recovery_id][0]) update_replica_state(self.rse4recovery_id, self.internal_scope, path.basename(self.tmp_file3), ReplicaState.UNAVAILABLE) if replica['name'] == path.basename(self.tmp_file4): print("Updating replica state as unavailable: " + replica['rses'][self.rse4recovery_id][0]) update_replica_state(self.rse4recovery_id, self.internal_scope, path.basename(self.tmp_file4), ReplicaState.UNAVAILABLE) if replica['name'] == path.basename(self.tmp_file5): print("Updating replica state as unavailable: " + replica['rses'][self.rse4recovery_id][0]) update_replica_state(self.rse4recovery_id, self.internal_scope, path.basename(self.tmp_file5), ReplicaState.UNAVAILABLE) # Gather replica info after setting initial replica statuses replicalist = list_replicas(dids=self.listdids) # Checking if the status changes were effective for replica in replicalist: if replica['name'] == path.basename(self.tmp_file1): assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE' assert replica['states'][self.rse4recovery_id] == 'AVAILABLE' if replica['name'] == path.basename(self.tmp_file2): assert (self.rse4suspicious_id in replica['states']) is False assert replica['states'][self.rse4recovery_id] == 'AVAILABLE' if replica['name'] == path.basename(self.tmp_file3): assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE' assert (self.rse4recovery_id in replica['states']) is False if replica['name'] == path.basename(self.tmp_file4): assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE' assert (self.rse4recovery_id in replica['states']) is False if replica['name'] == path.basename(self.tmp_file5): assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE' assert (self.rse4recovery_id in replica['states']) is False # Checking if only self.tmp_file2 is declared as 'BAD' self.from_date = datetime.now() - timedelta(days=1) bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4suspicious_id, younger_than=self.from_date, **self.vo) bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list] assert (path.basename(self.tmp_file1), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file2), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist assert (path.basename(self.tmp_file3), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file4), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file5), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4recovery_id, younger_than=self.from_date, **self.vo) bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list] assert (path.basename(self.tmp_file1), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file2), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file3), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file4), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file5), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
def process_output(output, sanity_check=True, compress=True): """Perform post-consistency-check actions. DARK files are put in the quarantined-replica table so that they may be deleted by the Dark Reaper. LOST files are reported as suspicious so that they may be further checked by the cloud squads. ``output`` should be an ``str`` with the absolute path to the file produced by ``consistency()``. It must maintain its naming convention. If ``sanity_check`` is ``True`` (default) and the number of entries in the output file is deemed excessive, the actions are aborted. If ``compress`` is ``True`` (default), the file is compressed with bzip2 after the actions are successfully performed. """ logger = logging.getLogger('auditor-worker') dark_replicas = [] lost_replicas = [] try: with open(output) as f: for line in f: label, path = line.rstrip().split(',', 1) scope, name = guess_replica_info(path) if label == 'DARK': dark_replicas.append({'path': path, 'scope': InternalScope(scope), 'name': name}) elif label == 'LOST': lost_replicas.append({'scope': InternalScope(scope), 'name': name}) else: raise ValueError('unexpected label') # Since the file is read immediately after its creation, any error # exposes a bug in the Auditor. except Exception as error: logger.critical('Error processing "%s"', output, exc_info=True) raise error rse = os.path.basename(output[:output.rfind('_')]) rse_id = get_rse_id(rse=rse) usage = get_rse_usage(rse_id=rse_id, source='rucio')[0] threshold = config.config_get('auditor', 'threshold', False, 0.2) # Perform a basic sanity check by comparing the number of entries # with the total number of files on the RSE. If the percentage is # significant, there is most likely an issue with the site dump. found_error = False if len(dark_replicas) > threshold * usage['files']: logger.warning('Number of DARK files is exceeding threshold: "%s"', output) found_error = True if len(lost_replicas) > threshold * usage['files']: logger.warning('Number of LOST files is exceeding threshold: "%s"', output) found_error = True if found_error and sanity_check: raise AssertionError('sanity check failed') # While converting LOST replicas to PFNs, entries that do not # correspond to a replica registered in Rucio are silently dropped. lost_pfns = [r['rses'][rse_id][0] for r in list_replicas(lost_replicas) if rse_id in r['rses']] add_quarantined_replicas(rse_id=rse_id, replicas=dark_replicas) logger.debug('Processed %d DARK files from "%s"', len(dark_replicas), output) declare_bad_file_replicas(lost_pfns, reason='Reported by Auditor', issuer=InternalAccount('root'), status=BadFilesStatus.SUSPICIOUS) logger.debug('Processed %d LOST files from "%s"', len(lost_replicas), output) if compress: destination = bz2_compress_file(output) logger.debug('Compressed "%s"', destination)
def test_replica_recoverer(self): """ REPLICA RECOVERER: Testing declaration of suspicious replicas as bad if they are found available on other RSEs. setUp function (above) is supposed to run first (nose does this automatically): - uploads 6 test files to two test RSEs ('MOCK_RECOVERY', 'MOCK_SUSPICIOUS') - prepares their statuses to be as follows: # ---------------------------------------------------------------------------------------------------------------------------------- # Name State(s) declared on MOCK_RECOVERY State(s) declared on MOCK_SUSPICIOUS Metadata "datatype" # ---------------------------------------------------------------------------------------------------------------------------------- # tmp_file1 available suspicious (available) # tmp_file2 available suspicious + bad (unavailable) # tmp_file3 unavailable suspicious (available) RAW # tmp_file4 unavailable suspicious (available) testtypedeclare_bad # tmp_file5 unavailable suspicious (available) testtypenopolicy # ---------------------------------------------------------------------------------------------------------------------------------- - Explaination: Suspicious replicas that are the last remaining copy (unavailable on MOCK_RECOVERY) are handeled differently depending by their metadata "datatype". RAW files have the poilcy to be ignored. testtype_declare_bad files are of a fictional type that has the policy of being declared bad. testtype_nopolicy files are of a fictional type that doesn't have a policy specified, meaning they should be ignored by default. Runs the Test: - running suspicious_replica_recoverer Concluding: - checks that tmp_file1 and tmp_file4 were declared as 'BAD' on 'MOCK_SUSPICIOUS' """ # Run replica recoverer once try: run(once=True, younger_than=1, nattempts=2, limit_suspicious_files_on_rse=5) except KeyboardInterrupt: stop() # Checking the outcome: # we expect to see only one change, i.e. tmp_file1 declared as bad on MOCK_SUSPICIOUS # ---------------------------------------------------------------------------------------------------------------------------------- # Name State(s) declared on MOCK_RECOVERY State(s) declared on MOCK_SUSPICIOUS Metadata "datatype" # ---------------------------------------------------------------------------------------------------------------------------------- # tmp_file1 available suspicious + bad (unavailable) # tmp_file2 available suspicious + bad (unavailable) # tmp_file3 unavailable suspicious (available) RAW # tmp_file4 unavailable suspicious + bad (unvailable) test_type_declare_bad # tmp_file5 unavailable suspicious (available) test_type_ignore # ---------------------------------------------------------------------------------------------------------------------------------- # Gather replica info after replica_recoverer has run. replicalist = list_replicas(dids=self.listdids) for replica in replicalist: if replica['name'] == path.basename(self.tmp_file1) or replica['name'] == path.basename(self.tmp_file2): assert (self.rse4suspicious_id in replica['states']) is False assert replica['states'][self.rse4recovery_id] == 'AVAILABLE' if replica['name'] == path.basename(self.tmp_file3): assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE' assert (self.rse4recovery_id in replica['states']) is False if replica['name'] == path.basename(self.tmp_file4): # The key 'state' only exists if the replica is available on at least one RSE. It shouldn't exist for tmp_file4. assert ('states' in replica) is False if replica['name'] == path.basename(self.tmp_file5): assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE' assert (self.rse4recovery_id in replica['states']) is False # Checking if replicas declared as 'BAD' bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4suspicious_id, younger_than=self.from_date, **self.vo) bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list] assert (path.basename(self.tmp_file1), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist assert (path.basename(self.tmp_file2), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist assert (path.basename(self.tmp_file3), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file4), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist assert (path.basename(self.tmp_file5), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4recovery_id, younger_than=self.from_date, **self.vo) bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list] assert (path.basename(self.tmp_file1), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file2), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file3), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file4), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file5), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5, sleep_time=300): """ Main loop to check for available replicas which are labeled as suspicious. Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE is considered problematic and the suspicious replicas on that RSE are labeled as 'TEMPORARY_UNAVAILABLE'. :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then it should sleep until sleep_time is over. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] prepend_str = 'replica_recoverer: ' logger = formatted_logger(logging.log, prepend_str + '%s') multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logger(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.') vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logger(logging.INFO, 'replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] + 1 # there is only 1 worker allowed for this daemon if total_workers != 1: logger(logging.ERROR, 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number, total_workers) logger = formatted_logger(logging.log, prepend_str + '%s') start = time.time() try: json_file = open("/opt/rucio/etc/suspicious_replica_recoverer.json") except: logger(logging.WARNING, "An error occured whilst trying to open the JSON file.") break try: json_data = json.load(json_file) except ValueError: logger(logging.WARNING, "No JSON object could be decoded.") # Checking that the json file is formatedd properly. for i, entry in enumerate(json_data): if "datatype" not in entry or "action" not in entry: logger(logging.ERROR, 'Entry %s in the json file is incomplete (missing either "datatype" or "action").', i) break logger(logging.INFO, 'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.', younger_than, nattempts) getfileskwargs = {'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'is_suspicious': True} for vo in vos: logger(logging.INFO, 'Start replica recovery for VO: %s', vo) recoverable_replicas = {} if vo not in recoverable_replicas: recoverable_replicas[vo] = {} # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse']) rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true') if rse['vo'] == vo], key=lambda k: k['rse']) logger(logging.DEBUG, "List of RSEs with enable_suspicious_file_recovery = True:") for i in rse_list: logger(logging.DEBUG, '%s', i) for rse in rse_list: time_start_rse = time.time() rse_expr = rse['rse'] cnt_surl_not_found = 0 if rse_expr not in recoverable_replicas[vo]: recoverable_replicas[vo][rse_expr] = {} # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs suspicious_replicas_avail_elsewhere = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["EXIST_COPIES"].value, filter_={'vo': vo}, **getfileskwargs) # Get the suspicious replicas that are the last remaining copies suspicious_replicas_last_copy = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["LAST_COPY"].value, filter_={'vo': vo}, **getfileskwargs) logger(logging.DEBUG, 'Suspicious replicas on %s:', rse_expr) logger(logging.DEBUG, 'Replicas with copies on other RSEs (%s):', len(suspicious_replicas_avail_elsewhere)) for i in suspicious_replicas_avail_elsewhere: logger(logging.DEBUG, '%s', i) logger(logging.DEBUG, 'Replicas that are the last remaining copy (%s):', len(suspicious_replicas_last_copy)) for i in suspicious_replicas_last_copy: logger(logging.DEBUG, '%s', i) # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE. if (rse['availability'] not in {4, 5, 6, 7}) and ((len(suspicious_replicas_avail_elsewhere) > 0) or (len(suspicious_replicas_last_copy) > 0)): logger(logging.WARNING, "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n", rse_expr, rse['availability']) continue if suspicious_replicas_avail_elsewhere: for replica in suspicious_replicas_avail_elsewhere: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': rep_name}]): for rse_ in rep['rses']: if rse_ == rse_id: recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_][0], 'available_elsewhere': True} surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) if suspicious_replicas_last_copy: for replica in suspicious_replicas_last_copy: if vo == replica['scope'].vo: scope = replica['scope'] rep_name = replica['name'] rse_id = replica['rse_id'] surl_not_found = True # Should only return one rse, as there is only one replica remaining for rep in list_replicas([{'scope': scope, 'name': rep_name}]): recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_id][0], 'available_elsewhere': False} surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr) logger(logging.INFO, 'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.', time.time() - time_start_rse, rse_expr, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy), len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) - cnt_surl_not_found, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy)) if len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) != 0: logger(logging.DEBUG, 'List of replicas on %s for which the pfns have been found:', rse_expr) for i in recoverable_replicas[vo][rse_expr]: logger(logging.DEBUG, '%s', i) # Log file is long and hard to read -> implement some spacing logger(logging.INFO, 'All RSEs have been checked for suspicious replicas. Total time: %s seconds.', time.time() - start) logger(logging.INFO, 'Begin check for problematic RSEs.') time_start_check_probl = time.time() # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE. # The suspicious files are marked as temporarily unavailable. list_problematic_rses = [] for rse_key in list(recoverable_replicas[vo].keys()): if len(recoverable_replicas[vo][rse_key].values()) > limit_suspicious_files_on_rse: list_problematic_rses.append(rse_key) surls_list = [] for replica_value in recoverable_replicas[vo][rse_key].values(): surls_list.append(replica_value['surl']) add_bad_pfns(pfns=surls_list, account=InternalAccount('root', vo=vo), state='TEMPORARY_UNAVAILABLE', expires_at=datetime.utcnow() + timedelta(days=3)) logger(logging.INFO, "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).", rse_key, limit_suspicious_files_on_rse) logger(logging.INFO, "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:", rse_key) for rse_values in recoverable_replicas[vo][rse_key].values(): logger(logging.INFO, 'Scope: %s Name: %s', rse_values['scope'], rse_values['name']) # Remove the RSE from the dictionary as it has been dealt with. del recoverable_replicas[vo][rse_key] logger(logging.INFO, "Following RSEs were deemed problematic (total: %s)", len(list_problematic_rses)) for rse in list_problematic_rses: logger(logging.INFO, "%s", rse) # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious). # If they are the last remaining copies, deal with them differently. for rse_key in list(recoverable_replicas[vo].keys()): files_to_be_declared_bad = [] files_to_be_ignored = [] # Remove RSEs from dictionary that don't have any suspicious replicas if len(recoverable_replicas[vo][rse_key]) == 0: del recoverable_replicas[vo][rse_key] continue # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there rse_id = list(recoverable_replicas[vo][rse_key].values())[0]['rse_id'] for replica_key in list(recoverable_replicas[vo][rse_key].keys()): if recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is True: # Replicas with other copies on at least one other RSE can safely be labeled as bad files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) # Remove replica from dictionary del recoverable_replicas[vo][rse_key][replica_key] elif recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is False: if (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("log.")) or (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("user")): # Don't keep log files or user files files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) del recoverable_replicas[vo][rse_key][replica_key] else: # Deal with replicas based on their metadata. file_metadata = get_metadata(recoverable_replicas[vo][rse_key][replica_key]["scope"], recoverable_replicas[vo][rse_key][replica_key]["name"]) if file_metadata["datatype"] is None: # "None" type has no function "split()" files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) continue for i in json_data: if i["datatype"] == file_metadata["datatype"].split("_")[-1]: action = i["action"] if action == "ignore": files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) elif action == "declare bad": files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) else: logger(logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)", rse_key, replica_key, recoverable_replicas[vo][rse_key][replica_key]['surl'], i["datatype"], i["action"]) break else: # If no policy has be set, default to ignoring the file (no action taken). files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl']) logger(logging.INFO, '(%s) Remaining replicas (pfns) that will be ignored:', rse_key) for i in files_to_be_ignored: logger(logging.INFO, '%s', i) logger(logging.INFO, '(%s) Remaining replica (pfns) that will be declared BAD:', rse_key) for i in files_to_be_declared_bad: logger(logging.INFO, '%s', i) if files_to_be_declared_bad: logger(logging.INFO, 'Ready to declare %s bad replica(s) on %s (RSE id: %s).', len(files_to_be_declared_bad), rse_key, str(rse_id)) declare_bad_file_replicas(pfns=files_to_be_declared_bad, reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), session=None) logger(logging.INFO, 'Finished declaring bad replicas on %s.\n', rse_key) logger(logging.INFO, 'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.', time.time() - time_start_check_probl) time_passed = time.time() - start logger(logging.INFO, 'Total time: %s seconds', time_passed) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logger(logging.WARNING, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) else: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) except Exception as err: logger(logging.CRITICAL, traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logger(logging.INFO, 'Graceful stop done.')
def test_reaper(): """ REAPER2 (DAEMON): Test the reaper2 daemon.""" if config_get_bool('common', 'multi_vo', raise_exception=False, default=False): vo = { 'vo': config_get('client', 'vo', raise_exception=False, default='tst') } new_vo = {'vo': 'new'} if not vo_core.vo_exists(**new_vo): vo_core.add_vo(description='Test', email='*****@*****.**', **new_vo) if not scope_core.check_scope(InternalScope('data13_hip', **new_vo)): scope_core.add_scope(InternalScope('data13_hip', **new_vo), InternalAccount('root', **new_vo)) nb_rses = 2 else: vo = {} new_vo = {} nb_rses = 1 mock_protocol = { 'scheme': 'MOCK', 'hostname': 'localhost', 'port': 123, 'prefix': '/test/reaper', 'impl': 'rucio.rse.protocols.mock.Default', 'domains': { 'lan': { 'read': 1, 'write': 1, 'delete': 1 }, 'wan': { 'read': 1, 'write': 1, 'delete': 1 } } } nb_files = 30 file_size = 2147483648 # 2G rse_names = [] all_file_names = [] for j in range(nb_rses): rse_name = rse_name_generator() rse_names.append(rse_name) rse_id = rse_core.add_rse(rse_name, **vo) rse_core.add_protocol(rse_id=rse_id, parameter=mock_protocol) if new_vo: rse_id_new = rse_core.add_rse(rse_name, **new_vo) rse_core.add_protocol(rse_id=rse_id_new, parameter=mock_protocol) file_names = [] for i in range(nb_files): file_name = 'lfn' + generate_uuid() file_names.append(file_name) replica_core.add_replica(rse_id=rse_id, scope=InternalScope('data13_hip', **vo), name=file_name, bytes=file_size, tombstone=datetime.utcnow() - timedelta(days=1), account=InternalAccount('root', **vo), adler32=None, md5=None) if new_vo: replica_core.add_replica( rse_id=rse_id_new, scope=InternalScope('data13_hip', **new_vo), name=file_name, bytes=file_size, tombstone=datetime.utcnow() - timedelta(days=1), account=InternalAccount('root', **new_vo), adler32=None, md5=None) all_file_names.append(file_names) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=800) rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=10737418240) rse_core.set_rse_limits(rse_id=rse_id, name='MaxBeingDeletedFiles', value=10) if new_vo: rse_core.set_rse_usage(rse_id=rse_id_new, source='storage', used=nb_files * file_size, free=800) rse_core.set_rse_limits(rse_id=rse_id_new, name='MinFreeSpace', value=10737418240) rse_core.set_rse_limits(rse_id=rse_id_new, name='MaxBeingDeletedFiles', value=10) if not vo: reaper(once=True, rses=[], include_rses=rse_names[0], exclude_rses=[]) reaper(once=True, rses=[], include_rses=rse_names[0], exclude_rses=[]) assert len( list( replica_core.list_replicas( dids=[{ 'scope': InternalScope('data13_hip', **vo), 'name': n } for n in all_file_names[0]], rse_expression=rse_name))) == nb_files - 5 else: # Check we reap all VOs by default reaper(once=True, rses=[], include_rses=rse_names[0], exclude_rses=[]) reaper(once=True, rses=[], include_rses=rse_names[0], exclude_rses=[]) assert len( list( replica_core.list_replicas( dids=[{ 'scope': InternalScope('data13_hip', **vo), 'name': n } for n in all_file_names[0]], rse_expression=rse_names[0]))) == nb_files - 5 assert len( list( replica_core.list_replicas( dids=[{ 'scope': InternalScope('data13_hip', **new_vo), 'name': n } for n in all_file_names[0]], rse_expression=rse_names[0]))) == nb_files - 5 # Check we don't affect a second VO that isn't specified reaper(once=True, rses=[], include_rses=rse_names[1], exclude_rses=[], vos=['new']) reaper(once=True, rses=[], include_rses=rse_names[1], exclude_rses=[], vos=['new']) assert len( list( replica_core.list_replicas( dids=[{ 'scope': InternalScope('data13_hip', **vo), 'name': n } for n in all_file_names[1]], rse_expression=rse_names[1]))), nb_files assert len( list( replica_core.list_replicas( dids=[{ 'scope': InternalScope('data13_hip', **new_vo), 'name': n } for n in all_file_names[1]], rse_expression=rse_names[1]))), nb_files - 5
try: parsed_rses = parse_expression(source_replica_expression, session=None) except InvalidRSEExpression, e: logging.error("Invalid RSE exception %s for request %s: %s" % (source_replica_expression, req['request_id'], e)) allowed_source_rses = [] else: allowed_source_rses = [x['rse'] for x in parsed_rses] tmpsrc = [] metadata = {} try: ts = time.time() replications = replica.list_replicas(dids=[{'scope': req['scope'], 'name': req['name'], 'type': DIDType.FILE}], schemes=[scheme, 'gsiftp']) record_timer('daemons.conveyor.submitter.list_replicas', (time.time() - ts) * 1000) # return gracefully if there are no replicas for a DID if not replications: return None, None for source in replications: metadata['filesize'] = long(source['bytes']) metadata['md5'] = source['md5'] metadata['adler32'] = source['adler32'] # TODO: Source protection # we need to know upfront if we are mixed DISK/TAPE source
def test_replica_recoverer(self): """ REPLICA RECOVERER: Testing declaration of suspicious replicas as bad if they are found available on other RSEs. setUp function (above) is supposed to run first (nose does this automatically): - uploads 3 test files to two test RSEs ('MOCK_RECOVERY', 'MOCK_SUSPICIOUS') - prepares their statuses to be as follows: # -------------------------------------------------------------------------------------------- # Name State(s) declared on MOCK_RECOVERY State(s) declared on MOCK_SUSPICIOUS # -------------------------------------------------------------------------------------------- # tmp_file1 available suspicious (available) # tmp_file2 available suspicious + bad (unavailable) # tmp_file3 unavailable suspicious (available) # -------------------------------------------------------------------------------------------- Runs the Test: - running suspicious_replica_recoverer Concluding: - checks that the only change made is that tmp_file1 was declared as 'BAD on 'MOCK_SUSPICIOUS' """ # Run replica recoverer once try: run(once=True, younger_than=1, nattempts=2, rse_expression='MOCK_SUSPICIOUS') except KeyboardInterrupt: stop() # Checking the outcome: # we expect to see only one change, i.e. tmp_file1 declared as bad on MOCK_SUSPICIOUS # -------------------------------------------------------------------------------------------- # Name State(s) declared on MOCK_RECOVERY State(s) declared on MOCK_SUSPICIOUS # -------------------------------------------------------------------------------------------- # tmp_file1 available suspicious + bad (unavailable) # tmp_file2 available suspicious + bad (unavailable) # tmp_file3 unavailable suspicious (available) # -------------------------------------------------------------------------------------------- # Gather replica info after replica_recoverer has run. replicalist = list_replicas(dids=self.listdids) for replica in replicalist: if replica['name'] == path.basename(self.tmp_file1) or replica['name'] == path.basename(self.tmp_file2): assert (self.rse4suspicious_id in replica['states']) is False assert replica['states'][self.rse4recovery_id] == 'AVAILABLE' if replica['name'] == path.basename(self.tmp_file3): assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE' assert (self.rse4recovery_id in replica['states']) is False # Checking if replicas declared as 'BAD' bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4suspicious_id, younger_than=self.from_date, **self.vo) bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list] assert (path.basename(self.tmp_file1), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist assert (path.basename(self.tmp_file2), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist assert (path.basename(self.tmp_file3), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4recovery_id, younger_than=self.from_date, **self.vo) bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list] assert (path.basename(self.tmp_file1), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file2), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist assert (path.basename(self.tmp_file3), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
def test_archive_removal_impact_on_constituents(rse_factory, did_factory, mock_scope, root_account, caches_mock, file_config_mock): [cache_region] = caches_mock rse_name, rse_id = rse_factory.make_mock_rse() scope = mock_scope account = root_account # Create 2 archives and 4 files: # - One only exists in the first archive # - One in both, plus another replica, which is not in an archive # - One in both, plus another replica, which is not in an archive; and this replica has expired # - One in both, plus another replica, which is not in an archive; and this replica has expired; but a replication rule exists on this second replica # Also add these files to datasets, one of which will be removed at the end nb_constituents = 4 nb_c_outside_archive = nb_constituents - 1 constituent_size = 2000 archive_size = 1000 uuid = str(generate_uuid()) constituents = [{'scope': scope, 'name': 'lfn.%s.%d' % (uuid, i)} for i in range(nb_constituents)] did_factory.register_dids(constituents) c_first_archive_only, c_with_replica, c_with_expired_replica, c_with_replica_and_rule = constituents replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size, **c_with_replica) replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size, tombstone=datetime.utcnow() - timedelta(days=1), **c_with_expired_replica) replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size, tombstone=datetime.utcnow() - timedelta(days=1), **c_with_replica_and_rule) rule_core.add_rule(dids=[c_with_replica_and_rule], account=account, copies=1, rse_expression=rse_name, grouping='NONE', weight=None, lifetime=None, locked=False, subscription_id=None) archive1, archive2 = [{'scope': scope, 'name': 'archive_%s.%d.zip' % (uuid, i)} for i in range(2)] replica_core.add_replica(rse_id=rse_id, bytes_=archive_size, account=account, **archive1) replica_core.add_replica(rse_id=rse_id, bytes_=archive_size, account=account, **archive2) did_core.attach_dids(dids=[{'scope': c['scope'], 'name': c['name'], 'bytes': constituent_size} for c in constituents], account=account, **archive1) did_core.attach_dids(dids=[{'scope': c['scope'], 'name': c['name'], 'bytes': constituent_size} for c in [c_with_replica, c_with_expired_replica, c_with_replica_and_rule]], account=account, **archive2) dataset1, dataset2 = [{'scope': scope, 'name': 'dataset_%s.%i' % (uuid, i)} for i in range(2)] did_core.add_did(did_type='DATASET', account=account, **dataset1) did_core.attach_dids(dids=constituents, account=account, **dataset1) did_core.add_did(did_type='DATASET', account=account, **dataset2) did_core.attach_dids(dids=[c_first_archive_only, c_with_expired_replica], account=account, **dataset2) @read_session def __get_archive_contents_history_count(archive, session=None): return session.query(ConstituentAssociationHistory).filter_by(**archive).count() # Run reaper the first time. # the expired non-archive replica of c_with_expired_replica must be removed, # but the did must not be removed, and it must still remain in the dataset because # it still has the replica from inside the archive assert replica_core.get_replica(rse_id=rse_id, **c_with_expired_replica) cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) for did in constituents + [archive1, archive2]: assert did_core.get_did(**did) for did in [archive1, archive2, c_with_replica, c_with_replica_and_rule]: assert replica_core.get_replica(rse_id=rse_id, **did) with pytest.raises(ReplicaNotFound): # The replica is only on the archive, not on the constituent replica_core.get_replica(rse_id=rse_id, **c_first_archive_only) with pytest.raises(ReplicaNotFound): # The replica outside the archive was removed by reaper nb_c_outside_archive -= 1 replica_core.get_replica(rse_id=rse_id, **c_with_expired_replica) # Compared to get_replica, list_replicas resolves archives, must return replicas for all files assert len(list(replica_core.list_replicas(dids=constituents))) == 4 assert len(list(did_core.list_content(**dataset1))) == 4 assert len(list(did_core.list_archive_content(**archive1))) == 4 assert len(list(did_core.list_archive_content(**archive2))) == 3 assert __get_archive_contents_history_count(archive1) == 0 assert __get_archive_contents_history_count(archive2) == 0 # Expire the first archive and run reaper again # the archive will be removed; and c_first_archive_only must be removed from datasets # and from the did table. replica_core.set_tombstone(rse_id=rse_id, tombstone=datetime.utcnow() - timedelta(days=1), **archive1) cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**archive1) with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**c_first_archive_only) assert len(list(replica_core.list_replicas(dids=constituents))) == 3 assert len(list(did_core.list_content(**dataset1))) == 3 assert len(list(did_core.list_archive_content(**archive1))) == 0 assert len(list(did_core.list_archive_content(**archive2))) == 3 assert __get_archive_contents_history_count(archive1) == 4 assert __get_archive_contents_history_count(archive2) == 0 # Expire the second archive replica and run reaper another time # c_with_expired_replica is removed because its external replica got removed at previous step # and it exists only inside the archive now. # If not open, Dataset2 will be removed because it will be empty. did_core.set_status(open=False, **dataset2) replica_core.set_tombstone(rse_id=rse_id, tombstone=datetime.utcnow() - timedelta(days=1), **archive2) cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=archive_size + nb_c_outside_archive * constituent_size, free=1) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) # The archive must be removed with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**archive2) # The DIDs which only existed in the archive are also removed with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**c_first_archive_only) with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**c_with_expired_replica) # If the DID has a non-expired replica outside the archive without rules on it, the DID is not removed assert did_core.get_did(**c_with_replica) # If the DID has an expired replica outside the archive, but has rules on that replica, the DID is not removed assert did_core.get_did(**c_with_replica_and_rule) assert len(list(replica_core.list_replicas(dids=constituents))) == 2 assert len(list(did_core.list_content(**dataset1))) == 2 with pytest.raises(DataIdentifierNotFound): did_core.get_did(**dataset2) assert len(list(did_core.list_content(**dataset2))) == 0 assert len(list(did_core.list_archive_content(**archive2))) == 0 assert __get_archive_contents_history_count(archive1) == 4 assert __get_archive_contents_history_count(archive2) == 3
""" REPLICA (CORE): Force update the replica path """ tmp_scope = 'mock' nbfiles = 5 rse_info = rsemgr.get_rse_info('MOCK') files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests//does/not/really/matter/where', 'bytes': 1L, 'adler32': '0cc737eb', 'meta': {'events': 10}, 'rse_id': rse_info['id'], 'path': '/does/not/really/matter/where'} for i in xrange(nbfiles)] add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True) update_replicas_paths(files) for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']): # force the changed string - if we look it up from the DB, then we're not testing anything :-D assert_equal(replica['rses']['MOCK2'][0], 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests//does/not/really/matter/where') def test_add_list_bad_replicas(self): """ REPLICA (CORE): Add bad replicas and list them""" tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in xrange(nbfiles)] rse_info = rsemgr.get_rse_info('MOCK') rse_id1 = rse_info['id'] add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True) # Listing replicas on deterministic RSE
def test_reaper(): """ REAPER (DAEMON): Test the reaper daemon.""" rse_name = rse_name_generator() rse_id = rse_core.add_rse(rse_name) mock_protocol = { 'scheme': 'MOCK', 'hostname': 'localhost', 'port': 123, 'prefix': '/test/reaper', 'impl': 'rucio.rse.protocols.mock.Default', 'domains': { 'lan': { 'read': 1, 'write': 1, 'delete': 1 }, 'wan': { 'read': 1, 'write': 1, 'delete': 1 } } } rse_core.add_protocol(rse_id=rse_id, parameter=mock_protocol) nb_files = 30 file_size = 2147483648 # 2G file_names = [] for i in range(nb_files): file_name = 'lfn' + generate_uuid() file_names.append(file_name) replica_core.add_replica(rse_id=rse_id, scope=InternalScope('data13_hip'), name=file_name, bytes=file_size, tombstone=datetime.utcnow() - timedelta(days=1), account=InternalAccount('root'), adler32=None, md5=None) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=800) rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=10737418240) rse_core.set_rse_limits(rse_id=rse_id, name='MaxBeingDeletedFiles', value=10) rses = [ rse_core.get_rse(rse_id), ] reaper(once=True, rses=rses) reaper(once=True, rses=rses) assert_equal( len( list( replica_core.list_replicas(dids=[{ 'scope': InternalScope('data13_hip'), 'name': n } for n in file_names], rse_expression=rse_name))), nb_files - 10)
'bytes': 1L, 'adler32': '0cc737eb', 'meta': { 'events': 10 }, 'rse_id': rse_info['id'], 'path': '/does/not/really/matter/where' } for i in xrange(nbfiles)] add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True) update_replicas_paths(files) for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['srm']): # force the changed string - if we look it up from the DB, then we're not testing anything :-D assert_equal( replica['rses']['MOCK2'][0], 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/does/not/really/matter/where' ) def test_add_list_bad_replicas(self): """ REPLICA (CORE): Add bad replicas and list them""" tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{ 'scope': tmp_scope,
def test_source_avoid_deletion(vo, caches_mock, core_config_mock, rse_factory, did_factory, root_account, file_factory): """ Test that sources on a file block it from deletion """ _, reaper_region = caches_mock src_rse1, src_rse1_id = rse_factory.make_mock_rse() src_rse2, src_rse2_id = rse_factory.make_mock_rse() dst_rse, dst_rse_id = rse_factory.make_mock_rse() all_rses = [src_rse1_id, src_rse2_id, dst_rse_id] any_source = f'{src_rse1}|{src_rse2}' for rse_id in [src_rse1_id, src_rse2_id]: rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=1) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=1, free=0) distance_core.add_distance(src_rse1_id, dst_rse_id, ranking=20) distance_core.add_distance(src_rse2_id, dst_rse_id, ranking=10) # Upload a test file to both rses without registering did = did_factory.random_did() # Register replica on one source RSE replica_core.add_replica(rse_id=src_rse1_id, account=root_account, bytes_=1, tombstone=datetime(year=1970, month=1, day=1), **did) rule_core.add_rule(dids=[did], account=root_account, copies=1, rse_expression=dst_rse, grouping='ALL', weight=None, lifetime=None, locked=False, subscription_id=None) # Reaper will not delete a file which only has one replica if there is any pending transfer for it reaper_region.invalidate() reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None) replica = next( iter(replica_core.list_replicas(dids=[did], rse_expression=any_source))) assert len(replica['pfns']) == 1 # Register replica on second source rse replica_core.add_replica(rse_id=src_rse2_id, account=root_account, bytes_=1, tombstone=datetime(year=1970, month=1, day=1), **did) replica = next( iter(replica_core.list_replicas(dids=[did], rse_expression=any_source))) assert len(replica['pfns']) == 2 # Submit the transfer. This will create the sources. submitter(once=True, rses=[{ 'id': rse_id } for rse_id in all_rses], partition_wait_time=None, transfertool='mock', transfertype='single', filter_transfertool=None) # None of the replicas will be removed. They are protected by an entry in the sources table reaper_region.invalidate() reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None) replica = next( iter(replica_core.list_replicas(dids=[did], rse_expression=any_source))) assert len(replica['pfns']) == 2 @transactional_session def __delete_sources(rse_id, scope, name, session=None): session.execute( delete(Source).where(Source.rse_id == rse_id, Source.scope == scope, Source.name == name)) # Deletion succeeds for one replica (second still protected by existing request) __delete_sources(src_rse1_id, **did) __delete_sources(src_rse2_id, **did) reaper_region.invalidate() reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None) replica = next( iter(replica_core.list_replicas(dids=[did], rse_expression=any_source))) assert len(replica['pfns']) == 1
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, rse_expression='MOCK', max_replicas_per_rse=100): """ Main loop to check for available replicas which are labeled as suspicious Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'. :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE. If more is found, processing is skipped and warning is printed. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] rses = [] for rse in parse_expression(expression=rse_expression): rses.append(rse['rse']) rses.sort() executable += ' --rse-expression ' + str(rses) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] # there is only 1 worker allowed for this daemon if total_workers != 1: logging.error( 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break start = time.time() logging.info( 'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,' + ' reported suspicious in the last %i days at least %i times which are available on other RSEs.', # NOQA: W503 worker_number, total_workers, rse_expression, younger_than, nattempts) getfileskwargs = { 'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'available_elsewhere': True, 'is_suspicious': True } recoverable_replicas = get_suspicious_files( rse_expression, **getfileskwargs) logging.info( 'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.', worker_number, total_workers, time.time() - start, len(recoverable_replicas)) if not recoverable_replicas and not once: logging.info( 'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas. Sleeping for 60 seconds.', worker_number, total_workers, len(recoverable_replicas)) GRACEFUL_STOP.wait(60) else: logging.info( 'replica_recoverer[%i/%i]: looking for replica surls.', worker_number, total_workers) start = time.time() surls_to_recover = { } # dictionary of { rse1: [surl1, surl2, ... ], rse2: ... } cnt_surl_not_found = 0 for replica in recoverable_replicas: scope = replica['scope'] name = replica['name'] rse = replica['rse'] rse_id = replica['rse_id'] if GRACEFUL_STOP.is_set(): break if rse_id not in surls_to_recover: surls_to_recover[rse_id] = [] # for each suspicious replica, we get its surl through the list_replicas function surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': name}]): for site in rep['rses']: if site == rse_id: surls_to_recover[rse_id].append( rep['rses'][site][0]) surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logging.warning( 'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.', worker_number, total_workers, name, rse) logging.info( 'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.', worker_number, total_workers, len(recoverable_replicas) - cnt_surl_not_found, len(recoverable_replicas), time.time() - start) for rse_id in surls_to_recover: logging.info( 'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.', worker_number, total_workers, len(surls_to_recover[rse_id]), rse, str(surls_to_recover[rse_id])) if len(surls_to_recover[rse_id]) > max_replicas_per_rse: logging.warning( 'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.', worker_number, total_workers, max_replicas_per_rse, str(len(surls_to_recover[rse_id])), rse) else: declare_bad_file_replicas( pfns=surls_to_recover[rse_id], reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root'), status=BadFilesStatus.BAD, session=None) logging.info( 'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.', worker_number, total_workers, rse) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number, total_workers)
def test_list_replica_with_domain(self): """ REPLICA (CORE): Add and list file replicas forcing domain""" tmp_rse = rse_name_generator() add_rse(tmp_rse) protocols = [ { 'scheme': 'MOCK', 'hostname': 'localhost', 'port': 17, 'prefix': '/i/prefer/the/lan', 'impl': 'rucio.rse.protocols.mock.Default', 'domains': { 'lan': { 'read': 1, 'write': 1, 'delete': 1 }, 'wan': { 'read': 2, 'write': 2, 'delete': 2 } } }, { 'scheme': 'MOCK', 'hostname': 'localhost', 'port': 18, 'prefix': '/i/prefer/the/wan', 'impl': 'rucio.rse.protocols.mock.Default', 'domains': { 'lan': { 'read': 2, 'write': 2, 'delete': 2 }, 'wan': { 'read': 1, 'write': 1, 'delete': 1 } } }, ] for p in protocols: add_protocol(tmp_rse, p) nbfiles = 3 files = [{ 'scope': 'mock', 'name': 'file_%s' % generate_uuid(), 'bytes': 1234, 'adler32': '01234567', 'meta': { 'events': 1234 } } for _ in range(nbfiles)] add_replicas(rse=tmp_rse, files=files, account='root', ignore_availability=True) for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['MOCK'], domain='wan'): assert_in('/i/prefer/the/wan', replica['pfns'].keys()[0]) for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['MOCK'], domain='lan'): assert_in('/i/prefer/the/lan', replica['pfns'].keys()[0]) # test old client behaviour - get all WAN answers for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['MOCK']): cmd = 'rucio list-file-replicas --pfns %s:%s' % (replica['scope'], replica['name']) _, stdout, _ = execute(cmd) assert_in('/i/prefer/the/wan', stdout) # # force all LAN for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['MOCK'], domain='lan'): cmd = 'rucio list-file-replicas --pfns --domain=lan %s:%s' % ( replica['scope'], replica['name']) errno, stdout, stderr = execute(cmd) assert_in('/i/prefer/the/lan', stdout) # # force all WAN for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['MOCK'], domain='wan'): cmd = 'rucio list-file-replicas --pfns --domain=wan %s:%s' % ( replica['scope'], replica['name']) errno, stdout, stderr = execute(cmd) assert_in('/i/prefer/the/wan', stdout) # # force both WAN and LAN for replica in list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE } for f in files], schemes=['MOCK'], domain='all'): cmd = 'rucio list-file-replicas --pfns --domain=all %s:%s' % ( replica['scope'], replica['name']) errno, stdout, stderr = execute(cmd) assert_in('/i/prefer/the/wan', stdout) assert_in('/i/prefer/the/lan', stdout)
def list_replicas(dids, schemes=None, unavailable=False, request_id=None, ignore_availability=True, all_states=False, rse_expression=None, client_location=None, domain=None, signature_lifetime=None, resolve_archives=True, resolve_parents=False, updated_after=None, issuer=None): """ List file replicas for a list of data identifiers. :param dids: The list of data identifiers (DIDs). :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...) :param unavailable: Also include unavailable replicas in the list. :param request_id: ID associated with the request for debugging. :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary. :param rse_expression: The RSE expression to restrict replicas on a set of RSEs. :param client_location: Client location dictionary for PFN modification {'ip', 'fqdn', 'site'} :param domain: The network domain for the call, either None, 'wan' or 'lan'. Compatibility fallback: None falls back to 'wan'. :param signature_lifetime: If supported, in seconds, restrict the lifetime of the signed PFN. :param resolve_archives: When set to True, find archives which contain the replicas. :param resolve_parents: When set to True, find all parent datasets which contain the replicas. :param updated_after: datetime object (UTC time), only return replicas updated after this time :param issuer: The issuer account. """ validate_schema(name='r_dids', obj=dids) # Allow selected authenticated users to retrieve signed URLs. # Unauthenticated users, or permission-less users will get the raw URL without the signature. sign_urls = False if permission.has_permission(issuer=issuer, action='get_signed_url', kwargs={}): sign_urls = True for d in dids: d['scope'] = InternalScope(d['scope']) replicas = replica.list_replicas(dids=dids, schemes=schemes, unavailable=unavailable, request_id=request_id, ignore_availability=ignore_availability, all_states=all_states, rse_expression=rse_expression, client_location=client_location, domain=domain, sign_urls=sign_urls, signature_lifetime=signature_lifetime, resolve_archives=resolve_archives, resolve_parents=resolve_parents, updated_after=updated_after) for rep in replicas: # 'rses' and 'states' use rse_id as the key. This needs updating to be rse. keys = ['rses', 'states'] for k in keys: old_dict = rep.get(k, None) if old_dict is not None: new_dict = {} for rse_id in old_dict: rse = get_rse_name( rse_id=rse_id) if rse_id is not None else None new_dict[rse] = old_dict[rse_id] rep[k] = new_dict rep['scope'] = rep['scope'].external if 'parents' in rep: new_parents = [] for p in rep['parents']: scope, name = p.split(':') scope = InternalScope(scope, fromExternal=False).external new_parents.append('{}:{}'.format(scope, name)) rep['parents'] = new_parents yield rep
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, rse_expression='MOCK', vos=None, max_replicas_per_rse=100, sleep_time=60): """ Main loop to check for available replicas which are labeled as suspicious Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE. If more is found, processing is skipped and warning is printed. :param sleep_time: Thread sleep time after each chunk of work. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False) if not multi_vo: if vos: logging.warning( 'Ignoring argument vos, this is only applicable in a multi-VO setup.' ) vos = ['def'] else: if vos: invalid = set(vos) - set([v['vo'] for v in list_vos()]) if invalid: msg = 'VO{} {} cannot be found'.format( 's' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid])) raise VONotFound(msg) else: vos = [v['vo'] for v in list_vos()] logging.info('replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos]))) # Don't require a result from the expression at each VO, only raise if we can't get a result from any of them rses = [] exceptions_raised = 0 for vo in vos: try: parsed_rses = parse_expression(expression=rse_expression, filter_={'vo': vo}) except InvalidRSEExpression: exceptions_raised += 1 parsed_rses = [] for rse in parsed_rses: rses.append(rse['id']) if exceptions_raised == len(vos): raise InvalidRSEExpression('RSE Expression resulted in an empty set.') rses.sort() executable += ' --rse-expression ' + str(rses) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] # there is only 1 worker allowed for this daemon if total_workers != 1: logging.error( 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break start = time.time() logging.info( 'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,' + ' reported suspicious in the last %i days at least %i times which are available on other RSEs.', # NOQA: W503 worker_number, total_workers, rse_expression, younger_than, nattempts) getfileskwargs = { 'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'available_elsewhere': True, 'is_suspicious': True } # Don't require a result from the expression at each VO, only raise if we can't get a result from any of them recoverable_replicas = [] exceptions_raised = 0 for vo in vos: try: recoverable_replicas.extend( get_suspicious_files(rse_expression, filter_={'vo': vo}, **getfileskwargs)) except InvalidRSEExpression: exceptions_raised += 1 if exceptions_raised == len(vos): raise InvalidRSEExpression( 'RSE Expression resulted in an empty set.') logging.info( 'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.', worker_number, total_workers, time.time() - start, len(recoverable_replicas)) if not recoverable_replicas and not once: logging.info( 'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas.', worker_number, total_workers, len(recoverable_replicas)) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) else: logging.info( 'replica_recoverer[%i/%i]: looking for replica surls.', worker_number, total_workers) start = time.time() surls_to_recover = { } # dictionary of { vo1: {rse1: [surl1, surl2, ... ], rse2: ...}, vo2:... } cnt_surl_not_found = 0 for replica in recoverable_replicas: scope = replica['scope'] name = replica['name'] vo = scope.vo rse = replica['rse'] rse_id = replica['rse_id'] if GRACEFUL_STOP.is_set(): break if vo not in surls_to_recover: surls_to_recover[vo] = {} if rse_id not in surls_to_recover[vo]: surls_to_recover[vo][rse_id] = [] # for each suspicious replica, we get its surl through the list_replicas function surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': name}]): for site in rep['rses']: if site == rse_id: surls_to_recover[vo][rse_id].append( rep['rses'][site][0]) surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logging.warning( 'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.', worker_number, total_workers, name, rse) logging.info( 'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.', worker_number, total_workers, len(recoverable_replicas) - cnt_surl_not_found, len(recoverable_replicas), time.time() - start) for vo in surls_to_recover: for rse_id in surls_to_recover[vo]: logging.info( 'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.', worker_number, total_workers, len(surls_to_recover[vo][rse_id]), rse, str(surls_to_recover[vo][rse_id])) if len(surls_to_recover[vo] [rse_id]) > max_replicas_per_rse: logging.warning( 'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.', worker_number, total_workers, max_replicas_per_rse, str(len(surls_to_recover[vo][rse_id])), rse) else: declare_bad_file_replicas( pfns=surls_to_recover[vo][rse_id], reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), status=BadFilesStatus.BAD, session=None) logging.info( 'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.', worker_number, total_workers, rse) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.' + err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number, total_workers)