def test_client_add_temporary_unavailable_pfns(rse_factory, mock_scope, replica_client): """ REPLICA (CLIENT): Add temporary unavailable PFNs""" rse, rse_id = rse_factory.make_posix_rse() nbfiles = 5 # Adding replicas to deterministic RSE files = [{'scope': mock_scope.external, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles)] replica_client.add_replicas(rse=rse, files=files) # Listing replicas on deterministic RSE list_rep = [] for replica in replica_client.list_replicas(dids=[{'scope': f['scope'], 'name': f['name']} for f in files], schemes=['file'], all_states=True): pfn = list(replica['pfns'].keys())[0] list_rep.append(pfn) # Submit bad PFNs now = datetime.utcnow() reason_str = generate_uuid() replica_client.add_bad_pfns(pfns=list_rep, reason=str(reason_str), state='TEMPORARY_UNAVAILABLE', expires_at=now.isoformat()) result = get_bad_pfns(limit=10000, thread=None, total_threads=None, session=None) bad_pfns = {} for res in result: bad_pfns[res['pfn']] = (res['state'], res['reason'], res['expires_at']) for pfn in list_rep: pfn = str(clean_surls([pfn])[0]) assert pfn in bad_pfns assert bad_pfns[pfn][0] == BadPFNStatus.TEMPORARY_UNAVAILABLE assert bad_pfns[pfn][1] == reason_str # Submit with wrong state with pytest.raises(RucioException): replica_client.add_bad_pfns(pfns=list_rep, reason=str(reason_str), state='BADSTATE', expires_at=now.isoformat()) # Run minos once minos_run(threads=1, bulk=10000, once=True) result = get_bad_pfns(limit=10000, thread=None, total_threads=None, session=None) pfns = [res['pfn'] for res in result if res['pfn'] in bad_pfns] res_pfns = [] for replica in list_rep: if replica in pfns: res_pfns.append(replica) assert res_pfns == [] # Check the state in the replica table for did in files: rep = get_replicas_state(scope=mock_scope, name=did['name']) assert list(rep.keys())[0] == ReplicaState.TEMPORARY_UNAVAILABLE rep = [] for did in files: did['state'] = ReplicaState.TEMPORARY_UNAVAILABLE rep.append(did) # Run the minos expiration minos_temp_run(threads=1, once=True) # Check the state in the replica table for did in files: rep = get_replicas_state(scope=mock_scope, name=did['name']) assert list(rep.keys())[0] == ReplicaState.AVAILABLE
def necromancer(thread=0, bulk=5, once=False): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. """ sleep_time = 60 update_history_threshold = 3600 update_history_time = time.time() executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) stime = time.time() replicas = [] try: replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logging.info(prepend_str + 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1) except DatabaseException as error: logging.info(prepend_str + '%s' % (str(error))) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, []) logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1) except DatabaseException as error: logging.info(prepend_str + '%s' % (str(error))) logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(prepend_str + ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break else: now = time.time() if (now - update_history_time) > update_history_threshold: logging.info(prepend_str + 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time)) bad_replicas = list_bad_replicas_history(limit=1000000, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for rse_id in bad_replicas: chunk_size = 1000 nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size)) logging.debug(prepend_str + 'Update history for rse_id %s' % (rse_id)) cnt = 0 for chunk in chunks(bad_replicas[rse_id], chunk_size): logging.debug(prepend_str + ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk)) cnt += 1 update_bad_replicas_history(chunk, rse_id) logging.info(prepend_str + 'History table updated in %s seconds' % (time.time() - now)) update_history_time = time.time() tottime = time.time() - stime if len(replicas) == bulk: logging.info(prepend_str + 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle') elif tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def minos(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Worker that gets a list of bad PFNs, extract the scope, name and rse_id and fill the bad_replicas table. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) states_mapping = { BadPFNStatus.BAD: BadFilesStatus.BAD, BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS, BadPFNStatus.TEMPORARY_UNAVAILABLE: BadFilesStatus.TEMPORARY_UNAVAILABLE } logging.info(prepend_str + 'Minos started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) pfns = [] try: bad_replicas = {} temporary_unvailables = {} pfns = get_bad_pfns(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], limit=bulk) # Class the PFNs into bad_replicas and temporary_unavailable for pfn in pfns: path = pfn['pfn'] account = pfn['account'] reason = pfn['reason'] expires_at = pfn['expires_at'] state = pfn['state'] if states_mapping[state] in [ BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS ]: if (account, reason, state) not in bad_replicas: bad_replicas[(account, reason, state)] = [] bad_replicas[(account, reason, state)].append(path) if states_mapping[ state] == BadFilesStatus.TEMPORARY_UNAVAILABLE: if (account, reason, expires_at) not in temporary_unvailables: temporary_unvailables[(account, reason, expires_at)] = [] temporary_unvailables[(account, reason, expires_at)].append(path) # Process the bad and suspicious files # The scope, name, rse_id are extracted and filled into the bad_replicas table for account, reason, state in bad_replicas: vo = account.vo pfns = bad_replicas[(account, reason, state)] logging.info( prepend_str + 'Declaring %s replicas with state %s and reason %s' % (len(pfns), str(state), reason)) session = get_session() schemes = {} dict_rse = {} unknown_replicas = [] try: # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: vo_str = '' if vo == 'def' else ' on VO ' + vo logging.debug(prepend_str + 'Running on RSE %s%s with %s replicas' % (get_rse_name(rse_id=rse_id), vo_str, len(dict_rse[rse_id]))) nchunk = 0 tot_chunk = int( math.ceil(len(dict_rse[rse_id]) / chunk_size)) for chunk in chunks(dict_rse[rse_id], chunk_size): nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) unknown_replicas = declare_bad_file_replicas( pfns=chunk, reason=reason, issuer=account, status=state, session=session) if unknown_replicas: logging.debug(prepend_str + 'Unknown replicas : %s' % (str(unknown_replicas))) bulk_delete_bad_pfns(pfns=chunk, session=session) session.commit() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) # Now get the temporary unavailable and update the replicas states for account, reason, expires_at in temporary_unvailables: vo = account.vo pfns = temporary_unvailables[(account, reason, expires_at)] logging.info( prepend_str + 'Declaring %s replicas temporary available with timeout %s and reason %s' % (len(pfns), str(expires_at), reason)) logging.debug(prepend_str + 'Extracting RSEs') schemes = {} dict_rse = {} unknown_replicas = [] # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logging.info( prepend_str + 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: replicas = [] rse = get_rse_name(rse_id=rse_id, session=None) rse_vo_str = rse if vo == 'def' else '{} on {}'.format( rse, vo) logging.debug(prepend_str + 'Running on RSE %s' % rse_vo_str) for rep in get_did_from_pfns(pfns=dict_rse[rse_id], rse_id=None, vo=vo, session=None): for pfn in rep: scope = rep[pfn]['scope'] name = rep[pfn]['name'] replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.TEMPORARY_UNAVAILABLE, 'pfn': pfn }) # The following part needs to be atomic # We update the replicas states to TEMPORARY_UNAVAILABLE # then insert a row in the bad_replicas table. TODO Update the row if it already exists # then delete the corresponding rows into the bad_pfns table logging.debug(prepend_str + 'Running on %s replicas on RSE %s' % (len(replicas), rse_vo_str)) nchunk = 0 tot_chunk = int( math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(replicas, chunk_size): try: nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(chunk, nowait=False, session=session) bulk_add_bad_replicas( chunk, account, state=BadFilesStatus.TEMPORARY_UNAVAILABLE, reason=None, expires_at=expires_at, session=session) pfns = [entry['pfn'] for entry in chunk] bulk_delete_bad_pfns(pfns=pfns, session=session) session.commit() # pylint: disable=no-member except (UnsupportedOperation, ReplicaNotFound) as error: session.rollback() # pylint: disable=no-member logging.error( prepend_str + 'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s' % str(error)) for rep in chunk: logging.debug(prepend_str + 'Working on %s' % (str(rep))) try: get_metadata(rep['scope'], rep['name']) unavailable_states = [] rep_state = get_replicas_state( rep['scope'], rep['name']) unavailable_states.extend( rep_state.get( ReplicaState.TEMPORARY_UNAVAILABLE, [])) unavailable_states.extend( rep_state.get( ReplicaState.BEING_DELETED, [])) unavailable_states.extend( rep_state.get(ReplicaState.BAD, [])) if rep['rse_id'] in unavailable_states: logging.info( prepend_str + '%s is in unavailable state. Will be removed from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) elif expires_at < datetime.now(): logging.info( '%s PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns', prepend_str, str(rep['pfn']), expires_at) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) except (DataIdentifierNotFound, ReplicaNotFound) as error: logging.error( prepend_str + 'Will remove %s from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) session = get_session() except Exception as error: logging.error(prepend_str + '%s' % (str(error))) tottime = time.time() - start_time if once: break if len(pfns) == bulk: logging.info( prepend_str + 'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle' ) elif tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def test_add_temporary_unavailable_pfns(self): """ REPLICA (CLIENT): Add temporary unavailable PFNs""" tmp_scope = 'mock' nbfiles = 5 # Adding replicas to deterministic RSE files = [{ 'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': { 'events': 10 } } for _ in range(nbfiles)] self.replica_client.add_replicas(rse='MOCK', files=files) # Listing replicas on deterministic RSE list_rep = [] for replica in self.replica_client.list_replicas(dids=[{ 'scope': f['scope'], 'name': f['name'] } for f in files], schemes=['srm'], unavailable=True): pfn = replica['pfns'].keys()[0] list_rep.append(pfn) # Submit bad PFNs now = datetime.utcnow() reason_str = generate_uuid() self.replica_client.add_bad_pfns(pfns=list_rep, reason=str(reason_str), state='TEMPORARY_UNAVAILABLE', expires_at=now.isoformat()) result = get_bad_pfns(limit=10000, thread=None, total_threads=None, session=None) bad_pfns = {} for res in result: bad_pfns[res['pfn']] = (res['state'], res['reason'], res['expires_at']) for pfn in list_rep: pfn = str(clean_surls([pfn])[0]) assert_in(pfn, bad_pfns) assert_equal(str(bad_pfns[pfn][0]), 'TEMPORARY_UNAVAILABLE') assert_equal(bad_pfns[pfn][1], reason_str) # Submit with wrong state with assert_raises(RucioException): self.replica_client.add_bad_pfns(pfns=list_rep, reason=str(reason_str), state='BADSTATE', expires_at=now.isoformat()) # Run minos once minos_run(threads=1, bulk=10000, once=True) result = get_bad_pfns(limit=10000, thread=None, total_threads=None, session=None) pfns = [res['pfn'] for res in result] res_pfns = [] for replica in list_rep: if replica in pfns: res_pfns.append(replica) assert_equal(res_pfns, []) # Check the state in the replica table for did in files: rep = get_replicas_state(scope=did['scope'], name=did['name']) assert_equal(str(rep.keys()[0]), 'TEMPORARY_UNAVAILABLE') rep = [] for did in files: did['state'] = ReplicaState.from_sym('TEMPORARY_UNAVAILABLE') rep.append(did) # Run the minos expiration minos_temp_run(threads=1, once=True) # Check the state in the replica table for did in files: rep = get_replicas_state(scope=did['scope'], name=did['name']) assert_equal(str(rep.keys()[0]), 'AVAILABLE')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos_temporary_expiration[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Minos Temporary Expiration starting') time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Minos Temporary Expiration started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) try: # Get list of expired TU replicas logger(logging.INFO, 'Getting list of expired replicas') expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logger(logging.INFO, '%s expired replicas returned', len(expired_replicas)) logger(logging.DEBUG, 'List of expired replicas returned %s', str(expired_replicas)) replicas = [] bad_replicas = [] nchunk = 0 tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) skip_replica_update = [] # Process and update the replicas in chunks for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] states_dictionary = get_replicas_state(scope=scope, name=name, session=session) # Check if the replica is not declared bad # If already declared bad don't update the replica state, but remove from bad_pfns if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]): replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}) else: skip_replica_update.append((scope, name)) # Remove the replicas from bad_replicas table in chunks bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) try: nchunk += 1 logger(logging.DEBUG, 'Running on %s chunk out of %s', nchunk, tot_chunk) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', str(error)) for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] logger(logging.DEBUG, 'Working on %s:%s on %s', scope, name, rse_id) try: # First check if the DID exists get_metadata(scope, name) if (scope, name) not in skip_replica_update: update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'DID %s:%s does not exist anymore.', scope, name) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logger(logging.WARNING, 'Replica %s:%s on RSEID %s does not exist anymore.', scope, name, rse_id) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member session = get_session() except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) session.rollback() session = get_session() except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, str(traceback.format_exc())) session = get_session() except Exception: logger(logging.CRITICAL, str(traceback.format_exc())) if once: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop) heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Minos Temporary Expiration starting', prepend_str) time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Minos Temporary Expiration started', prepend_str) chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get list of expired TU replicas logging.info('%s Getting list of expired replicas', prepend_str) expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logging.info('%s %s expired replicas returned', prepend_str, len(expired_replicas)) logging.debug('%s List of expired replicas returned %s', prepend_str, str(expired_replicas)) replicas = [] bad_replicas = [] nchunk = 0 tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): skip_replica_update = [] # Process and update the replicas in chunks for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] states_dictionary = get_replicas_state(scope=scope, name=name, session=session) # Check if the replica is not declared bad # If already declared bad don't update the replica state, but remove from bad_pfns if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]): replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}) else: skip_replica_update.append((scope, name)) # Remove the replicas from bad_replicas table in chunks bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) try: nchunk += 1 logging.debug('%s Running on %s chunk out of %s', prepend_str, nchunk, tot_chunk) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logging.warning('%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', prepend_str, str(error)) for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] logging.debug('%s Working on %s:%s on %s', prepend_str, scope, name, rse_id) try: # First check if the DID exists get_metadata(scope, name) if (scope, name) not in skip_replica_update: update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logging.warning('%s DID %s:%s does not exist anymore.', prepend_str, scope, name) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logging.warning('%s Replica %s:%s on RSEID %s does not exist anymore.', prepend_str, scope, name, rse_id) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical('%s %s', prepend_str, str(traceback.format_exc())) session = get_session() except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop requested', prepend_str) logging.info('%s Graceful stop done', prepend_str)
def necromancer(thread=0, bulk=5, once=False, sleep_time=60): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not GRACEFUL_STOP.is_set(): stime = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.DEBUG, 'Starting new cycle') # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get('necromancer', 'max_evaluator_backlog_count') max_evaluator_backlog_count = int(max_evaluator_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get('necromancer', 'max_evaluator_backlog_duration') max_evaluator_backlog_duration = int(max_evaluator_backlog_duration) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: evaluator_backlog_count, evaluator_backlog_duration = get_evaluation_backlog(expiration_time=sleep_time) if max_evaluator_backlog_count and \ evaluator_backlog_count and \ max_evaluator_backlog_duration and \ evaluator_backlog_duration and \ evaluator_backlog_count > max_evaluator_backlog_count and \ evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation') GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and evaluator_backlog_count and evaluator_backlog_count > max_evaluator_backlog_count: logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation') GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and evaluator_backlog_duration and evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger(logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation') GRACEFUL_STOP.wait(30) continue # Check how many bad replicas are queued try: max_bad_replicas_backlog_count = config_get('necromancer', 'max_bad_replicas_backlog_count') max_bad_replicas_backlog_count = int(max_bad_replicas_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_bad_replicas_backlog_count = None bad_replicas_backlog = REGION.get('bad_replicas_backlog') if bad_replicas_backlog is NO_VALUE: bad_replicas_backlog = get_bad_replicas_backlog() REGION.set('bad_replicas_backlog', bad_replicas_backlog) tot_bad_files = sum([bad_replicas_backlog[key] for key in bad_replicas_backlog]) list_of_rses = list() # If too many replica, call list_bad_replicas with a list of RSEs if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(bad_replicas_backlog) > 1: logger(logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs') rses = list() cnt = 0 for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=True): rses.append({'id': key}) cnt += bad_replicas_backlog[key] if cnt >= bulk: list_of_rses.append(rses) rses = list() cnt = 0 else: list_of_rses.append(None) tot_processed = 0 if tot_bad_files == 0: logger(logging.INFO, 'No bad replicas to process.') else: ttime = time.time() replicas = [] try: for rses in list_of_rses: replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse'] logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logger(logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(name='necromancer.badfiles.lostfile') except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, str(error)) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, []) logger(logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter(name='necromancer.badfiles.recovering') except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, str(error)) tot_processed += len(replicas) logger(logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - ttime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logger(logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break elif tot_processed == 0 or tot_bad_files == 0: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')
def necromancer(thread=0, bulk=5, once=False, sleep_time=60): """ Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Thread sleep time after each chunk of work. """ update_history_threshold = 3600 update_history_time = time.time() executable = 'necromancer' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not GRACEFUL_STOP.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get( 'necromancer', 'max_evaluator_backlog_count') max_evaluator_backlog_count = int(max_evaluator_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get( 'necromancer', 'max_evaluator_backlog_duration') max_evaluator_backlog_duration = int( max_evaluator_backlog_duration) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: backlog = get_evaluation_backlog(expiration_time=sleep_time) if max_evaluator_backlog_count and \ backlog[0] and \ max_evaluator_backlog_duration and \ backlog[1] and \ backlog[0] > max_evaluator_backlog_count and \ backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and backlog[ 0] and backlog[0] > max_evaluator_backlog_count: logger( logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and backlog[ 1] and backlog[1] < datetime.utcnow() - timedelta( minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue # Check how many bad replicas are queued try: max_bad_replicas_backlog_count = config_get( 'necromancer', 'max_bad_replicas_backlog_count') max_bad_replicas_backlog_count = int( max_bad_replicas_backlog_count) except (NoOptionError, NoSectionError, RuntimeError, ValueError): max_bad_replicas_backlog_count = None bad_replicas_backlog = get_bad_replicas_backlog() tot_bad_files = sum( [bad_replicas_backlog[key] for key in bad_replicas_backlog]) list_of_rses = list() # If too many replica, call list_bad_replicas with a list of RSEs if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len( bad_replicas_backlog) > 1: logger( logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs' ) rses = list() cnt = 0 for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=False): rses.append({'id': key}) cnt += bad_replicas_backlog[key] if cnt >= bulk: list_of_rses.append(rses) rses = list() cnt = 0 else: list_of_rses.append(None) stime = time.time() replicas = [] try: for rses in list_of_rses: replicas = list_bad_replicas( limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses) for replica in replicas: scope, name, rse_id, rse = replica['scope'], replica[ 'name'], replica['rse_id'], replica['rse'] logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse)) list_replicas = get_replicas_state(scope=scope, name=name) if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas: logger( logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name)) try: update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter( name='necromancer.badfiles.lostfile') except DatabaseException as error: logger(logging.WARNING, str(error)) else: rep = list_replicas.get(ReplicaState.AVAILABLE, []) unavailable_rep = list_replicas.get( ReplicaState.TEMPORARY_UNAVAILABLE, []) logger( logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep))) try: update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True) monitor.record_counter( name='necromancer.badfiles.recovering') except DatabaseException as error: logger(logging.WARNING, str(error)) logger( logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas)))) except Exception: exc_type, exc_value, exc_traceback = exc_info() logger( logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip()) if once: break else: now = time.time() if (now - update_history_time) > update_history_threshold: logger( logging.INFO, 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time)) bad_replicas = list_bad_replicas_history( limit=1000000, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads']) for rse_id in bad_replicas: chunk_size = 1000 nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size)) logger(logging.DEBUG, 'Update history for rse_id %s' % (rse_id)) cnt = 0 for chunk in chunks(bad_replicas[rse_id], chunk_size): logger( logging.DEBUG, ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk)) cnt += 1 update_bad_replicas_history(chunk, rse_id) logger( logging.INFO, 'History table updated in %s seconds' % (time.time() - now)) update_history_time = time.time() if len(replicas) == bulk: logger( logging.INFO, 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle' ) else: daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')