def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) try: # Get list of expired TU replicas logging.info(prepend_str + 'Getting list of expired replicas') expired_replicas = list_expired_temporary_unavailable_replicas( total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], limit=1000) logging.info(prepend_str + '%s expired replicas returned' % len(expired_replicas)) logging.debug(prepend_str + 'List of expired replicas returned %s' % str(expired_replicas)) replicas = [] bad_replicas = [] for replica in expired_replicas: replicas.append({ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE }) bad_replicas.append({ 'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE }) session = get_session() for chunk in chunks(replicas, chunk_size): # Process and update the replicas in chunks update_replicas_states(chunk, nowait=False, session=session) for chunk in chunks(bad_replicas, chunk_size): # Remove the replicas from bad_replicas table in chunks bulk_delete_bad_replicas(chunk, session=session) session.commit() # pylint: disable=no-member except Exception as error: logging.error(prepend_str + '%s' % (str(error))) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Minos Temporary Expiration starting', prepend_str) time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info('%s Minos Temporary Expiration started', prepend_str) chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get list of expired TU replicas logging.info('%s Getting list of expired replicas', prepend_str) expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logging.info('%s %s expired replicas returned', prepend_str, len(expired_replicas)) logging.debug('%s List of expired replicas returned %s', prepend_str, str(expired_replicas)) replicas = [] bad_replicas = [] nchunk = 0 tot_chunk = int(math.ceil(len(expired_replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): skip_replica_update = [] # Process and update the replicas in chunks for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] states_dictionary = get_replicas_state(scope=scope, name=name, session=session) # Check if the replica is not declared bad # If already declared bad don't update the replica state, but remove from bad_pfns if not (ReplicaState.BAD in states_dictionary and rse_id in states_dictionary[ReplicaState.BAD]): replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}) else: skip_replica_update.append((scope, name)) # Remove the replicas from bad_replicas table in chunks bad_replicas.append({'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) try: nchunk += 1 logging.debug('%s Running on %s chunk out of %s', prepend_str, nchunk, tot_chunk) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logging.warning('%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s', prepend_str, str(error)) for replica in chunk: scope, name, rse_id = replica[0], replica[1], replica[2] logging.debug('%s Working on %s:%s on %s', prepend_str, scope, name, rse_id) try: # First check if the DID exists get_metadata(scope, name) if (scope, name) not in skip_replica_update: update_replicas_states([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.AVAILABLE}, ], nowait=True, session=session) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logging.warning('%s DID %s:%s does not exist anymore.', prepend_str, scope, name) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logging.warning('%s Replica %s:%s on RSEID %s does not exist anymore.', prepend_str, scope, name, rse_id) bulk_delete_bad_replicas([{'scope': scope, 'name': name, 'rse_id': rse_id, 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}, ], session=session) session.commit() # pylint: disable=no-member session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical('%s %s', prepend_str, str(traceback.format_exc())) session = get_session() except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop requested', prepend_str) logging.info('%s Graceful stop done', prepend_str)
def poller(once=False, activities=None, sleep_time=60, fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None): """ Main loop to check the status of a transfer primitive with a transfertool. """ try: timeout = config_get('conveyor', 'poll_timeout') timeout = float(timeout) except NoOptionError: timeout = None executable = sys.argv[0] if activities: activities.sort() executable += '--activities ' + str(activities) if activity_shares: activities.sort() executable += '--activity_shares' + str(activity_shares) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' % (db_bulk, fts_bulk, timeout)) time.sleep( 10 ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Poller started') activity_next_exe_time = defaultdict(time.time) while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) if activities is None: activities = [None] for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue start_time = time.time() logging.debug( prepend_str + 'Start to poll transfers older than %i seconds for activity %s' % (older_than, activity)) transfs = request_core.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[RequestState.SUBMITTED], limit=db_bulk, older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than), total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], mode_all=False, hash_variable='id', activity=activity, activity_shares=activity_shares) record_timer('daemons.conveyor.poller.000-get_next', (time.time() - start_time) * 1000) if transfs: logging.debug(prepend_str + 'Polling %i transfers for activity %s' % (len(transfs), activity)) xfers_ids = {} for transf in transfs: if not transf['external_host'] in xfers_ids: xfers_ids[transf['external_host']] = [] xfers_ids[transf['external_host']].append( (transf['external_id'], transf['request_id'])) for external_host in xfers_ids: external_ids = list( {trf[0] for trf in xfers_ids[external_host]}) request_ids = [trf[1] for trf in xfers_ids[external_host]] for xfers in chunks(external_ids, fts_bulk): # poll transfers poll_transfers(external_host=external_host, xfers=xfers, prepend_str=prepend_str, request_ids=request_ids, timeout=timeout) if len(transfs) < fts_bulk / 2: logging.info( prepend_str + "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (len(transfs), activity, fts_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logging.critical(prepend_str + "%s" % (traceback.format_exc())) if once: break logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False): """ Main loop to select and delete dids. """ logging.info('Undertaker(%s): starting', worker_number) logging.info('Undertaker(%s): started', worker_number) hostname = socket.gethostname() pid = os.getpid() thread = threading.current_thread() sanity_check(executable='rucio-undertaker', hostname=hostname) paused_dids = {} # {(scope, name): datetime} while not GRACEFUL_STOP.is_set(): try: heartbeat = live(executable='rucio-undertaker', hostname=hostname, pid=pid, thread=thread, older_than=6000) logging.info( 'Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}' .format(locals())) # Refresh paused dids iter_paused_dids = deepcopy(paused_dids) for key in iter_paused_dids: if datetime.utcnow() > paused_dids[key]: del paused_dids[key] dids = list_expired_dids(worker_number=heartbeat['assign_thread'] + 1, total_workers=heartbeat['nr_threads'], limit=10000) dids = [ did for did in dids if (did['scope'], did['name']) not in paused_dids ] if not dids and not once: logging.info('Undertaker(%s): Nothing to do. sleep 60.', worker_number) time.sleep(60) continue for chunk in chunks(dids, chunk_size): try: logging.info('Undertaker(%s): Receive %s dids to delete', worker_number, len(chunk)) delete_dids(dids=chunk, account=InternalAccount('root'), expire_rules=True) logging.info('Undertaker(%s): Delete %s dids', worker_number, len(chunk)) record_counter(counters='undertaker.delete_dids', delta=len(chunk)) except RuleNotFound as error: logging.error(error) except (DatabaseException, DatabaseError, UnsupportedOperation) as e: if match('.*ORA-00054.*', str(e.args[0])) or match( '.*55P03.*', str(e.args[0])) or match( '.*3572.*', str(e.args[0])): for did in chunk: paused_dids[( did['scope'], did['name'])] = datetime.utcnow() + timedelta( seconds=randint(600, 2400)) record_counter( 'undertaker.delete_dids.exceptions.LocksDetected') logging.warning( 'undertaker[%s/%s]: Locks detected for chunk', heartbeat['assign_thread'], heartbeat['nr_threads']) else: logging.error('Undertaker(%s): Got database error %s.', worker_number, str(e)) except: logging.critical(traceback.format_exc()) time.sleep(1) if once: break die(executable='rucio-undertaker', hostname=hostname, pid=pid, thread=thread) logging.info('Undertaker(%s): graceful stop requested', worker_number) logging.info('Undertaker(%s): graceful stop done', worker_number)
def re_evaluator(once=False): """ Main loop to check the re-evaluation of dids. """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_dids = {} # {(scope, name): datetime} # Make an initial heartbeat so that all judge-evaluators have the correct worker number on the next try executable = 'judge-evaluator' live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) start = time.time() # NOQA # Refresh paused dids paused_dids = dict((k, v) for k, v in iteritems(paused_dids) if datetime.utcnow() < v) # Select a bunch of dids for re evaluation for this worker dids = get_updated_dids(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=100, blacklisted_dids=[ (InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids ]) logging.debug( 're_evaluator[%s/%s] index query time %f fetch size is %d (%d blacklisted)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], time.time() - start, len(dids), len([(InternalScope(key[0], fromExternal=False), key[1]) for key in paused_dids]))) # If the list is empty, sent the worker to sleep if not dids and not once: logging.debug( 're_evaluator[%s/%s] did not get any work (paused_dids=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], str(len(paused_dids)))) graceful_stop.wait(30) else: done_dids = {} for did in dids: if graceful_stop.is_set(): break # Check if this did has already been operated on did_tag = '%s:%s' % (did.scope.internal, did.name) if did_tag in done_dids: if did.rule_evaluation_action in done_dids[did_tag]: logging.debug( 're_evaluator[%s/%s]: evaluation of %s:%s already done' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) delete_updated_did(id=did.id) continue else: done_dids[did_tag] = [] # Jump paused dids if (did.scope.internal, did.name) in paused_dids: continue try: start_time = time.time() re_evaluate_did( scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action) logging.debug( 're_evaluator[%s/%s]: evaluation of %s:%s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name, time.time() - start_time)) delete_updated_did(id=did.id) done_dids[did_tag].append(did.rule_evaluation_action) except DataIdentifierNotFound as e: delete_updated_did(id=did.id) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_dids[( did.scope.internal, did.name)] = datetime.utcnow() + timedelta( seconds=randint(60, 600)) logging.warning( 're_evaluator[%s/%s]: Locks detected for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) record_counter( 'rule.judge.exceptions.LocksDetected') elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed as e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) logging.warning( 're_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) except FlushError as e: record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) logging.warning( 're_evaluator[%s/%s]: Flush error for %s:%s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], did.scope, did.name)) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def kronos_file(once=False, thread=0, brokers_resolved=None, dataset_queue=None, sleep_time=60): """ Main loop to consume tracer reports. """ logging.info('tracer consumer starting') hostname = gethostname() pid = getpid() thread = current_thread() chunksize = config_get_int('tracer-kronos', 'chunksize') prefetch_size = config_get_int('tracer-kronos', 'prefetch_size') subscription_id = config_get('tracer-kronos', 'subscription_id') try: bad_files_patterns = [] pattern = get(section='kronos', option='bad_files_patterns', session=None) pattern = str(pattern) patterns = pattern.split(",") for pat in patterns: bad_files_patterns.append(re.compile(pat.strip())) except ConfigNotFound: bad_files_patterns = [] except Exception as error: logging.error('(kronos_file) Failed to get bad_file_patterns' + str(error)) bad_files_patterns = [] use_ssl = True try: use_ssl = config_get_bool('tracer-kronos', 'use_ssl') except Exception: pass if not use_ssl: username = config_get('tracer-kronos', 'username') password = config_get('tracer-kronos', 'password') excluded_usrdns = set( config_get('tracer-kronos', 'excluded_usrdns').split(',')) conns = [] for broker in brokers_resolved: if not use_ssl: conns.append( Connection(host_and_ports=[ (broker, config_get_int('tracer-kronos', 'port')) ], use_ssl=False, reconnect_attempts_max=config_get_int( 'tracer-kronos', 'reconnect_attempts'))) else: conns.append( Connection(host_and_ports=[ (broker, config_get_int('tracer-kronos', 'port')) ], use_ssl=True, ssl_key_file=config_get('tracer-kronos', 'ssl_key_file'), ssl_cert_file=config_get('tracer-kronos', 'ssl_cert_file'), ssl_version=PROTOCOL_TLSv1, reconnect_attempts_max=config_get_int( 'tracer-kronos', 'reconnect_attempts'))) logging.info('(kronos_file) tracer consumer started') sanity_check(executable='kronos-file', hostname=hostname) while not graceful_stop.is_set(): start_time = time() live(executable='kronos-file', hostname=hostname, pid=pid, thread=thread) for conn in conns: if not conn.is_connected(): logging.info('(kronos_file) connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.tracer.kronos.reconnect.%s' % conn.transport._Transport__host_and_ports[0] [0].split('.')[0]) conn.set_listener( 'rucio-tracer-kronos', AMQConsumer( broker=conn.transport._Transport__host_and_ports[0], conn=conn, queue=config_get('tracer-kronos', 'queue'), chunksize=chunksize, subscription_id=subscription_id, excluded_usrdns=excluded_usrdns, dataset_queue=dataset_queue, bad_files_patterns=bad_files_patterns)) conn.start() if not use_ssl: conn.connect(username, password) else: conn.connect() conn.subscribe( destination=config_get('tracer-kronos', 'queue'), ack='client-individual', id=subscription_id, headers={'activemq.prefetchSize': prefetch_size}) tottime = time() - start_time if tottime < sleep_time: logging.info('(kronos_file) Will sleep for %s seconds' % (sleep_time - tottime)) sleep(sleep_time - tottime) logging.info('(kronos_file) graceful stop requested') for conn in conns: try: conn.disconnect() except Exception: pass die(executable='kronos-file', hostname=hostname, pid=pid, thread=thread) logging.info('(kronos_file) graceful stop done')
def reaper(rses, chunk_size=100, once=False, scheme=None, sleep_time=300): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param sleep_time: Thread sleep time after each chunk of work. """ pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) hash_executable = hashlib.sha256( (sys.argv[0] + ''.join(rses)).encode()).hexdigest() sanity_check(executable=None, hostname=hostname) # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Starting Dark Reaper on RSEs: %s', ', '.join(rses)) if not once: logger(logging.INFO, 'Waiting for heartbeat synchonization') GRACEFUL_STOP.wait( 10 ) # To prevent running on the same partition if all the reapers restart at the same time while not GRACEFUL_STOP.is_set(): try: heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals())) nothing_to_do = True start_time = time.time() rses_to_process = list( set(rses) & set(list_rses_with_quarantined_replicas())) random.shuffle(rses_to_process) for rse_id in rses_to_process: # The following query returns the list of real replicas (deleted_replicas) and list of dark replicas (dark_replicas) # Real replicas can be directly removed from the quarantine table deleted_replicas, dark_replicas = list_quarantined_replicas( rse_id=rse_id, limit=chunk_size, worker_number=heartbeat['assign_thread'], total_workers=heartbeat['nr_threads']) rse_info = rsemgr.get_rse_info(rse_id=rse_id) rse = rse_info['rse'] prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) prepend_str = 'dark-reaper [%i/%i] : ' % ( heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') try: prot.connect() for replica in dark_replicas: nothing_to_do = False scope = '' if replica['scope']: scope = replica['scope'].external try: pfn = str( list( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': scope, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) logger(logging.INFO, 'Deletion ATTEMPT of %s:%s as %s on %s', scope, replica['name'], pfn, rse) start = time.time() prot.delete(pfn) duration = time.time() - start logger( logging.INFO, 'Deletion SUCCESS of %s:%s as %s on %s in %s seconds', scope, replica['name'], pfn, rse, duration) payload = { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica.get('bytes') or 0, 'bytes': replica.get('bytes') or 0, 'url': pfn, 'duration': duration, 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-done', payload) deleted_replicas.append(replica) except SourceNotFound: err_msg = ( 'Deletion NOTFOUND of %s:%s as %s on %s' % (scope, replica['name'], pfn, rse)) logger(logging.WARNING, err_msg) deleted_replicas.append(replica) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: err_msg = ( 'Deletion NOACCESS of %s:%s as %s on %s: %s' % (scope, replica['name'], pfn, rse, str(error))) logger(logging.WARNING, err_msg) payload = { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica['bytes'] or 0, 'bytes': replica['bytes'] or 0, 'url': pfn, 'reason': str(error), 'protocol': prot.attributes['scheme'] } if replica['scope'].vo != 'def': payload['vo'] = replica['scope'].vo add_message('deletion-failed', payload) except Exception: logging.critical(traceback.format_exc()) finally: prot.close() delete_quarantined_replicas(rse_id=rse_id, replicas=deleted_replicas) if once: break if once: break if nothing_to_do: logger(logging.INFO, 'Nothing to do') daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except Exception: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def submitter(once=False, rses=None, partition_wait_time=10, bulk=100, group_bulk=1, group_policy='rule', source_strategy=None, activities=None, sleep_time=600, max_sources=4, retry_other_fts=False, filter_transfertool=FILTER_TRANSFERTOOL, transfertool=TRANSFER_TOOL, transfertype=TRANSFER_TYPE): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = "conveyor-submitter" if activities: activities.sort() executable += '--activities ' + str(activities) if filter_transfertool: executable += ' --filter-transfertool ' + filter_transfertool hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Submitter starting with timeout %s', timeout) if partition_wait_time: time.sleep(partition_wait_time) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Transfer submitter started') while not graceful_stop.is_set(): if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: try: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logger(logging.INFO, 'CMS user transfer activity') user_transfer = True logger(logging.INFO, 'Starting to get transfer transfers for %s', activity) start_time = time.time() transfers = __get_transfers(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts, transfertool=filter_transfertool, logger=logger) record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) GET_TRANSFERS_COUNTER.inc(len(transfers)) record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logger(logging.INFO, 'Got %s transfers for %s in %s seconds', len(transfers), activity, time.time() - start_time) # group transfers logger(logging.INFO, 'Starting to group transfers for %s', activity) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue, group_by_scope=user_transfer) record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logger(logging.INFO, 'Starting to submit transfers for %s', activity) if transfertool in ['fts3', 'mock']: for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', timeout=timeout, logger=logger, transfertool=transfertool) else: for _, jobs in iteritems(grouped_jobs[external_host]): # submit transfers for job in jobs: submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', timeout=timeout, user_transfer_job=user_transfer, logger=logger, transfertool=transfertool) elif transfertool == 'globus': if transfertype == 'bulk': # build bulk job file list per external host to send to submit_transfer for external_host in grouped_jobs: # pad the job with job_params; irrelevant for globus but needed for further rucio parsing submitjob = {'files': [], 'job_params': grouped_jobs[''][0].get('job_params')} for job in grouped_jobs[external_host]: submitjob.get('files').append(job.get('files')[0]) logger(logging.DEBUG, 'submitjob: %s' % submitjob) submit_transfer(external_host=external_host, job=submitjob, submitter='transfer_submitter', timeout=timeout, logger=logger, transfertool=transfertool) else: # build single job files and individually send to submit_transfer job_params = grouped_jobs[''][0].get('job_params') if grouped_jobs else None for external_host in grouped_jobs: for job in grouped_jobs[external_host]: for file in job['files']: singlejob = {'files': [file], 'job_params': job_params} logger(logging.DEBUG, 'singlejob: %s' % singlejob) submit_transfer(external_host=external_host, job=singlejob, submitter='transfer_submitter', timeout=timeout, logger=logger, transfertool=transfertool) else: logger(logging.ERROR, 'Unknown transfer tool') if len(transfers) < group_bulk: logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', len(transfers), activity, group_bulk, sleep_time) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time() + sleep_time except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: break logger(logging.INFO, 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done') return
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. """ logging.info('Starting Reaper: Worker %(worker_number)s, ' 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses])) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) # Generate a hash just for the subset of RSEs rse_names = [rse['rse'] for rse in rses] hash_executable = hashlib.sha256(sys.argv[0] + ''.join(rse_names)).hexdigest() sanity_check(executable=None, hostname=hostname) nothing_to_do = {} while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) checkpoint_time = datetime.datetime.now() # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) max_deleting_rate = 0 for rse in sort_rses(rses): try: if checkpoint_time + datetime.timedelta( minutes=1) < datetime.datetime.now(): heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) checkpoint_time = datetime.datetime.now() if rse['id'] in nothing_to_do and nothing_to_do[ rse['id']] > datetime.datetime.now(): continue logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number, rse['rse'], nothing_to_do.get(rse['id'])) rse_info = rsemgr.get_rse_info(rse['rse']) rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id']) if not rse_protocol['availability_delete']: logging.info( 'Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse']) nothing_to_do[rse['id']] = datetime.datetime.now( ) + datetime.timedelta(minutes=30) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol[ 'impl'] == 'rucio.rse.protocols.srm.Default' or protocol[ 'impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol[ 'impl'] = 'rucio.rse.protocols.gfal.Default' if protocol[ 'impl'] == 'rucio.rse.protocols.signeds3.Default': protocol[ 'impl'] = 'rucio.rse.protocols.s3es.Default' needed_free_space, max_being_deleted_files = None, 100 needed_free_space_per_child = None if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage( rse_id=rse['id']) logging.info( 'Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space, needed_free_space_per_child = 0, 0 logging.info( 'Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse']) else: if total_children and total_children > 0: needed_free_space_per_child = needed_free_space / float( total_children) start = time.time() with monitor.record_timer_block( 'reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas( rse_id=rse['id'], bytes=needed_free_space_per_child, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug( 'Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas)) if not replicas: nothing_to_do[rse['id']] = datetime.datetime.now( ) + datetime.timedelta(minutes=30) logging.info( 'Reaper %s-%s: No replicas to delete %s. The next check will occur at %s', worker_number, child_number, rse['rse'], nothing_to_do[rse['id']]) continue prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files)) try: update_replicas_states(replicas=[ dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files ], nowait=True) for replica in files: try: replica['pfn'] = str( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': replica['scope'], 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values()[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % ( replica['scope'], replica['name'], rse['rse'], str(error)) logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg) replica['pfn'] = None monitor.record_counter( counters='reaper.deletion.being_deleted', delta=len(files)) try: deleted_files = [] prot.connect() for replica in files: try: logging.info( 'Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) start = time.time() if rse['staging_area'] or rse[ 'rse'].endswith("STAGING"): logging.warning( 'Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) else: if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes[ 'scheme'] == 'https' and rse_info[ 'sign_url'] is not None: pfn = get_signed_url( rse_info['sign_url'], 'delete', pfn) prot.delete(pfn) else: logging.warning( 'Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) monitor.record_timer( 'daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000) duration = time.time() - start deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) add_message( 'deletion-done', { 'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'duration': duration, 'protocol': prot.attributes['scheme'] }) logging.info( 'Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration) except SourceNotFound: err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % ( worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({ 'scope': replica['scope'], 'name': replica['name'] }) if replica[ 'state'] == ReplicaState.AVAILABLE: add_message( 'deletion-failed', { 'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(err_msg), 'protocol': prot.attributes['scheme'] }) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable ) as error: logging.warning( 'Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) add_message( 'deletion-failed', { 'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme'] }) except Exception as error: logging.critical( 'Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) add_message( 'deletion-failed', { 'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme'] }) except: logging.critical( 'Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in files: logging.warning( 'Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) add_message( 'deletion-failed', { 'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme'] }) break finally: prot.close() start = time.time() with monitor.record_timer_block( 'reaper.delete_replicas'): delete_replicas(rse_id=rse['id'], files=deleted_files) logging.debug( 'Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start) monitor.record_counter( counters='reaper.deletion.done', delta=len(deleted_files)) except DatabaseException as error: logging.warning( 'Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error)) except UnsupportedOperation as error: logging.warning( 'Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) except RSENotFound as error: logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) if once: break time.sleep(1) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def deliver_emails(once=False, send_email=True, thread=0, bulk=1000, delay=10): ''' Main loop to deliver emails via SMTP. ''' logging.info('[email] starting - threads (%i) bulk (%i)', thread, bulk) executable = 'hermes [email]' hostname = socket.getfqdn() pid = os.getpid() heartbeat_thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) # Make an initial heartbeat so that all daemons have the correct worker number on the next try live(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) GRACEFUL_STOP.wait(1) email_from = config_get('messaging-hermes', 'email_from') while not GRACEFUL_STOP.is_set(): heartbeat = live(executable, hostname, pid, heartbeat_thread) logging.debug('[email] %i:%i - bulk %i', heartbeat['assign_thread'], heartbeat['nr_threads'], bulk) t_start = time.time() messages = retrieve_messages(bulk=bulk, thread=heartbeat['assign_thread'], total_threads=heartbeat['nr_threads'], event_type='email') if messages != []: to_delete = [] for message in messages: logging.debug('[email] %i:%i - submitting: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message)) msg = MIMEText(message['payload']['body'].encode('utf-8')) msg['From'] = email_from msg['To'] = ', '.join(message['payload']['to']) msg['Subject'] = message['payload']['subject'].encode('utf-8') if send_email: smtp = smtplib.SMTP() smtp.connect() smtp.sendmail(msg['From'], message['payload']['to'], msg.as_string()) smtp.quit() to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': str(message['payload']), 'event_type': 'email' }) logging.debug('[email] %i:%i - submitting done: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['id'])) delete_messages(to_delete) logging.info('[email] %i:%i - submitted %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(to_delete)) if once: break t_delay = delay - (time.time() - t_start) t_delay = t_delay if t_delay > 0 else 0 if t_delay: logging.debug('[email] %i:%i - sleeping %s seconds', heartbeat['assign_thread'], heartbeat['nr_threads'], t_delay) time.sleep(t_delay) logging.debug('[email] %i:%i - graceful stop requested', heartbeat['assign_thread'], heartbeat['nr_threads']) die(executable, hostname, pid, heartbeat_thread) logging.debug('[email] %i:%i - graceful stop done', heartbeat['assign_thread'], heartbeat['nr_threads'])
def actions_loop(once, scope, rses, sleep_time, dark_min_age, dark_threshold_percent, miss_threshold_percent, force_proceed, scanner_files_path): """ Main loop to apply the CC actions """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() executable = 'storage-consistency-actions' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) # Make an initial heartbeat # so that all storage-consistency-actions have the correct worker number on the next try prefix = 'storage-consistency-actions[%i/%i] ' %\ (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger( logging.INFO, 'hostname: %s pid: %d current_thread: %s' % (hostname, pid, current_thread)) graceful_stop.wait(1) while not graceful_stop.is_set(): try: heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread) logger(logging.INFO, 'heartbeat? %s' % heartbeat) prefix = 'storage-consistency-actions[%i/%i] ' %\ (heartbeat['assign_thread'], heartbeat['nr_threads']) logger(logging.INFO, 'prefix: %s' % prefix) start = time.time() logger(logging.DEBUG, 'Start time: %f' % start) deckard_loop(scope, rses, dark_min_age, dark_threshold_percent, miss_threshold_percent, force_proceed, scanner_files_path) daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger) except Exception as e: traceback.print_exc() logger(logging.WARNING, '\n Something went wrong here... %s' % e) logger( logging.WARNING, '\n Something went wrong here... %s ' % (e.__class__.__name__)) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def deliver_messages(once=False, brokers_resolved=None, thread=0, bulk=1000, delay=10, broker_timeout=3, broker_retry=3): ''' Main loop to deliver messages to a broker. ''' logging.info('[broker] starting - threads (%i) bulk (%i)', thread, bulk) if not brokers_resolved: logging.fatal('No brokers resolved.') return logging.info('[broker] checking authentication method') use_ssl = True try: use_ssl = config_get_bool('messaging-hermes', 'use_ssl') except: logging.info( '[broker] could not find use_ssl in configuration -- please update your rucio.cfg' ) port = config_get_int('messaging-hermes', 'port') vhost = config_get('messaging-hermes', 'broker_virtual_host', raise_exception=False) if not use_ssl: username = config_get('messaging-hermes', 'username') password = config_get('messaging-hermes', 'password') port = config_get_int('messaging-hermes', 'nonssl_port') conns = [] for broker in brokers_resolved: if not use_ssl: logging.info( '[broker] setting up username/password authentication: %s' % broker) con = stomp.Connection12(host_and_ports=[(broker, port)], vhost=vhost, keepalive=True, timeout=broker_timeout) else: logging.info( '[broker] setting up ssl cert/key authentication: %s' % broker) con = stomp.Connection12( host_and_ports=[(broker, port)], use_ssl=True, ssl_key_file=config_get('messaging-hermes', 'ssl_key_file'), ssl_cert_file=config_get('messaging-hermes', 'ssl_cert_file'), vhost=vhost, keepalive=True, timeout=broker_timeout) con.set_listener( 'rucio-hermes', HermesListener(con.transport._Transport__host_and_ports[0])) conns.append(con) destination = config_get('messaging-hermes', 'destination') executable = 'hermes [broker]' hostname = socket.getfqdn() pid = os.getpid() heartbeat_thread = threading.current_thread() # Make an initial heartbeat so that all daemons have the correct worker number on the next try sanity_check(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: t_start = time.time() heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=heartbeat_thread) logging.debug('[broker] %i:%i - using: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], [ conn.transport._Transport__host_and_ports[0][0] for conn in conns ]) messages = retrieve_messages(bulk=bulk, thread=heartbeat['assign_thread'], total_threads=heartbeat['nr_threads']) if messages: logging.debug('[broker] %i:%i - retrieved %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(messages)) to_delete = [] for message in messages: try: conn = random.sample(conns, 1)[0] if not conn.is_connected(): host_and_ports = conn.transport._Transport__host_and_ports[ 0][0] record_counter('daemons.hermes.reconnect.%s' % host_and_ports.split('.')[0]) conn.start() if not use_ssl: logging.info( '[broker] %i:%i - connecting with USERPASS to %s', heartbeat['assign_thread'], heartbeat['nr_threads'], host_and_ports) conn.connect(username, password, wait=True) else: logging.info( '[broker] %i:%i - connecting with SSL to %s', heartbeat['assign_thread'], heartbeat['nr_threads'], host_and_ports) conn.connect(wait=True) conn.send(body=json.dumps({ 'event_type': str(message['event_type']).lower(), 'payload': message['payload'], 'created_at': str(message['created_at']) }), destination=destination, headers={ 'persistent': 'true', 'event_type': str(message['event_type']).lower() }) to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': json.dumps(message['payload']), 'event_type': message['event_type'] }) except ValueError: logging.warn('Cannot serialize payload to JSON: %s', str(message['payload'])) to_delete.append({ 'id': message['id'], 'created_at': message['created_at'], 'updated_at': message['created_at'], 'payload': str(message['payload']), 'event_type': message['event_type'] }) continue except stomp.exception.NotConnectedException as error: logging.warn( 'Could not deliver message due to NotConnectedException: %s', str(error)) continue except stomp.exception.ConnectFailedException as error: logging.warn( 'Could not deliver message due to ConnectFailedException: %s', str(error)) continue except Exception as error: logging.warn('Could not deliver message: %s', str(error)) logging.critical(traceback.format_exc()) continue if str(message['event_type']).lower().startswith( 'transfer') or str(message['event_type']).lower( ).startswith('stagein'): logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, request-id: %s, transfer-id: %s, created_at: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload'].get('scope', None), message['payload'].get('name', None), message['payload'].get('dst-rse', None), message['payload'].get('request-id', None), message['payload'].get('transfer-id', None), str(message['created_at'])) elif str(message['event_type']).lower().startswith( 'dataset'): logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, rule-id: %s, created_at: %s)', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload']['scope'], message['payload']['name'], message['payload']['rse'], message['payload']['rule_id'], str(message['created_at'])) elif str(message['event_type']).lower().startswith( 'deletion'): if 'url' not in message['payload']: message['payload']['url'] = 'unknown' logging.debug( '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, url: %s, created_at: %s)', heartbeat['assign_thread'], heartbeat['nr_threads'], str(message['event_type']).lower(), message['payload']['scope'], message['payload']['name'], message['payload']['rse'], message['payload']['url'], str(message['created_at'])) else: logging.debug('[broker] %i:%i - other message: %s', heartbeat['assign_thread'], heartbeat['nr_threads'], message) delete_messages(to_delete) logging.info('[broker] %i:%i - submitted %i messages', heartbeat['assign_thread'], heartbeat['nr_threads'], len(to_delete)) if once: break except NoResultFound: # silence this error: https://its.cern.ch/jira/browse/RUCIO-1699 pass except: logging.critical(traceback.format_exc()) t_delay = delay - (time.time() - t_start) t_delay = t_delay if t_delay > 0 else 0 if t_delay: logging.debug('[broker] %i:%i - sleeping %s seconds', heartbeat['assign_thread'], heartbeat['nr_threads'], t_delay) time.sleep(t_delay) for conn in conns: try: conn.disconnect() except Exception: pass logging.debug('[broker] %i:%i - graceful stop requested', heartbeat['assign_thread'], heartbeat['nr_threads']) die(executable, hostname, pid, heartbeat_thread) logging.debug('[broker] %i:%i - graceful stop done', heartbeat['assign_thread'], heartbeat['nr_threads'])
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, rse_expression='MOCK', max_replicas_per_rse=100): """ Main loop to check for available replicas which are labeled as suspicious Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table and available on other RSE. Finds surls of these replicas and declares them as bad. :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped. :param younger_than: The number of days since which bad_replicas table will be searched for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'), but 'AVAILABLE' on other RSE(s). :param nattempts: The minimum number of appearances in the bad_replica DB table in order to appear in the resulting list of replicas for recovery. :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'. :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE. If more is found, processing is skipped and warning is printed. :returns: None """ # assembling the worker name identifier ('executable') including the rses from <rse_expression> # in order to have the possibility to detect a start of a second instance with the same set of RSES executable = argv[0] rses = [] for rse in parse_expression(expression=rse_expression): rses.append(rse['id']) rses.sort() executable += ' --rse-expression ' + str(rses) sanity_check(executable=executable, hostname=socket.gethostname()) # make an initial heartbeat - expected only one replica-recoverer thread on one node # heartbeat mechanism is used in this daemon only for information purposes # (due to expected low load, the actual DB query does not filter the result based on worker number) live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) # wait a moment in case all workers started at the same time GRACEFUL_STOP.wait(1) while not GRACEFUL_STOP.is_set(): try: # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon) heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) total_workers = heartbeat['nr_threads'] worker_number = heartbeat['assign_thread'] # there is only 1 worker allowed for this daemon if total_workers != 1: logging.error( 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname()) die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) break start = time.time() logging.info( 'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,' + ' reported suspicious in the last %i days at least %i times which are available on other RSEs.', # NOQA: W503 worker_number, total_workers, rse_expression, younger_than, nattempts) getfileskwargs = { 'younger_than': younger_than, 'nattempts': nattempts, 'exclude_states': ['B', 'R', 'D', 'L', 'T'], 'available_elsewhere': True, 'is_suspicious': True } recoverable_replicas = get_suspicious_files( rse_expression, **getfileskwargs) logging.info( 'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.', worker_number, total_workers, time.time() - start, len(recoverable_replicas)) if not recoverable_replicas and not once: logging.info( 'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas. Sleeping for 60 seconds.', worker_number, total_workers, len(recoverable_replicas)) GRACEFUL_STOP.wait(60) else: logging.info( 'replica_recoverer[%i/%i]: looking for replica surls.', worker_number, total_workers) start = time.time() surls_to_recover = { } # dictionary of { vo1: {rse1: [surl1, surl2, ... ], rse2: ...}, vo2:... } cnt_surl_not_found = 0 for replica in recoverable_replicas: scope = replica['scope'] name = replica['name'] vo = scope.vo rse = replica['rse'] rse_id = replica['rse_id'] if GRACEFUL_STOP.is_set(): break if vo not in surls_to_recover: surls_to_recover[vo] = {} if rse_id not in surls_to_recover[vo]: surls_to_recover[vo][rse_id] = [] # for each suspicious replica, we get its surl through the list_replicas function surl_not_found = True for rep in list_replicas([{'scope': scope, 'name': name}]): for site in rep['rses']: if site == rse_id: surls_to_recover[vo][rse_id].append( rep['rses'][site][0]) surl_not_found = False if surl_not_found: cnt_surl_not_found += 1 logging.warning( 'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.', worker_number, total_workers, name, rse) logging.info( 'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.', worker_number, total_workers, len(recoverable_replicas) - cnt_surl_not_found, len(recoverable_replicas), time.time() - start) for vo in surls_to_recover: for rse_id in surls_to_recover[vo]: logging.info( 'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.', worker_number, total_workers, len(surls_to_recover[vo][rse_id]), rse, str(surls_to_recover[vo][rse_id])) if len(surls_to_recover[vo] [rse_id]) > max_replicas_per_rse: logging.warning( 'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.', worker_number, total_workers, max_replicas_per_rse, str(len(surls_to_recover[vo][rse_id])), rse) else: declare_bad_file_replicas( pfns=surls_to_recover[vo][rse_id], reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), status=BadFilesStatus.BAD, session=None) logging.info( 'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.', worker_number, total_workers, rse) except (DatabaseException, DatabaseError) as err: if match('.*QueuePool.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) elif match('.*ORA-03135.*', str(err.args[0])): logging.warning(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) except Exception as err: logging.critical(traceback.format_exc()) record_counter('replica.recoverer.exceptions.%s', err.__class__.__name__) if once: break die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread()) logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number, total_workers)
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, scope='tests', once=False, dataset_lifetime=None, set_metadata=False): sleep(sleep_time * (total_workers - worker_number) / total_workers) executable = 'automatix' hostname = socket.getfqdn() pid = getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers) while not GRACEFUL_STOP.is_set(): heartbeat.live(executable, hostname, pid, hb_thread) starttime = time() prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers) logging.info('%s Getting data distribution', prepend_str) probabilities, data = get_data_distribution(inputfile) logging.debug('%s Probabilities %s', prepend_str, probabilities) totretries = 3 status = False for site in sites: for retry in range(0, totretries): start_time = time() tmpdir = tempfile.mkdtemp() logging.info('%s Running on site %s', prepend_str, site) dic = choose_element(probabilities, data) metadata = dic['metadata'] try: nbfiles = dic['nbfiles'] except KeyError: nbfiles = 2 logging.warning('%s No nbfiles defined in the configuration, will use 2', prepend_str) try: filesize = dic['filesize'] except KeyError: filesize = 1000000 logging.warning('%s No filesize defined in the configuration, will use 1M files', prepend_str) dsn = generate_didname(metadata, None, 'dataset') fnames = [] lfns = [] physical_fnames = [] for _ in range(nbfiles): fname = generate_didname(metadata=metadata, dsn=dsn, did_type='file') lfns.append(fname) logging.info('%s Generating file %s in dataset %s', prepend_str, fname, dsn) physical_fname = '%s/%s' % (tmpdir, "".join(fname.split('/'))) physical_fnames.append(physical_fname) generate_file(physical_fname, filesize) fnames.append(fname) logging.info('%s Upload %s to %s', prepend_str, dsn, site) dsn = '%s:%s' % (scope, dsn) status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, worker_number=worker_number, total_workers=total_workers, dataset_lifetime=dataset_lifetime, did=dsn, set_metadata=set_metadata) for physical_fname in physical_fnames: remove(physical_fname) rmdir(tmpdir) if status: monitor.record_counter(counters='automatix.addnewdataset.done', delta=1) monitor.record_counter(counters='automatix.addnewfile.done', delta=nbfiles) monitor.record_timer('automatix.datasetinjection', (time() - start_time) * 1000) break else: logging.info('%s Failed to upload files. Will retry another time (attempt %s/%s)', prepend_str, str(retry + 1), str(totretries)) if once is True: logging.info('%s Run with once mode. Exiting', prepend_str) break tottime = time() - starttime if status: logging.info('%s It took %s seconds to upload one dataset on %s', prepend_str, str(tottime), str(sites)) if sleep_time > tottime: logging.info('%s Will sleep for %s seconds', prepend_str, str(sleep_time - tottime)) sleep(sleep_time - tottime) else: logging.info('%s Retrying a new upload', prepend_str) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s Graceful stop requested', prepend_str) logging.info('%s Graceful stop done', prepend_str)
def submitter(once=False, rses=[], mock=False, process=0, total_processes=1, total_threads=1, bulk=100, group_bulk=1, group_policy='rule', fts_source_strategy='auto', activities=None, sleep_time=600, max_sources=4, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ logging.info('Transfer submitter starting - process (%i/%i) threads (%i)' % (process, total_processes, total_threads)) try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb = heartbeat.live(executable, hostname, pid, hb_thread) logging.info('Transfer submitter started - process (%i/%i) threads (%i/%i) timeout (%s)' % (process, total_processes, hb['assign_thread'], hb['nr_threads'], timeout)) threadPool = ThreadPool(total_threads) activity_next_exe_time = defaultdict(time.time) while not graceful_stop.is_set(): try: hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue logging.info("%s:%s Starting to get transfer transfers for %s" % (process, hb['assign_thread'], activity)) ts = time.time() transfers = __get_transfers(process=process, total_processes=total_processes, thread=hb['assign_thread'], total_threads=hb['nr_threads'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, mock=mock, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1)) record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logging.info("%s:%s Got %s transfers for %s" % (process, hb['assign_thread'], len(transfers), activity)) # group transfers logging.info("%s:%s Starting to group transfers for %s" % (process, hb['assign_thread'], activity)) ts = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, fts_source_strategy, max_time_in_queue) record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1)) logging.info("%s:%s Starting to submit transfers for %s" % (process, hb['assign_thread'], activity)) for external_host in grouped_jobs: for job in grouped_jobs[external_host]: # submit transfers # job_requests = makeRequests(submit_transfer, args_list=[((external_host, job, 'transfer_submitter', process, thread), {})]) job_requests = makeRequests(submit_transfer, args_list=[((), {'external_host': external_host, 'job': job, 'submitter': 'transfer_submitter', 'process': process, 'thread': hb['assign_thread'], 'timeout': timeout})]) [threadPool.putRequest(job_req) for job_req in job_requests] threadPool.wait() if len(transfers) < group_bulk: logging.info('%i:%i - only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (process, hb['assign_thread'], len(transfers), activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time() + sleep_time except: logging.critical('%s:%s %s' % (process, hb['assign_thread'], traceback.format_exc())) if once: break logging.info('%s:%s graceful stop requested' % (process, hb['assign_thread'])) threadPool.dismissWorkers(total_threads, do_join=True) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%s:%s graceful stop done' % (process, hb['assign_thread'])) return
def rule_repairer(once=False): """ Main loop to check for STUCK replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-repairers have the correct worker number on the next try executable = 'judge-repairer' live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=60 * 30) start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] # Select a bunch of rules for this worker to repair rules = get_stuck_rules( total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], delta=-1 if once else 1800, limit=100, blacklisted_rules=[key for key in paused_rules]) logging.debug( 'rule_repairer[%s/%s] index query time %f fetch size is %d' % (heartbeat['assign_thread'], heartbeat['nr_threads'], time.time() - start, len(rules))) if not rules and not once: logging.debug( 'rule_repairer[%s/%s] did not get any work (paused_rules=%s)' % (heartbeat['assign_thread'], heartbeat['nr_threads'], str(len(paused_rules)))) graceful_stop.wait(60) else: for rule_id in rules: rule_id = rule_id[0] logging.info('rule_repairer[%s/%s]: Repairing rule %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id)) if graceful_stop.is_set(): break try: start = time.time() repair_rule(rule_id=rule_id) logging.debug( 'rule_repairer[%s/%s]: repairing of %s took %f' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow( ) + timedelta(seconds=randint(600, 2400)) logging.warning( 'rule_repairer[%s/%s]: Locks detected for %s' % (heartbeat['assign_thread'], heartbeat['nr_threads'], rule_id)) record_counter( 'rule.judge.exceptions.LocksDetected') elif match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.error(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def rule_injector(once=False): """ Main loop to check for asynchronous creation of replication rules """ hostname = socket.gethostname() pid = os.getpid() current_thread = threading.current_thread() paused_rules = {} # {rule_id: datetime} # Make an initial heartbeat so that all judge-inectors have the correct worker number on the next try executable = 'judge-injector' heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') graceful_stop.wait(1) while not graceful_stop.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=current_thread, older_than=2 * 60 * 60) prefix = 'judge-injector[%i/%i] ' % (heartbeat['assign_thread'], heartbeat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') start = time.time() # Refresh paused rules iter_paused_rules = deepcopy(paused_rules) for key in iter_paused_rules: if datetime.utcnow() > paused_rules[key]: del paused_rules[key] rules = get_injected_rules(total_workers=heartbeat['nr_threads'], worker_number=heartbeat['assign_thread'], limit=100, blocked_rules=[key for key in paused_rules]) logger(logging.DEBUG, 'index query time %f fetch size is %d' % (time.time() - start, len(rules))) if not rules and not once: logger(logging.DEBUG, 'did not get any work (paused_rules=%s)' % str(len(paused_rules))) graceful_stop.wait(60) else: for rule in rules: rule_id = rule[0] logger(logging.INFO, 'Injecting rule %s' % rule_id) if graceful_stop.is_set(): break try: start = time.time() inject_rule(rule_id=rule_id, logger=logger) logger(logging.DEBUG, 'injection of %s took %f' % (rule_id, time.time() - start)) except (DatabaseException, DatabaseError) as e: if match('.*ORA-00054.*', str(e.args[0])): paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) record_counter('rule.judge.exceptions.LocksDetected') logger(logging.WARNING, 'Locks detected for %s' % rule_id) elif match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.ERROR, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except (RSEWriteBlocked) as e: paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'RSEWriteBlocked for rule %s' % rule_id) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except ReplicationRuleCreationTemporaryFailed as e: paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600)) logger(logging.WARNING, 'ReplicationRuleCreationTemporaryFailed for rule %s' % rule_id) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except RuleNotFound: pass except InsufficientAccountLimit: # A rule with InsufficientAccountLimit on injection hangs there potentially forever # It should be marked as SUSPENDED logger(logging.INFO, 'Marking rule %s as SUSPENDED due to InsufficientAccountLimit' % rule_id) update_rule(rule_id=rule_id, options={'state': 'SUSPENDED'}) except (DatabaseException, DatabaseError) as e: if match('.*QueuePool.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) elif match('.*ORA-03135.*', str(e.args[0])): logger(logging.WARNING, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logger(logging.CRITICAL, 'DatabaseException', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception as e: logger(logging.CRITICAL, 'Exception', exc_info=True) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable=executable, hostname=hostname, pid=pid, thread=current_thread)
def transmogrifier(bulk=5, once=False, sleep_time=60): """ Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash, identifies the subscriptions matching the DIDs and submit a replication rule for each DID matching a subscription. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = ' '.join(argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) dids, subscriptions = [], [] tottime = 0 prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) try: # Get the new DIDs based on the is_new flag for did in list_new_dids(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], chunk_size=bulk): dids.append({ 'scope': did['scope'], 'did_type': str(did['did_type']), 'name': did['name'] }) sub_dict = {3: []} # Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest # The priority is defined as 'policyid' for sub in list_subscriptions(None, None): if sub['state'] != SubscriptionState.INACTIVE and sub[ 'lifetime'] and (datetime.now() > sub['lifetime']): update_subscription( name=sub['name'], account=sub['account'], metadata={'state': SubscriptionState.INACTIVE}, issuer='root') elif sub['state'] in [ SubscriptionState.ACTIVE, SubscriptionState.UPDATED ]: priority = 3 if 'policyid' in sub: if int(sub['policyid']) not in sub_dict: sub_dict[int(sub['policyid'])] = [] priority = int(sub['policyid']) sub_dict[priority].append(sub) priorities = sub_dict.keys() priorities.sort() # Order the subscriptions according to their priority for priority in priorities: subscriptions.extend(sub_dict[priority]) except SubscriptionNotFound as error: logging.warning(prepend_str + 'No subscriptions defined: %s' % (str(error))) time.sleep(10) continue except Exception as error: logging.error( prepend_str + 'Failed to get list of new DIDs or subscriptions: %s' % (str(error))) try: results = {} start_time = time.time() blacklisted_rse_id = [ rse['id'] for rse in list_rses({'availability_write': False}) ] logging.debug(prepend_str + 'In transmogrifier worker') identifiers = [] # Loop over all the new dids for did in dids: did_success = True if did['did_type'] == str( DIDType.DATASET) or did['did_type'] == str( DIDType.CONTAINER): results['%s:%s' % (did['scope'], did['name'])] = [] try: metadata = get_metadata(did['scope'], did['name']) # Loop over all the subscriptions for subscription in subscriptions: # Check if the DID match the subscription if is_matching_subscription( subscription, did, metadata) is True: filter_string = loads(subscription['filter']) split_rule = filter_string.get( 'split_rule', False) if split_rule == 'true': split_rule = True elif split_rule == 'false': split_rule = False stime = time.time() results['%s:%s' % (did['scope'], did['name'])].append( subscription['id']) logging.info(prepend_str + '%s:%s matches subscription %s' % (did['scope'], did['name'], subscription['name'])) for rule_string in loads( subscription['replication_rules']): # Get all the rule and subscription parameters grouping = rule_string.get( 'grouping', 'DATASET') lifetime = rule_string.get( 'lifetime', None) ignore_availability = rule_string.get( 'ignore_availability', None) weight = rule_string.get('weight', None) source_replica_expression = rule_string.get( 'source_replica_expression', None) locked = rule_string.get('locked', None) if locked == 'True': locked = True else: locked = False purge_replicas = rule_string.get( 'purge_replicas', False) if purge_replicas == 'True': purge_replicas = True else: purge_replicas = False rse_expression = str( rule_string['rse_expression']) comment = str(subscription['comments']) subscription_id = str(subscription['id']) account = subscription['account'] copies = int(rule_string['copies']) activity = rule_string.get( 'activity', 'User Subscriptions') try: validate_schema(name='activity', obj=activity) except InputValidationError as error: logging.error( prepend_str + 'Error validating the activity %s' % (str(error))) activity = 'User Subscriptions' if lifetime: lifetime = int(lifetime) str_activity = "".join(activity.split()) success = False nattempt = 5 attemptnr = 0 skip_rule_creation = False if split_rule: rses = parse_expression(rse_expression) list_of_rses = [ rse['rse'] for rse in rses ] # Check that some rule doesn't already exist for this DID and subscription preferred_rse_ids = [] for rule in list_rules( filters={ 'subscription_id': subscription_id, 'scope': did['scope'], 'name': did['name'] }): already_existing_rses = [ (rse['rse'], rse['id']) for rse in parse_expression( rule['rse_expression']) ] for rse, rse_id in already_existing_rses: if (rse in list_of_rses) and ( rse_id not in preferred_rse_ids): preferred_rse_ids.append( rse_id) if len(preferred_rse_ids) >= copies: skip_rule_creation = True rse_id_dict = {} for rse in rses: rse_id_dict[rse['id']] = rse['rse'] try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist=blacklisted_rse_id ) ] except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight) as error: logging.warning( prepend_str + 'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites' % (subscription['name'], account, str(error))) # Now including the blacklisted sites try: rseselector = RSESelector( account=account, rses=rses, weight=weight, copies=copies - len(preferred_rse_ids)) selected_rses = [ rse_id_dict[rse_id] for rse_id, _, _ in rseselector.select_rse( 0, preferred_rse_ids= preferred_rse_ids, copies=copies, blacklist=[]) ] ignore_availability = True except (InsufficientTargetRSEs, InsufficientAccountLimit, InvalidRuleWeight ) as error: logging.error( prepend_str + 'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.' % (subscription['name'], account, str(error))) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str(error.__class__. __name__)), delta=1) # The DID won't be reevaluated at the next cycle did_success = did_success and True continue for attempt in range(0, nattempt): attemptnr = attempt nb_rule = 0 # Try to create the rule try: if split_rule: if not skip_rule_creation: for rse in selected_rses: logging.info( prepend_str + 'Will insert one rule for %s:%s on %s' % (did['scope'], did['name'], rse)) add_rule( dids=[{ 'scope': did['scope'], 'name': did['name'] }], account=account, copies=1, rse_expression=rse, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription_id, source_replica_expression =source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment) nb_rule += 1 if nb_rule == copies: success = True break else: add_rule( dids=[{ 'scope': did['scope'], 'name': did['name'] }], account=account, copies=copies, rse_expression= rse_expression, grouping=grouping, weight=weight, lifetime=lifetime, locked=locked, subscription_id= subscription['id'], source_replica_expression= source_replica_expression, activity=activity, purge_replicas= purge_replicas, ignore_availability= ignore_availability, comment=comment) nb_rule += 1 monitor.record_counter( counters= 'transmogrifier.addnewrule.done', delta=nb_rule) monitor.record_counter( counters= 'transmogrifier.addnewrule.activity.%s' % str_activity, delta=nb_rule) success = True break except (InvalidReplicationRule, InvalidRuleWeight, InvalidRSEExpression, StagingAreaRuleRequiresLifetime, DuplicateRule) as error: # Errors that won't be retried success = True logging.error(prepend_str + '%s' % (str(error))) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str( error.__class__.__name__)), delta=1) break except (ReplicationRuleCreationTemporaryFailed, InsufficientTargetRSEs, InsufficientAccountLimit, DatabaseException, RSEBlacklisted) as error: # Errors to be retried logging.error( prepend_str + '%s Will perform an other attempt %i/%i' % (str(error), attempt + 1, nattempt)) monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.%s' % (str( error.__class__.__name__)), delta=1) except Exception as error: # Unexpected errors monitor.record_counter( counters= 'transmogrifier.addnewrule.errortype.unknown', delta=1) exc_type, exc_value, exc_traceback = exc_info( ) logging.critical( prepend_str + ''.join( format_exception( exc_type, exc_value, exc_traceback)).strip( )) did_success = (did_success and success) if (attemptnr + 1) == nattempt and not success: logging.error( prepend_str + 'Rule for %s:%s on %s cannot be inserted' % (did['scope'], did['name'], rse_expression)) else: logging.info( prepend_str + '%s rule(s) inserted in %f seconds' % (str(nb_rule), time.time() - stime)) except DataIdentifierNotFound as error: logging.warning(prepend_str + error) if did_success: if did['did_type'] == str(DIDType.FILE): monitor.record_counter( counters='transmogrifier.did.file.processed', delta=1) elif did['did_type'] == str(DIDType.DATASET): monitor.record_counter( counters='transmogrifier.did.dataset.processed', delta=1) elif did['did_type'] == str(DIDType.CONTAINER): monitor.record_counter( counters='transmogrifier.did.container.processed', delta=1) monitor.record_counter( counters='transmogrifier.did.processed', delta=1) identifiers.append({ 'scope': did['scope'], 'name': did['name'], 'did_type': DIDType.from_sym(did['did_type']) }) time1 = time.time() # Mark the DIDs as processed for identifier in chunks(identifiers, 100): _retrial(set_new_dids, identifier, None) logging.info(prepend_str + 'Time to set the new flag : %f' % (time.time() - time1)) tottime = time.time() - start_time for sub in subscriptions: update_subscription( name=sub['name'], account=sub['account'], metadata={'last_processed': datetime.now()}, issuer='root') logging.info(prepend_str + 'It took %f seconds to process %i DIDs' % (tottime, len(dids))) logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids))) monitor.record_counter(counters='transmogrifier.job.done', delta=1) monitor.record_timer(stat='transmogrifier.job.duration', time=1000 * tottime) except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(prepend_str + ''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) monitor.record_counter(counters='transmogrifier.job.error', delta=1) monitor.record_counter(counters='transmogrifier.addnewrule.error', delta=1) if once is True: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, activities=None, sleep_time=60, fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None): """ Main loop to check the status of a transfer primitive with a transfertool. """ try: timeout = config_get('conveyor', 'poll_timeout') timeout = float(timeout) except NoOptionError: timeout = None logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i) timeout (%s)' % (process, total_processes, thread, total_threads, db_bulk, timeout)) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb = heartbeat.live(executable, hostname, pid, hb_thread) logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, hb['assign_thread'], hb['nr_threads'], db_bulk)) activity_next_exe_time = defaultdict(time.time) threadPool = ThreadPool(total_threads) sleeping = False while not graceful_stop.is_set(): try: hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) logging.debug('poller - thread (%i/%i)' % (hb['assign_thread'], hb['nr_threads'])) if not sleeping: sleeping = True if activities is None: activities = [None] for activity in activities: if activity_next_exe_time[activity] > time.time(): time.sleep(1) continue sleeping = False ts = time.time() logging.debug('%i:%i - start to poll transfers older than %i seconds for activity %s' % (process, hb['assign_thread'], older_than, activity)) transfs = request.get_next_transfers(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=[RequestState.SUBMITTED], limit=db_bulk, older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than), process=process, total_processes=total_processes, thread=hb['assign_thread'], total_threads=hb['nr_threads'], activity=activity, activity_shares=activity_shares) record_timer('daemons.conveyor.poller.000-get_next_transfers', (time.time() - ts) * 1000) if transfs: logging.debug('%i:%i - polling %i transfers for activity %s' % (process, hb['assign_thread'], len(transfs), activity)) xfers_ids = {} for transf in transfs: if not transf['external_host'] in xfers_ids: xfers_ids[transf['external_host']] = [] xfers_ids[transf['external_host']].append(transf['external_id']) for external_host in xfers_ids: for xfers in chunks(xfers_ids[external_host], fts_bulk): # poll transfers # xfer_requests = makeRequests(common.poll_transfers, args_list=[((external_host, xfers, process, thread), {})]) xfer_requests = makeRequests(common.poll_transfers, args_list=[((), {'external_host': external_host, 'xfers': xfers, 'process': process, 'thread': hb['assign_thread'], 'timeout': timeout})]) [threadPool.putRequest(xfer_req) for xfer_req in xfer_requests] threadPool.wait() if len(transfs) < db_bulk / 2: logging.info("%i:%i - only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (process, hb['assign_thread'], len(transfs), activity, db_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time() + sleep_time except: logging.critical("%i:%i - %s" % (process, hb['assign_thread'], traceback.format_exc())) if once: break logging.info('%i:%i - graceful stop requests' % (process, hb['assign_thread'])) threadPool.dismissWorkers(total_threads, do_join=True) heartbeat.die(executable, hostname, pid, hb_thread) logging.info('%i:%i - graceful stop done' % (process, hb['assign_thread']))
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Temporary Unavailable Replicas Expiration Worker that gets the list of expired TU replicas and sets them back to AVAILABLE. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos-temporary-expiration' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration starting') time.sleep(10) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Minos Temporary Expiration started') chunk_size = 10 # The chunk size used for the commits while not graceful_stop.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) try: # Get list of expired TU replicas logging.info(prepend_str + 'Getting list of expired replicas') expired_replicas = list_expired_temporary_unavailable_replicas(total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], limit=1000) logging.info(prepend_str + '%s expired replicas returned' % len(expired_replicas)) logging.debug(prepend_str + 'List of expired replicas returned %s' % str(expired_replicas)) replicas = [] bad_replicas = [] for replica in expired_replicas: replicas.append({'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE}) bad_replicas.append({'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE}) session = get_session() nchunk = 0 tot_chunk = int(math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(expired_replicas, chunk_size): # Process and update the replicas in chunks replicas = [{'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': ReplicaState.AVAILABLE} for replica in chunk] # Remove the replicas from bad_replicas table in chunks bad_replicas = [{'scope': replica[0], 'name': replica[1], 'rse_id': replica[2], 'state': BadFilesStatus.TEMPORARY_UNAVAILABLE} for replica in chunk] try: nchunk += 1 logging.debug(prepend_str + 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(replicas, nowait=True, session=session) bulk_delete_bad_replicas(bad_replicas, session=session) session.commit() # pylint: disable=no-member except (ReplicaNotFound, DataIdentifierNotFound) as error: session.rollback() # pylint: disable=no-member logging.warning(prepend_str + 'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s' % str(error)) for idx in range(len(chunk)): logging.debug(prepend_str + 'Working on %s' % (str(replicas[idx]))) try: get_metadata(replicas[idx]['scope'], replicas[idx]['name']) update_replicas_states([replicas[idx], ], nowait=True, session=session) bulk_delete_bad_replicas([bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member except DataIdentifierNotFound: session.rollback() # pylint: disable=no-member logging.warning(prepend_str + 'DID %s:%s does not exist anymore. ' % (bad_replicas[idx]['scope'], bad_replicas[idx]['name'])) bulk_delete_bad_replicas([bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member except ReplicaNotFound: session.rollback() # pylint: disable=no-member logging.warning(prepend_str + '%s:%s on RSEID %s does not exist anymore. ' % (replicas[idx]['scope'], replicas[idx]['name'], replicas[idx]['rse_id'])) bulk_delete_bad_replicas([bad_replicas[idx], ], session=session) session.commit() # pylint: disable=no-member session = get_session() except Exception: session.rollback() # pylint: disable=no-member logging.critical(traceback.format_exc()) session = get_session() except Exception: logging.critical(traceback.format_exc()) tottime = time.time() - start_time if once: break if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop requested') logging.info(prepend_str + 'Graceful stop done')
def atropos(thread, bulk, date_check, dry_run=True, grace_period=86400, once=True, unlock=False, spread_period=0, purge_replicas=False): """ Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them. :param thread: Thread number at startup. :param bulk: The number of requests to process. :param grace_period: The grace_period for the rules. :param once: Run only once. """ sleep_time = 60 executable = 'atropos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) now = datetime.datetime.now() hb = heartbeat.live(executable, hostname, pid, hb_thread) time.sleep(10) hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) logging.debug(prepend_str + 'Starting worker') summary = {} lifetime_exceptions = {} rand = random.Random(hb['assign_thread']) for excep in rucio.core.lifetime_exception.list_exceptions( exception_id=None, states=[ LifetimeExceptionsState.APPROVED, ], session=None): if '%s:%s' % (excep['scope'], excep['name']) not in lifetime_exceptions: lifetime_exceptions['%s:%s' % (excep['scope'], excep['name'])] = excep['expires_at'] elif lifetime_exceptions['%s:%s' % (excep['scope'], excep['name'])] < excep['expires_at']: lifetime_exceptions['%s:%s' % (excep['scope'], excep['name'])] = excep['expires_at'] logging.debug(prepend_str + '%s active exceptions' % len(lifetime_exceptions)) if not dry_run and date_check > now: logging.error( prepend_str + 'Atropos cannot run in non-dry-run mode for date in the future') else: while not GRACEFUL_STOP.is_set(): hb = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'], hb['nr_threads']) stime = time.time() try: rules = get_rules_beyond_eol(date_check, thread, hb['nr_threads'], session=None) logging.info(prepend_str + '%s rules to process' % (len(rules))) for rule_idx, rule in enumerate(rules, start=1): did = '%s:%s' % (rule.scope, rule.name) logging.debug(prepend_str + 'Working on rule %s on DID %s on %s' % (rule.id, did, rule.rse_expression)) if (rule_idx % 1000) == 0: logging.info(prepend_str + '%s/%s rules processed' % (rule_idx, len(rules))) # We compute the expected eol_at try: rses = parse_expression(rule.rse_expression) except InvalidRSEExpression: logging.warning( prepend_str + 'Rule %s has an RSE expression that results in an empty set: %s' % (rule.id, rule.rse_expression)) continue eol_at = rucio.core.lifetime_exception.define_eol( rule.scope, rule.name, rses) if eol_at != rule.eol_at: logging.warning( prepend_str + 'The computed eol %s differs from the one recorded %s for rule %s on %s at %s' % (eol_at, rule.eol_at, rule.id, did, rule.rse_expression)) try: update_rule(rule.id, options={'eol_at': eol_at}) except RuleNotFound: logging.warning(prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Check the exceptions if did in lifetime_exceptions: if eol_at > lifetime_exceptions[did]: logging.info( prepend_str + 'Rule %s on DID %s on %s has longer expiration date than the one requested : %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did])) else: # If eol_at < requested extension, update eol_at logging.info( prepend_str + 'Updating rule %s on DID %s on %s according to the exception till %s' % (rule.id, did, rule.rse_expression, lifetime_exceptions[did])) eol_at = lifetime_exceptions[did] try: update_rule(rule.id, options={ 'eol_at': lifetime_exceptions[did] }) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue # Now check that the new eol_at is expired if eol_at and eol_at < date_check: no_locks = True for lock in get_dataset_locks(rule.scope, rule.name): if lock['rule_id'] == rule[4]: no_locks = False if lock['rse_id'] not in summary: summary[lock['rse_id']] = {} if did not in summary[lock['rse_id']]: summary[lock['rse_id']][did] = { 'length': lock['length'] or 0, 'bytes': lock['bytes'] or 0 } if no_locks: logging.warning( prepend_str + 'Cannot find a lock for rule %s on DID %s' % (rule.id, did)) if not dry_run: lifetime = grace_period + rand.randrange( spread_period + 1) logging.info( prepend_str + 'Setting %s seconds lifetime for rule %s' % (lifetime, rule.id)) options = {'lifetime': lifetime} if purge_replicas: options['purge_replicas'] = True if rule.locked and unlock: logging.info(prepend_str + 'Unlocking rule %s', rule.id) options['locked'] = False try: update_rule(rule.id, options=options) except RuleNotFound: logging.warning( prepend_str + 'Cannot find rule %s on DID %s' % (rule.id, did)) continue except Exception: exc_type, exc_value, exc_traceback = exc_info() logging.critical(''.join( format_exception(exc_type, exc_value, exc_traceback)).strip()) for rse_id in summary: tot_size, tot_files, tot_datasets = 0, 0, 0 for did in summary[rse_id]: tot_datasets += 1 tot_files += summary[rse_id][did].get('length', 0) tot_size += summary[rse_id][did].get('bytes', 0) logging.info( prepend_str + 'For RSE %s %s datasets will be deleted representing %s files and %s bytes' % (get_rse_name(rse_id=rse_id), tot_datasets, tot_files, tot_size)) if once: break else: tottime = time.time() - stime if tottime < sleep_time: logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime))) time.sleep(sleep_time - tottime) continue logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def reaper(rses, include_rses, exclude_rses, vos=None, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0, sleep_time=60, auto_exclude_threshold=100, auto_exclude_timeout=600): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param include_rses: RSE expression to include RSEs. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", or the current VO otherwise. :param chunk_size: The size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param delay_seconds: The delay to query replicas in BEING_DELETED state. :param sleep_time: Time between two cycles. :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded. :param auto_exclude_timeout: Timeout for temporarily excluded RSEs. """ hostname = socket.getfqdn() executable = 'reaper2' pid = os.getpid() hb_thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Reaper starting') if not once: GRACEFUL_STOP.wait( 10 ) # To prevent running on the same partition if all the reapers restart at the same time heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Reaper started') while not GRACEFUL_STOP.is_set(): # try to get auto exclude parameters from the config table. Otherwise use CLI parameters. try: auto_exclude_threshold = get('reaper', 'auto_exclude_threshold', default=auto_exclude_threshold) auto_exclude_timeout = get('reaper', 'auto_exclude_timeout', default=auto_exclude_timeout) except ConfigNotFound: pass # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = get('reaper', 'max_evaluator_backlog_count') except ConfigNotFound: max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = get( 'reaper', 'max_evaluator_backlog_duration') except ConfigNotFound: max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: backlog = get_evaluation_backlog() if max_evaluator_backlog_count and \ backlog[0] and \ max_evaluator_backlog_duration and \ backlog[1] and \ backlog[0] > max_evaluator_backlog_count and \ backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Reaper: Judge evaluator backlog count and duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and backlog[ 0] and backlog[0] > max_evaluator_backlog_count: logger( logging.ERROR, 'Reaper: Judge evaluator backlog count hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and backlog[ 1] and backlog[1] < datetime.utcnow() - timedelta( minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Reaper: Judge evaluator backlog duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses, vos) if not rses_to_process: logger(logging.ERROR, 'Reaper: No RSEs found. Will sleep for 30 seconds') GRACEFUL_STOP.wait(30) continue start_time = time.time() try: staging_areas = [] dict_rses = {} heart_beat = live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') tot_needed_free_space = 0 for rse in rses_to_process: # Check if the RSE is a staging area if rse['staging_area']: staging_areas.append(rse['rse']) # Check if RSE is blacklisted if rse['availability'] % 2 == 0: logger(logging.DEBUG, 'RSE %s is blacklisted for delete', rse['rse']) continue needed_free_space, only_delete_obsolete = __check_rse_usage( rse['rse'], rse['id'], greedy=greedy, logger=logger) if needed_free_space: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, only_delete_obsolete ] tot_needed_free_space += needed_free_space elif only_delete_obsolete: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, only_delete_obsolete ] else: logger(logging.DEBUG, 'Nothing to delete on %s', rse['rse']) # Ordering the RSEs based on the needed free space sorted_dict_rses = OrderedDict( sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True)) logger(logging.DEBUG, 'List of RSEs to process ordered by needed space desc: %s', str(sorted_dict_rses)) # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value rses_hostname_mapping = get_rses_to_hostname_mapping() # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping)) list_rses_mult = [] # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity for rse_key in dict_rses: rse_name, rse_id = rse_key # The length of the deletion queue scales inversily with the number of workers # The ceil increase the weight of the RSE with small amount of files to delete if tot_needed_free_space: max_workers = ceil(dict_rses[rse_key][0] / tot_needed_free_space * 1000 / heart_beat['nr_threads']) else: max_workers = 1 list_rses_mult.extend([ (rse_name, rse_id, dict_rses[rse_key][0], dict_rses[rse_key][1]) for _ in range(int(max_workers)) ]) random.shuffle(list_rses_mult) for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult: result = REGION.get('pause_deletion_%s' % rse_id, expiration_time=120) if result is not NO_VALUE: logger( logging.INFO, 'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while', rse_name) continue result = REGION.get('temporary_exclude_%s' % rse_id, expiration_time=auto_exclude_timeout) if result is not NO_VALUE: logger( logging.WARNING, 'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.', rse_name) labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(1) continue labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(0) percent = 0 if tot_needed_free_space: percent = needed_free_space / tot_needed_free_space * 100 logger( logging.DEBUG, 'Working on %s. Percentage of the total space needed %.2f', rse_name, percent) try: rse_hostname, rse_info = rses_hostname_mapping[rse_id] except KeyError: logger(logging.DEBUG, "Hostname lookup for %s failed.", rse_name) REGION.set('pause_deletion_%s' % rse_id, True) continue rse_hostname_key = '%s,%s' % (rse_id, rse_hostname) payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None) # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt)) tot_threads_for_hostname = 0 tot_threads_for_rse = 0 for key in payload_cnt: if key and key.find(',') > -1: if key.split(',')[1] == rse_hostname: tot_threads_for_hostname += payload_cnt[key] if key.split(',')[0] == str(rse_id): tot_threads_for_rse += payload_cnt[key] max_deletion_thread = get_max_deletion_threads_by_hostname( rse_hostname) if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread: logger( logging.DEBUG, 'Too many deletion threads for %s on RSE %s. Back off', rse_hostname, rse_name) # Might need to reschedule a try on this RSE later in the same cycle continue logger( logging.INFO, 'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s', rse_hostname, tot_threads_for_hostname, max_deletion_thread, rse_name) live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) logger(logging.DEBUG, 'Total deletion workers for %s : %i', rse_hostname, tot_threads_for_hostname + 1) # List and mark BEING_DELETED the files to delete del_start_time = time.time() only_delete_obsolete = dict_rses[(rse_name, rse_id)][1] try: with monitor.record_timer_block( 'reaper.list_unlocked_replicas'): if only_delete_obsolete: logger( logging.DEBUG, 'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas', rse_name) replicas = list_and_mark_unlocked_replicas( limit=chunk_size, bytes=needed_free_space, rse_id=rse_id, delay_seconds=delay_seconds, only_delete_obsolete=only_delete_obsolete, session=None) logger( logging.DEBUG, 'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', rse_name, needed_free_space, time.time() - del_start_time, len(replicas)) if len(replicas) < chunk_size: logger( logging.DEBUG, 'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle', rse_name, chunk_size, len(replicas)) REGION.set('pause_deletion_%s' % rse_id, True) except (DatabaseException, IntegrityError, DatabaseError) as error: logger(logging.ERROR, '%s', str(error)) continue except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) # Physical deletion will take place there try: prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme, logger=logger) for file_replicas in chunks(replicas, 100): # Refresh heartbeat live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) del_start_time = time.time() for replica in file_replicas: try: replica['pfn'] = str( list( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: logger( logging.WARNING, 'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s', replica['scope'], replica['name'], rse_name, str(error)) replica['pfn'] = None except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) deleted_files = delete_from_storage( file_replicas, prot, rse_info, staging_areas, auto_exclude_threshold, logger=logger) logger(logging.INFO, '%i files processed in %s seconds', len(file_replicas), time.time() - del_start_time) # Then finally delete the replicas del_start = time.time() with monitor.record_timer_block( 'reaper.delete_replicas'): delete_replicas(rse_id=rse_id, files=deleted_files) logger( logging.DEBUG, 'delete_replicas successed on %s : %s replicas in %s seconds', rse_name, len(deleted_files), time.time() - del_start) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) DELETION_COUNTER.inc(len(deleted_files)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: break tottime = time.time() - start_time if tottime < sleep_time: logger(logging.INFO, 'Will sleep for %s seconds', sleep_time - tottime) GRACEFUL_STOP.wait(sleep_time - tottime) except DatabaseException as error: logger(logging.WARNING, 'Reaper: %s', str(error)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) finally: if once: break die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done') return
def reaper(rses=[], worker_number=0, total_workers=1, chunk_size=100, once=False, scheme=None): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param total_workers: The total number of workers. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param scheme: Force the reaper to use a particular protocol, e.g., mock. """ logging.info('Starting Dark Reaper %s-%s: Will work on RSEs: %s', worker_number, total_workers, str(rses)) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) hash_executable = hashlib.sha256(sys.argv[0] + ''.join(rses)).hexdigest() sanity_check(executable=None, hostname=hostname) while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info( 'Dark Reaper({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}' .format(locals())) nothing_to_do = True random.shuffle(rses) for rse_id in rses: rse = rse_core.get_rse_name(rse_id=rse_id) rse_info = rsemgr.get_rse_info(rse) replicas = list_quarantined_replicas( rse_id=rse_id, limit=chunk_size, worker_number=worker_number, total_workers=total_workers) rse_protocol = rse_core.get_rse_protocols(rse_id=rse_id) prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) deleted_replicas = [] try: prot.connect() for replica in replicas: nothing_to_do = False scope = '' if replica['scope']: scope = replica['scope'].external try: pfn = str( rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{ 'scope': scope, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values()[0]) logging.info( 'Dark Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, total_workers, scope, replica['name'], pfn, rse) start = time.time() prot.delete(pfn) duration = time.time() - start logging.info( 'Dark Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, total_workers, scope, replica['name'], pfn, rse, duration) add_message( 'deletion-done', { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica.get('bytes') or 0, 'bytes': replica.get('bytes') or 0, 'url': pfn, 'duration': duration, 'protocol': prot.attributes['scheme'] }) deleted_replicas.append(replica) except SourceNotFound: err_msg = 'Dark Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % ( worker_number, total_workers, scope, replica['name'], pfn, rse) logging.warning(err_msg) deleted_replicas.append(replica) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: err_msg = 'Dark Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % ( worker_number, total_workers, scope, replica['name'], pfn, rse, str(error)) logging.warning(err_msg) add_message( 'deletion-failed', { 'scope': scope, 'name': replica['name'], 'rse': rse, 'rse_id': rse_id, 'file-size': replica['bytes'] or 0, 'bytes': replica['bytes'] or 0, 'url': pfn, 'reason': str(error), 'protocol': prot.attributes['scheme'] }) except Exception: logging.critical(traceback.format_exc()) finally: prot.close() delete_quarantined_replicas(rse_id=rse_id, replicas=deleted_replicas) if once: break if once: break if nothing_to_do: logging.info( 'Dark Reaper %s-%s: Nothing to do. I will sleep for 60s', worker_number, total_workers) time.sleep(60) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except Exception: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
elif match('.*ORA-03135.*', str(e.args[0])): logging.warning(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) else: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) except Exception, e: logging.critical(traceback.format_exc()) record_counter('rule.judge.exceptions.%s' % e.__class__.__name__) if once: break die(executable='rucio-judge-repairer', hostname=hostname, pid=pid, thread=current_thread) def stop(signum=None, frame=None): """ Graceful exit. """ graceful_stop.set() def run(once=False, threads=1): """ Starts up the Judge-Repairer threads. """
if time_left > 0: logging.debug( "Waiting %s seconds until next FTS terminal state retrieval" % time_left) time.sleep(time_left) except RequestException, e: logging.error("Failed to contact FTS server: %s" % (str(e))) except: logging.critical(traceback.format_exc()) if once: return logging.info('poller_latest - graceful stop requests') heartbeat.die(executable, hostname, pid, hb_thread) logging.info('poller_latest - graceful stop done') def stop(signum=None, frame=None): """ Graceful exit. """ graceful_stop.set() def run(once=False, last_nhours=1, external_hosts=None,
def fts_throttler(once=False, cycle_interval=3600): """ Main loop to automatically configure FTS storages. """ graceful_stop.clear() logging.info('FTS Throttler starting') executable = 'conveyor-fts-throttler' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) current_time = time.time() graceful_stop.wait(10) running_instance = False while not graceful_stop.is_set(): heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) if heart_beat['nr_threads'] < 2: running_instance = True # this loop cannot be entered by more than one instance at a time. while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % ( heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + "fts_throttler start cycle") if FTSThrottler().revert(): logging.info('revert was successful, now tuning') FTSThrottler().tune() logging.info('Tuning finished for this cycle') else: logging.warning( 'could not revert, cannot tune unless revert has been done, will try again next cycle.' ) if once: break if time.time() < current_time + cycle_interval: graceful_stop.wait( int((current_time + cycle_interval) - time.time())) current_time = time.time() except Exception: logging.critical(prepend_str + 'fts_throttler crashed %s' % (traceback.format_exc())) if once: break else: prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info( prepend_str + 'another fts_throttler instance already exists. will wait') if time.time() < current_time + cycle_interval: graceful_stop.wait( int((current_time + cycle_interval) - time.time())) current_time = time.time() logging.info(prepend_str + 'Throttler - graceful stop requested') # before we stop, try to revert, but only if this instance was running the cycles. # ! If the cycle info file information is shared between instances, then this implementation must be changed ! if running_instance: try: FTSThrottler().revert() except Exception: logging.warning('could not revert changes before stopping') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Throttler - graceful stop done')
def receiver(id, total_threads=1, full_mode=False): """ Main loop to consume messages from the FTS3 producer. """ logging.info('receiver starting in full mode: %s' % full_mode) executable = 'conveyor-receiver' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) # Make an initial heartbeat so that all finishers have the correct worker number on the next try heartbeat.live(executable, hostname, pid, hb_thread) brokers_alias = [] brokers_resolved = [] try: brokers_alias = [ b.strip() for b in config_get('messaging-fts3', 'brokers').split(',') ] except Exception: raise Exception('Could not load brokers from configuration') logging.info('resolving broker dns alias: %s' % brokers_alias) brokers_resolved = [] for broker in brokers_alias: addrinfos = socket.getaddrinfo(broker, 0, socket.AF_INET, 0, socket.IPPROTO_TCP) brokers_resolved.extend(ai[4][0] for ai in addrinfos) logging.info('brokers resolved to %s', brokers_resolved) logging.info('checking authentication method') use_ssl = True try: use_ssl = config_get_bool('messaging-fts3', 'use_ssl') except: logging.info( 'could not find use_ssl in configuration -- please update your rucio.cfg' ) port = config_get_int('messaging-fts3', 'port') vhost = config_get('messaging-fts3', 'broker_virtual_host', raise_exception=False) if not use_ssl: username = config_get('messaging-fts3', 'username') password = config_get('messaging-fts3', 'password') port = config_get_int('messaging-fts3', 'nonssl_port') conns = [] for broker in brokers_resolved: if not use_ssl: logging.info('setting up username/password authentication: %s' % broker) con = stomp.Connection12(host_and_ports=[(broker, port)], use_ssl=False, vhost=vhost, reconnect_attempts_max=999) else: logging.info('setting up ssl cert/key authentication: %s' % broker) con = stomp.Connection12( host_and_ports=[(broker, port)], use_ssl=True, ssl_key_file=config_get('messaging-fts3', 'ssl_key_file'), ssl_cert_file=config_get('messaging-fts3', 'ssl_cert_file'), vhost=vhost, reconnect_attempts_max=999) conns.append(con) logging.info('receiver started') while not graceful_stop.is_set(): heartbeat.live(executable, hostname, pid, hb_thread) for conn in conns: if not conn.is_connected(): logging.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0]) record_counter('daemons.messaging.fts3.reconnect.%s' % conn.transport._Transport__host_and_ports[0] [0].split('.')[0]) conn.set_listener( 'rucio-messaging-fts3', Receiver( broker=conn.transport._Transport__host_and_ports[0], id=id, total_threads=total_threads, full_mode=full_mode)) conn.start() if not use_ssl: conn.connect(username, password, wait=True) else: conn.connect(wait=True) conn.subscribe(destination=config_get('messaging-fts3', 'destination'), id='rucio-messaging-fts3', ack='auto') time.sleep(1) logging.info('receiver graceful stop requested') for conn in conns: try: conn.disconnect() except Exception: pass heartbeat.die(executable, hostname, pid, hb_thread) logging.info('receiver graceful stop done')
def submitter(once=False, rses=None, mock=False, bulk=100, group_bulk=1, group_policy='rule', fts_source_strategy='auto', activities=None, sleep_time=600, max_sources=4, retry_other_fts=False): """ Main loop to submit a new transfer primitive to a transfertool. """ try: scheme = config_get('conveyor', 'scheme') except NoOptionError: scheme = None try: failover_scheme = config_get('conveyor', 'failover_scheme') except NoOptionError: failover_scheme = None try: timeout = config_get('conveyor', 'submit_timeout') timeout = float(timeout) except NoOptionError: timeout = None try: bring_online = config_get('conveyor', 'bring_online') except NoOptionError: bring_online = 43200 try: max_time_in_queue = {} timelife_conf = config_get('conveyor', 'max_time_in_queue') timelife_confs = timelife_conf.split(",") for conf in timelife_confs: act, timelife = conf.split(":") max_time_in_queue[act.strip()] = int(timelife.strip()) except NoOptionError: max_time_in_queue = {} if 'default' not in max_time_in_queue: max_time_in_queue['default'] = 168 logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue) activity_next_exe_time = defaultdict(time.time) executable = sys.argv[0] if activities: activities.sort() executable += '--activities ' + str(activities) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Submitter starting with timeout %s' % (timeout)) time.sleep(10) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Transfer submitter started') while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) if activities is None: activities = [None] if rses: rse_ids = [rse['id'] for rse in rses] else: rse_ids = None for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue user_transfer = False if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']: logging.info(prepend_str + "CMS user transfer activity") user_transfer = True logging.info(prepend_str + 'Starting to get transfer transfers for %s' % (activity)) start_time = time.time() transfers = __get_transfers(total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], failover_schemes=failover_scheme, limit=bulk, activity=activity, rses=rse_ids, schemes=scheme, mock=mock, max_sources=max_sources, bring_online=bring_online, retry_other_fts=retry_other_fts) record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers)) record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers)) logging.info(prepend_str + 'Got %s transfers for %s in %s seconds' % (len(transfers), activity, time.time() - start_time)) # group transfers logging.info(prepend_str + 'Starting to group transfers for %s' % (activity)) start_time = time.time() grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, fts_source_strategy, max_time_in_queue) record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1)) logging.info(prepend_str + 'Starting to submit transfers for %s' % (activity)) for external_host in grouped_jobs: if not user_transfer: for job in grouped_jobs[external_host]: # submit transfers submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout) else: for _, jobs in grouped_jobs[external_host].iteritems(): # submit transfers for job in jobs: submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter', logging_prepend_str=prepend_str, timeout=timeout, user_transfer_job=user_transfer) if len(transfers) < group_bulk: logging.info(prepend_str + 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (len(transfers), activity, group_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time() + sleep_time except Exception: logging.critical(prepend_str + '%s' % (traceback.format_exc())) if once: break logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done') return
def minos(bulk=1000, once=False, sleep_time=60): """ Creates a Minos Worker that gets a list of bad PFNs, extract the scope, name and rse_id and fill the bad_replicas table. :param bulk: The number of requests to process. :param once: Run only once. :param sleep_time: Time between two cycles. """ executable = 'minos' hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Minos starting') time.sleep( 10 ) # To prevent running on the same partition if all the daemons restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') states_mapping = { BadPFNStatus.BAD: BadFilesStatus.BAD, BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS, BadPFNStatus.TEMPORARY_UNAVAILABLE: BadFilesStatus.TEMPORARY_UNAVAILABLE } logger(logging.INFO, 'Minos started') chunk_size = 10 # The chunk size used for the commits while not GRACEFUL_STOP.is_set(): start_time = time.time() heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') pfns = [] try: bad_replicas = {} temporary_unvailables = {} pfns = get_bad_pfns(thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], limit=bulk) # Class the PFNs into bad_replicas and temporary_unavailable for pfn in pfns: path = pfn['pfn'] account = pfn['account'] reason = pfn['reason'] expires_at = pfn['expires_at'] state = states_mapping[pfn['state']] if state in [BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS]: if (account, reason, state) not in bad_replicas: bad_replicas[(account, reason, state)] = [] bad_replicas[(account, reason, state)].append(path) elif state == BadFilesStatus.TEMPORARY_UNAVAILABLE: if (account, reason, expires_at) not in temporary_unvailables: temporary_unvailables[(account, reason, expires_at)] = [] temporary_unvailables[(account, reason, expires_at)].append(path) # Process the bad and suspicious files # The scope, name, rse_id are extracted and filled into the bad_replicas table for account, reason, state in bad_replicas: vo = account.vo pfns = bad_replicas[(account, reason, state)] logger( logging.INFO, 'Declaring %s replicas with state %s and reason %s' % (len(pfns), str(state), reason)) session = get_session() schemes = {} dict_rse = {} unknown_replicas = [] try: # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logger( logging.INFO, 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: vo_str = '' if vo == 'def' else ' on VO ' + vo logger( logging.DEBUG, 'Running on RSE %s%s with %s replicas' % (get_rse_name(rse_id=rse_id), vo_str, len(dict_rse[rse_id]))) nchunk = 0 tot_chunk = int( math.ceil(len(dict_rse[rse_id]) / chunk_size)) for chunk in chunks(dict_rse[rse_id], chunk_size): nchunk += 1 logger( logging.DEBUG, 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) unknown_replicas = declare_bad_file_replicas( pfns=chunk, reason=reason, issuer=account, status=state, session=session) if unknown_replicas: logger( logging.DEBUG, 'Unknown replicas : %s' % (str(unknown_replicas))) bulk_delete_bad_pfns(pfns=chunk, session=session) session.commit() # pylint: disable=no-member except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, 'Exception', exc_info=True) # Now get the temporary unavailable and update the replicas states for account, reason, expires_at in temporary_unvailables: vo = account.vo pfns = temporary_unvailables[(account, reason, expires_at)] logger( logging.INFO, 'Declaring %s replicas temporary available with timeout %s and reason %s' % (len(pfns), str(expires_at), reason)) logger(logging.DEBUG, 'Extracting RSEs') schemes = {} dict_rse = {} unknown_replicas = [] # Splitting the PFNs by schemes for pfn in pfns: scheme = pfn.split(':')[0] if scheme not in schemes: schemes[scheme] = [] schemes[scheme].append(pfn) for scheme in schemes: _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse( schemes[scheme], vo=vo) for rse_id in tmp_dict_rse: if rse_id not in dict_rse: dict_rse[rse_id] = [] dict_rse[rse_id].extend(tmp_dict_rse[rse_id]) unknown_replicas.extend( tmp_unknown_replicas.get('unknown', [])) # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns if unknown_replicas: logger( logging.INFO, 'The following replicas are unknown and will be removed : %s' % str(unknown_replicas)) bulk_delete_bad_pfns(pfns=unknown_replicas, session=None) for rse_id in dict_rse: replicas = [] rse = get_rse_name(rse_id=rse_id, session=None) rse_vo_str = rse if vo == 'def' else '{} on {}'.format( rse, vo) logger(logging.DEBUG, 'Running on RSE %s' % rse_vo_str) for rep in get_did_from_pfns(pfns=dict_rse[rse_id], rse_id=None, vo=vo, session=None): for pfn in rep: scope = rep[pfn]['scope'] name = rep[pfn]['name'] replicas.append({ 'scope': scope, 'name': name, 'rse_id': rse_id, 'state': ReplicaState.TEMPORARY_UNAVAILABLE, 'pfn': pfn }) # The following part needs to be atomic # We update the replicas states to TEMPORARY_UNAVAILABLE # then insert a row in the bad_replicas table. TODO Update the row if it already exists # then delete the corresponding rows into the bad_pfns table logger( logging.DEBUG, 'Running on %s replicas on RSE %s' % (len(replicas), rse_vo_str)) nchunk = 0 tot_chunk = int( math.ceil(len(replicas) / float(chunk_size))) session = get_session() for chunk in chunks(replicas, chunk_size): try: nchunk += 1 logger( logging.DEBUG, 'Running on %s chunk out of %s' % (nchunk, tot_chunk)) update_replicas_states(chunk, nowait=False, session=session) bulk_add_bad_replicas( chunk, account, state=BadFilesStatus.TEMPORARY_UNAVAILABLE, reason=reason, expires_at=expires_at, session=session) pfns = [entry['pfn'] for entry in chunk] bulk_delete_bad_pfns(pfns=pfns, session=session) session.commit() # pylint: disable=no-member except (UnsupportedOperation, ReplicaNotFound) as error: session.rollback() # pylint: disable=no-member logger( logging.ERROR, 'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s' % str(error)) for rep in chunk: logger(logging.DEBUG, 'Working on %s' % (str(rep))) try: get_metadata(rep['scope'], rep['name']) unavailable_states = [] rep_state = get_replicas_state( rep['scope'], rep['name']) unavailable_states.extend( rep_state.get( ReplicaState.TEMPORARY_UNAVAILABLE, [])) unavailable_states.extend( rep_state.get( ReplicaState.BEING_DELETED, [])) unavailable_states.extend( rep_state.get(ReplicaState.BAD, [])) # If the replica is already not available, it is removed from the bad PFNs table if rep['rse_id'] in unavailable_states: logger( logging.INFO, '%s is in unavailable state. Will be removed from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) # If the expiration date of the TEMPORARY_UNAVAILABLE is in the past, it is removed from the bad PFNs table elif expires_at < datetime.now(): logger( logging.INFO, 'PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns' % (str(rep['pfn']), expires_at)) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) # Else update everything in the same transaction else: try: session = get_session() update_replicas_states( [rep], nowait=False, session=session) bulk_add_bad_replicas( [rep], account, state=BadFilesStatus. TEMPORARY_UNAVAILABLE, reason=reason, expires_at=expires_at, session=session) bulk_delete_bad_pfns( pfns=[rep['pfn']], session=session) session.commit() # pylint: disable=no-member except Exception: logger( logging.ERROR, 'Cannot update state of %s' % str(rep['pfn'])) except (DataIdentifierNotFound, ReplicaNotFound): logger( logging.ERROR, 'Will remove %s from the list of bad PFNs' % str(rep['pfn'])) bulk_delete_bad_pfns(pfns=[rep['pfn']], session=None) session = get_session() except Exception: session.rollback() # pylint: disable=no-member logger(logging.CRITICAL, 'Exception', exc_info=True) session = get_session() except Exception as error: logger(logging.ERROR, '%s' % (str(error))) tottime = time.time() - start_time if once: break if len(pfns) == bulk: logger( logging.INFO, 'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle' ) elif tottime < sleep_time: logger(logging.INFO, 'Will sleep for %s seconds' % (sleep_time - tottime)) time.sleep(sleep_time - tottime) heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done')
def kronos_file(thread=0, dataset_queue=None, sleep_time=60): """ Main loop to consume tracer reports. """ logging.info('kronos_file[%i/?] starting', thread) executable = 'kronos-file' hostname = socket.gethostname() pid = getpid() hb_thread = current_thread() chunksize = config_get_int('tracer-kronos', 'chunksize') prefetch_size = config_get_int('tracer-kronos', 'prefetch_size') subscription_id = config_get('tracer-kronos', 'subscription_id') try: bad_files_patterns = [] pattern = config_get(section='kronos', option='bad_files_patterns', session=None) pattern = str(pattern) patterns = pattern.split(",") for pat in patterns: bad_files_patterns.append(re.compile(pat.strip())) except (NoOptionError, NoSectionError, RuntimeError): bad_files_patterns = [] except Exception as error: logging.log(logging.ERROR, 'kronos_file[%i/?] Failed to get bad_file_patterns %s', thread, str(error)) bad_files_patterns = [] use_ssl = True try: use_ssl = config_get_bool('tracer-kronos', 'use_ssl') except Exception: pass if not use_ssl: username = config_get('tracer-kronos', 'username') password = config_get('tracer-kronos', 'password') excluded_usrdns = set( config_get('tracer-kronos', 'excluded_usrdns').split(',')) vhost = config_get('tracer-kronos', 'broker_virtual_host', raise_exception=False) brokers_alias = [ b.strip() for b in config_get('tracer-kronos', 'brokers').split(',') ] port = config_get_int('tracer-kronos', 'port') reconnect_attempts = config_get_int('tracer-kronos', 'reconnect_attempts') ssl_key_file = config_get('tracer-kronos', 'ssl_key_file', raise_exception=False) ssl_cert_file = config_get('tracer-kronos', 'ssl_cert_file', raise_exception=False) sanity_check(executable=executable, hostname=hostname) while not graceful_stop.is_set(): start_time = time() heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'kronos-file[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') conns = get_stomp_brokers(brokers=brokers_alias, port=port, use_ssl=use_ssl, vhost=vhost, reconnect_attempts=reconnect_attempts, ssl_key_file=ssl_key_file, ssl_cert_file=ssl_cert_file, timeout=sleep_time, logger=logger) for conn in conns: if not conn.is_connected(): logger( logging.INFO, 'connecting to %s' % str(conn.transport._Transport__host_and_ports[0])) record_counter( 'daemons.tracer.kronos.reconnect.{host}', labels={ 'host': conn.transport._Transport__host_and_ports[0][0] }) conn.set_listener( 'rucio-tracer-kronos', AMQConsumer( broker=conn.transport._Transport__host_and_ports[0], conn=conn, queue=config_get('tracer-kronos', 'queue'), chunksize=chunksize, subscription_id=subscription_id, excluded_usrdns=excluded_usrdns, dataset_queue=dataset_queue, bad_files_patterns=bad_files_patterns, logger=logger)) if not use_ssl: conn.connect(username, password) else: conn.connect() conn.subscribe( destination=config_get('tracer-kronos', 'queue'), ack='client-individual', id=subscription_id, headers={'activemq.prefetchSize': prefetch_size}) tottime = time() - start_time if tottime < sleep_time: logger(logging.INFO, 'Will sleep for %s seconds' % (sleep_time - tottime)) sleep(sleep_time - tottime) logger(logging.INFO, 'graceful stop requested') for conn in conns: try: conn.disconnect() except Exception: pass die(executable=executable, hostname=hostname, pid=pid, thread=thread) logger(logging.INFO, 'graceful stop done')