def delete_dids(dids, account, session=None): """ Delete data identifiers :param dids: The list of dids to delete. :param account: The account. :param session: The database session in use. """ rule_id_clause = [] content_clause = [] parent_content_clause = [] did_clause = [] for did in dids: logging.info('Removing did %s:%s' % (did['scope'], did['name'])) did_clause.append(and_(models.DataIdentifier.scope == did['scope'], models.DataIdentifier.name == did['name'])) parent_content_clause.append(and_(models.DataIdentifierAssociation.child_scope == did['scope'], models.DataIdentifierAssociation.child_name == did['name'])) rule_id_clause.append(and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name'])) content_clause.append(and_(models.DataIdentifierAssociation.scope == did['scope'], models.DataIdentifierAssociation.name == did['name'])) # Send message for AMI add_message('ERASE', {'account': account, 'scope': did['scope'], 'name': did['name']}, session=session) # Delete rules on did if rule_id_clause: with record_timer_block('undertaker.rules'): for (rule_id, scope, name, rse_expression, ) in session.query(models.ReplicationRule.id, models.ReplicationRule.scope, models.ReplicationRule.name, models.ReplicationRule.rse_expression).filter(or_(*rule_id_clause)): logging.debug('Removing rule %s for did %s:%s on RSE-Expression %s' % (str(rule_id), scope, name, rse_expression)) rucio.core.rule.delete_rule(rule_id=rule_id, nowait=True, session=session) # Detach from parent dids: existing_parent_dids = False if parent_content_clause: with record_timer_block('undertaker.parent_content'): for parent_did in session.query(models.DataIdentifierAssociation).filter(or_(*parent_content_clause)): existing_parent_dids = True detach_dids(scope=parent_did.scope, name=parent_did.name, dids=[{'scope': parent_did.child_scope, 'name': parent_did.child_name}], session=session) # Remove content if content_clause: with record_timer_block('undertaker.content'): rowcount = session.query(models.DataIdentifierAssociation).filter(or_(*content_clause)).\ delete(synchronize_session=False) record_counter(counters='undertaker.content.rowcount', delta=rowcount) # remove data identifier if existing_parent_dids: # Exit method early to give Judge time to remove locks (Otherwise, due to foreign keys, did removal does not work logging.debug('Leaving delete_dids early for Judge-Evaluator checks') return with record_timer_block('undertaker.dids'): rowcount = session.query(models.DataIdentifier).filter(or_(*did_clause)).\ filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER, models.DataIdentifier.did_type == DIDType.DATASET)).\ delete(synchronize_session=False)
def test_context_record_timer(self): """MONITOR (CORE): Send a timer message to graphite using context """ with monitor.record_timer_block('test.context_timer'): var_a = 2 * 100 var_a = var_a * 1 with monitor.record_timer_block(['test.context_timer']): var_a = 2 * 100 var_a = var_a * 1 with monitor.record_timer_block(['test.context_timer', ('test.context_timer_normal10', 10)]): var_a = 2 * 100 var_a = var_a * 1
def test_monitor_context_record_timer(self): """MONITOR (CORE): Send a timer message to graphite using context """ with monitor.record_timer_block('test.context_timer'): a = 2 * 100 a = a * 1 with monitor.record_timer_block(['test.context_timer']): a = 2 * 100 a = a * 1 with monitor.record_timer_block(['test.context_timer', ('test.context_timer_normal10', 10)]): a = 2 * 100 a = a * 1
def get_datasets_at_rse(rse, prefix=None): """ :param rse: The RSE name :param prefix: Character(s) to restrict the dataset search :return: a dictionary with <dataset name>: <number of files> """ filters = {'scope': 'cms', 'did_type': DIDType.DATASET} if prefix: filters['name'] = '/' + prefix + '*' with monitor.record_timer_block('cms_sync.time_rse_datasets'): all_datasets = [ dataset['name'] for dataset in list_datasets_per_rse(rse=rse, filters=filters) ] for dataset in all_datasets: datasets = { dataset: ds['available_length'] for ds in list_dataset_replicas( scope='cms', name=dataset, deep=True) if ds['rse'] == rse } # datasets = {dataset['name']: dataset['available_length'] # for dataset in list_datasets_per_rse(rse=rse, filters=filters, deep=True)} return datasets
def compare_site_blocks(phedex=None, rucio=None, rse=''): """ :param phedex: Dictionary with file counts for PhEDEx :param rucio: Dictionary with file counts for Rucio :return: """ with monitor.record_timer_block('cms_sync.time_node_diff'): phedex_blocks = set(phedex.keys()) rucio_blocks = set(rucio.keys()) missing_phedex = rucio_blocks - phedex_blocks missing_rucio = phedex_blocks - rucio_blocks both = phedex_blocks & rucio_blocks incomplete = set() for block in both: if phedex[block] != rucio[block]: print("For %s at %s the files differ: %s vs %s." % (block, rse, phedex[block], rucio[block])) incomplete.add(block) both = both - incomplete return { 'not_rucio': missing_rucio, 'not_phedex': missing_phedex, 'complete': both, 'incomplete': incomplete }
def get_datasets_at_rse(rse, prefix=None): """ :param rse: The RSE name :param prefix: Character(s) to restrict the dataset search :return: a dictionary with <dataset name>: <number of files> """ filters = {'scope': 'cms', 'did_type': DIDType.DATASET} if prefix: filters['name'] = '/' + prefix + '*' account = SYNC_ACCOUNT_FMT % rse.lower() rule_filters = {'account': account, 'scope': 'cms', 'did_type': DIDType.DATASET} with monitor.record_timer_block('cms_sync.time_rse_datasets'): synced_ds = {item['name'] for item in list_replication_rules(filters=rule_filters) if item['expires_at'] is None and (prefix is None or item['name'].startswith('/' + prefix))} all_datasets = [dataset['name'] for dataset in list_datasets_per_rse(rse=rse, filters=filters)] logging.info('Getting all datasets at %s with prefix %s' % (rse, prefix)) datasets = {} for dataset in all_datasets: if dataset in synced_ds: for ds in list_dataset_replicas(scope='cms', name=dataset, deep=True): if ds['rse'] == rse: datasets.update({dataset: ds['available_length']}) return datasets
def remove_extra_replicas(self, to_remove): """ :param to_remove: replicas to remove from Rucio :return: """ scope = InternalScope(self.scope) with monitor.record_timer_block('cms_sync.time_remove_replica'): if to_remove and self.dry_run: logging.info('Dry run: Removing replicas %s from rse %s.', str(to_remove), self.rse) elif to_remove: logging.debug('Removing %s replicas from rse %s.', len(to_remove), self.rse) for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE): replicas = [{ 'scope': scope, 'name': lfn, "rse_id": self.rse_id, "state": "U" } for lfn in to_remove_chunk] # transactional_session here? # while lock is set stuck, judge-repairer might make transfer requests before rule is gone but does it matter? update_replicas_states( replicas=replicas, add_tombstone=False, ) # delete_replicas(rse=self.rse, issuer=self.account, # files=[{'scope': self.scope, 'name': lfn} for lfn in to_remove_chunk]) return len(to_remove)
def update_replicas(self): """ Add or removes replicas for the dataset at rse. """ with monitor.record_timer_block('cms_sync.time_update_replica'): logging.info('Updating replicas for %s:%s at %s', self.scope, self.block_name, self.rse) replicas = list_replicas(dids=[{'scope': self.scope, 'name': self.block_name}], rse_expression='rse=%s' % self.rse) try: rucio_replicas = {repl['name'] for repl in replicas} except TypeError: rucio_replicas = set() phedex_replicas = set(self.replicas.keys()) missing = list(phedex_replicas - rucio_replicas) to_remove = list(rucio_replicas - phedex_replicas) if missing and (len(phedex_replicas) != len(missing)): logging.warn('Recovery: Inconsistency found for %s at %s: %s in PhEDEx and %s missing', self.rse, self.block_name, len(phedex_replicas), len(missing)) if missing: lfns_added = self.add_missing_replicas(missing) monitor.record_counter('cms_sync.files_added', delta=lfns_added) if to_remove: lfns_removed = self.remove_extra_replicas(to_remove) monitor.record_counter('cms_sync.files_removed', delta=lfns_removed) return
def remove_from_rucio(self): """""" if not self.block_in_phedex: logging.info('Declining to remove since it is not in PhEDEx') return with monitor.record_timer_block('cms_sync.time_remove_block'): self.update_replicas() self.update_rule()
def add_to_rucio(self, recover=False): """""" with monitor.record_timer_block('cms_sync.time_add_block'): self.register_container() block_exists = self.register_block() if block_exists: self.update_replicas() if recover: self.make_replicas_available() self.update_rule()
def compare_site_blocks(phedex=None, rucio=None, rse='', patterns=None): """ :param phedex: Dictionary with file counts for PhEDEx :param rucio: Dictionary with file counts for Rucio :return: """ with monitor.record_timer_block('cms_sync.time_node_diff'): # from pprint import pformat phedex_blocks = set(phedex.keys()) # logging.info('Blocks in PhEDEx %s', pformat(phedex_blocks)) rucio_blocks = set(rucio.keys()) # logging.info('Blocks in Rucio %s', pformat(rucio_blocks)) if patterns: phedex_match = set() rucio_match = set() for pattern in patterns: phedex_match = phedex_match | { p for p in phedex_blocks if pattern in p } rucio_match = rucio_match | { r for r in rucio_blocks if pattern in r } logging.info( 'Pattern matching reduces Rucio %s->%s and PhEDEx %s->%s', len(rucio_blocks), len(rucio_match), len(phedex_blocks), len(phedex_match)) phedex_blocks = phedex_match rucio_blocks = rucio_match missing_phedex = rucio_blocks - phedex_blocks missing_rucio = phedex_blocks - rucio_blocks both = phedex_blocks & rucio_blocks incomplete = set() for block in both: if phedex[block] != rucio[block]: print("For %s at %s the files differ: %s vs %s." % (block, rse, phedex[block], rucio[block])) incomplete.add(block) both = both - incomplete return { 'not_rucio': missing_rucio, 'not_phedex': missing_phedex, 'complete': both, 'incomplete': incomplete }
def add_to_rucio(self, recover=False): """""" with monitor.record_timer_block('cms_sync.time_add_block'): self.register_container() block_exists = self.register_block() if block_exists: self.update_replicas() if recover: self.make_replicas_available() self.update_rule() else: logging.critical('Unable to make the block %s', self.block_name)
def make_replicas_available(self): """ Marks available replicas for the dataset at rse if they are in PhEDEx """ with monitor.record_timer_block('cms_sync.time_recover_replica'): logging.info('Recovering unavailable replicas for %s:%s at %s', self.scope, self.block_name, self.rse) replicas = list_replicas(dids=[{ 'scope': self.scope, 'name': self.block_name }], rse_expression='rse=%s' % self.rse, all_states=True) try: unavailable_replicas = { repl['name'] for repl in replicas if repl['states'][self.rse] != 'AVAILABLE' } except TypeError: unavailable_replicas = set() phedex_replicas = set(self.replicas.keys()) missing = list(phedex_replicas & unavailable_replicas) logging.info( 'Recovery for %s:%s at %s: PhEDEx has %s, Rucio unavailable %s. Missing: %s ', self.scope, self.block_name, self.rse, len(phedex_replicas), len(unavailable_replicas), len(missing)) # Fix up things which are unavailable rse_details = get_rse(self.rse) rse_id = rse_details['id'] scope = InternalScope(self.scope) state = 'A' for name in missing: logging.info('Setting available %s:%s at %s', self.scope, name, self.rse) core_update_state(rse_id=rse_id, scope=scope, name=name, state=state) monitor.record_counter('cms_sync.files_made_available', delta=len(missing)) return
def add_missing_replicas(self, missing): """ :param missing: possible missing lfns :return: """ with monitor.record_timer_block('cms_sync.time_add_replica'): if missing and self.dry_run: logging.info('Dry run: Adding replicas %s to rse %s.', str(missing), self.rse) elif missing: logging.info('Adding %s replicas to rse %s.', len(missing), self.rse) replicas_to_add = [self.replicas[lfn] for lfn in missing] files = replica_file_list(replicas=replicas_to_add, scope=self.scope) for rucio_file in files: try: update_file = copy.deepcopy(rucio_file) update_file.update({'scope': InternalScope(self.scope), "rse_id": self.rse_id, "state": "A"}) update_replicas_states(replicas=[update_file], add_tombstone=False) except ReplicaNotFound: resurrect_file = copy.deepcopy(rucio_file) resurrect_file.update({'scope': 'cms', 'type': 'FILE'}) try: add_replicas(rse=self.rse, files=[resurrect_file], issuer=self.account, ignore_availability=True) except RucioException: logging.critical('Could not add %s to %s. Constraint violated?', resurrect_file, self.rse) resurrect_file.update({'scope': 'cms', 'type': 'FILE'}) # Reset to Internal scope by call resurrect([resurrect_file], issuer=self.account) resurrect_file.update({'scope': 'cms', 'type': 'FILE'}) # Reset to Internal scope by call add_replicas(rse=self.rse, files=[resurrect_file], issuer=self.account, ignore_availability=True) logging.critical('Resurrected %s at %s', resurrect_file, self.rse) # add_replicas(rse=self.rse, files=files, issuer=self.account) lfns = [item['name'] for item in list_files(scope=self.scope, name=self.block_name, long=False)] missing_lfns = list(set(missing) - set(lfns)) if missing_lfns: logging.debug('Attaching %s lfns to %s at %s', len(missing_lfns), self.block_name, self.rse) dids = [{'scope': self.scope, 'name': lfn} for lfn in missing_lfns] try: attach_dids(scope=self.scope, name=self.block_name, attachment={'dids': dids}, issuer=self.account) except FileAlreadyExists: logging.warning('Trying to attach already existing files to %s', self.block_name) except DataIdentifierNotFound: logging.critical('Could not attach to %s at %s. Constraint violated?', self.block_name, self.rse) return len(missing_lfns)
def add_to_rucio(self, recover=False): """""" if not self.block_in_phedex: logging.info('Declining to add %s since it is not in PhEDEx', self.block_in_phedex) return with monitor.record_timer_block('cms_sync.time_add_block'): self.register_container() block_exists = self.register_block() if block_exists: self.update_replicas() if recover: self.make_replicas_available() self.update_rule() else: logging.critical('Unable to make the block %s', self.block_name)
def LIST_DIDS_WILDCARD(self, scope, wildcard): jdoe_account = 'jdoe' client = DIDClient(account=jdoe_account) print 'run with: ' + str(wildcard) start = time() with monitor.record_timer_block('jdoe.list_dids_wildcard'): dids = [did for did in client.list_dids(scope=scope, filters=wildcard, type='dataset')] duration = time() - start cnt = len(dids) print 'got %d dids' % cnt monitor.record_counter('jdoe.list_dids_wildcard.num_results', cnt) if cnt != 0: monitor.record_counter('jdoe.list_dids_wildcard.time_per_did', duration / cnt) return {'no_datasets': cnt}
def add_missing_replicas(self, missing): """ :param missing: possible missing lfns :return: """ with monitor.record_timer_block('cms_sync.time_add_replica'): if missing and self.dry_run: logging.info('Dry run: Adding replicas %s to rse %s.', str(missing), self.rse) elif missing: logging.debug('Adding %s replicas to rse %s.', len(missing), self.rse) replicas_to_add = [self.replicas[lfn] for lfn in missing] files = replica_file_list(replicas=replicas_to_add, scope=self.scope) add_replicas(rse=self.rse, files=files, issuer=self.account) lfns = [ item['name'] for item in list_files( scope=self.scope, name=self.block_name, long=False) ] missing_lfns = list(set(missing) - set(lfns)) if missing_lfns: logging.debug('Attaching %s lfns to %s at %s', len(missing_lfns), self.block_name, self.rse) dids = [{ 'scope': self.scope, 'name': lfn } for lfn in missing_lfns] try: attach_dids(scope=self.scope, name=self.block_name, attachment={'dids': dids}, issuer=self.account) except FileAlreadyExists: logging.warning( 'Trying to attach already existing files to %s', self.block_name) return len(missing_lfns)
def remove_extra_replicas(self, to_remove): """ :param to_remove: replicas to remove from Rucio :return: """ with monitor.record_timer_block('cms_sync.time_remove_replica'): if to_remove and self.dry_run: logging.info('Dry run: Removing replicas %s from rse %s.', str(to_remove), self.rse) elif to_remove: logging.debug('Removing %s replicas from rse %s.', len(to_remove), self.rse) for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE): delete_replicas(rse=self.rse, issuer=self.account, files=[{ 'scope': self.scope, 'name': lfn } for lfn in to_remove_chunk]) # FIXME: Do we need a retry here on DatabaseException? If so, use the retry module return len(to_remove)
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. """ logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals()) while not graceful_stop.is_set(): try: max_deleting_rate = 0 for rse in rses: deleting_rate = 0 rse_info = rsemgr.get_rse_info(rse['rse']) rse_protocol = rse_core.get_rse_protocols(rse['rse']) if not rse_protocol['availability_delete']: logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse'])) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol['impl'] = 'rucio.rse.protocols.gfal.Default' logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse'])) try: needed_free_space, max_being_deleted_files = None, 10000 if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id']) logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space = 0 logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse'])) s = time.time() with monitor.record_timer_block('reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas))) if not replicas: logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse'])) continue p = rsemgr.create_protocol(rse_info, 'delete', scheme=None) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files))) try: s = time.time() update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files]) for replica in files: try: replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0]) except ReplicaUnAvailable as e: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e)) logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg)) replica['pfn'] = None add_message('deletion-planned', {'scope': replica['scope'], 'name': replica['name'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'rse': rse_info['rse']}) # logging.debug('update_replicas_states %s' % (time.time() - s)) monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files)) if not scheme: try: deleted_files = [] p.connect() for replica in files: try: logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) s = time.time() if rse['staging_area'] or rse['rse'].endswith("STAGING"): logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) else: if replica['pfn']: p.delete(replica['pfn']) else: logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000) duration = time.time() - s deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) add_message('deletion-done', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'duration': duration}) logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])) except SourceNotFound: err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': err_msg}) except (ServiceUnavailable, RSEAccessDenied) as e: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) except Exception as e: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) except: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))) except (ServiceUnavailable, RSEAccessDenied) as e: for replica in files: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e))) add_message('deletion-failed', {'scope': replica['scope'], 'name': replica['name'], 'rse': rse_info['rse'], 'file-size': replica['bytes'], 'url': replica['pfn'], 'reason': str(e)}) finally: p.close() s = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse=rse['rse'], files=deleted_files) logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s)) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) deleting_rate += len(deleted_files) except: logging.critical(traceback.format_exc()) deleting_rate = deleting_rate * 1.0 / max_being_deleted_files if deleting_rate > max_deleting_rate: max_deleting_rate = deleting_rate except: logging.critical(traceback.format_exc()) if once: break logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate)) sleep_time = int((1 - max_deleting_rate) * 60 + 1) time.sleep(sleep_time) except: logging.critical(traceback.format_exc()) logging.info('Graceful stop requested') logging.info('Graceful stop done')
def remove_from_rucio(self): """""" with monitor.record_timer_block('cms_sync.time_remove_block'): self.update_replicas() self.update_rule()
def get_signed_url(rse_id, service, operation, url, lifetime=600): """ Get a signed URL for a particular service and operation. The signed URL will be valid for 1 hour but can be overriden. :param rse_id: The ID of the RSE that the URL points to. :param service: The service to authorise, either 'gcs', 's3' or 'swift'. :param operation: The operation to sign, either 'read', 'write', or 'delete'. :param url: The URL to sign. :param lifetime: Lifetime of the signed URL in seconds. :returns: Signed URL as a variable-length string. """ global CREDS_GCS if service not in ['gcs', 's3', 'swift']: raise UnsupportedOperation('Service must be "gcs", "s3" or "swift"') if operation not in ['read', 'write', 'delete']: raise UnsupportedOperation( 'Operation must be "read", "write", or "delete"') if url is None or url == '': raise UnsupportedOperation('URL must not be empty') if lifetime: if not isinstance(lifetime, integer_types): try: lifetime = int(lifetime) except: raise UnsupportedOperation( 'Lifetime must be convertible to numeric.') signed_url = None if service == 'gcs': if not CREDS_GCS: CREDS_GCS = ServiceAccountCredentials.from_json_keyfile_name( config_get( 'credentials', 'gcs', raise_exception=False, default='/opt/rucio/etc/google-cloud-storage-test.json')) components = urlparse(url) host = components.netloc # select the correct operation operations = {'read': 'GET', 'write': 'PUT', 'delete': 'DELETE'} operation = operations[operation] # special case to test signature, force epoch time if lifetime is None: lifetime = 0 else: # GCS is timezone-sensitive, don't use UTC # has to be converted to Unixtime lifetime = datetime.datetime.now() + datetime.timedelta( seconds=lifetime) lifetime = int(time.mktime(lifetime.timetuple())) # sign the path only path = components.path # assemble message to sign to_sign = "%s\n\n\n%s\n%s" % (operation, lifetime, path) # create URL-capable signature # first character is always a '=', remove it signature = urlencode( {'': base64.b64encode(CREDS_GCS.sign_blob(to_sign)[1])})[1:] # assemble final signed URL signed_url = 'https://%s%s?GoogleAccessId=%s&Expires=%s&Signature=%s' % ( host, path, CREDS_GCS.service_account_email, lifetime, signature) elif service == 's3': # split URL to get hostname, bucket and key components = urlparse(url) host = components.netloc pathcomponents = components.path.split('/') if len(pathcomponents) < 3: raise UnsupportedOperation('Not a valid S3 URL') bucket = pathcomponents[1] key = '/'.join(pathcomponents[2:]) # remove port number from host if present colon = host.find(':') port = '443' if colon >= 0: port = host[colon + 1:] host = host[:colon] # look up in RSE account configuration by RSE ID cred_name = rse_id cred = REGION.get('s3-%s' % cred_name) if cred is NO_VALUE: rse_cred = get_rse_credentials() cred = rse_cred.get(cred_name) REGION.set('s3-%s' % cred_name, cred) access_key = cred['access_key'] secret_key = cred['secret_key'] signature_version = cred['signature_version'] region_name = cred['region'] if operation == 'read': s3op = 'get_object' elif operation == 'write': s3op = 'put_object' else: s3op = 'delete_object' with record_timer_block('credential.signs3'): s3 = boto3.client('s3', endpoint_url='https://' + host + ':' + port, aws_access_key_id=access_key, aws_secret_access_key=secret_key, config=Config( signature_version=signature_version, region_name=region_name)) signed_url = s3.generate_presigned_url(s3op, Params={ 'Bucket': bucket, 'Key': key }, ExpiresIn=lifetime) elif service == 'swift': # split URL to get hostname and path components = urlparse(url) host = components.netloc # remove port number from host if present colon = host.find(':') if colon >= 0: host = host[:colon] # use RSE ID to look up key cred_name = rse_id # look up tempurl signing key cred = REGION.get('swift-%s' % cred_name) if cred is NO_VALUE: rse_cred = get_rse_credentials() cred = rse_cred.get(cred_name) REGION.set('swift-%s' % cred_name, cred) tempurl_key = cred['tempurl_key'] if operation == 'read': swiftop = 'GET' elif operation == 'write': swiftop = 'PUT' else: swiftop = 'DELETE' expires = int(time.time() + lifetime) # create signed URL with record_timer_block('credential.signswift'): hmac_body = u'%s\n%s\n%s' % (swiftop, expires, components.path) # Python 3 hmac only accepts bytes or bytearray sig = hmac.new(bytearray(tempurl_key, 'utf-8'), bytearray(hmac_body, 'utf-8'), sha1).hexdigest() signed_url = 'https://' + host + components.path + '?temp_url_sig=' + sig + '&temp_url_expires=' + str( expires) return signed_url
# TODO: fix when meta-data is working properly filename = datasetname + '._' + uuid() onefile = {'name': filename, 'scope': scope, 'bytes': tz_filesize, 'pfn': 'mock://srm-atlas.cern.ch:8443/castor/cern.ch/grid/atlas/tzero/this/is/from/the/emulation/%s' % filename} no_files += 1 newfiles.append(onefile) d[3] = target # with monitor.record_timer_block(['panda.attach_dids_to_dids', ('panda.attach_dids_to_dids.normalized_datasets', len(inserts)), ('panda.attach_dids_to_dids.normalized_files', no_files)]): # client.attach_dids_to_dids(attachments=inserts) monitor.record_counter('emulator.tzero.client.files', len(newfiles)) while len(newfiles): with monitor.record_timer_block(['emulator.tzero.client.add_files_to_dataset', ('emulator.tzero.client.add_files_to_dataset.normalized', len(newfiles))]): try: client.add_files_to_dataset(scope=scope, name=datasetname, files=newfiles[:500], rse=tz_rse) del newfiles[:500] except Exception, e: print traceback.format_exc() delta = time.time() - now print '== TZero: Appending %s files to %s datasets took %s seconds' % (no_files, len(ds), delta) monitor.record_timer('emulator.tzero.registering_all_replicas', delta * 1000) monitor.record_timer('emulator.tzero.registering_all_replicas.normalized', (delta * 1000) / no_files) # close all datasets for scope, datasetname, s, c in ds: print '== TZero: Close dataset %s:%s' % (scope, datasetname) with monitor.record_timer_block('emulator.tzero.client.close'): try: client.close(scope=scope, name=datasetname) except Exception, e: monitor.record_counter('emulator.exceptions.tzero.EMULATION_RUN.%s' % e.__class__.__name__, 1) print traceback.format_exc()
def reaper(rses, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0, sleep_time=60): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param chunk_size: The size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param delay_seconds: The delay to query replicas in BEING_DELETED state. :param sleep_time: Time between two cycles. """ try: max_deletion_thread = get('reaper', 'nb_workers_by_hostname') except ConfigNotFound as error: max_deletion_thread = 5 hostname = socket.getfqdn() executable = sys.argv[0] pid = os.getpid() hb_thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info('%s Reaper starting', prepend_str) time.sleep( 10 ) # To prevent running on the same partition if all the reapers restart at the same time heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info('%s Reaper started', prepend_str) while not GRACEFUL_STOP.is_set(): start_time = time.time() try: staging_areas = [] dict_rses = {} heart_beat = live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) tot_needed_free_space = 0 for rse in rses: # Check if the RSE is a staging area if rse['staging_area']: staging_areas.append(rse['rse']) # Check if RSE is blacklisted if rse['availability'] % 2 == 0: logging.debug('%s RSE %s is blacklisted for delete', prepend_str, rse['rse']) continue max_being_deleted_files, needed_free_space, used, free = __check_rse_usage( rse['rse'], rse['id'], prepend_str) # Check if greedy mode if greedy: dict_rses[(rse['rse'], rse['id'])] = [ 1000000000000, max_being_deleted_files ] tot_needed_free_space += 1000000000000 else: if needed_free_space: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, max_being_deleted_files ] tot_needed_free_space += needed_free_space else: logging.debug('%s Nothing to delete on %s', prepend_str, rse['rse']) # Ordering the RSEs based on the needed free space sorted_dict_rses = OrderedDict( sorted(dict_rses.items(), key=itemgetter(1), reverse=True)) logging.debug( '%s List of RSEs to process ordered by needed space desc : %s', prepend_str, str(sorted_dict_rses)) # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value rses_hostname_mapping = get_rses_to_hostname_mapping() # logging.debug('%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping)) list_rses_mult = [] # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity for rse_key in dict_rses: rse_name, rse_id = rse_key # The length of the deletion queue scales inversily with the number of workers # The ceil increase the weight of the RSE with small amount of files to delete max_workers = ceil(dict_rses[rse_key][0] / tot_needed_free_space * 1000 / heart_beat['nr_threads']) list_rses_mult.extend([ (rse_name, rse_id, dict_rses[rse_key][0], dict_rses[rse_key][1]) for _ in range(int(max_workers)) ]) random.shuffle(list_rses_mult) skip_until_next_run = [] for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult: if rse_id in skip_until_next_run: continue logging.debug( '%s Working on %s. Percentage of the total space needed %.2f', prepend_str, rse_name, needed_free_space / tot_needed_free_space * 100) rse_hostname, rse_info = rses_hostname_mapping[rse_id] rse_hostname_key = '%s,%s' % (rse_id, rse_hostname) payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None) # logging.debug('%s Payload count : %s', prepend_str, str(payload_cnt)) tot_threads_for_hostname = 0 tot_threads_for_rse = 0 for key in payload_cnt: if key and key.find(',') > -1: if key.split(',')[1] == rse_hostname: tot_threads_for_hostname += payload_cnt[key] if key.split(',')[0] == str(rse_id): tot_threads_for_rse += payload_cnt[key] if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread: logging.debug( '%s Too many deletion threads for %s on RSE %s. Back off', prepend_str, rse_hostname, rse_name) # Might need to reschedule a try on this RSE later in the same cycle continue logging.info( '%s Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s', prepend_str, rse_hostname, tot_threads_for_hostname, max_deletion_thread, rse_name) live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) logging.debug('%s Total deletion workers for %s : %i', prepend_str, rse_hostname, tot_threads_for_hostname + 1) # List and mark BEING_DELETED the files to delete del_start_time = time.time() try: with monitor.record_timer_block( 'reaper.list_unlocked_replicas'): replicas = list_and_mark_unlocked_replicas( limit=chunk_size, bytes=needed_free_space, rse_id=rse_id, delay_seconds=delay_seconds, session=None) logging.debug( '%s list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', prepend_str, rse_name, needed_free_space, time.time() - del_start_time, len(replicas)) if len(replicas) < chunk_size: logging.info( '%s Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle', prepend_str, rse_name, chunk_size, len(replicas)) skip_until_next_run.append(rse_id) except (DatabaseException, IntegrityError, DatabaseError) as error: logging.error('%s %s', prepend_str, str(error)) continue except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) # Physical deletion will take place there try: prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) for file_replicas in chunks(replicas, 100): # Refresh heartbeat live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) del_start_time = time.time() for replica in file_replicas: try: replica['pfn'] = str( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': replica['scope'], 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values()[0]) time.sleep(random.uniform(0, 0.01)) except (ReplicaUnAvailable, ReplicaNotFound) as error: logging.warning( '%s Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s', prepend_str, replica['scope'], replica['name'], rse_name, str(error)) replica['pfn'] = None except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) deleted_files = delete_from_storage( file_replicas, prot, rse_info, staging_areas, prepend_str) logging.info('%s %i files processed in %s seconds', prepend_str, len(file_replicas), time.time() - del_start_time) # Then finally delete the replicas del_start = time.time() with monitor.record_timer_block( 'reaper.delete_replicas'): delete_replicas(rse_id=rse_id, files=deleted_files) logging.debug( '%s delete_replicas successed on %s : %s replicas in %s seconds', prepend_str, rse_name, len(deleted_files), time.time() - del_start) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) except Exception as error: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) if once: break tottime = time.time() - start_time if tottime < sleep_time: logging.info('%s Will sleep for %s seconds', prepend_str, sleep_time - tottime) time.sleep(sleep_time - tottime) except DatabaseException as error: logging.warning('%s Reaper: %s', prepend_str, str(error)) except Exception: logging.critical('%s %s', prepend_str, str(traceback.format_exc())) finally: if once: break die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread) logging.info('%s Graceful stop requested', prepend_str) logging.info('%s Graceful stop done', prepend_str) return
def reaper(rses, include_rses, exclude_rses, vos=None, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0, sleep_time=60, auto_exclude_threshold=100, auto_exclude_timeout=600): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param include_rses: RSE expression to include RSEs. :param exclude_rses: RSE expression to exclude RSEs from the Reaper. :param vos: VOs on which to look for RSEs. Only used in multi-VO mode. If None, we either use all VOs if run from "def", or the current VO otherwise. :param chunk_size: The size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. :param delay_seconds: The delay to query replicas in BEING_DELETED state. :param sleep_time: Time between two cycles. :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded. :param auto_exclude_timeout: Timeout for temporarily excluded RSEs. """ hostname = socket.getfqdn() executable = 'reaper' pid = os.getpid() hb_thread = threading.current_thread() sanity_check(executable=executable, hostname=hostname) heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Reaper starting') if not once: GRACEFUL_STOP.wait( 10 ) # To prevent running on the same partition if all the reapers restart at the same time heart_beat = live(executable, hostname, pid, hb_thread) prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') logger(logging.INFO, 'Reaper started') while not GRACEFUL_STOP.is_set(): # try to get auto exclude parameters from the config table. Otherwise use CLI parameters. try: auto_exclude_threshold = config_get('reaper', 'auto_exclude_threshold', default=auto_exclude_threshold) auto_exclude_timeout = config_get('reaper', 'auto_exclude_timeout', default=auto_exclude_timeout) except (NoOptionError, NoSectionError, RuntimeError): pass # Check if there is a Judge Evaluator backlog try: max_evaluator_backlog_count = config_get( 'reaper', 'max_evaluator_backlog_count') except (NoOptionError, NoSectionError, RuntimeError): max_evaluator_backlog_count = None try: max_evaluator_backlog_duration = config_get( 'reaper', 'max_evaluator_backlog_duration') except (NoOptionError, NoSectionError, RuntimeError): max_evaluator_backlog_duration = None if max_evaluator_backlog_count or max_evaluator_backlog_duration: backlog = get_evaluation_backlog() if max_evaluator_backlog_count and \ backlog[0] and \ max_evaluator_backlog_duration and \ backlog[1] and \ backlog[0] > max_evaluator_backlog_count and \ backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Reaper: Judge evaluator backlog count and duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_count and backlog[ 0] and backlog[0] > max_evaluator_backlog_count: logger( logging.ERROR, 'Reaper: Judge evaluator backlog count hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue elif max_evaluator_backlog_duration and backlog[ 1] and backlog[1] < datetime.utcnow() - timedelta( minutes=max_evaluator_backlog_duration): logger( logging.ERROR, 'Reaper: Judge evaluator backlog duration hit, stopping operation' ) GRACEFUL_STOP.wait(30) continue rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses, vos) if not rses_to_process: logger(logging.ERROR, 'Reaper: No RSEs found. Will sleep for 30 seconds') GRACEFUL_STOP.wait(30) continue start_time = time.time() try: staging_areas = [] dict_rses = {} heart_beat = live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prepend_str + '%s') tot_needed_free_space = 0 for rse in rses_to_process: # Check if the RSE is a staging area if rse['staging_area']: staging_areas.append(rse['rse']) # Check if RSE is blocklisted if rse['availability'] % 2 == 0: logger(logging.DEBUG, 'RSE %s is blocklisted for delete', rse['rse']) continue needed_free_space, only_delete_obsolete = __check_rse_usage( rse['rse'], rse['id'], greedy=greedy, logger=logger) if needed_free_space: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, only_delete_obsolete ] tot_needed_free_space += needed_free_space elif only_delete_obsolete: dict_rses[(rse['rse'], rse['id'])] = [ needed_free_space, only_delete_obsolete ] else: logger(logging.DEBUG, 'Nothing to delete on %s', rse['rse']) # Ordering the RSEs based on the needed free space sorted_dict_rses = OrderedDict( sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True)) logger(logging.DEBUG, 'List of RSEs to process ordered by needed space desc: %s', str(sorted_dict_rses)) # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value rses_hostname_mapping = get_rses_to_hostname_mapping() # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping)) list_rses_mult = [] # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity for rse_key in dict_rses: rse_name, rse_id = rse_key # The length of the deletion queue scales inversily with the number of workers # The ceil increase the weight of the RSE with small amount of files to delete if tot_needed_free_space: max_workers = ceil(dict_rses[rse_key][0] / tot_needed_free_space * 1000 / heart_beat['nr_threads']) else: max_workers = 1 list_rses_mult.extend([ (rse_name, rse_id, dict_rses[rse_key][0], dict_rses[rse_key][1]) for _ in range(int(max_workers)) ]) random.shuffle(list_rses_mult) paused_rses = [] for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult: result = REGION.get('pause_deletion_%s' % rse_id, expiration_time=120) if result is not NO_VALUE: paused_rses.append(rse_name) logger( logging.DEBUG, 'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while', rse_name) continue result = REGION.get('temporary_exclude_%s' % rse_id, expiration_time=auto_exclude_timeout) if result is not NO_VALUE: logger( logging.WARNING, 'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.', rse_name) labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(1) continue labels = {'rse': rse_name} EXCLUDED_RSE_GAUGE.labels(**labels).set(0) percent = 0 if tot_needed_free_space: percent = needed_free_space / tot_needed_free_space * 100 logger( logging.DEBUG, 'Working on %s. Percentage of the total space needed %.2f', rse_name, percent) try: rse_hostname, rse_info = rses_hostname_mapping[rse_id] except KeyError: logger(logging.DEBUG, "Hostname lookup for %s failed.", rse_name) REGION.set('pause_deletion_%s' % rse_id, True) continue rse_hostname_key = '%s,%s' % (rse_id, rse_hostname) payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None) # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt)) tot_threads_for_hostname = 0 tot_threads_for_rse = 0 for key in payload_cnt: if key and key.find(',') > -1: if key.split(',')[1] == rse_hostname: tot_threads_for_hostname += payload_cnt[key] if key.split(',')[0] == str(rse_id): tot_threads_for_rse += payload_cnt[key] max_deletion_thread = get_max_deletion_threads_by_hostname( rse_hostname) if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread: logger( logging.DEBUG, 'Too many deletion threads for %s on RSE %s. Back off', rse_hostname, rse_name) # Might need to reschedule a try on this RSE later in the same cycle continue logger( logging.INFO, 'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s', rse_hostname, tot_threads_for_hostname, max_deletion_thread, rse_name) live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) logger(logging.DEBUG, 'Total deletion workers for %s : %i', rse_hostname, tot_threads_for_hostname + 1) # List and mark BEING_DELETED the files to delete del_start_time = time.time() only_delete_obsolete = dict_rses[(rse_name, rse_id)][1] try: with monitor.record_timer_block( 'reaper.list_unlocked_replicas'): if only_delete_obsolete: logger( logging.DEBUG, 'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas', rse_name) replicas = list_and_mark_unlocked_replicas( limit=chunk_size, bytes_=needed_free_space, rse_id=rse_id, delay_seconds=delay_seconds, only_delete_obsolete=only_delete_obsolete, session=None) logger( logging.DEBUG, 'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', rse_name, needed_free_space, time.time() - del_start_time, len(replicas)) if len(replicas) < chunk_size: logger( logging.DEBUG, 'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle', rse_name, chunk_size, len(replicas)) REGION.set('pause_deletion_%s' % rse_id, True) except (DatabaseException, IntegrityError, DatabaseError) as error: logger(logging.ERROR, '%s', str(error)) continue except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) # Physical deletion will take place there try: prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme, logger=logger) for file_replicas in chunks(replicas, chunk_size): # Refresh heartbeat live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None) del_start_time = time.time() for replica in file_replicas: try: replica['pfn'] = str( list( rsemgr.lfns2pfns( rse_settings=rse_info, lfns=[{ 'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: logger( logging.WARNING, 'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s', replica['scope'], replica['name'], rse_name, str(error)) replica['pfn'] = None except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) deleted_files = delete_from_storage( file_replicas, prot, rse_info, staging_areas, auto_exclude_threshold, logger=logger) logger(logging.INFO, '%i files processed in %s seconds', len(file_replicas), time.time() - del_start_time) # Then finally delete the replicas del_start = time.time() with monitor.record_timer_block( 'reaper.delete_replicas'): delete_replicas(rse_id=rse_id, files=deleted_files) logger( logging.DEBUG, 'delete_replicas successed on %s : %s replicas in %s seconds', rse_name, len(deleted_files), time.time() - del_start) DELETION_COUNTER.inc(len(deleted_files)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if paused_rses: logger(logging.INFO, 'Deletion paused for a while for following RSEs: %s', ', '.join(paused_rses)) if once: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger) except DatabaseException as error: logger(logging.WARNING, 'Reaper: %s', str(error)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) finally: if once: break die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread) logger(logging.INFO, 'Graceful stop requested') logger(logging.INFO, 'Graceful stop done') return
def sync_site(self, site_pair): """ Sync a site defined by a site_pair of (site, prefix). Prefix can be None to sync all blocks in the site :return: """ site, prefix = site_pair # now = int(time.time()) # Set 1980 as the last sync date if no data exists # site_last_synced = self.last_synced.get(site_pair, 10 * 365 * 24 * 3600) # last_week = int(site_last_synced - 7 * 24 * 3600) if self.config.get('default', None): if self.config['default'].get('chunck', 0): BLOCKS_PER_ACTION = int(self.config['default']['chunck']) with monitor.record_timer_block('cms_sync.time_site_sync'): r_timer = 'cms_sync.time_rucio_block_list_all' p_timer = 'cms_sync.time_phedex_block_list_all' if prefix: r_timer = 'cms_sync.time_rucio_block_list_partial' p_timer = 'cms_sync.time_phedex_block_list_partial' with monitor.record_timer_block(p_timer): phedex_blocks = self.phedex_svc.blocks_at_site(pnn=site, prefix=prefix, since=None) with monitor.record_timer_block(r_timer): rucio_blocks = self.get_datasets_at_rse(rse=site, prefix=prefix) n_blocks_in_phedex = len(phedex_blocks) n_blocks_in_rucio = len(rucio_blocks) # FIXME: This is refusing to delete everything from Rucio. Not clear it's needed if not n_blocks_in_phedex and n_blocks_in_rucio: logging.warning( "At %s found %s blocks in PhEDEx and %s in Rucio with prefix %s", site, n_blocks_in_phedex, n_blocks_in_rucio, prefix) return if not n_blocks_in_phedex and not n_blocks_in_rucio: logging.info( "At %s:%s, nothing in PhEDEx or Rucio. Quitting." % (site, prefix)) return block_report = compare_site_blocks(phedex=phedex_blocks, rucio=rucio_blocks, rse=site) n_blocks_not_in_rucio = len(block_report['not_rucio']) n_blocks_not_in_phedex = len(block_report['not_phedex']) n_incomplete_blocks = len(block_report['incomplete']) logging.info("At %s: In both/PhEDEx only/Rucio only: %s/%s/%s" % (site, len(block_report['complete']), n_blocks_not_in_rucio, n_blocks_not_in_phedex)) # Truncate lists if we want to reduce cycle time if BLOCKS_PER_ACTION and n_blocks_not_in_rucio > BLOCKS_PER_ACTION: block_report['not_rucio'] = set( list(block_report['not_rucio'])[:BLOCKS_PER_ACTION]) n_blocks_not_in_rucio = len(block_report['not_rucio']) if BLOCKS_PER_ACTION and n_blocks_not_in_phedex > BLOCKS_PER_ACTION: block_report['not_phedex'] = set( list(block_report['not_phedex'])[:BLOCKS_PER_ACTION]) n_blocks_not_in_phedex = len(block_report['not_phedex']) logging.info('Adding %6d blocks to Rucio for %s:%s', n_blocks_not_in_rucio, site, prefix) for block in block_report['not_rucio']: bs = BlockSyncer(block_name=block, pnn=site, rse=site) bs.add_to_rucio() logging.info('Removing %6d blocks from Rucio for %s:%s', n_blocks_not_in_phedex, site, prefix) for block in block_report['not_phedex']: bs = BlockSyncer(block_name=block, pnn=site, rse=site) bs.remove_from_rucio() for block in block_report['incomplete']: logging.warn('Redoing sync for %s at %s', block, site) bs = BlockSyncer(block_name=block, pnn=site, rse=site) bs.add_to_rucio(recover=True) logging.info('Finished syncing %s:%s' % (site, prefix))
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, delay_seconds=0): """ Main loop to select and delete files. :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs. :param worker_number: The worker number. :param child_number: The child number. :param total_children: The total number of children created per worker. :param chunk_size: the size of chunk for deletion. :param once: If True, only runs one iteration of the main loop. :param greedy: If True, delete right away replicas with tombstone. :param scheme: Force the reaper to use a particular protocol, e.g., mock. """ logging.info('Starting Reaper: Worker %(worker_number)s, ' 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses])) pid = os.getpid() thread = threading.current_thread() hostname = socket.gethostname() executable = ' '.join(sys.argv) # Generate a hash just for the subset of RSEs rse_names = [rse['rse'] for rse in rses] hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest() sanity_check(executable=None, hostname=hostname) nothing_to_do = {} while not GRACEFUL_STOP.is_set(): try: # heartbeat heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) checkpoint_time = datetime.datetime.now() # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) max_deleting_rate = 0 for rse in sort_rses(rses): try: if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now(): heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals())) checkpoint_time = datetime.datetime.now() if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now(): continue logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number, rse['rse'], nothing_to_do.get(rse['id'])) rse_info = rsemgr.get_rse_info(rse_id=rse['id']) rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id']) if not rse_protocol['availability_delete']: logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse']) nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30) continue # Temporary hack to force gfal for deletion for protocol in rse_info['protocols']: if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default': protocol['impl'] = 'rucio.rse.protocols.gfal.Default' needed_free_space, max_being_deleted_files = None, 100 needed_free_space_per_child = None if not greedy: max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id']) logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals()) if needed_free_space <= 0: needed_free_space, needed_free_space_per_child = 0, 0 logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse']) else: if total_children and total_children > 0: needed_free_space_per_child = needed_free_space / float(total_children) start = time.time() with monitor.record_timer_block('reaper.list_unlocked_replicas'): replicas = list_unlocked_replicas(rse_id=rse['id'], bytes=needed_free_space_per_child, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds) logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas)) if not replicas: nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30) logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s', worker_number, child_number, rse['rse'], nothing_to_do[rse['id']]) continue prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme) for files in chunks(replicas, chunk_size): logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files)) try: update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True) for replica in files: try: replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error)) logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg) replica['pfn'] = None monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files)) try: deleted_files = [] prot.connect() for replica in files: try: deletion_dict = {'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'protocol': prot.attributes['scheme']} if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) start = time.time() if rse['staging_area'] or rse['rse'].endswith("STAGING"): logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) else: if replica['pfn']: pfn = replica['pfn'] # sign the URL if necessary if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None: pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn) prot.delete(pfn) else: logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']) monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000) duration = time.time() - start deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) deletion_dict['duration'] = duration add_message('deletion-done', deletion_dict) logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration) except SourceNotFound: err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse']) logging.warning(err_msg) deleted_files.append({'scope': replica['scope'], 'name': replica['name']}) if replica['state'] == ReplicaState.AVAILABLE: deletion_dict['reason'] = str(err_msg) add_message('deletion-failed', deletion_dict) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except Exception as error: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) deletion_dict['reason'] = str(error) add_message('deletion-failed', deletion_dict) except: logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())) except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error: for replica in files: logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error)) payload = {'scope': replica['scope'].external, 'name': replica['name'], 'rse': rse_info['rse'], 'rse_id': rse_info['id'], 'file-size': replica['bytes'], 'bytes': replica['bytes'], 'url': replica['pfn'], 'reason': str(error), 'protocol': prot.attributes['scheme']} if replica['scope'].vo != 'def': deletion_dict['vo'] = replica['scope'].vo add_message('deletion-failed', payload) break finally: prot.close() start = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse_id=rse['id'], files=deleted_files) logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start) monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files)) except DatabaseException as error: logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error)) except UnsupportedOperation as error: logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) except RSENotFound as error: logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error)) except: logging.critical(traceback.format_exc()) if once: break time.sleep(1) except DatabaseException as error: logging.warning('Reaper: %s', str(error)) except: logging.critical(traceback.format_exc()) die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable) logging.info('Graceful stop requested') logging.info('Graceful stop done') return
def EMULATION_RUN(self, interval, filescale, dataXX, runnumber, runspertag, calibfraction, timespan): tz_account = 'tzero' tz_group = 'tzero' tz_provenance = 'T0' tz_rse = 'CERN-PROD_TZERO' tz_filesize = 2000000000 # 2GB runnumber = random.randrange(400000, 500000) client = Client(account=tz_account) # Initializing run data if random.random() > calibfraction: print '== TZero: Starting calibration run no. %s' % runnumber pattern = [('data12_8TeV.NNNNNNNN.calibration_LArCellsEmpty.daq.RAW', 36), ('data12_8TeV.NNNNNNNN.calibration_Tile.daq.RAW', 36), ('data12_8TeV.NNNNNNNN.calibration_lucid.daq.RAW', 6), ('data12_8TeV.NNNNNNNN.physics_Background.merge.AOD.f1_m4', 11), ('data12_8TeV.NNNNNNNN.physics_Background.merge.AOD.x1_m4', 11), ('data12_8TeV.NNNNNNNN.physics_Background.merge.HIST.f1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_Background.merge.HIST.x1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_Background.merge.NTUP_BKGD.x1_m2', 7), ('data12_8TeV.NNNNNNNN.physics_Background.merge.NTUP_SCT.f1_m2', 7), ('data12_8TeV.NNNNNNNN.physics_Background.merge.RAW', 208), ('data12_8TeV.NNNNNNNN.physics_Background.merge.TAG.f1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_Background.merge.TAG.x1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_Background.recon.ESD.f1', 208), ('data12_8TeV.NNNNNNNN.physics_Background.recon.ESD.x1', 208), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.AOD.f1_m4', 11), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.AOD.x1_m4', 11), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.HIST.f1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.HIST.x1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.RAW', 205), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.TAG.f1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.TAG.x1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.recon.ESD.f1', 205), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.recon.ESD.x1', 205), ('data12_8TeV.NNNNNNNN.physics_Standby.merge.AOD.x1_m4', 11), ('data12_8TeV.NNNNNNNN.physics_Standby.merge.HIST.x1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_Standby.merge.RAW', 207), ('data12_8TeV.NNNNNNNN.physics_Standby.merge.TAG.x1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_Standby.recon.ESD.x1', 207), ] else: print '== TZero: Starting data run no. %s' % runnumber pattern = [('data12_8TeV.NNNNNNNN.calibration_IDTracks.daq.RAW', 141), ('data12_8TeV.NNNNNNNN.calibration_LArCells.daq.RAW', 63), ('data12_8TeV.NNNNNNNN.calibration_LArCellsEmpty.daq.RAW', 63), ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.daq.RAW', 106), ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.merge.AOD.c819_m4', 66), ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.merge.NTUP_IDVTXLUMI.c819_m2', 10), ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.merge.TAG.c819_m4_m3', 2), ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.recon.ESD.c819', 271), ('data12_8TeV.NNNNNNNN.calibration_PixelNoise.daq.RAW', 63), ('data12_8TeV.NNNNNNNN.calibration_SCTNoise.daq.RAW', 63), ('data12_8TeV.NNNNNNNN.calibration_Tile.daq.RAW', 63), ('data12_8TeV.NNNNNNNN.calibration_beamspot.daq.RAW', 63), ('data12_8TeV.NNNNNNNN.calibration_lucid.daq.RAW', 10), ('data12_8TeV.NNNNNNNN.debug_all.daq.RAW', 4), ('data12_8TeV.NNNNNNNN.debugrec_hltacc.merge.AOD.g1_f2_m4', 1), ('data12_8TeV.NNNNNNNN.debugrec_hltacc.merge.NTUP_FASTMON.g1_f2_m4_m8', 1), ('data12_8TeV.NNNNNNNN.debugrec_hltacc.merge.RAW.g1', 1), ('data12_8TeV.NNNNNNNN.debugrec_hltacc.merge.TAG.g1_f2_m4_m3', 1), ('data12_8TeV.NNNNNNNN.debugrec_hltacc.recon.ESD.g1_f2', 1), ('data12_8TeV.NNNNNNNN.express_express.merge.AOD.f2_m4', 21), ('data12_8TeV.NNNNNNNN.express_express.merge.AOD.x1_m4', 21), ('data12_8TeV.NNNNNNNN.express_express.merge.HIST.f2_m1', 1), ('data12_8TeV.NNNNNNNN.express_express.merge.HIST.x1_m1', 1), ('data12_8TeV.NNNNNNNN.express_express.merge.NTUP_SCT.f2_m2', 31), ('data12_8TeV.NNNNNNNN.express_express.merge.NTUP_TRIG.x1_m2', 121), ('data12_8TeV.NNNNNNNN.express_express.merge.RAW', 402), ('data12_8TeV.NNNNNNNN.express_express.merge.TAG.f2_m4_m3', 1), ('data12_8TeV.NNNNNNNN.express_express.merge.TAG.x1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.express_express.recon.ESD.f2', 402), ('data12_8TeV.NNNNNNNN.express_express.recon.ESD.x1', 3614), ('data12_8TeV.NNNNNNNN.physics_Background.merge.AOD.f2_m4', 22), ('data12_8TeV.NNNNNNNN.physics_Background.merge.AOD.x1_m4', 22), ('data12_8TeV.NNNNNNNN.physics_Background.merge.HIST.f2_m1', 1), ('data12_8TeV.NNNNNNNN.physics_Background.merge.HIST.x1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_Background.merge.NTUP_BKGD.x1_m2', 15), ('data12_8TeV.NNNNNNNN.physics_Background.merge.NTUP_SCT.f2_m2', 15), ('data12_8TeV.NNNNNNNN.physics_Background.merge.RAW', 429), ('data12_8TeV.NNNNNNNN.physics_Background.merge.TAG.f2_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_Background.merge.TAG.x1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_Background.recon.ESD.f2', 429), ('data12_8TeV.NNNNNNNN.physics_Background.recon.ESD.x1', 429), ('data12_8TeV.NNNNNNNN.physics_Bphysics.merge.RAW', 3150), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.AOD.f2_m4', 22), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.AOD.x1_m4', 22), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.HIST.f2_m1', 1), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.HIST.x1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.RAW', 425), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.TAG.f2_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.TAG.x1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.recon.ESD.f2', 425), ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.recon.ESD.x1', 425), ('data12_8TeV.NNNNNNNN.physics_CosmicMuons.merge.RAW', 399), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.AOD.f2_m4', 312), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DESDM_EGAMMA.f2_m6', 31), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DESDM_RPVLL.f2_m6', 21), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DESD_PHOJET.f2_m6', 74), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DESD_SGLEL.f2_m6', 57), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DRAW_ZEE.f2_m9', 37), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.HIST.f2_m1', 1), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.NTUP_FASTMON.f2_m4_m8', 18), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.RAW', 3626), ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.TAG.f2_m4_m3', 7), ('data12_8TeV.NNNNNNNN.physics_Egamma.recon.DAOD_ZEE.f2_m9_f2', 37), ('data12_8TeV.NNNNNNNN.physics_Egamma.recon.DESD_ZEE.f2_m9_f2', 37), ('data12_8TeV.NNNNNNNN.physics_Egamma.recon.ESD.f2', 3626), ('data12_8TeV.NNNNNNNN.physics_HadDelayed.merge.RAW', 3215), ('data12_8TeV.NNNNNNNN.physics_IDCosmic.merge.RAW', 402), ('data12_8TeV.NNNNNNNN.physics_JetCalibDelayed.merge.RAW', 403), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.AOD.f2_m4', 364), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.DESDM_RPVLL.f2_m6', 150), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.DESDM_TRACK.f2_m6', 25), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.DESD_CALJET.f2_m6', 71), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.HIST.f2_m1', 1), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.NTUP_FASTMON.f2_m4_m8', 27), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.RAW', 3624), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.TAG.f2_m4_m3', 8), ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.recon.ESD.f2', 3624), ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.AOD.f2_m4', 21), ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.DESD_MBIAS.f2_m6', 24), ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.HIST.f2_m1', 1), ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.RAW', 403), ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.TAG.f2_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_MinBias.recon.ESD.f2', 403), ('data12_8TeV.NNNNNNNN.physics_Muons.merge.AOD.f2_m4', 334), ('data12_8TeV.NNNNNNNN.physics_Muons.merge.DESDM_RPVLL.f2_m6', 54), ('data12_8TeV.NNNNNNNN.physics_Muons.merge.DESD_SGLMU.f2_m6', 254), ('data12_8TeV.NNNNNNNN.physics_Muons.merge.DRAW_ZMUMU.f2_m9', 47), ('data12_8TeV.NNNNNNNN.physics_Muons.merge.HIST.f2_m1', 1), ('data12_8TeV.NNNNNNNN.physics_Muons.merge.NTUP_FASTMON.f2_m4_m8', 19), ('data12_8TeV.NNNNNNNN.physics_Muons.merge.RAW', 3629), ('data12_8TeV.NNNNNNNN.physics_Muons.merge.TAG.f2_m4_m3', 7), ('data12_8TeV.NNNNNNNN.physics_Muons.recon.DAOD_ZMUMU.f2_m9_f1', 47), ('data12_8TeV.NNNNNNNN.physics_Muons.recon.DESD_ZMUMU.f2_m9_f1', 47), ('data12_8TeV.NNNNNNNN.physics_Muons.recon.ESD.f2', 3629), ('data12_8TeV.NNNNNNNN.physics_Standby.merge.AOD.x1_m4', 2), ('data12_8TeV.NNNNNNNN.physics_Standby.merge.HIST.x1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_Standby.merge.RAW', 24), ('data12_8TeV.NNNNNNNN.physics_Standby.merge.TAG.x1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_Standby.recon.ESD.x1', 24), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.AOD.f2_m4', 21), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.AOD.x1_m4', 21), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.HIST.f2_m1', 1), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.HIST.x1_m1', 1), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.RAW', 402), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.TAG.f2_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.TAG.x1_m4_m3', 1), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.recon.ESD.f2', 402), ('data12_8TeV.NNNNNNNN.physics_ZeroBias.recon.ESD.x1', 402), ('data12_8TeV.NNNNNNNN.physics_ZeroBiasOverlay.merge.RAW', 404), ('data12_calib.0NNNNNNNN.calibration_MuonAll.daq.RAW', 70), ] factor = runnumber / runspertag tagdict = {'x1': 'x%s' % (factor * 1 + 1), 'f1': 'f%s' % (factor * 2 + 1), 'f2': 'f%s' % (factor * 2 + 2), 'm1': 'm%s' % (factor * 10 + 1), 'm2': 'm%s' % (factor * 10 + 2), 'm3': 'm%s' % (factor * 10 + 3), 'm4': 'm%s' % (factor * 10 + 4), 'm6': 'm%s' % (factor * 10 + 6), 'm8': 'm%s' % (factor * 10 + 8), 'm9': 'm%s' % (factor * 10 + 9) } # pre compute all datasets success = False while not success: try: ds = [] tmp_uuid = uuid() for n, s in pattern: # newname = n.replace('data12', dataXX).replace('NNNNNNNN', str(runnumber).zfill(8)) % tagdict # TODO: fix when meta-data is working properly newname = n.replace('data12', dataXX).replace('NNNNNNNN', tmp_uuid) % tagdict newscope = newname.split('.')[0] newsize = s * filescale ds.append([newscope, newname, newsize, 0]) # 0 is initial value of file counter # register run number # client.add_value('run_number', runnumber) # open all datasets open_ds = list() for scope, datasetname, s, c in ds: pcs = datasetname.split('.') meta = {'project': pcs[0], 'stream_name': pcs[2], 'prod_step': pcs[3], 'datatype': pcs[4], # 'version': pcs[5], 'guid': uuid(), } meta['group'] = tz_group meta['provenance'] = tz_provenance meta['run_number'] = runnumber open_ds.append({'scope': scope, 'name': datasetname, 'statuses': {'monotonic': True}, 'meta': meta, 'rules': [{'account': tz_account, 'copies': 1, 'rse_expression': tz_rse, 'grouping': 'DATASET'}]}) # The missing lifetime attribute indicated an infinite lifetime with monitor.record_timer_block(['emulator.tzero.client.add_datasets', ('emulator.tzero.client.add_datasets.normalized', len(open_ds))]): client.add_datasets(open_ds) monitor.record_counter('emulator.tzero.client.datasets', len(open_ds)) # dids = [{'scope': d['scope'], 'name': d['name']} for d in open_ds] # with monitor.record_timer_block(['tzero.add_replication_rule', ('tzero.add_replication_rule.normalized', len(open_ds))]): # client.add_replication_rule(dids, copies=1, rse_expression=tz_rse, # grouping='DATASET', account=tz_account) success = True except Exception, e: monitor.record_counter('emulator.exceptions.tzero.EMULATION_RUN.%s' % e.__class__.__name__, 1) print traceback.format_exc()
def sync_site(self, site_pair): """ Sync a site defined by a site_pair of (site, prefix). Prefix can be None to sync all blocks in the site :return: """ site, prefix = site_pair if site.endswith('_Tape'): pnn = site.replace('_Tape', '_MSS') else: pnn = site # now = int(time.time()) # Set 1980 as the last sync date if no data exists # site_last_synced = self.last_synced.get(site_pair, 10 * 365 * 24 * 3600) # last_week = int(site_last_synced - 7 * 24 * 3600) if self.config.get('default', None): if self.config['default'].get('chunck', 0): BLOCKS_PER_ACTION = int(self.config['default']['chunck']) if self.config['default'].get('select', None): self.patterns = [self.config['default']['select']] with monitor.record_timer_block('cms_sync.time_site_sync'): r_timer = 'cms_sync.time_rucio_block_list_all' p_timer = 'cms_sync.time_phedex_block_list_all' if prefix: r_timer = 'cms_sync.time_rucio_block_list_partial' p_timer = 'cms_sync.time_phedex_block_list_partial' # Add touches to keep from getting killed as long as progress is being made with monitor.record_timer_block(p_timer): touch(text='PQ ' + site) phedex_blocks = self.phedex_svc.blocks_at_site(pnn=pnn, prefix=prefix, since=None) with monitor.record_timer_block(r_timer): touch(text='RQ ' + site) rucio_blocks = self.get_datasets_at_rse(rse=site, prefix=prefix) touch(text='DQ ' + site) n_blocks_in_phedex = len(phedex_blocks) n_blocks_in_rucio = len(rucio_blocks) # FIXME: This is refusing to delete everything from Rucio. Not clear it's needed if not n_blocks_in_phedex and n_blocks_in_rucio: logging.warning( "At %s found %s blocks in PhEDEx and %s in Rucio with prefix %s", site, n_blocks_in_phedex, n_blocks_in_rucio, prefix) return if not n_blocks_in_phedex and not n_blocks_in_rucio: logging.info( "At %s:%s, nothing in PhEDEx or Rucio. Quitting." % (site, prefix)) return block_report = compare_site_blocks(phedex=phedex_blocks, rucio=rucio_blocks, rse=site, patterns=self.patterns) n_blocks_not_in_rucio = len(block_report['not_rucio']) n_blocks_not_in_phedex = len(block_report['not_phedex']) n_incomplete_blocks = len(block_report['incomplete']) logging.info("At %s:%s In both/PhEDEx only/Rucio only: %s/%s/%s" % (site, prefix, len(block_report['complete']), n_blocks_not_in_rucio, n_blocks_not_in_phedex)) if len(block_report['complete'] ) or n_blocks_not_in_rucio or n_blocks_not_in_phedex: logging.info( 'At %s:%s %3.0f%% complete', site, prefix, len(block_report['complete']) * 100 / (len(block_report['complete']) + n_blocks_not_in_rucio + n_blocks_not_in_phedex)) if len(block_report['complete']) or n_blocks_not_in_rucio: logging.info( 'At %s:%s %3.0f%% completely added', site, prefix, len(block_report['complete']) * 100 / (len(block_report['complete']) + n_blocks_not_in_rucio)) # Truncate lists if we want to reduce cycle time if BLOCKS_PER_ACTION and n_blocks_not_in_rucio > BLOCKS_PER_ACTION: block_report['not_rucio'] = set( list(block_report['not_rucio'])[:BLOCKS_PER_ACTION]) n_blocks_not_in_rucio = len(block_report['not_rucio']) if BLOCKS_PER_ACTION and n_blocks_not_in_phedex > BLOCKS_PER_ACTION: block_report['not_phedex'] = set( list(block_report['not_phedex'])[:BLOCKS_PER_ACTION]) n_blocks_not_in_phedex = len(block_report['not_phedex']) logging.info('Adding %6d blocks to Rucio for %s:%s', n_blocks_not_in_rucio, site, prefix) for block in block_report['not_rucio']: logging.info('Adding to rucio: %s at %s', block, site) bs = BlockSyncer(block_name=block, pnn=pnn, rse=site) bs.add_to_rucio() logging.info('Removing %6d blocks from Rucio for %s:%s', n_blocks_not_in_phedex, site, prefix) for block in block_report['not_phedex']: logging.info('Removing from rucio: %s at %s', block, site) bs = BlockSyncer(block_name=block, pnn=pnn, rse=site) bs.remove_from_rucio() for block in block_report['incomplete']: logging.warn('Redoing sync for %s at %s', block, site) bs = BlockSyncer(block_name=block, pnn=pnn, rse=site) bs.add_to_rucio(recover=True) logging.info('Finished syncing %s:%s' % (site, prefix))
def _run_once(rses_to_process, chunk_size, greedy, scheme, delay_seconds, auto_exclude_threshold, auto_exclude_timeout, heartbeat_handler, oidc_account, oidc_scope, oidc_audience, **_kwargs): dict_rses = {} _, total_workers, logger = heartbeat_handler.live() tot_needed_free_space = 0 for rse in rses_to_process: # Check if RSE is blocklisted if rse.columns['availability'] % 2 == 0: logger(logging.DEBUG, 'RSE %s is blocklisted for delete', rse.name) continue rse.ensure_loaded(load_attributes=True) enable_greedy = rse.attributes.get('greedyDeletion', False) or greedy needed_free_space, only_delete_obsolete = __check_rse_usage_cached( rse, greedy=enable_greedy, logger=logger) if needed_free_space: dict_rses[rse] = [ needed_free_space, only_delete_obsolete, enable_greedy ] tot_needed_free_space += needed_free_space elif only_delete_obsolete: dict_rses[rse] = [ needed_free_space, only_delete_obsolete, enable_greedy ] else: logger(logging.DEBUG, 'Nothing to delete on %s', rse.name) rses_with_params = [ (rse, needed_free_space, only_delete_obsolete, enable_greedy) for rse, (needed_free_space, only_delete_obsolete, enable_greedy) in dict_rses.items() ] # Ordering the RSEs based on the needed free space sorted_rses = sorted(rses_with_params, key=lambda x: x[1], reverse=True) log_msg_str = ', '.join( f'{rse}:{needed_free_space}:{only_delete_obsolete}:{enable_greedy}' for rse, needed_free_space, only_delete_obsolete, enable_greedy in sorted_rses) logger(logging.DEBUG, 'List of RSEs to process ordered by needed space desc: %s', log_msg_str) random.shuffle(rses_with_params) work_remaining_by_rse = {} paused_rses = [] for rse, needed_free_space, only_delete_obsolete, enable_greedy in rses_with_params: result = REGION.get('pause_deletion_%s' % rse.id, expiration_time=120) if result is not NO_VALUE: paused_rses.append(rse.name) logger( logging.DEBUG, 'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while', rse.name) continue result = REGION.get('temporary_exclude_%s' % rse.id, expiration_time=auto_exclude_timeout) if result is not NO_VALUE: logger( logging.WARNING, 'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.', rse.name) EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(1) continue EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(0) percent = 0 if tot_needed_free_space: percent = needed_free_space / tot_needed_free_space * 100 logger(logging.DEBUG, 'Working on %s. Percentage of the total space needed %.2f', rse.name, percent) rse_hostname = _rse_deletion_hostname(rse) if not rse_hostname: logger(logging.WARNING, 'No default delete protocol for %s', rse.name) REGION.set('pause_deletion_%s' % rse.id, True) continue hb_payload = __try_reserve_worker_slot( heartbeat_handler=heartbeat_handler, rse=rse, hostname=rse_hostname, logger=logger) if not hb_payload: # Might need to reschedule a try on this RSE later in the same cycle continue # List and mark BEING_DELETED the files to delete del_start_time = time.time() try: use_temp_tables = config_get_bool('core', 'use_temp_tables', default=False) with monitor.record_timer_block('reaper.list_unlocked_replicas'): if only_delete_obsolete: logger( logging.DEBUG, 'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas', rse.name) if use_temp_tables: replicas = list_and_mark_unlocked_replicas( limit=chunk_size, bytes_=needed_free_space, rse_id=rse.id, delay_seconds=delay_seconds, only_delete_obsolete=only_delete_obsolete, session=None) else: replicas = list_and_mark_unlocked_replicas_no_temp_table( limit=chunk_size, bytes_=needed_free_space, rse_id=rse.id, delay_seconds=delay_seconds, only_delete_obsolete=only_delete_obsolete, session=None) logger( logging.DEBUG, 'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', rse.name, needed_free_space, time.time() - del_start_time, len(replicas)) if (len(replicas) == 0 and enable_greedy) or (len(replicas) < chunk_size and not enable_greedy): logger( logging.DEBUG, 'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle', rse.name, chunk_size, len(replicas)) REGION.set('pause_deletion_%s' % rse.id, True) work_remaining_by_rse[rse] = False else: work_remaining_by_rse[rse] = True except (DatabaseException, IntegrityError, DatabaseError) as error: logger(logging.ERROR, '%s', str(error)) continue except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) continue # Physical deletion will take place there try: rse.ensure_loaded(load_info=True, load_attributes=True) auth_token = None if oidc_account and rse.attributes.get('oidc_support', False): account = InternalAccount(oidc_account, vo=rse.columns['vo']) token_dict = get_token_for_account_operation( account, req_audience=oidc_audience, req_scope=oidc_scope, admin=True) if token_dict is not None and 'token' in token_dict: auth_token = token_dict['token'] logger(logging.DEBUG, 'OIDC authentication used for deletion.') prot = rsemgr.create_protocol(rse.info, 'delete', scheme=scheme, auth_token=auth_token, logger=logger) for file_replicas in chunks(replicas, chunk_size): # Refresh heartbeat _, total_workers, logger = heartbeat_handler.live( payload=hb_payload) del_start_time = time.time() for replica in file_replicas: try: replica['pfn'] = str( list( rsemgr.lfns2pfns(rse_settings=rse.info, lfns=[{ 'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path'] }], operation='delete', scheme=scheme).values())[0]) except (ReplicaUnAvailable, ReplicaNotFound) as error: logger( logging.WARNING, 'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s', replica['scope'], replica['name'], rse.name, str(error)) replica['pfn'] = None except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) is_staging = rse.columns['staging_area'] deleted_files = delete_from_storage(heartbeat_handler, hb_payload, file_replicas, prot, rse.info, is_staging, auto_exclude_threshold, logger=logger) logger(logging.INFO, '%i files processed in %s seconds', len(file_replicas), time.time() - del_start_time) # Then finally delete the replicas del_start = time.time() with monitor.record_timer_block('reaper.delete_replicas'): delete_replicas(rse_id=rse.id, files=deleted_files) logger( logging.DEBUG, 'delete_replicas successed on %s : %s replicas in %s seconds', rse.name, len(deleted_files), time.time() - del_start) DELETION_COUNTER.inc(len(deleted_files)) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if paused_rses: logger(logging.INFO, 'Deletion paused for a while for following RSEs: %s', ', '.join(paused_rses)) rses_with_more_work = [ rse for rse, has_more_work in work_remaining_by_rse.items() if has_more_work ] return rses_with_more_work