Example #1
0
def delete_dids(dids, account, session=None):
    """
    Delete data identifiers

    :param dids: The list of dids to delete.
    :param account: The account.
    :param session: The database session in use.
    """

    rule_id_clause = []
    content_clause = []
    parent_content_clause = []
    did_clause = []
    for did in dids:
        logging.info('Removing did %s:%s' % (did['scope'], did['name']))
        did_clause.append(and_(models.DataIdentifier.scope == did['scope'], models.DataIdentifier.name == did['name']))
        parent_content_clause.append(and_(models.DataIdentifierAssociation.child_scope == did['scope'], models.DataIdentifierAssociation.child_name == did['name']))
        rule_id_clause.append(and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name']))
        content_clause.append(and_(models.DataIdentifierAssociation.scope == did['scope'], models.DataIdentifierAssociation.name == did['name']))

        # Send message for AMI
        add_message('ERASE', {'account': account,
                              'scope': did['scope'],
                              'name': did['name']},
                    session=session)

    # Delete rules on did
    if rule_id_clause:
        with record_timer_block('undertaker.rules'):
            for (rule_id, scope, name, rse_expression, ) in session.query(models.ReplicationRule.id,
                                                                          models.ReplicationRule.scope,
                                                                          models.ReplicationRule.name,
                                                                          models.ReplicationRule.rse_expression).filter(or_(*rule_id_clause)):
                logging.debug('Removing rule %s for did %s:%s on RSE-Expression %s' % (str(rule_id), scope, name, rse_expression))
                rucio.core.rule.delete_rule(rule_id=rule_id, nowait=True, session=session)

    # Detach from parent dids:
    existing_parent_dids = False
    if parent_content_clause:
        with record_timer_block('undertaker.parent_content'):
            for parent_did in session.query(models.DataIdentifierAssociation).filter(or_(*parent_content_clause)):
                existing_parent_dids = True
                detach_dids(scope=parent_did.scope, name=parent_did.name, dids=[{'scope': parent_did.child_scope, 'name': parent_did.child_name}], session=session)

    # Remove content
    if content_clause:
        with record_timer_block('undertaker.content'):
            rowcount = session.query(models.DataIdentifierAssociation).filter(or_(*content_clause)).\
                delete(synchronize_session=False)
        record_counter(counters='undertaker.content.rowcount',  delta=rowcount)

    # remove data identifier
    if existing_parent_dids:
        # Exit method early to give Judge time to remove locks (Otherwise, due to foreign keys, did removal does not work
        logging.debug('Leaving delete_dids early for Judge-Evaluator checks')
        return
    with record_timer_block('undertaker.dids'):
        rowcount = session.query(models.DataIdentifier).filter(or_(*did_clause)).\
            filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER, models.DataIdentifier.did_type == DIDType.DATASET)).\
            delete(synchronize_session=False)
Example #2
0
    def test_context_record_timer(self):
        """MONITOR (CORE): Send a timer message to graphite using context """
        with monitor.record_timer_block('test.context_timer'):
            var_a = 2 * 100
            var_a = var_a * 1

        with monitor.record_timer_block(['test.context_timer']):
            var_a = 2 * 100
            var_a = var_a * 1

        with monitor.record_timer_block(['test.context_timer', ('test.context_timer_normal10', 10)]):
            var_a = 2 * 100
            var_a = var_a * 1
Example #3
0
    def test_monitor_context_record_timer(self):
        """MONITOR (CORE): Send a timer message to graphite using context """
        with monitor.record_timer_block('test.context_timer'):
            a = 2 * 100
            a = a * 1

        with monitor.record_timer_block(['test.context_timer']):
            a = 2 * 100
            a = a * 1

        with monitor.record_timer_block(['test.context_timer', ('test.context_timer_normal10', 10)]):
            a = 2 * 100
            a = a * 1
    def get_datasets_at_rse(rse, prefix=None):
        """

        :param rse: The RSE name
        :param prefix: Character(s) to restrict the dataset search
        :return: a dictionary with <dataset name>: <number of files>
        """

        filters = {'scope': 'cms', 'did_type': DIDType.DATASET}
        if prefix:
            filters['name'] = '/' + prefix + '*'

        with monitor.record_timer_block('cms_sync.time_rse_datasets'):
            all_datasets = [
                dataset['name']
                for dataset in list_datasets_per_rse(rse=rse, filters=filters)
            ]

            for dataset in all_datasets:
                datasets = {
                    dataset: ds['available_length']
                    for ds in list_dataset_replicas(
                        scope='cms', name=dataset, deep=True)
                    if ds['rse'] == rse
                }

            # datasets = {dataset['name']: dataset['available_length']
            #             for dataset in list_datasets_per_rse(rse=rse, filters=filters, deep=True)}

        return datasets
def compare_site_blocks(phedex=None, rucio=None, rse=''):
    """

    :param phedex: Dictionary with file counts for PhEDEx
    :param rucio: Dictionary with file counts for Rucio
    :return:
    """
    with monitor.record_timer_block('cms_sync.time_node_diff'):
        phedex_blocks = set(phedex.keys())
        rucio_blocks = set(rucio.keys())
        missing_phedex = rucio_blocks - phedex_blocks
        missing_rucio = phedex_blocks - rucio_blocks
        both = phedex_blocks & rucio_blocks
        incomplete = set()

        for block in both:
            if phedex[block] != rucio[block]:
                print("For %s at %s the files differ: %s vs %s." %
                      (block, rse, phedex[block], rucio[block]))
                incomplete.add(block)
        both = both - incomplete

        return {
            'not_rucio': missing_rucio,
            'not_phedex': missing_phedex,
            'complete': both,
            'incomplete': incomplete
        }
Example #6
0
    def get_datasets_at_rse(rse, prefix=None):
        """

        :param rse: The RSE name
        :param prefix: Character(s) to restrict the dataset search
        :return: a dictionary with <dataset name>: <number of files>
        """

        filters = {'scope': 'cms', 'did_type': DIDType.DATASET}
        if prefix:
            filters['name'] = '/' + prefix + '*'

        account = SYNC_ACCOUNT_FMT % rse.lower()
        rule_filters = {'account': account, 'scope': 'cms', 'did_type': DIDType.DATASET}

        with monitor.record_timer_block('cms_sync.time_rse_datasets'):
            synced_ds = {item['name'] for item in list_replication_rules(filters=rule_filters)
                         if item['expires_at'] is None and (prefix is None or item['name'].startswith('/' + prefix))}

            all_datasets = [dataset['name'] for dataset in list_datasets_per_rse(rse=rse, filters=filters)]

            logging.info('Getting all datasets at %s with prefix %s' % (rse, prefix))

            datasets = {}

            for dataset in all_datasets:
                if dataset in synced_ds:
                    for ds in list_dataset_replicas(scope='cms', name=dataset, deep=True):
                        if ds['rse'] == rse:
                            datasets.update({dataset: ds['available_length']})

        return datasets
Example #7
0
    def remove_extra_replicas(self, to_remove):
        """
        :param to_remove: replicas to remove from Rucio
        :return:
        """
        scope = InternalScope(self.scope)
        with monitor.record_timer_block('cms_sync.time_remove_replica'):
            if to_remove and self.dry_run:
                logging.info('Dry run: Removing replicas %s from rse %s.',
                             str(to_remove), self.rse)
            elif to_remove:
                logging.debug('Removing %s replicas from rse %s.',
                              len(to_remove), self.rse)
                for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE):
                    replicas = [{
                        'scope': scope,
                        'name': lfn,
                        "rse_id": self.rse_id,
                        "state": "U"
                    } for lfn in to_remove_chunk]
                    # transactional_session here?
                    # while lock is set stuck, judge-repairer might make transfer requests before rule is gone but does it matter?
                    update_replicas_states(
                        replicas=replicas,
                        add_tombstone=False,
                    )

                # delete_replicas(rse=self.rse, issuer=self.account,
                #                     files=[{'scope': self.scope, 'name': lfn} for lfn in to_remove_chunk])
                return len(to_remove)
    def update_replicas(self):
        """
        Add or removes replicas for the dataset at rse.
        """

        with monitor.record_timer_block('cms_sync.time_update_replica'):
            logging.info('Updating replicas for %s:%s at %s', self.scope, self.block_name, self.rse)
            replicas = list_replicas(dids=[{'scope': self.scope, 'name': self.block_name}],
                                     rse_expression='rse=%s' % self.rse)
            try:
                rucio_replicas = {repl['name'] for repl in replicas}
            except TypeError:
                rucio_replicas = set()

            phedex_replicas = set(self.replicas.keys())
            missing = list(phedex_replicas - rucio_replicas)
            to_remove = list(rucio_replicas - phedex_replicas)

            if missing and (len(phedex_replicas) != len(missing)):
                logging.warn('Recovery: Inconsistency found for %s at %s: %s in PhEDEx and %s missing',
                             self.rse, self.block_name, len(phedex_replicas), len(missing))

            if missing:
                lfns_added = self.add_missing_replicas(missing)
                monitor.record_counter('cms_sync.files_added', delta=lfns_added)
            if to_remove:
                lfns_removed = self.remove_extra_replicas(to_remove)
                monitor.record_counter('cms_sync.files_removed', delta=lfns_removed)

        return
    def remove_from_rucio(self):
        """"""

        if not self.block_in_phedex:
            logging.info('Declining to remove since it is not in PhEDEx')
            return

        with monitor.record_timer_block('cms_sync.time_remove_block'):
            self.update_replicas()
            self.update_rule()
Example #10
0
 def add_to_rucio(self, recover=False):
     """"""
     with monitor.record_timer_block('cms_sync.time_add_block'):
         self.register_container()
         block_exists = self.register_block()
         if block_exists:
             self.update_replicas()
             if recover:
                 self.make_replicas_available()
             self.update_rule()
Example #11
0
def compare_site_blocks(phedex=None, rucio=None, rse='', patterns=None):
    """

    :param phedex: Dictionary with file counts for PhEDEx
    :param rucio: Dictionary with file counts for Rucio
    :return:
    """
    with monitor.record_timer_block('cms_sync.time_node_diff'):
        #        from  pprint import pformat
        phedex_blocks = set(phedex.keys())
        #        logging.info('Blocks in PhEDEx %s', pformat(phedex_blocks))

        rucio_blocks = set(rucio.keys())
        #        logging.info('Blocks in Rucio %s', pformat(rucio_blocks))

        if patterns:
            phedex_match = set()
            rucio_match = set()

            for pattern in patterns:
                phedex_match = phedex_match | {
                    p
                    for p in phedex_blocks if pattern in p
                }
                rucio_match = rucio_match | {
                    r
                    for r in rucio_blocks if pattern in r
                }

            logging.info(
                'Pattern matching reduces Rucio %s->%s and PhEDEx %s->%s',
                len(rucio_blocks), len(rucio_match), len(phedex_blocks),
                len(phedex_match))
            phedex_blocks = phedex_match
            rucio_blocks = rucio_match

        missing_phedex = rucio_blocks - phedex_blocks
        missing_rucio = phedex_blocks - rucio_blocks
        both = phedex_blocks & rucio_blocks
        incomplete = set()

        for block in both:
            if phedex[block] != rucio[block]:
                print("For %s at %s the files differ: %s vs %s." %
                      (block, rse, phedex[block], rucio[block]))
                incomplete.add(block)
        both = both - incomplete

        return {
            'not_rucio': missing_rucio,
            'not_phedex': missing_phedex,
            'complete': both,
            'incomplete': incomplete
        }
Example #12
0
 def add_to_rucio(self, recover=False):
     """"""
     with monitor.record_timer_block('cms_sync.time_add_block'):
         self.register_container()
         block_exists = self.register_block()
         if block_exists:
             self.update_replicas()
             if recover:
                 self.make_replicas_available()
             self.update_rule()
         else:
             logging.critical('Unable to make the block %s',
                              self.block_name)
Example #13
0
    def make_replicas_available(self):
        """
        Marks available replicas for the dataset at rse if they are in PhEDEx
        """

        with monitor.record_timer_block('cms_sync.time_recover_replica'):
            logging.info('Recovering unavailable replicas for %s:%s at %s',
                         self.scope, self.block_name, self.rse)

            replicas = list_replicas(dids=[{
                'scope': self.scope,
                'name': self.block_name
            }],
                                     rse_expression='rse=%s' % self.rse,
                                     all_states=True)

            try:
                unavailable_replicas = {
                    repl['name']
                    for repl in replicas
                    if repl['states'][self.rse] != 'AVAILABLE'
                }
            except TypeError:
                unavailable_replicas = set()

            phedex_replicas = set(self.replicas.keys())
            missing = list(phedex_replicas & unavailable_replicas)

            logging.info(
                'Recovery for %s:%s at %s: PhEDEx has %s, Rucio unavailable %s. Missing: %s ',
                self.scope, self.block_name, self.rse, len(phedex_replicas),
                len(unavailable_replicas), len(missing))

            # Fix up things which are unavailable
            rse_details = get_rse(self.rse)
            rse_id = rse_details['id']
            scope = InternalScope(self.scope)
            state = 'A'

            for name in missing:
                logging.info('Setting available %s:%s at %s', self.scope, name,
                             self.rse)
                core_update_state(rse_id=rse_id,
                                  scope=scope,
                                  name=name,
                                  state=state)

            monitor.record_counter('cms_sync.files_made_available',
                                   delta=len(missing))

        return
Example #14
0
    def add_missing_replicas(self, missing):
        """
        :param missing: possible missing lfns
        :return:
        """

        with monitor.record_timer_block('cms_sync.time_add_replica'):
            if missing and self.dry_run:
                logging.info('Dry run: Adding replicas %s to rse %s.', str(missing), self.rse)
            elif missing:
                logging.info('Adding %s replicas to rse %s.', len(missing), self.rse)
                replicas_to_add = [self.replicas[lfn] for lfn in missing]
                files = replica_file_list(replicas=replicas_to_add, scope=self.scope)
                for rucio_file in files:
                    try:
                        update_file = copy.deepcopy(rucio_file)
                        update_file.update({'scope': InternalScope(self.scope), "rse_id": self.rse_id, "state": "A"})
                        update_replicas_states(replicas=[update_file], add_tombstone=False)
                    except ReplicaNotFound:
                        resurrect_file = copy.deepcopy(rucio_file)
                        resurrect_file.update({'scope': 'cms', 'type': 'FILE'})
                        try:
                            add_replicas(rse=self.rse, files=[resurrect_file], issuer=self.account,
                                         ignore_availability=True)
                        except RucioException:
                            logging.critical('Could not add %s to %s. Constraint violated?', resurrect_file, self.rse)
                            resurrect_file.update({'scope': 'cms', 'type': 'FILE'})  # Reset to Internal scope by call
                            resurrect([resurrect_file], issuer=self.account)
                            resurrect_file.update({'scope': 'cms', 'type': 'FILE'})  # Reset to Internal scope by call
                            add_replicas(rse=self.rse, files=[resurrect_file], issuer=self.account,
                                         ignore_availability=True)
                            logging.critical('Resurrected %s at %s', resurrect_file, self.rse)

                # add_replicas(rse=self.rse, files=files, issuer=self.account)
                lfns = [item['name'] for item in list_files(scope=self.scope, name=self.block_name, long=False)]

                missing_lfns = list(set(missing) - set(lfns))

                if missing_lfns:
                    logging.debug('Attaching %s lfns to %s at %s', len(missing_lfns), self.block_name, self.rse)
                    dids = [{'scope': self.scope, 'name': lfn} for lfn in missing_lfns]
                    try:
                        attach_dids(scope=self.scope, name=self.block_name, attachment={'dids': dids},
                                    issuer=self.account)
                    except FileAlreadyExists:
                        logging.warning('Trying to attach already existing files to %s', self.block_name)
                    except DataIdentifierNotFound:
                        logging.critical('Could not attach to %s at %s. Constraint violated?', self.block_name, self.rse)
                return len(missing_lfns)
Example #15
0
    def add_to_rucio(self, recover=False):
        """"""

        if not self.block_in_phedex:
            logging.info('Declining to add %s since it is not in PhEDEx', self.block_in_phedex)
            return

        with monitor.record_timer_block('cms_sync.time_add_block'):
            self.register_container()
            block_exists = self.register_block()
            if block_exists:
                self.update_replicas()
                if recover:
                    self.make_replicas_available()
                self.update_rule()
            else:
                logging.critical('Unable to make the block %s', self.block_name)
Example #16
0
    def LIST_DIDS_WILDCARD(self, scope, wildcard):
        jdoe_account = 'jdoe'
        client = DIDClient(account=jdoe_account)

        print 'run with: ' + str(wildcard)
        start = time()
        with monitor.record_timer_block('jdoe.list_dids_wildcard'):
            dids = [did for did in client.list_dids(scope=scope, filters=wildcard, type='dataset')]

        duration = time() - start
        cnt = len(dids)
        print 'got %d dids' % cnt

        monitor.record_counter('jdoe.list_dids_wildcard.num_results', cnt)
        if cnt != 0:
            monitor.record_counter('jdoe.list_dids_wildcard.time_per_did', duration / cnt)

        return {'no_datasets': cnt}
Example #17
0
    def add_missing_replicas(self, missing):
        """
        :param missing: possible missing lfns
        :return:
        """

        with monitor.record_timer_block('cms_sync.time_add_replica'):
            if missing and self.dry_run:
                logging.info('Dry run: Adding replicas %s to rse %s.',
                             str(missing), self.rse)
            elif missing:
                logging.debug('Adding %s replicas to rse %s.', len(missing),
                              self.rse)
                replicas_to_add = [self.replicas[lfn] for lfn in missing]
                files = replica_file_list(replicas=replicas_to_add,
                                          scope=self.scope)
                add_replicas(rse=self.rse, files=files, issuer=self.account)
                lfns = [
                    item['name'] for item in list_files(
                        scope=self.scope, name=self.block_name, long=False)
                ]

                missing_lfns = list(set(missing) - set(lfns))

                if missing_lfns:
                    logging.debug('Attaching %s lfns to %s at %s',
                                  len(missing_lfns), self.block_name, self.rse)
                    dids = [{
                        'scope': self.scope,
                        'name': lfn
                    } for lfn in missing_lfns]
                    try:
                        attach_dids(scope=self.scope,
                                    name=self.block_name,
                                    attachment={'dids': dids},
                                    issuer=self.account)
                    except FileAlreadyExists:
                        logging.warning(
                            'Trying to attach already existing files to %s',
                            self.block_name)
                return len(missing_lfns)
Example #18
0
    def remove_extra_replicas(self, to_remove):
        """
        :param to_remove: replicas to remove from Rucio
        :return:
        """

        with monitor.record_timer_block('cms_sync.time_remove_replica'):
            if to_remove and self.dry_run:
                logging.info('Dry run: Removing replicas %s from rse %s.',
                             str(to_remove), self.rse)
            elif to_remove:
                logging.debug('Removing %s replicas from rse %s.',
                              len(to_remove), self.rse)
                for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE):
                    delete_replicas(rse=self.rse,
                                    issuer=self.account,
                                    files=[{
                                        'scope': self.scope,
                                        'name': lfn
                                    } for lfn in to_remove_chunk])
                    # FIXME: Do we need a retry here on DatabaseException? If so, use the retry module
                return len(to_remove)
Example #19
0
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param exclude_rses: RSE expression to exclude RSEs from the Reaper.
    """
    logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals())
    while not graceful_stop.is_set():
        try:
            max_deleting_rate = 0

            for rse in rses:
                deleting_rate = 0
                rse_info = rsemgr.get_rse_info(rse['rse'])
                rse_protocol = rse_core.get_rse_protocols(rse['rse'])

                if not rse_protocol['availability_delete']:
                    logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse']))
                    continue

                # Temporary hack to force gfal for deletion
                for protocol in rse_info['protocols']:
                    if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                        protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse']))
                try:
                    needed_free_space, max_being_deleted_files = None, 10000
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space = 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse']))

                    s = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas)))

                    if not replicas:
                        logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse']))
                        continue

                    p = rsemgr.create_protocol(rse_info, 'delete', scheme=None)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files)))
                        try:
                            s = time.time()
                            update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files])

                            for replica in files:
                                try:
                                    replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0])
                                except ReplicaUnAvailable as e:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e))
                                    logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg))
                                    replica['pfn'] = None

                                add_message('deletion-planned', {'scope': replica['scope'],
                                                                 'name': replica['name'],
                                                                 'file-size': replica['bytes'],
                                                                 'url': replica['pfn'],
                                                                 'rse': rse_info['rse']})

                            # logging.debug('update_replicas_states %s' % (time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.being_deleted',  delta=len(files))

                            if not scheme:
                                try:
                                    deleted_files = []
                                    p.connect()
                                    for replica in files:
                                        try:
                                            logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            s = time.time()
                                            if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                                logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number,
                                                                                                                                                                                    child_number,
                                                                                                                                                                                    replica['scope'],
                                                                                                                                                                                    replica['name'],
                                                                                                                                                                                    replica['pfn'],
                                                                                                                                                                                    rse['rse']))
                                            else:
                                                if replica['pfn']:
                                                    p.delete(replica['pfn'])
                                                else:
                                                    logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000)
                                            duration = time.time() - s

                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                            add_message('deletion-done', {'scope': replica['scope'],
                                                                          'name': replica['name'],
                                                                          'rse': rse_info['rse'],
                                                                          'file-size': replica['bytes'],
                                                                          'url': replica['pfn'],
                                                                          'duration': duration})
                                            logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                        except SourceNotFound:
                                            err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                            logging.warning(err_msg)
                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': err_msg})
                                        except (ServiceUnavailable, RSEAccessDenied) as e:
                                            logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except Exception as e:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                except (ServiceUnavailable, RSEAccessDenied) as e:
                                    for replica in files:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                        add_message('deletion-failed', {'scope': replica['scope'],
                                                                        'name': replica['name'],
                                                                        'rse': rse_info['rse'],
                                                                        'file-size': replica['bytes'],
                                                                        'url': replica['pfn'],
                                                                        'reason': str(e)})
                                finally:
                                    p.close()
                            s = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse=rse['rse'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.done',  delta=len(deleted_files))
                            deleting_rate += len(deleted_files)
                        except:
                            logging.critical(traceback.format_exc())
                    deleting_rate = deleting_rate * 1.0 / max_being_deleted_files
                    if deleting_rate > max_deleting_rate:
                        max_deleting_rate = deleting_rate
                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate))
            sleep_time = int((1 - max_deleting_rate) * 60 + 1)
            time.sleep(sleep_time)

        except:
            logging.critical(traceback.format_exc())

    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
Example #20
0
 def remove_from_rucio(self):
     """"""
     with monitor.record_timer_block('cms_sync.time_remove_block'):
         self.update_replicas()
         self.update_rule()
Example #21
0
def get_signed_url(rse_id, service, operation, url, lifetime=600):
    """
    Get a signed URL for a particular service and operation.

    The signed URL will be valid for 1 hour but can be overriden.

    :param rse_id: The ID of the RSE that the URL points to.
    :param service: The service to authorise, either 'gcs', 's3' or 'swift'.
    :param operation: The operation to sign, either 'read', 'write', or 'delete'.
    :param url: The URL to sign.
    :param lifetime: Lifetime of the signed URL in seconds.
    :returns: Signed URL as a variable-length string.
    """

    global CREDS_GCS

    if service not in ['gcs', 's3', 'swift']:
        raise UnsupportedOperation('Service must be "gcs", "s3" or "swift"')

    if operation not in ['read', 'write', 'delete']:
        raise UnsupportedOperation(
            'Operation must be "read", "write", or "delete"')

    if url is None or url == '':
        raise UnsupportedOperation('URL must not be empty')

    if lifetime:
        if not isinstance(lifetime, integer_types):
            try:
                lifetime = int(lifetime)
            except:
                raise UnsupportedOperation(
                    'Lifetime must be convertible to numeric.')

    signed_url = None
    if service == 'gcs':
        if not CREDS_GCS:
            CREDS_GCS = ServiceAccountCredentials.from_json_keyfile_name(
                config_get(
                    'credentials',
                    'gcs',
                    raise_exception=False,
                    default='/opt/rucio/etc/google-cloud-storage-test.json'))
        components = urlparse(url)
        host = components.netloc

        # select the correct operation
        operations = {'read': 'GET', 'write': 'PUT', 'delete': 'DELETE'}
        operation = operations[operation]

        # special case to test signature, force epoch time
        if lifetime is None:
            lifetime = 0
        else:
            # GCS is timezone-sensitive, don't use UTC
            # has to be converted to Unixtime
            lifetime = datetime.datetime.now() + datetime.timedelta(
                seconds=lifetime)
            lifetime = int(time.mktime(lifetime.timetuple()))

        # sign the path only
        path = components.path

        # assemble message to sign
        to_sign = "%s\n\n\n%s\n%s" % (operation, lifetime, path)

        # create URL-capable signature
        # first character is always a '=', remove it
        signature = urlencode(
            {'': base64.b64encode(CREDS_GCS.sign_blob(to_sign)[1])})[1:]

        # assemble final signed URL
        signed_url = 'https://%s%s?GoogleAccessId=%s&Expires=%s&Signature=%s' % (
            host, path, CREDS_GCS.service_account_email, lifetime, signature)

    elif service == 's3':
        # split URL to get hostname, bucket and key
        components = urlparse(url)
        host = components.netloc
        pathcomponents = components.path.split('/')
        if len(pathcomponents) < 3:
            raise UnsupportedOperation('Not a valid S3 URL')
        bucket = pathcomponents[1]
        key = '/'.join(pathcomponents[2:])

        # remove port number from host if present
        colon = host.find(':')
        port = '443'
        if colon >= 0:
            port = host[colon + 1:]
            host = host[:colon]

        # look up in RSE account configuration by RSE ID
        cred_name = rse_id
        cred = REGION.get('s3-%s' % cred_name)
        if cred is NO_VALUE:
            rse_cred = get_rse_credentials()
            cred = rse_cred.get(cred_name)
            REGION.set('s3-%s' % cred_name, cred)
        access_key = cred['access_key']
        secret_key = cred['secret_key']
        signature_version = cred['signature_version']
        region_name = cred['region']

        if operation == 'read':
            s3op = 'get_object'
        elif operation == 'write':
            s3op = 'put_object'
        else:
            s3op = 'delete_object'

        with record_timer_block('credential.signs3'):
            s3 = boto3.client('s3',
                              endpoint_url='https://' + host + ':' + port,
                              aws_access_key_id=access_key,
                              aws_secret_access_key=secret_key,
                              config=Config(
                                  signature_version=signature_version,
                                  region_name=region_name))

            signed_url = s3.generate_presigned_url(s3op,
                                                   Params={
                                                       'Bucket': bucket,
                                                       'Key': key
                                                   },
                                                   ExpiresIn=lifetime)

    elif service == 'swift':
        # split URL to get hostname and path
        components = urlparse(url)
        host = components.netloc

        # remove port number from host if present
        colon = host.find(':')
        if colon >= 0:
            host = host[:colon]

        # use RSE ID to look up key
        cred_name = rse_id

        # look up tempurl signing key
        cred = REGION.get('swift-%s' % cred_name)
        if cred is NO_VALUE:
            rse_cred = get_rse_credentials()
            cred = rse_cred.get(cred_name)
            REGION.set('swift-%s' % cred_name, cred)
        tempurl_key = cred['tempurl_key']

        if operation == 'read':
            swiftop = 'GET'
        elif operation == 'write':
            swiftop = 'PUT'
        else:
            swiftop = 'DELETE'

        expires = int(time.time() + lifetime)

        # create signed URL
        with record_timer_block('credential.signswift'):
            hmac_body = u'%s\n%s\n%s' % (swiftop, expires, components.path)
            # Python 3 hmac only accepts bytes or bytearray
            sig = hmac.new(bytearray(tempurl_key, 'utf-8'),
                           bytearray(hmac_body, 'utf-8'), sha1).hexdigest()
            signed_url = 'https://' + host + components.path + '?temp_url_sig=' + sig + '&temp_url_expires=' + str(
                expires)

    return signed_url
Example #22
0
                        # TODO: fix when meta-data is working properly
                        filename = datasetname + '._' + uuid()
                        onefile = {'name': filename, 'scope': scope, 'bytes': tz_filesize, 'pfn': 'mock://srm-atlas.cern.ch:8443/castor/cern.ch/grid/atlas/tzero/this/is/from/the/emulation/%s' % filename}
                        no_files += 1
                        newfiles.append(onefile)
                    d[3] = target
                    # with monitor.record_timer_block(['panda.attach_dids_to_dids', ('panda.attach_dids_to_dids.normalized_datasets', len(inserts)), ('panda.attach_dids_to_dids.normalized_files', no_files)]):
                    #   client.attach_dids_to_dids(attachments=inserts)
                    monitor.record_counter('emulator.tzero.client.files', len(newfiles))
                    while len(newfiles):
                        with monitor.record_timer_block(['emulator.tzero.client.add_files_to_dataset', ('emulator.tzero.client.add_files_to_dataset.normalized', len(newfiles))]):
                            try:
                                client.add_files_to_dataset(scope=scope, name=datasetname, files=newfiles[:500], rse=tz_rse)
                                del newfiles[:500]
                            except Exception, e:
                                print traceback.format_exc()
            delta = time.time() - now
            print '== TZero: Appending %s files to %s datasets took %s seconds' % (no_files, len(ds), delta)
            monitor.record_timer('emulator.tzero.registering_all_replicas', delta * 1000)
            monitor.record_timer('emulator.tzero.registering_all_replicas.normalized', (delta * 1000) / no_files)

        # close all datasets
        for scope, datasetname, s, c in ds:
            print '== TZero: Close dataset %s:%s' % (scope, datasetname)
            with monitor.record_timer_block('emulator.tzero.client.close'):
                try:
                    client.close(scope=scope, name=datasetname)
                except Exception, e:
                    monitor.record_counter('emulator.exceptions.tzero.EMULATION_RUN.%s' % e.__class__.__name__, 1)
                    print traceback.format_exc()
Example #23
0
def reaper(rses,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60):
    """
    Main loop to select and delete files.

    :param rses:           List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param chunk_size:     The size of chunk for deletion.
    :param once:           If True, only runs one iteration of the main loop.
    :param greedy:         If True, delete right away replicas with tombstone.
    :param scheme:         Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:  The delay to query replicas in BEING_DELETED state.
    :param sleep_time:     Time between two cycles.
    """

    try:
        max_deletion_thread = get('reaper', 'nb_workers_by_hostname')
    except ConfigNotFound as error:
        max_deletion_thread = 5
    hostname = socket.getfqdn()
    executable = sys.argv[0]
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info('%s Reaper starting', prepend_str)

    time.sleep(
        10
    )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info('%s Reaper started', prepend_str)

    while not GRACEFUL_STOP.is_set():

        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] +
                                                 1, heart_beat['nr_threads'])
            tot_needed_free_space = 0
            for rse in rses:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blacklisted
                if rse['availability'] % 2 == 0:
                    logging.debug('%s RSE %s is blacklisted for delete',
                                  prepend_str, rse['rse'])
                    continue
                max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(
                    rse['rse'], rse['id'], prepend_str)
                # Check if greedy mode
                if greedy:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        1000000000000, max_being_deleted_files
                    ]
                    tot_needed_free_space += 1000000000000
                else:
                    if needed_free_space:
                        dict_rses[(rse['rse'], rse['id'])] = [
                            needed_free_space, max_being_deleted_files
                        ]
                        tot_needed_free_space += needed_free_space
                    else:
                        logging.debug('%s Nothing to delete on %s',
                                      prepend_str, rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=itemgetter(1), reverse=True))
            logging.debug(
                '%s List of RSEs to process ordered by needed space desc : %s',
                prepend_str, str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logging.debug('%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                max_workers = ceil(dict_rses[rse_key][0] /
                                   tot_needed_free_space * 1000 /
                                   heart_beat['nr_threads'])
                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            skip_until_next_run = []
            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                if rse_id in skip_until_next_run:
                    continue
                logging.debug(
                    '%s Working on %s. Percentage of the total space needed %.2f',
                    prepend_str, rse_name,
                    needed_free_space / tot_needed_free_space * 100)
                rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logging.debug('%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]

                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logging.debug(
                        '%s Too many deletion threads for %s on RSE %s. Back off',
                        prepend_str, rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue

                logging.info(
                    '%s Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    prepend_str, rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logging.debug('%s Total deletion workers for %s : %i',
                              prepend_str, rse_hostname,
                              tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            session=None)
                    logging.debug(
                        '%s list_and_mark_unlocked_replicas  on %s for %s bytes in %s seconds: %s replicas',
                        prepend_str, rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logging.info(
                            '%s Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            prepend_str, rse_name, chunk_size, len(replicas))
                        skip_until_next_run.append(rse_id)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logging.error('%s %s', prepend_str, str(error))
                    continue
                except Exception:
                    logging.critical('%s %s', prepend_str,
                                     str(traceback.format_exc()))

                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme)
                    for file_replicas in chunks(replicas, 100):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    rsemgr.lfns2pfns(
                                        rse_settings=rse_info,
                                        lfns=[{
                                            'scope': replica['scope'],
                                            'name': replica['name'],
                                            'path': replica['path']
                                        }],
                                        operation='delete',
                                        scheme=scheme).values()[0])
                                time.sleep(random.uniform(0, 0.01))
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logging.warning(
                                    '%s Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    prepend_str, replica['scope'],
                                    replica['name'], rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logging.critical('%s %s', prepend_str,
                                                 str(traceback.format_exc()))

                        deleted_files = delete_from_storage(
                            file_replicas, prot, rse_info, staging_areas,
                            prepend_str)
                        logging.info('%s %i files processed in %s seconds',
                                     prepend_str, len(file_replicas),
                                     time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logging.debug(
                            '%s delete_replicas successed on %s : %s replicas in %s seconds',
                            prepend_str, rse_name, len(deleted_files),
                            time.time() - del_start)
                        monitor.record_counter(counters='reaper.deletion.done',
                                               delta=len(deleted_files))

                except Exception as error:
                    logging.critical('%s %s', prepend_str,
                                     str(traceback.format_exc()))

            if once:
                break

            tottime = time.time() - start_time
            if tottime < sleep_time:
                logging.info('%s Will sleep for %s seconds', prepend_str,
                             sleep_time - tottime)
                time.sleep(sleep_time - tottime)

        except DatabaseException as error:
            logging.warning('%s Reaper:  %s', prepend_str, str(error))
        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
    return
Example #24
0
def reaper(rses,
           include_rses,
           exclude_rses,
           vos=None,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60,
           auto_exclude_threshold=100,
           auto_exclude_timeout=600):
    """
    Main loop to select and delete files.

    :param rses:                   List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param include_rses:           RSE expression to include RSEs.
    :param exclude_rses:           RSE expression to exclude RSEs from the Reaper.
    :param vos:                    VOs on which to look for RSEs. Only used in multi-VO mode.
                                   If None, we either use all VOs if run from "def", or the current VO otherwise.
    :param chunk_size:             The size of chunk for deletion.
    :param once:                   If True, only runs one iteration of the main loop.
    :param greedy:                 If True, delete right away replicas with tombstone.
    :param scheme:                 Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:          The delay to query replicas in BEING_DELETED state.
    :param sleep_time:             Time between two cycles.
    :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded.
    :param auto_exclude_timeout:   Timeout for temporarily excluded RSEs.
    """
    hostname = socket.getfqdn()
    executable = 'reaper'
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                      heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    logger(logging.INFO, 'Reaper starting')

    if not once:
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                      heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Reaper started')

    while not GRACEFUL_STOP.is_set():
        # try to get auto exclude parameters from the config table. Otherwise use CLI parameters.
        try:
            auto_exclude_threshold = config_get('reaper',
                                                'auto_exclude_threshold',
                                                default=auto_exclude_threshold)
            auto_exclude_timeout = config_get('reaper',
                                              'auto_exclude_timeout',
                                              default=auto_exclude_timeout)
        except (NoOptionError, NoSectionError, RuntimeError):
            pass

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get(
                'reaper', 'max_evaluator_backlog_count')
        except (NoOptionError, NoSectionError, RuntimeError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get(
                'reaper', 'max_evaluator_backlog_duration')
        except (NoOptionError, NoSectionError, RuntimeError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog()
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses,
                                              vos)
        if not rses_to_process:
            logger(logging.ERROR,
                   'Reaper: No RSEs found. Will sleep for 30 seconds')
            GRACEFUL_STOP.wait(30)
            continue
        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                              heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            tot_needed_free_space = 0
            for rse in rses_to_process:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blocklisted
                if rse['availability'] % 2 == 0:
                    logger(logging.DEBUG, 'RSE %s is blocklisted for delete',
                           rse['rse'])
                    continue
                needed_free_space, only_delete_obsolete = __check_rse_usage(
                    rse['rse'], rse['id'], greedy=greedy, logger=logger)
                if needed_free_space:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                    tot_needed_free_space += needed_free_space
                elif only_delete_obsolete:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                else:
                    logger(logging.DEBUG, 'Nothing to delete on %s',
                           rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True))
            logger(logging.DEBUG,
                   'List of RSEs to process ordered by needed space desc: %s',
                   str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                if tot_needed_free_space:
                    max_workers = ceil(dict_rses[rse_key][0] /
                                       tot_needed_free_space * 1000 /
                                       heart_beat['nr_threads'])
                else:
                    max_workers = 1

                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            paused_rses = []
            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                result = REGION.get('pause_deletion_%s' % rse_id,
                                    expiration_time=120)
                if result is not NO_VALUE:
                    paused_rses.append(rse_name)
                    logger(
                        logging.DEBUG,
                        'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                        rse_name)
                    continue
                result = REGION.get('temporary_exclude_%s' % rse_id,
                                    expiration_time=auto_exclude_timeout)
                if result is not NO_VALUE:
                    logger(
                        logging.WARNING,
                        'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                        rse_name)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    continue
                labels = {'rse': rse_name}
                EXCLUDED_RSE_GAUGE.labels(**labels).set(0)
                percent = 0
                if tot_needed_free_space:
                    percent = needed_free_space / tot_needed_free_space * 100
                logger(
                    logging.DEBUG,
                    'Working on %s. Percentage of the total space needed %.2f',
                    rse_name, percent)
                try:
                    rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                except KeyError:
                    logger(logging.DEBUG, "Hostname lookup for %s failed.",
                           rse_name)
                    REGION.set('pause_deletion_%s' % rse_id, True)
                    continue
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]
                max_deletion_thread = get_max_deletion_threads_by_hostname(
                    rse_hostname)
                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logger(
                        logging.DEBUG,
                        'Too many deletion threads for %s on RSE %s. Back off',
                        rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue
                logger(
                    logging.INFO,
                    'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logger(logging.DEBUG, 'Total deletion workers for %s : %i',
                       rse_hostname, tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                only_delete_obsolete = dict_rses[(rse_name, rse_id)][1]
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        if only_delete_obsolete:
                            logger(
                                logging.DEBUG,
                                'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                                rse_name)
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes_=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            only_delete_obsolete=only_delete_obsolete,
                            session=None)
                    logger(
                        logging.DEBUG,
                        'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logger(
                            logging.DEBUG,
                            'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            rse_name, chunk_size, len(replicas))
                        REGION.set('pause_deletion_%s' % rse_id, True)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logger(logging.ERROR, '%s', str(error))
                    continue
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)
                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme,
                                                  logger=logger)
                    for file_replicas in chunks(replicas, chunk_size):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    list(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope':
                                                replica['scope'].external,
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values())[0])
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logger(
                                    logging.WARNING,
                                    'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    replica['scope'], replica['name'],
                                    rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logger(logging.CRITICAL,
                                       'Exception',
                                       exc_info=True)

                        deleted_files = delete_from_storage(
                            file_replicas,
                            prot,
                            rse_info,
                            staging_areas,
                            auto_exclude_threshold,
                            logger=logger)
                        logger(logging.INFO,
                               '%i files processed in %s seconds',
                               len(file_replicas),
                               time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logger(
                            logging.DEBUG,
                            'delete_replicas successed on %s : %s replicas in %s seconds',
                            rse_name, len(deleted_files),
                            time.time() - del_start)
                        DELETION_COUNTER.inc(len(deleted_files))
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            if paused_rses:
                logger(logging.INFO,
                       'Deletion paused for a while for following RSEs: %s',
                       ', '.join(paused_rses))

            if once:
                break

            daemon_sleep(start_time=start_time,
                         sleep_time=sleep_time,
                         graceful_stop=GRACEFUL_STOP,
                         logger=logger)

        except DatabaseException as error:
            logger(logging.WARNING, 'Reaper:  %s', str(error))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
    return
Example #25
0
    def sync_site(self, site_pair):
        """
        Sync a site defined by a site_pair of (site, prefix). Prefix can be None to sync all blocks in the site
        :return:
        """
        site, prefix = site_pair
        # now = int(time.time())

        # Set 1980 as the last sync date if no data exists
        # site_last_synced = self.last_synced.get(site_pair, 10 * 365 * 24 * 3600)
        # last_week = int(site_last_synced - 7 * 24 * 3600)

        if self.config.get('default', None):
            if self.config['default'].get('chunck', 0):
                BLOCKS_PER_ACTION = int(self.config['default']['chunck'])

        with monitor.record_timer_block('cms_sync.time_site_sync'):

            r_timer = 'cms_sync.time_rucio_block_list_all'
            p_timer = 'cms_sync.time_phedex_block_list_all'
            if prefix:
                r_timer = 'cms_sync.time_rucio_block_list_partial'
                p_timer = 'cms_sync.time_phedex_block_list_partial'

            with monitor.record_timer_block(p_timer):
                phedex_blocks = self.phedex_svc.blocks_at_site(pnn=site,
                                                               prefix=prefix,
                                                               since=None)
            with monitor.record_timer_block(r_timer):
                rucio_blocks = self.get_datasets_at_rse(rse=site,
                                                        prefix=prefix)

            n_blocks_in_phedex = len(phedex_blocks)
            n_blocks_in_rucio = len(rucio_blocks)

            # FIXME: This is refusing to delete everything from Rucio. Not clear it's needed
            if not n_blocks_in_phedex and n_blocks_in_rucio:
                logging.warning(
                    "At %s found %s blocks in PhEDEx and %s in Rucio with prefix %s",
                    site, n_blocks_in_phedex, n_blocks_in_rucio, prefix)
                return
            if not n_blocks_in_phedex and not n_blocks_in_rucio:
                logging.info(
                    "At %s:%s, nothing in PhEDEx or Rucio. Quitting." %
                    (site, prefix))
                return

            block_report = compare_site_blocks(phedex=phedex_blocks,
                                               rucio=rucio_blocks,
                                               rse=site)

            n_blocks_not_in_rucio = len(block_report['not_rucio'])
            n_blocks_not_in_phedex = len(block_report['not_phedex'])
            n_incomplete_blocks = len(block_report['incomplete'])

            logging.info("At %s: In both/PhEDEx only/Rucio only: %s/%s/%s" %
                         (site, len(block_report['complete']),
                          n_blocks_not_in_rucio, n_blocks_not_in_phedex))

            # Truncate lists if we want to reduce cycle time
            if BLOCKS_PER_ACTION and n_blocks_not_in_rucio > BLOCKS_PER_ACTION:
                block_report['not_rucio'] = set(
                    list(block_report['not_rucio'])[:BLOCKS_PER_ACTION])
                n_blocks_not_in_rucio = len(block_report['not_rucio'])
            if BLOCKS_PER_ACTION and n_blocks_not_in_phedex > BLOCKS_PER_ACTION:
                block_report['not_phedex'] = set(
                    list(block_report['not_phedex'])[:BLOCKS_PER_ACTION])
                n_blocks_not_in_phedex = len(block_report['not_phedex'])

            logging.info('Adding   %6d blocks to   Rucio for %s:%s',
                         n_blocks_not_in_rucio, site, prefix)
            for block in block_report['not_rucio']:
                bs = BlockSyncer(block_name=block, pnn=site, rse=site)
                bs.add_to_rucio()

            logging.info('Removing %6d blocks from Rucio for %s:%s',
                         n_blocks_not_in_phedex, site, prefix)
            for block in block_report['not_phedex']:
                bs = BlockSyncer(block_name=block, pnn=site, rse=site)
                bs.remove_from_rucio()

            for block in block_report['incomplete']:
                logging.warn('Redoing sync for %s at %s', block, site)
                bs = BlockSyncer(block_name=block, pnn=site, rse=site)
                bs.add_to_rucio(recover=True)

            logging.info('Finished syncing                      %s:%s' %
                         (site, prefix))
Example #26
0
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100,
           once=False, greedy=False, scheme=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    """
    logging.info('Starting Reaper: Worker %(worker_number)s, '
                 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    # Generate a hash just for the subset of RSEs
    rse_names = [rse['rse'] for rse in rses]
    hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    nothing_to_do = {}
    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
            checkpoint_time = datetime.datetime.now()
            # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))

            max_deleting_rate = 0
            for rse in sort_rses(rses):
                try:
                    if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now():
                        heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
                        # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))
                        checkpoint_time = datetime.datetime.now()

                    if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now():
                        continue
                    logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number,
                                 rse['rse'], nothing_to_do.get(rse['id']))

                    rse_info = rsemgr.get_rse_info(rse_id=rse['id'])
                    rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id'])

                    if not rse_protocol['availability_delete']:
                        logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse'])
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        continue

                    # Temporary hack to force gfal for deletion
                    for protocol in rse_info['protocols']:
                        if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                            protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                    needed_free_space, max_being_deleted_files = None, 100
                    needed_free_space_per_child = None
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space, needed_free_space_per_child = 0, 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse'])
                        else:
                            if total_children and total_children > 0:
                                needed_free_space_per_child = needed_free_space / float(total_children)

                    start = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse_id=rse['id'],
                                                          bytes=needed_free_space_per_child,
                                                          limit=max_being_deleted_files,
                                                          worker_number=child_number,
                                                          total_workers=total_children,
                                                          delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas))

                    if not replicas:
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s',
                                     worker_number, child_number, rse['rse'],
                                     nothing_to_do[rse['id']])
                        continue

                    prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files))
                        try:
                            update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True)
                            for replica in files:
                                try:
                                    replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info,
                                                                               lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}],
                                                                               operation='delete', scheme=scheme).values())[0])
                                except (ReplicaUnAvailable, ReplicaNotFound) as error:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error))
                                    logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg)
                                    replica['pfn'] = None

                            monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files))

                            try:
                                deleted_files = []
                                prot.connect()
                                for replica in files:
                                    try:
                                        deletion_dict = {'scope': replica['scope'].external,
                                                         'name': replica['name'],
                                                         'rse': rse_info['rse'],
                                                         'rse_id': rse_info['id'],
                                                         'file-size': replica['bytes'],
                                                         'bytes': replica['bytes'],
                                                         'url': replica['pfn'],
                                                         'protocol': prot.attributes['scheme']}
                                        if replica['scope'].vo != 'def':
                                            deletion_dict['vo'] = replica['scope'].vo
                                        logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        start = time.time()
                                        if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                            logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                                                            worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        else:
                                            if replica['pfn']:
                                                pfn = replica['pfn']
                                                # sign the URL if necessary
                                                if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None:
                                                    pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn)
                                                prot.delete(pfn)
                                            else:
                                                logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000)
                                        duration = time.time() - start

                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                        deletion_dict['duration'] = duration
                                        add_message('deletion-done', deletion_dict)
                                        logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration)
                                    except SourceNotFound:
                                        err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        logging.warning(err_msg)
                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                        if replica['state'] == ReplicaState.AVAILABLE:
                                            deletion_dict['reason'] = str(err_msg)
                                            add_message('deletion-failed', deletion_dict)
                                    except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except Exception as error:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                            except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                for replica in files:
                                    logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                    payload = {'scope': replica['scope'].external,
                                               'name': replica['name'],
                                               'rse': rse_info['rse'],
                                               'rse_id': rse_info['id'],
                                               'file-size': replica['bytes'],
                                               'bytes': replica['bytes'],
                                               'url': replica['pfn'],
                                               'reason': str(error),
                                               'protocol': prot.attributes['scheme']}
                                    if replica['scope'].vo != 'def':
                                        deletion_dict['vo'] = replica['scope'].vo
                                    add_message('deletion-failed', payload)
                                    break
                            finally:
                                prot.close()
                            start = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse_id=rse['id'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start)
                            monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files))

                        except DatabaseException as error:
                            logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error))
                        except UnsupportedOperation as error:
                            logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error))
                        except:
                            logging.critical(traceback.format_exc())

                except RSENotFound as error:
                    logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error))

                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            time.sleep(1)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Example #27
0
    def EMULATION_RUN(self, interval, filescale, dataXX, runnumber, runspertag, calibfraction, timespan):
        tz_account = 'tzero'
        tz_group = 'tzero'
        tz_provenance = 'T0'
        tz_rse = 'CERN-PROD_TZERO'
        tz_filesize = 2000000000     # 2GB

        runnumber = random.randrange(400000, 500000)
        client = Client(account=tz_account)

        # Initializing run data
        if random.random() > calibfraction:
            print '== TZero: Starting calibration run no. %s' % runnumber
            pattern = [('data12_8TeV.NNNNNNNN.calibration_LArCellsEmpty.daq.RAW', 36),
                       ('data12_8TeV.NNNNNNNN.calibration_Tile.daq.RAW', 36),
                       ('data12_8TeV.NNNNNNNN.calibration_lucid.daq.RAW', 6),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.AOD.f1_m4', 11),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.AOD.x1_m4', 11),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.HIST.f1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.HIST.x1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.NTUP_BKGD.x1_m2', 7),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.NTUP_SCT.f1_m2', 7),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.RAW', 208),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.TAG.f1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.TAG.x1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Background.recon.ESD.f1', 208),
                       ('data12_8TeV.NNNNNNNN.physics_Background.recon.ESD.x1', 208),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.AOD.f1_m4', 11),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.AOD.x1_m4', 11),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.HIST.f1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.HIST.x1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.RAW', 205),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.TAG.f1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.TAG.x1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.recon.ESD.f1', 205),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.recon.ESD.x1', 205),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.merge.AOD.x1_m4', 11),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.merge.HIST.x1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.merge.RAW', 207),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.merge.TAG.x1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.recon.ESD.x1', 207),
                       ]
        else:
            print '== TZero: Starting data run no. %s' % runnumber
            pattern = [('data12_8TeV.NNNNNNNN.calibration_IDTracks.daq.RAW', 141),
                       ('data12_8TeV.NNNNNNNN.calibration_LArCells.daq.RAW', 63),
                       ('data12_8TeV.NNNNNNNN.calibration_LArCellsEmpty.daq.RAW', 63),
                       ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.daq.RAW', 106),
                       ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.merge.AOD.c819_m4', 66),
                       ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.merge.NTUP_IDVTXLUMI.c819_m2', 10),
                       ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.merge.TAG.c819_m4_m3', 2),
                       ('data12_8TeV.NNNNNNNN.calibration_PixelBeam.recon.ESD.c819', 271),
                       ('data12_8TeV.NNNNNNNN.calibration_PixelNoise.daq.RAW', 63),
                       ('data12_8TeV.NNNNNNNN.calibration_SCTNoise.daq.RAW', 63),
                       ('data12_8TeV.NNNNNNNN.calibration_Tile.daq.RAW', 63),
                       ('data12_8TeV.NNNNNNNN.calibration_beamspot.daq.RAW', 63),
                       ('data12_8TeV.NNNNNNNN.calibration_lucid.daq.RAW', 10),
                       ('data12_8TeV.NNNNNNNN.debug_all.daq.RAW', 4),
                       ('data12_8TeV.NNNNNNNN.debugrec_hltacc.merge.AOD.g1_f2_m4', 1),
                       ('data12_8TeV.NNNNNNNN.debugrec_hltacc.merge.NTUP_FASTMON.g1_f2_m4_m8', 1),
                       ('data12_8TeV.NNNNNNNN.debugrec_hltacc.merge.RAW.g1', 1),
                       ('data12_8TeV.NNNNNNNN.debugrec_hltacc.merge.TAG.g1_f2_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.debugrec_hltacc.recon.ESD.g1_f2', 1),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.AOD.f2_m4', 21),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.AOD.x1_m4', 21),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.HIST.f2_m1', 1),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.HIST.x1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.NTUP_SCT.f2_m2', 31),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.NTUP_TRIG.x1_m2', 121),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.RAW', 402),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.TAG.f2_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.express_express.merge.TAG.x1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.express_express.recon.ESD.f2', 402),
                       ('data12_8TeV.NNNNNNNN.express_express.recon.ESD.x1', 3614),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.AOD.f2_m4', 22),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.AOD.x1_m4', 22),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.HIST.f2_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.HIST.x1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.NTUP_BKGD.x1_m2', 15),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.NTUP_SCT.f2_m2', 15),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.RAW', 429),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.TAG.f2_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Background.merge.TAG.x1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Background.recon.ESD.f2', 429),
                       ('data12_8TeV.NNNNNNNN.physics_Background.recon.ESD.x1', 429),
                       ('data12_8TeV.NNNNNNNN.physics_Bphysics.merge.RAW', 3150),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.AOD.f2_m4', 22),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.AOD.x1_m4', 22),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.HIST.f2_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.HIST.x1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.RAW', 425),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.TAG.f2_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.merge.TAG.x1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.recon.ESD.f2', 425),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicCalo.recon.ESD.x1', 425),
                       ('data12_8TeV.NNNNNNNN.physics_CosmicMuons.merge.RAW', 399),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.AOD.f2_m4', 312),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DESDM_EGAMMA.f2_m6', 31),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DESDM_RPVLL.f2_m6', 21),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DESD_PHOJET.f2_m6', 74),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DESD_SGLEL.f2_m6', 57),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.DRAW_ZEE.f2_m9', 37),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.HIST.f2_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.NTUP_FASTMON.f2_m4_m8', 18),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.RAW', 3626),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.merge.TAG.f2_m4_m3', 7),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.recon.DAOD_ZEE.f2_m9_f2', 37),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.recon.DESD_ZEE.f2_m9_f2', 37),
                       ('data12_8TeV.NNNNNNNN.physics_Egamma.recon.ESD.f2', 3626),
                       ('data12_8TeV.NNNNNNNN.physics_HadDelayed.merge.RAW', 3215),
                       ('data12_8TeV.NNNNNNNN.physics_IDCosmic.merge.RAW', 402),
                       ('data12_8TeV.NNNNNNNN.physics_JetCalibDelayed.merge.RAW', 403),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.AOD.f2_m4', 364),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.DESDM_RPVLL.f2_m6', 150),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.DESDM_TRACK.f2_m6', 25),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.DESD_CALJET.f2_m6', 71),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.HIST.f2_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.NTUP_FASTMON.f2_m4_m8', 27),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.RAW', 3624),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.merge.TAG.f2_m4_m3', 8),
                       ('data12_8TeV.NNNNNNNN.physics_JetTauEtmiss.recon.ESD.f2', 3624),
                       ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.AOD.f2_m4', 21),
                       ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.DESD_MBIAS.f2_m6', 24),
                       ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.HIST.f2_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.RAW', 403),
                       ('data12_8TeV.NNNNNNNN.physics_MinBias.merge.TAG.f2_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_MinBias.recon.ESD.f2', 403),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.merge.AOD.f2_m4', 334),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.merge.DESDM_RPVLL.f2_m6', 54),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.merge.DESD_SGLMU.f2_m6', 254),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.merge.DRAW_ZMUMU.f2_m9', 47),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.merge.HIST.f2_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.merge.NTUP_FASTMON.f2_m4_m8', 19),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.merge.RAW', 3629),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.merge.TAG.f2_m4_m3', 7),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.recon.DAOD_ZMUMU.f2_m9_f1', 47),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.recon.DESD_ZMUMU.f2_m9_f1', 47),
                       ('data12_8TeV.NNNNNNNN.physics_Muons.recon.ESD.f2', 3629),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.merge.AOD.x1_m4', 2),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.merge.HIST.x1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.merge.RAW', 24),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.merge.TAG.x1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_Standby.recon.ESD.x1', 24),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.AOD.f2_m4', 21),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.AOD.x1_m4', 21),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.HIST.f2_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.HIST.x1_m1', 1),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.RAW', 402),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.TAG.f2_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.merge.TAG.x1_m4_m3', 1),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.recon.ESD.f2', 402),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBias.recon.ESD.x1', 402),
                       ('data12_8TeV.NNNNNNNN.physics_ZeroBiasOverlay.merge.RAW', 404),
                       ('data12_calib.0NNNNNNNN.calibration_MuonAll.daq.RAW', 70),
                       ]

        factor = runnumber / runspertag
        tagdict = {'x1': 'x%s' % (factor * 1 + 1),
                   'f1': 'f%s' % (factor * 2 + 1),
                   'f2': 'f%s' % (factor * 2 + 2),
                   'm1': 'm%s' % (factor * 10 + 1),
                   'm2': 'm%s' % (factor * 10 + 2),
                   'm3': 'm%s' % (factor * 10 + 3),
                   'm4': 'm%s' % (factor * 10 + 4),
                   'm6': 'm%s' % (factor * 10 + 6),
                   'm8': 'm%s' % (factor * 10 + 8),
                   'm9': 'm%s' % (factor * 10 + 9)
                   }

        # pre compute all datasets
        success = False
        while not success:
            try:
                ds = []
                tmp_uuid = uuid()
                for n, s in pattern:
                    # newname = n.replace('data12', dataXX).replace('NNNNNNNN', str(runnumber).zfill(8)) % tagdict
                    # TODO: fix when meta-data is working properly
                    newname = n.replace('data12', dataXX).replace('NNNNNNNN', tmp_uuid) % tagdict
                    newscope = newname.split('.')[0]
                    newsize = s * filescale
                    ds.append([newscope, newname, newsize, 0])  # 0 is initial value of file counter

                # register run number
                # client.add_value('run_number', runnumber)

                # open all datasets
                open_ds = list()
                for scope, datasetname, s, c in ds:
                    pcs = datasetname.split('.')
                    meta = {'project': pcs[0],
                            'stream_name': pcs[2],
                            'prod_step': pcs[3],
                            'datatype': pcs[4],
                            # 'version': pcs[5],
                            'guid': uuid(),
                            }
                    meta['group'] = tz_group
                    meta['provenance'] = tz_provenance
                    meta['run_number'] = runnumber
                    open_ds.append({'scope': scope, 'name': datasetname,
                                    'statuses': {'monotonic': True}, 'meta': meta,
                                    'rules': [{'account': tz_account, 'copies': 1,
                                               'rse_expression': tz_rse, 'grouping': 'DATASET'}]})  # The missing lifetime attribute indicated an infinite lifetime
                with monitor.record_timer_block(['emulator.tzero.client.add_datasets', ('emulator.tzero.client.add_datasets.normalized', len(open_ds))]):
                    client.add_datasets(open_ds)
                monitor.record_counter('emulator.tzero.client.datasets', len(open_ds))
                # dids = [{'scope': d['scope'], 'name': d['name']} for d in open_ds]
                # with monitor.record_timer_block(['tzero.add_replication_rule', ('tzero.add_replication_rule.normalized', len(open_ds))]):
                #    client.add_replication_rule(dids, copies=1, rse_expression=tz_rse,
                #                                grouping='DATASET', account=tz_account)
                success = True
            except Exception, e:
                monitor.record_counter('emulator.exceptions.tzero.EMULATION_RUN.%s' % e.__class__.__name__, 1)
                print traceback.format_exc()
Example #28
0
    def sync_site(self, site_pair):
        """
        Sync a site defined by a site_pair of (site, prefix). Prefix can be None to sync all blocks in the site
        :return:
        """
        site, prefix = site_pair

        if site.endswith('_Tape'):
            pnn = site.replace('_Tape', '_MSS')
        else:
            pnn = site

        # now = int(time.time())

        # Set 1980 as the last sync date if no data exists
        # site_last_synced = self.last_synced.get(site_pair, 10 * 365 * 24 * 3600)
        # last_week = int(site_last_synced - 7 * 24 * 3600)

        if self.config.get('default', None):
            if self.config['default'].get('chunck', 0):
                BLOCKS_PER_ACTION = int(self.config['default']['chunck'])
            if self.config['default'].get('select', None):
                self.patterns = [self.config['default']['select']]

        with monitor.record_timer_block('cms_sync.time_site_sync'):

            r_timer = 'cms_sync.time_rucio_block_list_all'
            p_timer = 'cms_sync.time_phedex_block_list_all'
            if prefix:
                r_timer = 'cms_sync.time_rucio_block_list_partial'
                p_timer = 'cms_sync.time_phedex_block_list_partial'

            # Add touches to keep from getting killed as long as progress is being made
            with monitor.record_timer_block(p_timer):
                touch(text='PQ ' + site)
                phedex_blocks = self.phedex_svc.blocks_at_site(pnn=pnn,
                                                               prefix=prefix,
                                                               since=None)
            with monitor.record_timer_block(r_timer):
                touch(text='RQ ' + site)
                rucio_blocks = self.get_datasets_at_rse(rse=site,
                                                        prefix=prefix)
                touch(text='DQ ' + site)

            n_blocks_in_phedex = len(phedex_blocks)
            n_blocks_in_rucio = len(rucio_blocks)

            # FIXME: This is refusing to delete everything from Rucio. Not clear it's needed
            if not n_blocks_in_phedex and n_blocks_in_rucio:
                logging.warning(
                    "At %s found %s blocks in PhEDEx and %s in Rucio with prefix %s",
                    site, n_blocks_in_phedex, n_blocks_in_rucio, prefix)
                return
            if not n_blocks_in_phedex and not n_blocks_in_rucio:
                logging.info(
                    "At %s:%s, nothing in PhEDEx or Rucio. Quitting." %
                    (site, prefix))
                return

            block_report = compare_site_blocks(phedex=phedex_blocks,
                                               rucio=rucio_blocks,
                                               rse=site,
                                               patterns=self.patterns)

            n_blocks_not_in_rucio = len(block_report['not_rucio'])
            n_blocks_not_in_phedex = len(block_report['not_phedex'])
            n_incomplete_blocks = len(block_report['incomplete'])

            logging.info("At %s:%s In both/PhEDEx only/Rucio only: %s/%s/%s" %
                         (site, prefix, len(block_report['complete']),
                          n_blocks_not_in_rucio, n_blocks_not_in_phedex))
            if len(block_report['complete']
                   ) or n_blocks_not_in_rucio or n_blocks_not_in_phedex:
                logging.info(
                    'At %s:%s %3.0f%% complete', site, prefix,
                    len(block_report['complete']) * 100 /
                    (len(block_report['complete']) + n_blocks_not_in_rucio +
                     n_blocks_not_in_phedex))
            if len(block_report['complete']) or n_blocks_not_in_rucio:
                logging.info(
                    'At %s:%s %3.0f%% completely added', site, prefix,
                    len(block_report['complete']) * 100 /
                    (len(block_report['complete']) + n_blocks_not_in_rucio))
            # Truncate lists if we want to reduce cycle time
            if BLOCKS_PER_ACTION and n_blocks_not_in_rucio > BLOCKS_PER_ACTION:
                block_report['not_rucio'] = set(
                    list(block_report['not_rucio'])[:BLOCKS_PER_ACTION])
                n_blocks_not_in_rucio = len(block_report['not_rucio'])
            if BLOCKS_PER_ACTION and n_blocks_not_in_phedex > BLOCKS_PER_ACTION:
                block_report['not_phedex'] = set(
                    list(block_report['not_phedex'])[:BLOCKS_PER_ACTION])
                n_blocks_not_in_phedex = len(block_report['not_phedex'])

            logging.info('Adding   %6d blocks to   Rucio for %s:%s',
                         n_blocks_not_in_rucio, site, prefix)
            for block in block_report['not_rucio']:
                logging.info('Adding to rucio: %s at %s', block, site)
                bs = BlockSyncer(block_name=block, pnn=pnn, rse=site)
                bs.add_to_rucio()

            logging.info('Removing %6d blocks from Rucio for %s:%s',
                         n_blocks_not_in_phedex, site, prefix)
            for block in block_report['not_phedex']:
                logging.info('Removing from rucio: %s at %s', block, site)
                bs = BlockSyncer(block_name=block, pnn=pnn, rse=site)
                bs.remove_from_rucio()

            for block in block_report['incomplete']:
                logging.warn('Redoing sync for %s at %s', block, site)
                bs = BlockSyncer(block_name=block, pnn=pnn, rse=site)
                bs.add_to_rucio(recover=True)

            logging.info('Finished syncing                      %s:%s' %
                         (site, prefix))
Example #29
0
def _run_once(rses_to_process, chunk_size, greedy, scheme, delay_seconds,
              auto_exclude_threshold, auto_exclude_timeout, heartbeat_handler,
              oidc_account, oidc_scope, oidc_audience, **_kwargs):

    dict_rses = {}
    _, total_workers, logger = heartbeat_handler.live()
    tot_needed_free_space = 0
    for rse in rses_to_process:
        # Check if RSE is blocklisted
        if rse.columns['availability'] % 2 == 0:
            logger(logging.DEBUG, 'RSE %s is blocklisted for delete', rse.name)
            continue
        rse.ensure_loaded(load_attributes=True)
        enable_greedy = rse.attributes.get('greedyDeletion', False) or greedy
        needed_free_space, only_delete_obsolete = __check_rse_usage_cached(
            rse, greedy=enable_greedy, logger=logger)
        if needed_free_space:
            dict_rses[rse] = [
                needed_free_space, only_delete_obsolete, enable_greedy
            ]
            tot_needed_free_space += needed_free_space
        elif only_delete_obsolete:
            dict_rses[rse] = [
                needed_free_space, only_delete_obsolete, enable_greedy
            ]
        else:
            logger(logging.DEBUG, 'Nothing to delete on %s', rse.name)

    rses_with_params = [
        (rse, needed_free_space, only_delete_obsolete, enable_greedy)
        for rse, (needed_free_space, only_delete_obsolete,
                  enable_greedy) in dict_rses.items()
    ]

    # Ordering the RSEs based on the needed free space
    sorted_rses = sorted(rses_with_params, key=lambda x: x[1], reverse=True)
    log_msg_str = ', '.join(
        f'{rse}:{needed_free_space}:{only_delete_obsolete}:{enable_greedy}'
        for rse, needed_free_space, only_delete_obsolete, enable_greedy in
        sorted_rses)
    logger(logging.DEBUG,
           'List of RSEs to process ordered by needed space desc: %s',
           log_msg_str)

    random.shuffle(rses_with_params)

    work_remaining_by_rse = {}
    paused_rses = []
    for rse, needed_free_space, only_delete_obsolete, enable_greedy in rses_with_params:
        result = REGION.get('pause_deletion_%s' % rse.id, expiration_time=120)
        if result is not NO_VALUE:
            paused_rses.append(rse.name)
            logger(
                logging.DEBUG,
                'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                rse.name)
            continue

        result = REGION.get('temporary_exclude_%s' % rse.id,
                            expiration_time=auto_exclude_timeout)
        if result is not NO_VALUE:
            logger(
                logging.WARNING,
                'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                rse.name)
            EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(1)
            continue
        EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(0)

        percent = 0
        if tot_needed_free_space:
            percent = needed_free_space / tot_needed_free_space * 100
        logger(logging.DEBUG,
               'Working on %s. Percentage of the total space needed %.2f',
               rse.name, percent)

        rse_hostname = _rse_deletion_hostname(rse)
        if not rse_hostname:
            logger(logging.WARNING, 'No default delete protocol for %s',
                   rse.name)
            REGION.set('pause_deletion_%s' % rse.id, True)
            continue

        hb_payload = __try_reserve_worker_slot(
            heartbeat_handler=heartbeat_handler,
            rse=rse,
            hostname=rse_hostname,
            logger=logger)
        if not hb_payload:
            # Might need to reschedule a try on this RSE later in the same cycle
            continue

        # List and mark BEING_DELETED the files to delete
        del_start_time = time.time()
        try:
            use_temp_tables = config_get_bool('core',
                                              'use_temp_tables',
                                              default=False)
            with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                if only_delete_obsolete:
                    logger(
                        logging.DEBUG,
                        'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                        rse.name)
                if use_temp_tables:
                    replicas = list_and_mark_unlocked_replicas(
                        limit=chunk_size,
                        bytes_=needed_free_space,
                        rse_id=rse.id,
                        delay_seconds=delay_seconds,
                        only_delete_obsolete=only_delete_obsolete,
                        session=None)
                else:
                    replicas = list_and_mark_unlocked_replicas_no_temp_table(
                        limit=chunk_size,
                        bytes_=needed_free_space,
                        rse_id=rse.id,
                        delay_seconds=delay_seconds,
                        only_delete_obsolete=only_delete_obsolete,
                        session=None)
            logger(
                logging.DEBUG,
                'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                rse.name, needed_free_space,
                time.time() - del_start_time, len(replicas))
            if (len(replicas) == 0
                    and enable_greedy) or (len(replicas) < chunk_size
                                           and not enable_greedy):
                logger(
                    logging.DEBUG,
                    'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                    rse.name, chunk_size, len(replicas))
                REGION.set('pause_deletion_%s' % rse.id, True)
                work_remaining_by_rse[rse] = False
            else:
                work_remaining_by_rse[rse] = True

        except (DatabaseException, IntegrityError, DatabaseError) as error:
            logger(logging.ERROR, '%s', str(error))
            continue
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
            continue
        # Physical  deletion will take place there
        try:
            rse.ensure_loaded(load_info=True, load_attributes=True)
            auth_token = None
            if oidc_account and rse.attributes.get('oidc_support', False):
                account = InternalAccount(oidc_account, vo=rse.columns['vo'])
                token_dict = get_token_for_account_operation(
                    account,
                    req_audience=oidc_audience,
                    req_scope=oidc_scope,
                    admin=True)
                if token_dict is not None and 'token' in token_dict:
                    auth_token = token_dict['token']
                    logger(logging.DEBUG,
                           'OIDC authentication used for deletion.')
            prot = rsemgr.create_protocol(rse.info,
                                          'delete',
                                          scheme=scheme,
                                          auth_token=auth_token,
                                          logger=logger)
            for file_replicas in chunks(replicas, chunk_size):
                # Refresh heartbeat
                _, total_workers, logger = heartbeat_handler.live(
                    payload=hb_payload)
                del_start_time = time.time()
                for replica in file_replicas:
                    try:
                        replica['pfn'] = str(
                            list(
                                rsemgr.lfns2pfns(rse_settings=rse.info,
                                                 lfns=[{
                                                     'scope':
                                                     replica['scope'].external,
                                                     'name':
                                                     replica['name'],
                                                     'path':
                                                     replica['path']
                                                 }],
                                                 operation='delete',
                                                 scheme=scheme).values())[0])
                    except (ReplicaUnAvailable, ReplicaNotFound) as error:
                        logger(
                            logging.WARNING,
                            'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                            replica['scope'], replica['name'], rse.name,
                            str(error))
                        replica['pfn'] = None

                    except Exception:
                        logger(logging.CRITICAL, 'Exception', exc_info=True)

                is_staging = rse.columns['staging_area']
                deleted_files = delete_from_storage(heartbeat_handler,
                                                    hb_payload,
                                                    file_replicas,
                                                    prot,
                                                    rse.info,
                                                    is_staging,
                                                    auto_exclude_threshold,
                                                    logger=logger)
                logger(logging.INFO, '%i files processed in %s seconds',
                       len(file_replicas),
                       time.time() - del_start_time)

                # Then finally delete the replicas
                del_start = time.time()
                with monitor.record_timer_block('reaper.delete_replicas'):
                    delete_replicas(rse_id=rse.id, files=deleted_files)
                logger(
                    logging.DEBUG,
                    'delete_replicas successed on %s : %s replicas in %s seconds',
                    rse.name, len(deleted_files),
                    time.time() - del_start)
                DELETION_COUNTER.inc(len(deleted_files))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)

    if paused_rses:
        logger(logging.INFO,
               'Deletion paused for a while for following RSEs: %s',
               ', '.join(paused_rses))

    rses_with_more_work = [
        rse for rse, has_more_work in work_remaining_by_rse.items()
        if has_more_work
    ]
    return rses_with_more_work