Beispiel #1
0
class BlockSyncer(object):
    """
    Class representing the replica at a site af a CMS Dataset (PhEDEx FileBlock)
    """
    def __init__(self,
                 block_name,
                 pnn,
                 rse=None,
                 lifetime=None,
                 dry_run=False):
        """
        Get the status of replica of pditem at pnn
        considering only closed blocks completely replicated at site.

        :rds:    PhEDEx block name.
        :pnn:    PhEDEx node name.
        :rse:    Rucio RSE. If None (default) inferred by the pnn using DEFAULT_RSE_FMT.
        :scope:  Scope. Default: DEFAULT_SCOPE.
        """

        self.phedex_svc = PhEDEx()
        self.dry_run = dry_run

        self.pnn = pnn
        if rse is None:
            self.rse = list_rses('cms_type=real&pnn=%s' % self.pnn)[0]['rse']
        else:
            self.rse = rse
        rse_details = get_rse(self.rse)
        self.rse_id = rse_details['id']

        self.account = SYNC_ACCOUNT_FMT % self.rse.lower()
        self.container = self.phedex_svc.check_data_item(
            pditem=block_name)['pds']
        self.scope = DEFAULT_SCOPE
        self.block_name = block_name
        self.lifetime = lifetime

        self.group, self.custodial = self.phedex_svc.block_at_pnn_phedex(
            block=self.block_name, pnn=self.pnn)
        self.is_at_pnn = bool(self.group)

        if self.is_at_pnn:
            self.replicas = self.phedex_svc.fileblock_files_phedex(
                pnn=pnn, pfb=block_name)
        else:
            self.replicas = {}

        self.container_exists = None
        self.block_exists = None
        self.rule_exists = None

        touch(text=self.rse)

    def add_to_rucio(self, recover=False):
        """"""
        with monitor.record_timer_block('cms_sync.time_add_block'):
            self.register_container()
            block_exists = self.register_block()
            if block_exists:
                self.update_replicas()
                if recover:
                    self.make_replicas_available()
                self.update_rule()
            else:
                logging.critical('Unable to make the block %s',
                                 self.block_name)

    def remove_from_rucio(self):
        """"""
        with monitor.record_timer_block('cms_sync.time_remove_block'):
            self.update_replicas()
            self.update_rule()

    def register_container(self):
        self.container_exists = False
        if self.is_at_pnn and self.dry_run:
            logging.info('Dry Run: Create container %s in scope %s.',
                         self.container, self.scope)
            self.container_exists = True
            return self.container_exists

        try:
            get_did(scope=self.scope, name=self.container)
            monitor.record_counter('cms_sync.container_exists')
            self.container_exists = True
            logging.info('Found container %s', self.container)
        except DataIdentifierNotFound:
            if self.is_at_pnn:
                try:
                    logging.info('Create container %s in scope %s.',
                                 self.container, self.scope)
                    add_did(scope=self.scope,
                            name=self.container,
                            type='CONTAINER',
                            issuer=self.account,
                            lifetime=self.lifetime)
                    monitor.record_counter('cms_sync.container_created')
                    self.container_exists = True
                    logging.info('Created container %s in scope %s.',
                                 self.container, self.scope)
                except DataIdentifierAlreadyExists:
                    logging.warning('Container was created in the meanwhile')
                    monitor.record_counter('cms_sync.container_collision')
                    self.container_exists = True
            else:
                logging.warning('Container was not at PNN')

        return self.container_exists

    def register_block(self):
        """
        Register the dataset (if there is a replica at the pnn) and attach to container
        :dry: Dry run. Default false.
        """

        # FIXME: The logic here could use some improvement as we try to create a block even if it exists already

        try:
            get_did(scope=self.scope, name=self.block_name)
            self.block_exists = True
            monitor.record_counter('cms_sync.dataset_exists')
        except DataIdentifierNotFound:
            self.block_exists = False

        if self.is_at_pnn and self.dry_run:
            logging.info('Dry Run: Create dataset %s in scope %s.',
                         self.block_name, self.scope)
            self.block_exists = True
        elif self.is_at_pnn:
            logging.info('Create block %s in scope %s.', self.block_name,
                         self.scope)
            try:
                if not self.block_exists:
                    add_did(scope=self.scope,
                            name=self.block_name,
                            type='DATASET',
                            issuer=self.account,
                            lifetime=self.lifetime)
                    monitor.record_counter('cms_sync.dataset_created')
            except DataIdentifierAlreadyExists:
                logging.warning('Attempt to add %s:%s failed, already exists.',
                                self.scope, self.block_name)
                monitor.record_counter('cms_sync.dataset_collision')

            try:
                attach_dids(scope=self.scope,
                            name=self.container,
                            attachment={
                                'dids': [{
                                    'scope': self.scope,
                                    'name': self.block_name
                                }]
                            },
                            issuer=self.account)
            except DuplicateContent:
                logging.warning(
                    'Attempt to add %s:%s to %s failed, already exists.',
                    self.scope, self.block_name, self.container)
            except DataIdentifierNotFound:
                logging.error(
                    'Attempt to add %s:%s to %s failed. Container does not exist.',
                    self.scope, self.block_name, self.container)
                return False
            self.block_exists = True
        else:
            logging.warning('Block %s was not at PNN', self.block_name)

        return self.block_exists

    def update_rule(self):
        """
        Adds or removes the rule for the block.
        """

        rules = list_replication_rules(filters={
            'scope': self.scope,
            'name': self.block_name
        })
        # rules = self.rcli.list_did_rules(scope=self.scope, name=self.block_name)
        rse_expression = 'rse=' + self.rse

        remove_rules = [
            rule for rule in rules if rule['account'] == self.account
            and rule['rse_expression'] == rse_expression
        ]

        if not remove_rules and self.is_at_pnn:
            self.rule_exists = False
            if self.dry_run:
                logging.info("Dry run: Adding rule for dataset %s at rse %s.",
                             self.block_name, self.rse)
            else:
                self.add_replication_rule_with_defaults(
                    dids=[{
                        'scope': self.scope,
                        'name': self.block_name
                    }],
                    copies=1,
                    rse_expression=rse_expression,
                    account=self.account)
                monitor.record_counter('cms_sync.rules_added')
            self.rule_exists = True
        elif remove_rules and not self.is_at_pnn:
            self.rule_exists = True
            if self.dry_run:
                logging.info("Removing rules for dataset %s at rse %s.",
                             self.block_name, self.rse)
            else:
                for rule in remove_rules:
                    # delete_replication_rule(rule['id'], purge_replicas=False, issuer=self.account)
                    delete_rule(rule_id=rule['id'],
                                purge_replicas=True,
                                soft=False)
                    monitor.record_counter('cms_sync.rules_removed')
            self.rule_exists = False

    def update_replicas(self):
        """
        Add or removes replicas for the dataset at rse.
        """

        with monitor.record_timer_block('cms_sync.time_update_replica'):
            logging.info('Updating replicas for %s:%s at %s', self.scope,
                         self.block_name, self.rse)
            replicas = list_replicas(dids=[{
                'scope': self.scope,
                'name': self.block_name
            }],
                                     rse_expression='rse=%s' % self.rse)
            try:
                rucio_replicas = {repl['name'] for repl in replicas}
            except TypeError:
                rucio_replicas = set()

            phedex_replicas = set(self.replicas.keys())
            missing = list(phedex_replicas - rucio_replicas)
            to_remove = list(rucio_replicas - phedex_replicas)

            if missing and (len(phedex_replicas) != len(missing)):
                logging.warn(
                    'Recovery: Inconsistency found for %s at %s: %s in PhEDEx and %s missing',
                    self.rse, self.block_name, len(phedex_replicas),
                    len(missing))

            if missing:
                lfns_added = self.add_missing_replicas(missing)
                monitor.record_counter('cms_sync.files_added',
                                       delta=lfns_added)
            if to_remove:
                lfns_removed = self.remove_extra_replicas(to_remove)
                monitor.record_counter('cms_sync.files_removed',
                                       delta=lfns_removed)

        return

    def make_replicas_available(self):
        """
        Marks available replicas for the dataset at rse if they are in PhEDEx
        """

        with monitor.record_timer_block('cms_sync.time_recover_replica'):
            logging.info('Recovering unavailable replicas for %s:%s at %s',
                         self.scope, self.block_name, self.rse)

            replicas = list_replicas(dids=[{
                'scope': self.scope,
                'name': self.block_name
            }],
                                     rse_expression='rse=%s' % self.rse,
                                     all_states=True)

            try:
                unavailable_replicas = {
                    repl['name']
                    for repl in replicas
                    if repl['states'][self.rse] != 'AVAILABLE'
                }
            except TypeError:
                unavailable_replicas = set()

            phedex_replicas = set(self.replicas.keys())
            missing = list(phedex_replicas & unavailable_replicas)

            logging.info(
                'Recovery for %s:%s at %s: PhEDEx has %s, Rucio unavailable %s. Missing: %s ',
                self.scope, self.block_name, self.rse, len(phedex_replicas),
                len(unavailable_replicas), len(missing))

            # Fix up things which are unavailable
            rse_details = get_rse(self.rse)
            rse_id = rse_details['id']
            scope = InternalScope(self.scope)
            state = 'A'

            for name in missing:
                logging.info('Setting available %s:%s at %s', self.scope, name,
                             self.rse)
                core_update_state(rse_id=rse_id,
                                  scope=scope,
                                  name=name,
                                  state=state)

            monitor.record_counter('cms_sync.files_made_available',
                                   delta=len(missing))

        return

    def remove_extra_replicas(self, to_remove):
        """
        :param to_remove: replicas to remove from Rucio
        :return:
        """
        scope = InternalScope(self.scope)
        with monitor.record_timer_block('cms_sync.time_remove_replica'):
            if to_remove and self.dry_run:
                logging.info('Dry run: Removing replicas %s from rse %s.',
                             str(to_remove), self.rse)
            elif to_remove:
                logging.debug('Removing %s replicas from rse %s.',
                              len(to_remove), self.rse)
                for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE):
                    replicas = [{
                        'scope': scope,
                        'name': lfn,
                        "rse_id": self.rse_id,
                        "state": "U"
                    } for lfn in to_remove_chunk]
                    # transactional_session here?
                    # while lock is set stuck, judge-repairer might make transfer requests before rule is gone but does it matter?
                    update_replicas_states(
                        replicas=replicas,
                        add_tombstone=False,
                    )

                # delete_replicas(rse=self.rse, issuer=self.account,
                #                     files=[{'scope': self.scope, 'name': lfn} for lfn in to_remove_chunk])
                return len(to_remove)

    def add_missing_replicas(self, missing):
        """
        :param missing: possible missing lfns
        :return:
        """

        with monitor.record_timer_block('cms_sync.time_add_replica'):
            if missing and self.dry_run:
                logging.info('Dry run: Adding replicas %s to rse %s.',
                             str(missing), self.rse)
            elif missing:
                logging.info('Adding %s replicas to rse %s.', len(missing),
                             self.rse)
                replicas_to_add = [self.replicas[lfn] for lfn in missing]
                files = replica_file_list(replicas=replicas_to_add,
                                          scope=self.scope)
                for rucio_file in files:
                    try:
                        update_file = copy.deepcopy(rucio_file)
                        update_file.update({
                            'scope': InternalScope(self.scope),
                            "rse_id": self.rse_id,
                            "state": "A"
                        })
                        update_replicas_states(replicas=[update_file],
                                               add_tombstone=False)
                    except ReplicaNotFound:
                        try:
                            add_replicas(rse=self.rse,
                                         files=[rucio_file],
                                         issuer=self.account,
                                         ignore_availability=True)
                        except RucioException:
                            logging.critical(
                                'Could not add %s to %s. Constraint violated?',
                                rucio_file, self.rse)
                            resurrect([{
                                'scope': rucio_file['scope'],
                                'name': rucio_file['name']
                            }],
                                      issuer=self.account)
                            add_replicas(rse=self.rse,
                                         files=[rucio_file],
                                         issuer=self.account,
                                         ignore_availability=True)
                            logging.critical('Resurrected %s at %s',
                                             rucio_file, self.rse)

                # add_replicas(rse=self.rse, files=files, issuer=self.account)
                lfns = [
                    item['name'] for item in list_files(
                        scope=self.scope, name=self.block_name, long=False)
                ]

                missing_lfns = list(set(missing) - set(lfns))

                if missing_lfns:
                    logging.debug('Attaching %s lfns to %s at %s',
                                  len(missing_lfns), self.block_name, self.rse)
                    dids = [{
                        'scope': self.scope,
                        'name': lfn
                    } for lfn in missing_lfns]
                    try:
                        attach_dids(scope=self.scope,
                                    name=self.block_name,
                                    attachment={'dids': dids},
                                    issuer=self.account)
                    except FileAlreadyExists:
                        logging.warning(
                            'Trying to attach already existing files to %s',
                            self.block_name)
                    except DataIdentifierNotFound:
                        logging.critical(
                            'Could not attach to %s at %s. Constraint violated?',
                            self.block_name, self.rse)
                return len(missing_lfns)

    def add_replication_rule_with_defaults(self, dids, copies, rse_expression,
                                           account):
        """
        Add replication rule requires one to send all the values. Add a list of defaults.
        If true options are required, move them into the parameter list.

        :param dids: List of dids (scope/name dictionary)
        :param copies: Number of copies
        :param rse_expression: RSE expression
        :param account: Account for the rule
        :return: None
        """

        (grouping, weight, lifetime, locked, subscription_id,
         source_replica_expression, notify, purge_replicas,
         ignore_availability, comment, ask_approval, asynchronous, priority,
         split_container) = ('DATASET', None, None, False, None, None, None,
                             False, False, None, False, False, 3, False)

        activity = 'Data Consolidation'
        meta = json.dumps({
            "phedex_group": self.group,
            "phedex_custodial": self.custodial
        })

        add_replication_rule(
            dids=dids,
            copies=copies,
            rse_expression=rse_expression,
            account=account,
            grouping=grouping,
            weight=weight,
            lifetime=lifetime,
            locked=locked,
            subscription_id=subscription_id,
            source_replica_expression=source_replica_expression,
            activity=activity,
            notify=notify,
            purge_replicas=purge_replicas,
            ignore_availability=ignore_availability,
            comment=comment,
            ask_approval=ask_approval,
            asynchronous=asynchronous,
            priority=priority,
            split_container=split_container,
            meta=meta,
            issuer=account)
Beispiel #2
0
class CMSRucioDatasetReplica(object):
    """
    Class repeesenting the replica at a site af a CMS Dataset (PhEDEx FileBlock)
    """
    #pylint: disable=too-many-arguments
    def __init__(self, rds, pnn, rse=None, scope=DEFAULT_SCOPE,
                 lifetime=None, pcli=None, rcli=None):
        """
        Get the status of replica of pditem at pnn
        considering only closed blocks completely replicated at site.

        :pnn:    PhEDEx node name.
        :rds:    Rucio Dataset (PhEDEx FileBlock) name.
        :rse:    Rucio RSE. If None (default) inferred by the pnn using DEFAULT_RSE_FMT.
        :scope:  Scope. Default: DEFAULT_SCOPE.
        :pcli:   Reference to a phedex.PhEDEx object or a dict
                 {'instance': <instance>, 'dasgoclient': <path>, 'datasvc': <url>}
                 none of the keys is mandatory. Default is {}.
        :rcli:   Reference to a rucio Client() instance or a dict
                 {'accont': ..., ... } none of the keys is mandatory.
                 Default is {'account': <sync account>}
        """

        self.pnn = pnn

        self._get_pcli(pcli)

        self._get_rcli(rcli)

        if rse is None:
            self.rse = self.rcli.list_rses('cms_type=real&pnn=%s' %
                                           self.pnn)[0]['rse']
        else:
            self.rse = rse

        self.container = self.pcli.check_data_item(pditem=rds)['pds']

        self.dataset = rds

        self.scope = scope

        self.lifetime = lifetime

        self.block_at_pnn()

        if self.is_at_pnn:
            self.replicas = self.pcli.fileblock_files_phedex(pnn=pnn, pfb=rds)
        else:
            self.replicas = {}

    def _get_pcli(self, pcli):
        if pcli is None:
            pcli = {}

        if isinstance(pcli, dict):
            self.pcli = PhEDEx(**pcli)
        elif isinstance(pcli, PhEDEx):
            #pylint: disable=redefined-variable-type
            self.pcli = pcli
        else:
            raise Exception("wrong type for pcli parameter %s" %\
                            type(pcli))


    def _get_rcli(self, rcli):
        if rcli is None:
            rcli = {}

        if isinstance(rcli, dict):
            if 'account' not in rcli:
                rcli['account'] = SYNC_ACCOUNT_FMT % self.pnn.lower()
            self.rcli = Client(**rcli)
        elif isinstance(rcli, Client):
            #pylint: disable=redefined-variable-type
            self.rcli = rcli
        else:
            raise Exception("wrong type for rcli parameter %s" %\
                            type(rcli))

    def block_at_pnn(self):
        """
        Verify if the block is at pnn (using phedex datasvc)
        """

        self.is_at_pnn = self.pcli.block_at_pnn_phedex(block=self.dataset, pnn=self.pnn)

        return

    def register_container(self, dry=False):
        """
        Register container of the dataset
        (only if there is a dataset replica on the pnn)
        :dry: Dry run. Default false.
        """

        try:
            self.rcli.get_did(scope=self.scope, name=self.container)
            return 'exists'
        except DataIdentifierNotFound:
            pass

        if self.is_at_pnn and dry:
            logging.dry('Create container %s in scope %s.',
                        self.container, self.scope)
            return 'created'
        elif self.is_at_pnn:
            logging.verbose('Create container %s in scope %s.',
                            self.container, self.scope)
            try:
                self.rcli.add_container(scope=self.scope, name=self.container,
                                        lifetime=self.lifetime)

            except DataIdentifierAlreadyExists:
                logging.warning('Container was created in the meanwhile')
                return 'exists'

            return 'created'

        return 'skipped'


    def register_dataset(self, dry=False):
        """
        Register the dataset (if there is a replica at the pnn)
        :dry: Dry run. Default false.
        """

        try:
            self.rcli.get_did(scope=self.scope, name=self.dataset)
            return 'exists'
        except DataIdentifierNotFound:
            pass

        if self.is_at_pnn and dry:
            logging.dry('Create dataset %s in scope %s.',
                        self.dataset, self.scope)
            return 'created'

        elif self.is_at_pnn:
            logging.verbose('Create dataset %s in scope %s.',
                            self.dataset, self.scope)
            self.rcli.add_dataset(scope=self.scope, name=self.dataset,
                                  lifetime=self.lifetime)
            self.rcli.attach_dids(scope=self.scope, name=self.container,
                                  dids=[{'scope': self.scope, 'name': self.dataset}])
            return 'created'

        return 'skipped'


    def update_replicas(self, dry=False):
        """
        Add or removes replicas for the dataset at rse.
        :dry:  Drydrun. default false
        """

        logging.notice('Updating replicas for %s:%s at %s' % (self.scope, self.dataset, self.rse))

        replicas = self.rcli.list_replicas([{'scope': self.scope, 'name': self.dataset}],
                                           rse_expression='rse=%s' % self.rse)

        try:
            rrepl = [repl['name'] for repl in replicas]
        except TypeError:
            rrepl = []

        prepl = [repl for repl in self.replicas.keys()]

        missing = list(set(prepl) - set(rrepl))

        to_remove = list(set(rrepl) - set(prepl))

        if missing and dry:
            logging.dry('Adding replicas %s to rse %s.',
                        str(missing), self.rse)

        elif missing:
            logging.verbose('Adding replicas %s to rse %s.',
                            str(missing), self.rse)

            add_replicas = [self.replicas[lfn] for lfn in missing]
            files = replica_file_list(replicas=add_replicas, scope=self.scope)
            self.rcli.add_replicas(rse=self.rse, files=files)

            # missing files that are not in the list of dataset files
            # are to be attached.
            lfns = [item['name'] for item in self.rcli.list_files(
                scope=self.scope,
                name=self.dataset
            )]

            missing_lfns = list(set(missing) - set(lfns))
            if missing_lfns:
                logging.verbose('Attaching lfns %s to dataset %s.',
                                str(missing_lfns), self.dataset)


                try:
                    self.rcli.attach_dids(
                        scope=self.scope,
                        name=self.dataset,
                        dids=[{
                            'scope': self.scope,
                            'name': lfn
                        } for lfn in list(set(missing) - set(lfns))]
                    )

                except FileAlreadyExists:
                    logging.warning('Trying to attach already existing files.')

        if to_remove and dry:
            logging.dry('Removing replicas %s from rse %s.',
                        str(to_remove), self.rse)

        elif to_remove:
            logging.verbose('Removing replicas %s from rse %s.',
                            str(to_remove), self.rse)
            for to_remove_chunk in chunks(to_remove, REMOVE_CHUNK_SIZE):
                attempt = 0
                while True:
                    attempt += 1
                    try:
                        self.rcli.delete_replicas(rse=self.rse, files=[{
                            'scope': self.scope,
                            'name': lfn,
                        } for lfn in to_remove_chunk])
                        break
                    except DatabaseException:
                        logging.warning('DatabaseException raised, retrying...')
                        if attempt > 3:
                            raise
                        time.sleep(randint(1, 5))

        return {'added': missing, 'removed': to_remove}


    def update_rule(self, dry=False):
        """
        Adds or removes the rule for the dataset.
        :dry:  Drydrun. default false

        returns the action performed: None, added, removed
        """
        rules = self.rcli.list_did_rules(scope=self.scope, name=self.dataset)
        rrule = None
        account = self.rcli.__dict__['account']
        action = None
        rse_exp = 'rse=' + self.rse

        rrule = next((
            rule for rule in rules
            if rule['account'] == account and\
                rule['rse_expression'] == rse_exp
        ), None)

        if rrule is None and self.is_at_pnn:

            if dry:
                logging.dry("Adding rule for dataset %s at rse %s.",
                            self.dataset, self.rse)
            else:
                self.rcli.add_replication_rule(
                    dids=[{'scope': self.scope, 'name': self.dataset}],
                    copies=1,
                    rse_expression=rse_exp,
                )
            action = 'added'

        elif rrule is not None and not self.is_at_pnn:
            # removing rule
            if dry:
                logging.dry("Removing rule for dataset %s at rse %s.",
                            self.dataset, self.rse)
            else:
                self.rcli.delete_replication_rule(rrule['id'], purge_replicas=False)
            action = 'removed'

        return action

    def update(self, dry=False):
        """
        syncronize the dataset replica info.
        :dry:  Drydrun. default false
        """
        ret = {'at_node': self.is_at_pnn}

        #datasets and containers are only added
        ret['container'] = self.register_container(dry)
        ret['dataset'] = self.register_dataset(dry)

        ret['replicas'] = self.update_replicas(dry)
        ret['rule'] = self.update_rule(dry)

        return ret