def __init__(self, config=None):
        config = Configuration(config)

        DeletionInterface.__init__(self, config)

        self._phedex = PhEDEx(config.get('phedex', None))

        self._history = HistoryDatabase(config.get('history', None))

        self.auto_approval = config.get('auto_approval', True)
        self.allow_tape_deletion = config.get('allow_tape_deletion', True)
        self.tape_auto_approval = config.get('tape_auto_approval', False)

        self.deletion_chunk_size = config.get('chunk_size', 50.) * 1.e+12
class TapeCopyRequested(object):
    """
    Check for pending tape transfer requests.
    Sets one attr:
      tape_copy_requested
    """

    produces = ['tape_copy_requested']

    def __init__(self, config):
        self._phedex = PhEDEx(config.get('phedex', None))

    def load(self, inventory):
        for site in inventory.sites.itervalues():
            if site.storage_type != Site.TYPE_MSS:
                continue

            requests = self._phedex.make_request(
                'transferrequests', ['node=' + site.name, 'approval=pending'])
            for request in requests:
                for dest in request['destinations']['node']:
                    if dest['name'] != site.name:
                        continue

                    if 'decided_by' in dest:
                        break

                    for dataset_entry in request['data']['dbs']['dataset']:
                        try:
                            dataset = inventory.datasets[dataset_entry['name']]
                        except KeyError:
                            continue

                        dataset.attr['tape_copy_requested'] = True

                    for block_entry in request['data']['dbs']['block']:
                        dataset_name, block_name = Block.from_full_name(
                            block_entry['name'])
                        try:
                            dataset = inventory.datasets[dataset_name]
                        except KeyError:
                            continue

                        # just label the entire dataset
                        dataset.attr['tape_copy_requested'] = True
class PhEDExReplicaInfoSource(ReplicaInfoSource):
    """ReplicaInfoSource using PhEDEx."""
    def __init__(self, config=None):
        if config is None:
            config = Configuration()

        ReplicaInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.get('phedex', None))
        self._parallelizer_config = config

    def replica_exists_at_site(self, site, item):  #override
        options = ['node=' + site.name]
        if type(item) == Dataset:
            options += ['dataset=' + item.name, 'show_dataset=y']
        elif type(item) == DatasetReplica:
            options += ['dataset=' + item.dataset.name, 'show_dataset=y']
        elif type(item) == Block:
            options += ['block=' + item.full_name()]
        elif type(item) == BlockReplica:
            options += ['block=' + item.block.full_name()]
        else:
            raise RuntimeError('Invalid input passed: ' + repr(item))

        source = self._phedex.make_request('blockreplicas',
                                           options,
                                           timeout=600)

        if len(source) != 0:
            return True

        options = ['node=' + site.name]
        if type(item) == Dataset:
            # check both dataset-level and block-level subscriptions
            options += ['dataset=' + item.name, 'block=%s#*' % item.name]
        elif type(item) == DatasetReplica:
            options += [
                'dataset=' + item.dataset.name,
                'block=%s#*' % item.dataset.name
            ]
        elif type(item) == Block:
            options += ['block=' + item.full_name()]
        elif type(item) == BlockReplica:
            options += ['block=' + item.block.full_name()]

        # blockreplicas has max ~20 minutes latency
        source = self._phedex.make_request('subscriptions',
                                           options,
                                           timeout=600)

        return len(source) != 0

    def get_replicas(self, site=None, dataset=None, block=None):  #override
        if site is None:
            site_check = self.check_allowed_site
        else:
            site_check = None
            if not self.check_allowed_site(site):
                return []

        if dataset is None and block is None:
            dataset_check = self.check_allowed_dataset
        else:
            dataset_check = None
            if dataset is not None:
                if not self.check_allowed_dataset(dataset):
                    return []
            if block is not None:
                if not self.check_allowed_dataset(block[:block.find('#')]):
                    return []

        options = []
        if site is not None:
            options.append('node=' + site)
        if dataset is not None:
            options.append('dataset=' + dataset)
        if block is not None:
            options.append('block=' + block)

        LOG.info('get_replicas(' + ','.join(options) +
                 ')  Fetching the list of replicas from PhEDEx')

        if len(options) == 0:
            return []

        block_entries = self._phedex.make_request('blockreplicas',
                                                  options,
                                                  timeout=7200)

        parallelizer = Map()
        parallelizer.timeout = 7200

        # Automatically starts a thread as we add the output of block_entries
        combine_file = parallelizer.get_starter(self._combine_file_info)

        for block_entry in block_entries:
            for replica_entry in block_entry['replica']:
                if replica_entry['complete'] == 'n':
                    break
            else:
                continue

            # there is at least one incomplete replica
            try:
                dataset_name, block_name = Block.from_full_name(
                    block_entry['name'])
            except ObjectError:  # invalid name
                continue

            if dataset_check and not dataset_check(dataset_name):
                continue

            combine_file.add_input(block_entry)

        combine_file.close()

        # _combine_file_info alters block_entries directly - no need to deal with output
        combine_file.get_outputs()

        block_replicas = PhEDExReplicaInfoSource.make_block_replicas(
            block_entries,
            PhEDExReplicaInfoSource.maker_blockreplicas,
            site_check=site_check,
            dataset_check=dataset_check)

        # Also use subscriptions call which has a lower latency than blockreplicas
        # For example, group change on a block replica at time T may not show up in blockreplicas until up to T + 15 minutes
        # while in subscriptions it is visible within a few seconds
        # But subscriptions call without a dataset or block takes too long
        if dataset is None and block is None:
            return block_replicas

        indexed = collections.defaultdict(dict)
        for replica in block_replicas:
            indexed[(replica.site.name,
                     replica.block.dataset.name)][replica.block.name] = replica

        dataset_entries = self._phedex.make_request('subscriptions',
                                                    options,
                                                    timeout=3600)

        for dataset_entry in dataset_entries:
            dataset_name = dataset_entry['name']

            if not self.check_allowed_dataset(dataset_name):
                continue

            try:
                subscriptions = dataset_entry['subscription']
            except KeyError:
                pass
            else:
                for sub_entry in subscriptions:
                    site_name = sub_entry['node']

                    if not self.check_allowed_site(site_name):
                        continue

                    replicas = indexed[(site_name, dataset_name)]

                    for replica in replicas.itervalues():
                        replica.group = Group(sub_entry['group'])
                        replica.is_custodial = (sub_entry['custodial'] == 'y')

            try:
                block_entries = dataset_entry['block']
            except KeyError:
                pass
            else:
                for block_entry in block_entries:
                    try:
                        _, block_name = Block.from_full_name(
                            block_entry['name'])
                    except ObjectError:
                        continue

                    try:
                        subscriptions = block_entry['subscription']
                    except KeyError:
                        continue

                    for sub_entry in subscriptions:
                        site_name = sub_entry['node']

                        if not self.check_allowed_site(site_name):
                            continue

                        try:
                            replica = indexed[(site_name,
                                               dataset_name)][block_name]
                        except KeyError:
                            continue

                        replica.group = Group(sub_entry['group'])

                        if sub_entry['node_bytes'] == block_entry['bytes']:
                            # complete
                            replica.size = sub_entry['node_bytes']
                            if replica.size is None:
                                replica.size = 0
                            replica.files = None
                        else:
                            # incomplete - since we cannot know what files are there, we'll just have to pretend there is none
                            replica.size = 0
                            replica.files = tuple()

                        replica.is_custodial = (sub_entry['custodial'] == 'y')

                        if sub_entry['time_update'] is not None:
                            replica.last_update = 0
                        else:
                            replica.last_update = int(sub_entry['time_update'])

        return block_replicas

    def get_updated_replicas(self, updated_since, inventory):  #override
        LOG.info(
            'get_updated_replicas(%d)  Fetching the list of replicas from PhEDEx',
            updated_since)

        nodes = []
        for entry in self._phedex.make_request('nodes', timeout=600):
            if not self.check_allowed_site(entry['name']):
                continue

            if entry['name'] not in inventory.sites:
                continue

            nodes.append(entry['name'])

        try:
            tmpconfig = Configuration(
                self._parallelizer_config.get('parallel', None))
        except Exception as e:
            LOG.error(str(e))
            tmpconfig = Configuration()

        parallelizer = Map(tmpconfig)
        parallelizer.timeout = 5400

        def get_node_replicas(node):
            options = ['update_since=%d' % updated_since, 'node=%s' % node]
            results = self._phedex.make_request('blockreplicas', options)

            return node, results

        # Use async to fire threads on demand
        node_results = parallelizer.execute(get_node_replicas,
                                            nodes,
                                            async=True)

        # Automatically starts a thread as we add the output of block_replicas
        combine_file = parallelizer.get_starter(self._combine_file_info)

        all_block_entries = []

        for node, block_entries in node_results:
            site = inventory.sites[node]

            for block_entry in block_entries:
                all_block_entries.append(block_entry)

                replica_entry = block_entry['replica'][0]

                if replica_entry['complete'] == 'y':
                    continue

                # incomplete block replica - should we fetch file info?
                try:
                    dataset_name, block_name = Block.from_full_name(
                        block_entry['name'])
                except ObjectError:
                    pass
                else:
                    try:
                        dataset = inventory.datasets[dataset_name]
                        block = dataset.find_block(block_name)
                        replica = block.find_replica(site)
                        if replica.file_ids is None:
                            num_files = block.num_files
                        else:
                            num_files = len(replica.file_ids)

                        if replica.size == replica_entry[
                                'bytes'] and num_files == replica_entry[
                                    'files']:
                            # no we don't have to
                            continue
                    except:
                        # At any point of the above lookups we may hit a None object or KeyError or what not
                        pass

                LOG.debug(
                    'Replica %s:%s is incomplete. Fetching file information.',
                    replica_entry['node'], block_entry['name'])
                combine_file.add_input(block_entry)

        combine_file.close()

        # _combine_file_info alters block_entries directly - no need to deal with output
        combine_file.get_outputs()

        LOG.info('get_updated_replicas(%d) Got outputs' % updated_since)

        return PhEDExReplicaInfoSource.make_block_replicas(
            all_block_entries,
            PhEDExReplicaInfoSource.maker_blockreplicas,
            dataset_check=self.check_allowed_dataset)

    def get_deleted_replicas(self, deleted_since):  #override
        LOG.info(
            'get_deleted_replicas(%d)  Fetching the list of replicas from PhEDEx',
            deleted_since)

        result = self._phedex.make_request(
            'deletions', ['complete_since=%d' % deleted_since], timeout=7200)
        # result is by dataset
        block_entries = []
        for dataset_entry in result:
            block_entries.extend(dataset_entry['block'])

        return PhEDExReplicaInfoSource.make_block_replicas(
            block_entries, PhEDExReplicaInfoSource.maker_deletions)

    def _combine_file_info(self, block_entry):
        try:
            LOG.debug(
                '_combine_file_info(%s) Fetching file replicas from PhEDEx',
                block_entry['name'])
            file_info = self._phedex.make_request(
                'filereplicas', ['block=%s' % block_entry['name']])[0]['file']
        except (IndexError, KeyError):
            # Somehow PhEDEx didn't have a filereplicas entry for this block at this node
            block_entry['file'] = []
        else:
            block_entry['file'] = file_info

    @staticmethod
    def make_block_replicas(block_entries,
                            replica_maker,
                            site_check=None,
                            dataset_check=None):
        """Return a list of block replicas linked to Dataset, Block, Site, and Group"""

        dataset = None
        block_replicas = []

        for block_entry in block_entries:
            try:
                dataset_name, block_name = Block.from_full_name(
                    block_entry['name'])
            except ObjectError:  # invalid name
                continue

            if dataset is None or dataset.name != dataset_name:
                if dataset_check and not dataset_check(dataset_name):
                    continue

                try:
                    dataset = Dataset(dataset_name)
                except ObjectError:
                    # invalid name
                    dataset = None

            if dataset is None:
                continue

            block = Block(block_name, dataset, block_entry['bytes'])
            if block.size is None:
                block.size = 0

            block_replicas.extend(
                replica_maker(block, block_entry, site_check=site_check))

        return block_replicas

    @staticmethod
    def maker_blockreplicas(block, block_entry, site_check=None):
        """Return a list of block replicas using blockreplicas data or a combination of blockreplicas and filereplicas calls."""

        sites = {}
        invalid_sites = set()
        groups = {}

        block_replicas = {}

        for replica_entry in block_entry['replica']:
            site_name = replica_entry['node']
            try:
                site = sites[site_name]
            except KeyError:
                if site_check:
                    if site_name in invalid_sites:
                        continue
                    if not site_check(site_name):
                        invalid_sites.add(site_name)
                        continue

                site = sites[site_name] = Site(site_name)

            group_name = replica_entry['group']
            try:
                group = groups[group_name]
            except KeyError:
                group = groups[group_name] = Group(group_name)

            try:
                time_update = int(replica_entry['time_update'])
            except TypeError:
                # time_update was None
                time_update = 0

            block_replica = BlockReplica(
                block,
                site,
                group,
                is_custodial=(replica_entry['custodial'] == 'y'),
                last_update=time_update)

            block_replicas[site_name] = block_replica

            if replica_entry['complete'] == 'n':
                # temporarily make this a list
                block_replica.file_ids = []
                block_replica.size = 0
                LOG.info("Incomplete %s" % str(block_replica))

        if 'file' in block_entry:
            for file_entry in block_entry['file']:
                for replica_entry in file_entry['replica']:
                    site_name = replica_entry['node']
                    try:
                        block_replica = block_replicas[site_name]
                    except KeyError:
                        continue

                    if block_replica.file_ids is None:
                        continue

                    # add LFN instead of file id
                    block_replica.file_ids.append(file_entry['name'])
                    file_size = file_entry['bytes']
                    if file_size is not None:
                        block_replica.size += file_size

                    try:
                        time_create = int(replica_entry['time_create'])
                    except TypeError:
                        pass
                    else:
                        if time_create > block_replica.last_update:
                            block_replica.last_update = time_create

        for block_replica in block_replicas.itervalues():
            if block_replica.file_ids is not None:
                block_replica.file_ids = tuple(block_replica.file_ids)

        return block_replicas.values()

    @staticmethod
    def maker_deletions(block, block_entry, site_check=None):
        replicas = []

        for deletion_entry in block_entry['deletion']:
            if site_check and not site_check(deletion_entry['node']):
                continue

            block_replica = BlockReplica(block, Site(deletion_entry['node']),
                                         Group.null_group)

            replicas.append(block_replica)

        return replicas
Example #4
0
    def __init__(self, config):
        CopyInterface.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)

        self.subscription_chunk_size = config.get('chunk_size', 50.) * 1.e+12
Example #5
0
class PhEDExCopyInterface(CopyInterface):
    """Copy using PhEDEx."""
    def __init__(self, config):
        CopyInterface.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)

        self.subscription_chunk_size = config.get('chunk_size', 50.) * 1.e+12

    def schedule_copy(self, replica, comments=''):  #override
        request_mapping = {}

        subscription_list = []

        if type(replica) is DatasetReplica:
            blocks_by_group = collections.defaultdict(set)
            for block_replica in replica.block_replicas:
                blocks_by_group[block_replica.group].add(block_replica.block)

            if len(blocks_by_group) > 1:
                # this was called as a dataset-level copy, but in fact we have multiple
                # sets of blocks with different groups -> recall block-level schedule_copies
                return self.schedule_copies(replica.block_replicas, comments)

            group, block_replicas = blocks_by_group.items()[0]

            if block_replicas == replica.dataset.blocks:
                subscription_list.append(replica.dataset)
                level = 'dataset'
            else:
                subscription_list.extend(block_replicas)
                level = 'block'

        else:  #BlockReplica
            group = replica.group
            subscription_list.append(replica.block)
            level = 'block'

        self._run_subscription_request(request_mapping, replica.site, group,
                                       level, subscription_list, comments)

        return request_mapping

    def schedule_copies(self, replicas, comments=''):  #override
        request_mapping = {}

        replicas_by_site = collections.defaultdict(list)
        for replica in replicas:
            replicas_by_site[replica.site].append(replica)

        for site, replica_list in replicas_by_site.iteritems():
            # sort the subscriptions into dataset level / block level and by groups
            subscription_lists = {}
            subscription_lists['dataset'] = collections.defaultdict(
                list)  # {(level, group_name): [replicas]}
            subscription_lists['block'] = collections.defaultdict(
                list)  # {(level, group_name): [replicas]}

            for replica in replica_list:
                if type(replica) is DatasetReplica:
                    blocks_by_group = collections.defaultdict(set)
                    for block_replica in replica.block_replicas:
                        blocks_by_group[block_replica.group].add(
                            block_replica.block)

                    for group, blocks in blocks_by_group.iteritems():
                        if blocks == replica.dataset.blocks:
                            subscription_lists['dataset'][group].append(
                                replica.dataset)
                        else:
                            subscription_lists['block'][group].extend(blocks)
                else:
                    subscription_lists['block'][replica.group].append(
                        replica.block)

            for level in ['dataset', 'block']:
                for group, items in subscription_lists[level].iteritems():
                    self._run_subscription_request(request_mapping, site,
                                                   group, level, items,
                                                   comments)

        return request_mapping

    def _run_subscription_request(self, request_mapping, site, group, level,
                                  subscription_list, comments):
        # Make a subscription request for potentitally multiple datasets or blocks but to one site and one group
        full_catalog = collections.defaultdict(list)

        if level == 'dataset':
            for dataset in subscription_list:
                full_catalog[dataset] = []
        elif level == 'block':
            for block in subscription_list:
                full_catalog[block.dataset].append(block)

        LOG.info('Subscribing %d datasets for %s at %s', len(full_catalog),
                 group.name, site.name)

        # make requests in chunks
        request_catalog = {}
        chunk_size = 0
        items = []
        while len(full_catalog) != 0:
            dataset, blocks = full_catalog.popitem()
            request_catalog[dataset] = blocks

            if level == 'dataset':
                chunk_size += dataset.size
                items.append(dataset)
            elif level == 'block':
                chunk_size += sum(b.size for b in blocks)
                items.extend(blocks)

            if chunk_size < self.subscription_chunk_size and len(
                    full_catalog) != 0:
                continue

            options = {
                'node': site.name,
                'data': self._phedex.form_catalog_xml(request_catalog),
                'level': level,
                'priority': 'normal',
                'move': 'n',
                'static': 'n',
                'custodial': 'n',
                'group': group.name,
                'request_only': 'n',
                'no_mail': 'n',
                'comments': comments
            }

            # result = [{'id': <id>}] (item 'request_created' of PhEDEx response)
            if self.dry_run:
                result = [{'id': '0'}]
            else:
                try:
                    result = self._phedex.make_request('subscribe',
                                                       options,
                                                       method=POST)
                except:
                    result = []

            if len(result) != 0:
                request_id = int(result[0]['id'])  # return value is a string
                LOG.warning('PhEDEx subscription request id: %d', request_id)
                request_mapping[request_id] = (True, site, items)
            else:
                LOG.error('Copy %s failed.', str(options))
                # we should probably do something here

            request_catalog = {}
            chunk_size = 0
            items = []

    def copy_status(self, request_id):  #override
        request = self._phedex.make_request('transferrequests',
                                            'request=%d' % request_id)
        if len(request) == 0:
            return {}

        site_name = request[0]['destinations']['node'][0]['name']

        dataset_names = []
        for ds_entry in request[0]['data']['dbs']['dataset']:
            dataset_names.append(ds_entry['name'])

        block_names = []
        for ds_entry in request[0]['data']['dbs']['block']:
            block_names.append(ds_entry['name'])

        subscriptions = []

        if len(dataset_names) != 0:
            chunks = [
                dataset_names[i:i + 35]
                for i in xrange(0, len(dataset_names), 35)
            ]
            for chunk in chunks:
                subscriptions.extend(
                    self._phedex.make_request(
                        'subscriptions', ['node=%s' % site_name] +
                        ['dataset=%s' % n for n in chunk]))

        if len(block_names) != 0:
            chunks = [
                block_names[i:i + 35] for i in xrange(0, len(block_names), 35)
            ]
            for chunk in chunks:
                subscriptions.extend(
                    self._phedex.make_request('subscriptions',
                                              ['node=%s' % site_name] +
                                              ['block=%s' % n for n in chunk]))

        status = {}
        for dataset in subscriptions:
            try:
                cont = dataset['subscription'][0]
                bytes = dataset['bytes']
                node_bytes = cont['node_bytes']
                time_update = cont['time_update']
            except KeyError:
                # this was a block-level subscription (no 'subscription' field for the dataset)
                bytes = 0
                node_bytes = 0
                time_update = 0
                for block in dataset['block']:
                    cont = block['subscription'][0]
                    bytes += block['bytes']
                    node_bytes += cont['node_bytes']
                    time_update = max(time_update, cont['time_update'])

            status[(site_name, dataset['name'])] = (bytes, node_bytes,
                                                    time_update)

        return status
Example #6
0
    def __init__(self, config):
        DatasetInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)
        self._dbs = RESTService(config.dbs)
Example #7
0
class PhEDExDatasetInfoSource(DatasetInfoSource):
    """DatasetInfoSource using PhEDEx and DBS."""
    def __init__(self, config):
        DatasetInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)
        self._dbs = RESTService(config.dbs)

    def get_dataset_names(self, include=['*'], exclude=[]):
        dataset_names = []

        exclude_exps = []
        for pattern in exclude:
            exclude_exps.append(re.compile(fnmatch.translate(pattern)))

        def add_datasets(result):
            for entry in result:
                name = entry['dataset']
                for ex_exp in exclude_exps:
                    if ex_exp.match(name):
                        break
                else:
                    # not excluded
                    dataset_names.append(name)

        if len(include) == 1 and include[0] == '/*/*/*':
            # all datasets requested - will do this efficiently
            result = self._dbs.make_request('acquisitioneras')
            sds = [entry['acquisition_era_name'] for entry in result]

            # query DBS in parallel
            args = [('datasets', ['acquisition_era_name=' + sd]) for sd in sds]
            results = Map().execute(self._dbs.make_request, args)
            for result in results:
                add_datasets(result)

        for in_pattern in include:
            result = self._dbs.make_request('datasets',
                                            ['dataset=' + in_pattern])
            add_datasets(result)

        return dataset_names

    def get_updated_datasets(self, updated_since):  #override
        LOG.warning(
            'PhEDExDatasetInfoSource can only return a list of datasets and blocks that are created since the given timestamp.'
        )

        result = self._phedex.make_request('data', [
            'dataset=' + name, 'level=block',
            'create_since=%d' % updated_since
        ])

        if len(result) == 0 or 'dataset' not in result[0]:
            return []

        updated_datasets = []

        for dataset_entry in result[0]['dataset']:
            dataset = self._create_dataset(dataset_entry)
            updated_datasets.append(dataset)

        return updated_datasets

    def get_dataset(self, name, with_files=False):  #override
        ## Get the full dataset-block-file data from PhEDEx

        if with_files:
            level = 'file'
        else:
            level = 'block'

        result = self._phedex.make_request(
            'data', ['dataset=' + name, 'level=' + level])

        try:
            dataset_entry = result[0]['dataset'][0]
        except:
            return None

        ## Create the dataset object
        dataset = self._create_dataset(dataset_entry)

        ## Fill block and file data
        if 'block' in dataset_entry:
            for block_entry in dataset_entry['block']:
                block = self._create_block(block_entry, dataset)
                dataset.blocks.add(block)

                # size and num_files are left 0 in _create_dataset (PhEDEx does not tell)
                dataset.size += block.size
                dataset.num_files += block.num_files

                if with_files and 'file' in block_entry:
                    # See comments in get_block
                    block._files = set()
                    for file_entry in block_entry['file']:
                        block._files.add(self._create_file(file_entry, block))

        return dataset

    def get_block(self, name, dataset=None, with_files=False):  #override
        ## Get the full block-file data from PhEDEx

        if with_files:
            level = 'file'
        else:
            level = 'block'

        result = self._phedex.make_request('data',
                                           ['block=' + name, 'level=' + level])

        try:
            dataset_entry = result[0]['dataset'][0]
            block_entry = dataset_entry['block'][0]
        except:
            return None

        if dataset is None:
            link_dataset = False
            # Just need a named object
            dataset = Dataset(dataset_entry['name'])
        else:
            link_dataset = True
            if dataset.name != dataset_entry['name']:
                raise IntegrityError(
                    'Inconsistent dataset %s passed to get_block(%s)',
                    dataset.name, name)

        block = self._create_block(block_entry, dataset)

        if with_files and 'file' in block_entry:
            # _create_block sets size and num_files; just need to update the files list
            # Directly creating the _files set
            # This list will persist (unlike the weak proxy version loaded from inventory), but the returned block
            # from this function is only used temporarily anyway
            block._files = set()
            for file_entry in block_entry['file']:
                block._files.add(self._create_file(file_entry, block))

        if link_dataset:
            existing = dataset.find_block(block.name)
            if existing is None:
                dataset.blocks.add(block)
                dataset.size += block.size
                dataset.num_files += block.num_files
            else:
                dataset.blocks.remove(existing)
                dataset.size += block.size - existing.size
                dataset.num_files += block.num_files - existing.num_files

        return block

    def get_file(self, name, block=None):
        ## Get the file data from PhEDEx

        result = self._phedex.make_request('data',
                                           ['file=' + name, 'level=file'])

        try:
            block_entry = result[0]['dataset'][0]['block'][0]
            file_entry = block_entry['file'][0]
        except:
            return None

        bname = block_entry['name']
        block_name = Block.to_internal_name(bname[bname.find('#') + 1:])

        if block is None:
            link_block = False
            # Just need a named object
            dataset = Dataset(dataset_entry['name'])
            block = Block(block_name, dataset)
        else:
            link_block = True
            if block.name != block_name:
                raise IntegrityError(
                    'Inconsistent block %s passed to get_file(%s)',
                    block.full_name(), name)

        lfile = self._create_file(file_entry, block)

        if link_block:
            # Caution - by adding this file we edit the block properties too

            existing = block.find_file(lfile.fid())
            if existing is None:
                block.add_file(lfile)
            else:
                block.remove_file(existing)
                block.add_file(lfile)

        return lfile

    def get_files(self, dataset_or_block):  #override
        files = set()

        if type(dataset_or_block) is Dataset:
            result = self._phedex.make_request(
                'data', ['dataset=' + dataset_or_block.name, 'level=file'])
            blocks = dict((b.name, b) for b in dataset_or_block.blocks)
        else:
            result = self._phedex.make_request(
                'data',
                ['block=' + dataset_or_block.full_name(), 'level=file'])
            blocks = {dataset_or_block.name: dataset_or_block}

        try:
            block_entries = result[0]['dataset'][0]['block']
        except:
            return files

        for block_entry in block_entries:
            try:
                file_entries = block_entry['file']
            except:
                continue

            bname = block_entry['name']
            block_name = Block.to_internal_name(bname[bname.find('#') + 1:])
            try:
                block = blocks[block_name]
            except:
                # unknown block! maybe should raise?
                continue

            for file_entry in file_entries:
                files.add(self._create_file(file_entry, block))

        return files

    def _create_dataset(self, dataset_entry):
        """
        Create a dataset object with blocks and files from a PhEDEx dataset entry
        """

        dataset = Dataset(dataset_entry['name'],
                          is_open=(dataset_entry['is_open'] == 'y'))

        if 'time_update' in dataset_entry and dataset_entry[
                'time_update'] is not None:
            dataset.last_update = int(dataset_entry['time_update'])
        else:
            dataset.last_update = int(dataset_entry['time_create'])

        ## Get other details of the dataset from DBS
        self._fill_dataset_details(dataset)

        return dataset

    def _create_block(self, block_entry, dataset):
        """
        Create a block object with files from a PhEDEx block entry
        """

        bname = block_entry['name']
        block_name = Block.to_internal_name(bname[bname.find('#') + 1:])

        block = Block(block_name,
                      dataset,
                      size=block_entry['bytes'],
                      num_files=block_entry['files'],
                      is_open=(block_entry['is_open'] == 'y'))

        if 'time_update' in block_entry and block_entry[
                'time_update'] is not None:
            block.last_update = int(block_entry['time_update'])
        else:
            block.last_update = int(block_entry['time_create'])

        return block

    def _create_file(self, file_entry, block):
        lfile = File(file_entry['lfn'], block=block, size=file_entry['size'])

        return lfile

    def _fill_dataset_details(self, dataset):
        # 1. status and PD type

        result = self._dbs.make_request('datasets', [
            'dataset=' + dataset.name, 'dataset_access_type=*', 'detail=True'
        ])

        if len(result) != 0:
            dbs_entry = result[0]
            dataset.status = Dataset.status_val(
                dbs_entry['dataset_access_type'])
            dataset.data_type = Dataset.data_type_val(
                dbs_entry['primary_ds_type'])
        else:
            dataset.status = Dataset.STAT_UNKNOWN
            dataset.data_type = Dataset.TYPE_UNKNOWN

        # 2. software version

        result = self._dbs.make_request('releaseversions',
                                        ['dataset=' + dataset.name])
        if len(result) != 0:
            try:
                version = result[0]['release_version'][0]
            except KeyError:
                pass
            else:
                matches = re.match('CMSSW_([0-9]+)_([0-9]+)_([0-9]+)(|_.*)',
                                   version)
                if matches:
                    cycle, major, minor = map(
                        int, [matches.group(i) for i in range(1, 4)])

                    if matches.group(4):
                        suffix = matches.group(4)[1:]
                    else:
                        suffix = ''

                    dataset.software_version = (cycle, major, minor, suffix)
Example #8
0
class PhEDExReplicaInfoSource(ReplicaInfoSource):
    """ReplicaInfoSource using PhEDEx."""
    def __init__(self, config):
        ReplicaInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)

    def replica_exists_at_site(self, site, item):  #override
        options = ['node=' + site.name]

        if type(item) == Dataset:
            options += ['dataset=' + item.name, 'show_dataset=y']
        elif type(item) == DatasetReplica:
            options += ['dataset=' + item.dataset.name, 'show_dataset=y']
        elif type(item) == Block:
            options += ['block=' + item.full_name()]
        elif type(item) == BlockReplica:
            options += ['block=' + item.block.full_name()]
        else:
            raise RuntimeError('Invalid input passed: ' + repr(item))

        source = self._phedex.make_request('blockreplicas', options)

        return len(source) != 0

    def get_replicas(self, site=None, dataset=None, block=None):  #override
        options = []
        if site is not None:
            options.append('node=' + site)
        if dataset is not None:
            options.append('dataset=' + dataset)
        if block is not None:
            options.append('block=' + block)

        LOG.info('get_replicas(' + ','.join(options) +
                 ')  Fetching the list of replicas from PhEDEx')

        if len(options) == 0:
            return []

        result = self._phedex.make_request('blockreplicas',
                                           ['show_dataset=y'] + options)

        return PhEDExReplicaInfoSource.make_block_replicas(
            result, PhEDExReplicaInfoSource.maker_blockreplicas)

    def get_updated_replicas(self, updated_since):  #override
        LOG.info(
            'get_updated_replicas(%d)  Fetching the list of replicas from PhEDEx',
            updated_since)

        result = self._phedex.make_request(
            'blockreplicas',
            ['show_dataset=y',
             'update_since=%d' % updated_since])

        return PhEDExReplicaInfoSource.make_block_replicas(
            result, PhEDExReplicaInfoSource.maker_blockreplicas)

    def get_deleted_replicas(self, deleted_since):  #override
        LOG.info(
            'get_deleted_replicas(%d)  Fetching the list of replicas from PhEDEx',
            deleted_since)

        result = self._phedex.make_request(
            'deletions', ['complete_since=%d' % deleted_since])

        return PhEDExReplicaInfoSource.make_block_replicas(
            result, PhEDExReplicaInfoSource.maker_deletions)

    @staticmethod
    def make_block_replicas(dataset_entries, replica_maker):
        """Return a list of block replicas linked to Dataset, Block, Site, and Group"""

        block_replicas = []

        for dataset_entry in dataset_entries:
            dataset = Dataset(dataset_entry['name'])

            for block_entry in dataset_entry['block']:
                name = block_entry['name']
                try:
                    block_name = Block.to_internal_name(name[name.find('#') +
                                                             1:])
                except ValueError:  # invalid name
                    continue

                block = Block(block_name, dataset, block_entry['bytes'])

                block_replicas.extend(replica_maker(block, block_entry))

        return block_replicas

    @staticmethod
    def maker_blockreplicas(block, block_entry):
        replicas = []

        for replica_entry in block_entry['replica']:
            block_replica = BlockReplica(
                block,
                Site(replica_entry['node']),
                Group(replica_entry['group']),
                is_complete=(replica_entry['bytes'] == block.size),
                is_custodial=(replica_entry['custodial'] == 'y'),
                size=replica_entry['bytes'],
                last_update=int(replica_entry['time_update']))

            replicas.append(block_replica)

        return replicas

    @staticmethod
    def maker_deletions(block, block_entry):
        replicas = []

        for deletion_entry in block_entry['deletion']:
            block_replica = BlockReplica(block, Site(deletion_entry['node']),
                                         Group.null_group)

            replicas.append(block_replica)

        return replicas
Example #9
0
class PhEDExGroupInfoSource(GroupInfoSource):
    """GroupInfoSource using PhEDEx."""
    def __init__(self, config):
        GroupInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)

    def get_group(self, name):  #override
        if self.include is not None:
            matched = False
            for pattern in self.include:
                if fnmatch.fnmatch(name, pattern):
                    matched = True
                    break

            if not matched:
                LOG.info('get_group(%s)  %s is not included by configuration',
                         name, name)
                return None

        if self.exclude is not None:
            for pattern in self.exclude:
                if fnmatch.fnmatch(name, pattern):
                    LOG.info('get_group(%s)  %s is excluded by configuration',
                             name, name)
                    return None

        LOG.info('get_group(%s)  Fetching info on group %s', name, name)

        result = self._phedex.make_request('groups', ['group=' + name])
        if len(result) == 0:
            return None

        group = Group(name)

        if name in self.dataset_level_groups:
            group.olevel = Dataset
        else:
            group.olevel = Block

        return group

    def get_group_list(self):  #override
        LOG.info('get_group_list  Fetching the list of groups from PhEDEx')
        LOG.debug('Groups with dataset-level ownership: %s',
                  str(self.dataset_level_groups))

        group_list = []

        for entry in self._phedex.make_request('groups'):
            if self.include is not None:
                matched = False
                for pattern in self.include:
                    if fnmatch.fnmatch(entry['name'], pattern):
                        matched = True
                        break

                if not matched:
                    continue

            if self.exclude is not None:
                matched = False
                for pattern in self.exclude:
                    if fnmatch.fnmatch(entry['name'], pattern):
                        matched = True
                        break

                if matched:
                    continue

            if entry['name'] in self.dataset_level_groups:
                olevel = Dataset
            else:
                olevel = Block

            group_list.append(Group(entry['name'], olevel=olevel))

        return group_list
 def __init__(self, config):
     self._phedex = PhEDEx(config.get('phedex', None))
class PhEDExCopyInterface(CopyInterface):
    """Copy using PhEDEx."""
    def __init__(self, config=None):
        config = Configuration(config)

        CopyInterface.__init__(self, config)

        self._phedex = PhEDEx(config.get('phedex', None))

        self._history = HistoryDatabase(config.get('history', None))

        self.subscription_chunk_size = config.get('chunk_size', 50.) * 1.e+12

    def schedule_copies(self,
                        replica_list,
                        operation_id,
                        comments=''):  #override
        sites = set(r.site for r in replica_list)
        if len(sites) != 1:
            raise OperationalError(
                'schedule_copies should be called with a list of replicas at a single site.'
            )

        site = list(sites)[0]

        LOG.info(
            'Scheduling copy of %d replicas to %s using PhEDEx (operation %d)',
            len(replica_list), site, operation_id)

        # sort the subscriptions into dataset level / block level and by groups
        subscription_lists = {}
        subscription_lists['dataset'] = collections.defaultdict(
            list)  # {(level, group_name): [replicas]}
        subscription_lists['block'] = collections.defaultdict(
            list)  # {(level, group_name): [replicas]}

        for replica in replica_list:
            if replica.growing:
                subscription_lists['dataset'][replica.group].append(
                    replica.dataset)
            else:
                blocks_by_group = collections.defaultdict(set)
                for block_replica in replica.block_replicas:
                    blocks_by_group[block_replica.group].add(
                        block_replica.block)

                for group, blocks in blocks_by_group.iteritems():
                    subscription_lists['block'][group].extend(blocks)

        # for convenience, mapping dataset -> replica
        result = {}

        for level in ['dataset', 'block']:
            for group, items in subscription_lists[level].iteritems():
                success = self._run_subscription_request(
                    operation_id, site, group, level, items, comments)

                for replica in success:
                    if replica.dataset in result:
                        booked = result[replica.dataset]
                        # need to merge
                        for block_replica in replica.block_replicas:
                            # there shouldn't be any block replica overlap but we will be careful
                            if booked.find_block_replica(
                                    block_replica.block) is None:
                                booked.block_replicas.add(block_replica)
                    else:
                        result[replica.dataset] = replica

        return result.values()

    def _run_subscription_request(self, operation_id, site, group, level,
                                  subscription_list, comments):
        # Make a subscription request for potentitally multiple datasets or blocks but to one site and one group
        full_catalog = collections.defaultdict(list)

        if level == 'dataset':
            for dataset in subscription_list:
                full_catalog[dataset] = []
        elif level == 'block':
            for block in subscription_list:
                full_catalog[block.dataset].append(block)

        history_sql = 'INSERT INTO `phedex_requests` (`id`, `operation_type`, `operation_id`, `approved`) VALUES (%s, \'copy\', %s, %s)'

        success = []

        # make requests in chunks
        request_catalog = {}
        chunk_size = 0
        items = []
        while len(full_catalog) != 0:
            dataset, blocks = full_catalog.popitem()
            request_catalog[dataset] = blocks

            if level == 'dataset':
                chunk_size += dataset.size
                items.append(dataset)
            elif level == 'block':
                chunk_size += sum(b.size for b in blocks)
                items.extend(blocks)

            if chunk_size < self.subscription_chunk_size and len(
                    full_catalog) != 0:
                continue

            options = {
                'node': site.name,
                'data': self._phedex.form_catalog_xml(request_catalog),
                'level': level,
                'priority': 'low',
                'move': 'n',
                'static': 'n',
                'custodial': 'n',
                'group': group.name,
                'request_only': 'n',
                'no_mail': 'n',
                'comments': comments
            }

            try:
                if self._read_only:
                    result = [{'id': 0}]
                else:
                    result = self._phedex.make_request('subscribe',
                                                       options,
                                                       method=POST)
            except:
                LOG.error('Copy %s failed.', str(options))
                # we should probably do something here
            else:
                request_id = int(result[0]['id'])  # return value is a string
                LOG.warning('PhEDEx subscription request id: %d', request_id)
                if not self._read_only:
                    self._history.db.query(history_sql, request_id,
                                           operation_id, True)

                for dataset, blocks in request_catalog.iteritems():
                    if level == 'dataset':
                        replica = DatasetReplica(dataset,
                                                 site,
                                                 growing=True,
                                                 group=group)
                        for block in dataset.blocks:
                            replica.block_replicas.add(
                                BlockReplica(block,
                                             site,
                                             group,
                                             size=0,
                                             last_update=int(time.time())))

                    else:
                        replica = DatasetReplica(dataset, site, growing=False)
                        for block in blocks:
                            replica.block_replicas.add(
                                BlockReplica(block,
                                             site,
                                             group,
                                             size=0,
                                             last_update=int(time.time())))

                    success.append(replica)

            request_catalog = {}
            chunk_size = 0
            items = []

        return success

    def copy_status(self, history_record, inventory):  #override
        request_ids = self._history.db.query(
            'SELECT `id` FROM `phedex_requests` WHERE `operation_type` = \'copy\' AND `operation_id` = %s',
            history_record.operation_id)

        if len(request_ids) == 0:
            return {}

        return self.transfer_request_status(request_ids)

    def transfer_request_status(self, request_ids):
        status = {}

        LOG.debug('Querying PhEDEx transferrequests for requests %s',
                  request_ids)
        requests = self._phedex.make_request('transferrequests',
                                             [('request', i)
                                              for i in request_ids],
                                             method=POST)
        if len(requests) == 0:
            return status

        for request in requests:
            # A single request can have multiple destinations
            site_names = [d['name'] for d in request['destinations']['node']]

            dataset_names = []
            for ds_entry in request['data']['dbs']['dataset']:
                dataset_names.append(ds_entry['name'])

            block_names = []
            for ds_entry in request['data']['dbs']['block']:
                block_names.append(ds_entry['name'])

        if len(dataset_names) != 0:
            # Process dataset-level subscriptions

            subscriptions = []
            chunks = [
                dataset_names[i:i + 35]
                for i in xrange(0, len(dataset_names), 35)
            ]
            for site_name in site_names:
                for chunk in chunks:
                    subscriptions.extend(
                        self._phedex.make_request(
                            'subscriptions', ['node=%s' % site_name] +
                            ['dataset=%s' % n for n in chunk]))

            for dataset in subscriptions:
                dataset_name = dataset['name']
                try:
                    cont = dataset['subscription'][0]
                except KeyError:
                    LOG.error('Subscription of %s should exist but doesn\'t',
                              dataset_name)
                    continue

                site_name = cont['node']
                bytes = dataset['bytes']

                node_bytes = cont['node_bytes']
                if node_bytes is None:
                    node_bytes = 0
                elif node_bytes != bytes:
                    # it's possible that there were block-level deletions
                    blocks = self._phedex.make_request(
                        'blockreplicas',
                        ['node=%s' % site_name,
                         'dataset=%s' % dataset_name])
                    bytes = sum(b['bytes'] for b in blocks)

                status[(site_name, dataset_name)] = (bytes, node_bytes,
                                                     cont['time_update'])

        if len(block_names) != 0:
            # Process block-level subscriptions

            subscriptions = []
            chunks = [
                block_names[i:i + 35] for i in xrange(0, len(block_names), 35)
            ]
            for site_name in site_names:
                for chunk in chunks:
                    subscriptions.extend(
                        self._phedex.make_request(
                            'subscriptions', ['node=%s' % site_name] +
                            ['block=%s' % n for n in chunk]))

            overridden = set()

            for dataset in subscriptions:
                dataset_name = dataset['name']

                try:
                    blocks = dataset['block']
                except KeyError:
                    try:
                        cont = dataset['subscription'][0]
                    except KeyError:
                        LOG.error(
                            'Subscription of %s neither block-level nor dataset-level',
                            dataset_name)
                        continue

                    site_name = cont['node']

                    if (site_name, dataset_name) in overridden:
                        # this is a dataset-level subscription and we've processed this dataset already
                        continue

                    overridden.add((site_name, dataset_name))

                    LOG.debug(
                        'Block-level subscription of %s at %s is overridden',
                        dataset_name, site_name)

                    requested_blocks = [
                        name for name in block_names
                        if name.startswith(dataset_name + '#')
                    ]

                    blocks = self._phedex.make_request(
                        'blockreplicas',
                        ['node=%s' % site_name,
                         'dataset=%s' % dataset_name])
                    for block in blocks:
                        block_name = block['name']
                        if block_name not in requested_blocks:
                            continue

                        replica = block['replica'][0]
                        status[(site_name,
                                block_name)] = (block['bytes'],
                                                replica['bytes'],
                                                replica['time_update'])

                    continue

                for block in blocks:
                    block_name = block['name']
                    try:
                        cont = block['subscription'][0]
                    except KeyError:
                        LOG.error(
                            'Subscription of %s should exist but doesn\'t',
                            block_name)
                        continue

                    node_bytes = cont['node_bytes']
                    if node_bytes is None:
                        node_bytes = 0

                    status[(cont['node'],
                            block_name)] = (block['bytes'], node_bytes,
                                            cont['time_update'])

        # now we pick up whatever did not appear in the subscriptions call
        for site_name in site_names:
            for dataset_name in dataset_names:
                key = (site_name, dataset_name)
                if key not in status:
                    status[key] = None

            for block_name in block_names:
                key = (site_name, block_name)
                if key not in status:
                    status[key] = None

        return status
Example #12
0
class PhEDExSiteInfoSource(SiteInfoSource):
    """SiteInfoSource for PhEDEx. Also use CMS Site Status Board for additional information."""
    def __init__(self, config=None):
        config = Configuration(config)

        SiteInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.get('phedex', None))
        self._ssb = SiteStatusBoard(config.get('ssb', None))

        self.ssb_cache_lifetime = config.get('ssb_cache_lifetime', 3600)
        self._ssb_cache_timestamp = 0
        self._caching_lock = threading.Lock()

        self._waitroom_sites = set()
        self._morgue_sites = set()

    def get_site(self, name):  #override
        if not self.check_allowed_site(name):
            LOG.info('get_site(%s)  %s is excluded by configuration.', name,
                     name)
            return None

        LOG.info('get_site(%s)  Fetching information of %s from PhEDEx', name,
                 name)

        # General site info
        result = self._phedex.make_request('nodes', ['node=' + name])
        if len(result) == 0:
            return None

        entry = result[0]

        host = entry['se']
        storage_type = Site.storage_type_val(entry['kind'])

        return Site(name, host=host, storage_type=storage_type)

    def get_site_list(self, inventory):  #override
        LOG.info('get_site_list  Fetching the list of nodes from PhEDEx')

        site_list = []

        for entry in self._phedex.make_request('nodes'):
            site_name = entry['name']
            if not self.check_allowed_site(site_name):
                continue

            siteObj_new = Site(site_name,
                               host=entry['se'],
                               storage_type=Site.storage_type_val(
                                   entry['kind']))
            if site_name in inventory.sites:
                siteObj_old = inventory.sites[site_name]
                siteObj_new.backend = siteObj_old.backend
                siteObj_new.x509proxy = siteObj_old.x509proxy

            site_list.append(siteObj_new)

        return site_list

    def get_site_status(self, site_name):  #override
        with self._caching_lock:
            if time.time(
            ) > self._ssb_cache_timestamp + self.ssb_cache_lifetime:
                self._waitroom_sites = set()
                self._morgue_sites = set()

                latest_status = {}

                # get list of sites in waiting room (153) and morgue (199)
                for colid, stat, sitelist in [
                    (153, Site.STAT_WAITROOM, self._waitroom_sites),
                    (199, Site.STAT_MORGUE, self._morgue_sites)
                ]:
                    result = self._ssb.make_request(
                        'getplotdata',
                        'columnid=%d&time=2184&dateFrom=&dateTo=&sites=all&clouds=undefined&batch=1'
                        % colid)
                    for entry in result:
                        site = entry['VOName']

                        # entry['Time'] is UTC but we are only interested in relative times here
                        timestamp = time.mktime(
                            time.strptime(entry['Time'], '%Y-%m-%dT%H:%M:%S'))
                        if site in latest_status and latest_status[site][
                                0] > timestamp:
                            continue

                        if entry['Status'] == 'in':
                            latest_status[site] = (timestamp, stat)
                        else:
                            latest_status[site] = (timestamp, Site.STAT_READY)

                for site, (_, stat) in latest_status.items():
                    if stat == Site.STAT_WAITROOM:
                        self._waitroom_sites.add(site)
                    elif stat == Site.STAT_MORGUE:
                        self._morgue_sites.add(site)

                self._ssb_cache_timestamp = time.time()

        if site_name in self._waitroom_sites:
            return Site.STAT_WAITROOM
        elif site_name in self._morgue_sites:
            return Site.STAT_MORGUE
        else:
            return Site.STAT_READY

    def get_filename_mapping(self, site_name):  #override
        tfc = self._phedex.make_request('tfc', ['node=' + site_name])['array']

        conversions = {}
        for elem in tfc:
            if elem['element_name'] != 'lfn-to-pfn':
                continue

            if 'destination-match' in elem and re.match(
                    elem['destination-match'], site_name) is None:
                continue

            if 'chain' in elem:
                chain = elem['chain']
            else:
                chain = None

            result = elem['result']
            i = 1
            while '$' in result:
                result = result.replace('$%d' % i, '{%d}' % (i - 1))
                i += 1
                if i == 100:
                    # can't be possibly right
                    break

            result = result.replace('\\', '')

            if elem['protocol'] in conversions:
                conversions[elem['protocol']].append(
                    (elem['path-match'], result, chain))
            else:
                conversions[elem['protocol']] = [(elem['path-match'], result,
                                                  chain)]

        def make_mapping_chains(rule):
            if rule[2] is None:
                return [[(rule[0], rule[1])]]
            else:
                if rule[2] not in conversions:
                    return None

                chains = []
                for chained_rule in conversions[rule[2]]:
                    mapped_chains = make_mapping_chains(chained_rule)
                    if mapped_chains is None:
                        continue

                    chains.extend(mapped_chains)

                for chain in chains:
                    chain.append((rule[0], rule[1]))

                return chains

        mappings = {}

        for protocol, rules in conversions.items():
            if protocol == 'direct':
                continue

            if protocol == 'srmv2':
                # for historic reasons PhEDEx calls gfal2 srmv2
                protocol = 'gfal2'

            mapping = []

            for rule in rules:
                chains = make_mapping_chains(rule)
                if chains is None:
                    continue

                mapping.extend(chains)

            mappings[protocol] = mapping

        return mappings
class PhEDExDeletionInterface(DeletionInterface):
    """Deletion using PhEDEx."""
    def __init__(self, config=None):
        config = Configuration(config)

        DeletionInterface.__init__(self, config)

        self._phedex = PhEDEx(config.get('phedex', None))

        self._history = HistoryDatabase(config.get('history', None))

        self.auto_approval = config.get('auto_approval', True)
        self.allow_tape_deletion = config.get('allow_tape_deletion', True)
        self.tape_auto_approval = config.get('tape_auto_approval', False)

        self.deletion_chunk_size = config.get('chunk_size', 50.) * 1.e+12

    def schedule_deletions(self,
                           replica_list,
                           operation_id,
                           comments=''):  #override
        sites = set(r.site for r, b in replica_list)
        if len(sites) != 1:
            raise OperationalError(
                'schedule_copies should be called with a list of replicas at a single site.'
            )

        site = list(sites)[0]

        if site.storage_type == Site.TYPE_MSS and not self.allow_tape_deletion:
            LOG.warning('Deletion from MSS not allowed by configuration.')
            return []

        if self.allow_tape_deletion and self.auto_approval:
            LOG.warning(
                'You cannot have auto-approved tape deletions. Set auto-approval to False.'
            )
            return []

        # execute the deletions in two steps: one for dataset-level and one for block-level
        datasets = []
        blocks = []

        # maps used later for cloning
        # getting ugly here.. should come up with a better way of making clones
        replica_map = {}
        block_replica_map = {}

        for dataset_replica, block_replicas in replica_list:
            if block_replicas is None:
                datasets.append(dataset_replica.dataset)
            else:
                blocks.extend(br.block for br in block_replicas)

                replica_map[dataset_replica.dataset] = dataset_replica
                block_replica_map.update(
                    (br.block, br) for br in block_replicas)

        success = []

        deleted_datasets = self._run_deletion_request(operation_id, site,
                                                      'dataset', datasets,
                                                      comments)

        for dataset in deleted_datasets:
            replica = DatasetReplica(dataset,
                                     site,
                                     growing=False,
                                     group=Group.null_group)
            success.append((replica, None))

        tmp_map = dict((dataset, []) for dataset in replica_map.iterkeys())

        deleted_blocks = self._run_deletion_request(operation_id, site,
                                                    'block', blocks, comments)

        for block in deleted_blocks:
            tmp_map[block.dataset].append(block)

        for dataset, blocks in tmp_map.iteritems():
            replica = DatasetReplica(dataset, site)
            replica.copy(replica_map[dataset])

            success.append((replica, []))
            for block in blocks:
                block_replica = BlockReplica(block, site, Group.null_group)
                block_replica.copy(block_replica_map[block])
                block_replica.last_update = int(time.time())
                success[-1][1].append(block_replica)

        return success

    def _run_deletion_request(self, operation_id, site, level, deletion_list,
                              comments):
        full_catalog = collections.defaultdict(list)

        if level == 'dataset':
            for dataset in deletion_list:
                full_catalog[dataset] = []
        elif level == 'block':
            for block in deletion_list:
                full_catalog[block.dataset].append(block)

        history_sql = 'INSERT INTO `phedex_requests` (`id`, `operation_type`, `operation_id`, `approved`) VALUES (%s, \'deletion\', %s, %s)'

        deleted_items = []

        request_catalog = {}
        chunk_size = 0
        items = []
        while len(full_catalog) != 0:
            dataset, blocks = full_catalog.popitem()
            request_catalog[dataset] = blocks

            if level == 'dataset':
                chunk_size += dataset.size
                items.append(dataset)
            elif level == 'block':
                chunk_size += sum(b.size for b in blocks)
                items.extend(blocks)

            if chunk_size < self.deletion_chunk_size and len(
                    full_catalog) != 0:
                continue

            options = {
                'node': site.name,
                'data': self._phedex.form_catalog_xml(request_catalog),
                'level': level,
                'rm_subscriptions': 'y',
                'comments': comments
            }

            # result = [{'id': <id>}] (item 'request_created' of PhEDEx response) if successful
            try:
                if self._read_only:
                    result = [{'id': 0}]
                else:
                    result = self._phedex.make_request('delete',
                                                       options,
                                                       method=POST)
            except:
                LOG.error('Deletion %s failed.', str(options))

                if self._phedex.last_errorcode == 400:
                    # Sometimes we have invalid data in the list of objects to delete.
                    # PhEDEx throws a 400 error in such a case. We have to then try to identify the
                    # problematic item through trial and error.
                    if len(items) == 1:
                        LOG.error('Could not delete %s from %s', str(items[0]),
                                  site.name)
                    else:
                        LOG.info('Retrying with a reduced item list.')
                        deleted_items.extend(
                            self._run_deletion_request(operation_id, site,
                                                       level,
                                                       items[:len(items) / 2],
                                                       comments))
                        deleted_items.extend(
                            self._run_deletion_request(operation_id, site,
                                                       level,
                                                       items[len(items) / 2:],
                                                       comments))
                else:
                    raise
            else:
                request_id = int(result[0]['id'])  # return value is a string
                LOG.warning('PhEDEx deletion request id: %d', request_id)

                approved = False

                if self._read_only:
                    approved = True

                elif self.auto_approval:
                    try:
                        result = self._phedex.make_request('updaterequest', {
                            'decision': 'approve',
                            'request': request_id,
                            'node': site.name
                        },
                                                           method=POST)
                    except:
                        LOG.error('deletion approval of request %d failed.',
                                  request_id)
                    else:
                        approved = True

                if not self._read_only:
                    self._history.db.query(history_sql, request_id,
                                           operation_id, approved)

                if approved:
                    deleted_items.extend(items)

            request_catalog = {}
            chunk_size = 0
            items = []

        return deleted_items

    def deletion_status(self, request_id):  #override
        request = self._phedex.make_request('deleterequests',
                                            'request=%d' % request_id)
        if len(request) == 0:
            return {}

        node_info = request[0]['nodes']['node'][0]
        site_name = node_info['name']
        last_update = node_info['decided_by']['time_decided']

        status = {}
        for ds_entry in request[0]['data']['dbs']['dataset']:
            status[ds_entry['name']] = (ds_entry['bytes'], ds_entry['bytes'],
                                        last_update)

        return status
    def __init__(self, config):
        GroupInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.get('phedex', None))
Example #15
0
class PhEDExDatasetInfoSource(DatasetInfoSource):
    """DatasetInfoSource using PhEDEx and DBS."""
    def __init__(self, config=None):
        if config is None:
            config = Configuration()

        DatasetInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.get('phedex', None))
        self._dbs = DBS(config.get('dbs', None))

    def get_dataset_names(self, include=['*'], exclude=[]):
        dataset_names = []

        exclude_exps = []
        for pattern in exclude:
            exclude_exps.append(re.compile(fnmatch.translate(pattern)))

        def add_datasets(result):
            for entry in result:
                name = entry['dataset']
                for ex_exp in exclude_exps:
                    if ex_exp.match(name):
                        break
                else:
                    # not excluded by args, now check my include/exclude list
                    if self.check_allowed_dataset(name):
                        dataset_names.append(name)

        if len(include) == 1 and include[0] == '/*/*/*':
            # all datasets requested - will do this efficiently
            result = self._dbs.make_request('acquisitioneras')
            sds = [entry['acquisition_era_name'] for entry in result]

            # query DBS in parallel
            args = [('datasets', ['acquisition_era_name=' + sd]) for sd in sds]
            results = Map().execute(self._dbs.make_request, args)
            for result in results:
                add_datasets(result)

        for in_pattern in include:
            if in_pattern.startswith('/') and in_pattern.count('/') == 3:
                result = self._dbs.make_request('datasets',
                                                ['dataset=' + in_pattern])
                add_datasets(result)

        return dataset_names

    def get_updated_datasets(self, updated_since):  #override
        LOG.warning(
            'PhEDExDatasetInfoSource can only return a list of datasets and blocks that are created since the given timestamp.'
        )

        result = self._phedex.make_request('data', [
            'dataset=' + name, 'level=block',
            'create_since=%d' % updated_since
        ])

        try:
            dataset_entries = result[0]['dataset']
        except:
            return []

        if self.include is not None or self.exclude is not None:
            ientry = 0
            while ientry != len(dataset_entries):
                if self.check_allowed_dataset(dataset_entries[ientry]['name']):
                    ientry += 1
                else:
                    dataset_entries.pop(ientry)

        return Map().execute(self._create_dataset, dataset_entries)

    def get_dataset(self, name, with_files=False):  #override
        ## Get the full dataset-block-file data from PhEDEx
        if not name.startswith('/') or name.count('/') != 3:
            return None

        if not self.check_allowed_dataset(name):
            return None

        def get_dbs_datasets(name, dbs_data):
            dbs_data['datasets'] = self._dbs.make_request(
                'datasets',
                ['dataset=' + name, 'dataset_access_type=*', 'detail=True'])

        def get_dbs_releaseversions(name, dbs_data):
            dbs_data['releaseversions'] = self._dbs.make_request(
                'releaseversions', ['dataset=' + name])

        dbs_data = {}
        th1 = threading.Thread(target=get_dbs_datasets, args=(name, dbs_data))
        th1.start()
        th2 = threading.Thread(target=get_dbs_releaseversions,
                               args=(name, dbs_data))
        th2.start()

        if with_files:
            level = 'file'
        else:
            level = 'block'

        result = self._phedex.make_request(
            'data', ['dataset=' + name, 'level=' + level])

        th1.join()
        th2.join()

        try:
            dataset_entry = result[0]['dataset'][0]
        except:
            return None

        ## Create the dataset object
        dataset = self._create_dataset(dataset_entry, dbs_data)

        ## Fill block and file data
        if 'block' in dataset_entry:
            for block_entry in dataset_entry['block']:
                block = self._create_block(block_entry, dataset)
                dataset.blocks.add(block)

                if with_files and 'file' in block_entry:
                    # See comments in get_block
                    block._files = set()
                    for file_entry in block_entry['file']:
                        block._files.add(self._create_file(file_entry, block))

        return dataset

    def get_block(self, name, with_files=False):  #override
        ## Get the full block-file data from PhEDEx
        if not name.startswith('/') or name.count('/') != 3 or '#' in name:
            return None

        if not self.check_allowed_dataset(name[:name.find('#')]):
            return None

        if with_files:
            level = 'file'
        else:
            level = 'block'

        result = self._phedex.make_request('data',
                                           ['block=' + name, 'level=' + level])

        try:
            dataset_entry = result[0]['dataset'][0]
            block_entry = dataset_entry['block'][0]
        except:
            return None

        # Just need a named object
        dataset = Dataset(dataset_entry['name'])

        block = self._create_block(block_entry, dataset)

        if with_files and 'file' in block_entry:
            # _create_block sets size and num_files; just need to update the files list
            # Directly creating the _files set
            # This list will persist (unlike the weak proxy version loaded from inventory), but the returned block
            # from this function is only used temporarily anyway
            block._files = set()
            for file_entry in block_entry['file']:
                block._files.add(self._create_file(file_entry, block))

        return block

    def get_file(self, name):
        ## Get the file data from PhEDEx

        result = self._phedex.make_request('data',
                                           ['file=' + name, 'level=file'])

        try:
            dataset_entry = result[0]['dataset'][0]
            block_entry = dataset_entry['block'][0]
            file_entry = block_entry['file'][0]
        except:
            return None

        if not self.check_allowed_deataset(dataset_entry['name']):
            return None

        bname = block_entry['name']
        block_name = Block.to_internal_name(bname[bname.find('#') + 1:])

        # Just need a named object
        dataset = Dataset(dataset_entry['name'])
        block = Block(block_name, dataset)

        lfile = self._create_file(file_entry, block)

        return lfile

    def get_files(self, dataset_or_block):  #override
        files = set()

        if type(dataset_or_block) is Dataset:
            result = self._phedex.make_request(
                'data', ['dataset=' + dataset_or_block.name, 'level=file'])
            blocks = dict((b.name, b) for b in dataset_or_block.blocks)
        else:
            result = self._phedex.make_request(
                'data',
                ['block=' + dataset_or_block.full_name(), 'level=file'])
            blocks = {dataset_or_block.name: dataset_or_block}

        try:
            block_entries = result[0]['dataset'][0]['block']
        except:
            return files

        for block_entry in block_entries:
            try:
                file_entries = block_entry['file']
            except:
                continue

            bname = block_entry['name']
            block_name = Block.to_internal_name(bname[bname.find('#') + 1:])
            try:
                block = blocks[block_name]
            except:
                # unknown block! maybe should raise?
                continue

            for file_entry in file_entries:
                files.add(self._create_file(file_entry, block))

        return files

    def _create_dataset(self, dataset_entry, dbs_data=None):
        """
        Create a dataset object with blocks and files from a PhEDEx dataset entry
        """

        dataset = Dataset(dataset_entry['name'],
                          is_open=(dataset_entry['is_open'] == 'y'))

        if 'time_update' in dataset_entry and dataset_entry[
                'time_update'] is not None:
            dataset.last_update = int(dataset_entry['time_update'])
        else:
            dataset.last_update = int(dataset_entry['time_create'])

        ## Get other details of the dataset from DBS
        self._fill_dataset_details(dataset, dbs_data)

        return dataset

    def _create_block(self, block_entry, dataset):
        """
        Create a block object with files from a PhEDEx block entry
        """

        bname = block_entry['name']
        block_name = Block.to_internal_name(bname[bname.find('#') + 1:])

        block = Block(block_name,
                      dataset,
                      size=block_entry['bytes'],
                      num_files=block_entry['files'],
                      is_open=(block_entry['is_open'] == 'y'))

        if 'time_update' in block_entry and block_entry[
                'time_update'] is not None:
            block.last_update = int(block_entry['time_update'])
        else:
            block.last_update = int(block_entry['time_create'])

        return block

    def _create_file(self, file_entry, block):
        adler32 = ''
        crc32 = 0
        for cksum in file_entry['checksum'].split(','):
            if cksum.startswith('adler32'):
                adler32 = cksum[8:]
            elif cksum.startswith('cksum'):
                crc32 = int(cksum[6:])

        lfile = File(file_entry['lfn'],
                     block=block,
                     size=file_entry['size'],
                     checksum=(crc32, adler32))

        return lfile

    def _fill_dataset_details(self, dataset, dbs_data=None):
        if dbs_data is None:
            dbs_data = {}

            if dataset.name.startswith('/') and dataset.name.count('/') == 3:
                dbs_data['datasets'] = self._dbs.make_request(
                    'datasets', [
                        'dataset=' + dataset.name, 'dataset_access_type=*',
                        'detail=True'
                    ])
            else:
                dbs_data['datasets'] = []

            dbs_data['releaseversions'] = self._dbs.make_request(
                'releaseversions', ['dataset=' + dataset.name])

        # 1. status and PD type

        if len(dbs_data['datasets']) != 0:
            dbs_entry = dbs_data['datasets'][0]
            dataset.status = Dataset.status_val(
                dbs_entry['dataset_access_type'])
            dataset.data_type = Dataset.data_type_val(
                dbs_entry['primary_ds_type'])
        else:
            dataset.status = Dataset.STAT_UNKNOWN
            dataset.data_type = Dataset.TYPE_UNKNOWN

        # 2. software version

        if len(dbs_data['releaseversions']) != 0:
            try:
                version = dbs_data['releaseversions'][0]['release_version'][0]
            except KeyError:
                pass
            else:
                matches = re.match('CMSSW_([0-9]+)_([0-9]+)_([0-9]+)(|_.*)',
                                   version)
                if matches:
                    cycle, major, minor = map(
                        int, [matches.group(i) for i in range(1, 4)])

                    if matches.group(4):
                        suffix = matches.group(4)[1:]
                    else:
                        suffix = ''

                    dataset.software_version = (cycle, major, minor, suffix)
Example #16
0
class PhEDExDeletionInterface(DeletionInterface):
    """Deletion using PhEDEx."""
    def __init__(self, config):
        DeletionInterface.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)

        self.auto_approval = config.auto_approval
        self.allow_tape_deletion = config.allow_tape_deletion
        self.tape_auto_approval = config.tape_auto_approval

        self.deletion_chunk_size = config.chunk_size * 1.e+12

    def schedule_deletion(self, replica, comments=''):  #override
        request_mapping = {}

        if replica.site.storage_type == Site.TYPE_MSS and self.allow_tape_deletion:
            LOG.warning('Deletion from MSS is not allowed by configuration.')
            return request_mapping

        deletion_list = []
        if type(replica) is DatasetReplica:
            replica_blocks = set(r.block for r in replica.block_replicas)

            if replica_blocks == replica.dataset.blocks:
                deletion_list.append(replica.dataset)
                level = 'dataset'
            else:
                deletion_list.extend(replica_blocks)
                level = 'block'

        else:  #BlockReplica
            deletion_list.append(replica.block)
            level = 'block'

        self._run_deletion_request(request_mapping, replica.site, level,
                                   deletion_list, comments)

        return request_mapping

    def schedule_deletions(self, replica_list, comments=''):  #override
        request_mapping = {}

        replicas_by_site = collections.defaultdict(list)
        for replica in replica_list:
            replicas_by_site[replica.site].append(replica)

            if replica.site.storage_type == Site.TYPE_MSS and not self.allow_tape_deletion:
                LOG.warning('Deletion from MSS not allowed by configuration.')
                return {}

        for site, replica_list in replicas_by_site.iteritems():
            # execute the deletions in two steps: one for dataset-level and one for block-level
            deletion_lists = {'dataset': [], 'block': []}

            for replica in replica_list:
                if type(replica) is DatasetReplica:
                    blocks = set(r.block for r in replica.block_replicas)

                    if blocks == replica.dataset.blocks:
                        deletion_lists['dataset'].append(replica.dataset)
                    else:
                        deletion_lists['block'].extend(blocks)

                else:  #BlockReplica
                    deletion_lists['block'].append(replica.block)

            self._run_deletion_request(request_mapping, site, 'dataset',
                                       deletion_lists['dataset'], comments)
            self._run_deletion_request(request_mapping, site, 'block',
                                       deletion_lists['block'], comments)

        return request_mapping

    def _run_deletion_request(self, request_mapping, site, level,
                              deletion_list, comments):
        full_catalog = collections.defaultdict(list)

        if level == 'dataset':
            for dataset in deletion_list:
                full_catalog[dataset] = []
        elif level == 'block':
            for block in deletion_list:
                full_catalog[block.dataset].append(block)

        request_catalog = {}
        chunk_size = 0
        items = []
        while len(full_catalog) != 0:
            dataset, blocks = full_catalog.popitem()
            request_catalog[dataset] = blocks

            if level == 'dataset':
                chunk_size += dataset.size
                items.append(dataset)
            elif level == 'block':
                chunk_size += sum(b.size for b in blocks)
                items.extend(blocks)

            if chunk_size < self.deletion_chunk_size and len(
                    full_catalog) != 0:
                continue

            options = {
                'node': site.name,
                'data': self._phedex.form_catalog_xml(request_catalog),
                'level': level,
                'rm_subscriptions': 'y',
                'comments': comments
            }

            # result = [{'id': <id>}] (item 'request_created' of PhEDEx response) if successful
            if self.dry_run:
                result = [{'id': '0'}]
            else:
                try:
                    result = self._phedex.make_request('delete',
                                                       options,
                                                       method=POST)
                except:
                    if self._phedex.last_errorcode == 400:
                        # Sometimes we have invalid data in the list of objects to delete.
                        # PhEDEx throws a 400 error in such a case. We have to then try to identify the
                        # problematic item through trial and error.
                        if len(items) == 1:
                            LOG.error('Could not delete %s from %s',
                                      str(items[0]), site.name)
                            result = []
                        else:
                            self._run_deletion_request(request_mapping, site,
                                                       level,
                                                       item[:len(item) / 2],
                                                       comments)
                            self._run_deletion_request(request_mapping, site,
                                                       level,
                                                       item[len(item) / 2:],
                                                       comments)
                    else:
                        result = []

            if len(result) != 0:
                request_id = int(result[0]['id'])  # return value is a string
                LOG.warning('PhEDEx deletion request id: %d', request_id)

                approved = False

                if self.dry_run:
                    approved = True

                elif self.auto_approval:
                    try:
                        result = self._phedex.make_request('updaterequest', {
                            'decision': 'approve',
                            'request': request_id,
                            'node': site.name
                        },
                                                           method=POST)
                    except:
                        LOG.error('deletion approval of request %d failed.',
                                  request_id)
                    else:
                        approved = True

                request_mapping[request_id] = (approved, site, items)

            else:
                LOG.error('Deletion %s failed.', str(options))
                # we should probably do something here

            request_catalog = {}
            chunk_size = 0
            items = []

    def deletion_status(self, request_id):  #override
        request = self._phedex.make_request('deleterequests',
                                            'request=%d' % request_id)
        if len(request) == 0:
            return {}

        node_info = request[0]['nodes']['node'][0]
        site_name = node_info['name']
        last_update = node_info['decided_by']['time_decided']

        status = {}
        for ds_entry in request[0]['data']['dbs']['dataset']:
            status[ds_entry['name']] = (ds_entry['bytes'], ds_entry['bytes'],
                                        last_update)

        return status
class InvalidationRequest(WebModule):
    def __init__(self, config):
        WebModule.__init__(self, config)

        self.dbs = DBS()
        self.phedex = PhEDEx()
        self.registry = RegistryDatabase()
        self.authorized_users = list(config.file_invalidation.authorized_users)

    def run(self, caller, request, inventory):
        if caller.name not in self.authorized_users:
            raise AuthorizationError()

        try:
            item = request['item']
        except KeyError:
            raise MissingParameter('item')

        if type(item) is list:
            items = item
        else:
            items = [item]

        invalidated_items = []

        sql = 'INSERT INTO `invalidations` (`item`, `db`, `user_id`, `timestamp`) VALUES (%s, %s, %s, NOW())'

        for item in items:
            invalidated = False

            if item in inventory.datasets:
                # item is a dataset

                result = self.dbs.make_request('datasets', [
                    'dataset=' + item, 'dataset_access_type=*', 'detail=true'
                ])
                if len(result) != 0:
                    status = result[0]['dataset_access_type']
                    if status in ('VALID', 'PRODUCTION'):
                        self.registry.db.query(sql, item, 'dbs', caller.id)

                    for entry in self.dbs.make_request(
                            'files', ['dataset=' + item, 'validFileOnly=1']):
                        self.registry.db.query(sql, entry['logical_file_name'],
                                               'dbs', caller.id)

                    invalidated = True

                result = self.phedex.make_request(
                    'data', ['dataset=' + item, 'level=block'])
                if len(result) != 0:
                    self.registry.db.query(sql, item, 'tmdb', caller.id)
                    invalidated = True

            else:
                try:
                    dataset_name, block_name = Block.from_full_name(item)
                except:
                    lfile = inventory.find_file(item)
                    if lfile is not None:
                        # item is a file

                        result = self.dbs.make_request(
                            'files',
                            ['logical_file_name=' + item, 'validFileOnly=1'])
                        if len(result) != 0:
                            self.registry.db.query(
                                sql, result[0]['logical_file_name'], 'dbs',
                                caller.id)
                            invalidated = True

                        result = self.phedex.make_request(
                            'data', ['file=' + item])
                        if len(result) != 0:
                            self.registry.db.query(sql, item, 'tmdb',
                                                   caller.id)
                            invalidated = True

                else:
                    # item is a block

                    for entry in self.dbs.make_request(
                            'files',
                        ['block_name=' + item, 'validFileOnly=1']):
                        self.registry.db.query(sql, entry['logical_file_name'],
                                               'dbs', caller.id)
                        invalidated = True

                    result = self.phedex.make_request(
                        'data', ['block=' + item, 'level=block'])
                    if len(result) != 0:
                        self.registry.db.query(sql, item, 'tmdb', caller.id)
                        invalidated = True

            if invalidated:
                invalidated_items.append({'item': item})

        return invalidated_items
Example #18
0
    def __init__(self, config):
        GroupInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)
Example #19
0
    def __init__(self, config):
        ReplicaInfoSource.__init__(self, config)

        self._phedex = PhEDEx(config.phedex)