Beispiel #1
0
    def _get_source_records(popdb, inventory, included_sites, excluded_sites,
                            start_date):
        """
        Get the replica access data from PopDB from start_date to today.
        @param popdb          PopDB interface
        @param inventory      DynamoInventory
        @param included_sites List of site name patterns to include
        @param excluded_sites List of site name patterns to exclude
        @param start_date     Query start date (datetime.datetime)
        @return  {replica: {date: (number of access, total cpu time)}}
        """

        days_to_query = []

        utctoday = datetime.date(*time.gmtime()[:3])
        date = start_date
        while date <= utctoday:  # get records up to today
            days_to_query.append(date)
            date += datetime.timedelta(1)  # one day

        LOG.info('Updating dataset access info from %s to %s',
                 start_date.strftime('%Y-%m-%d'),
                 utctoday.strftime('%Y-%m-%d'))

        all_accesses = {}

        arg_pool = []
        for site in inventory.sites.itervalues():
            matched = False
            for pattern in included_sites:
                if fnmatch.fnmatch(site.name, pattern):
                    matched = True
                    break
            for pattern in excluded_sites:
                if fnmatch.fnmatch(site.name, pattern):
                    matched = False
                    break

            if matched:
                for date in days_to_query:
                    arg_pool.append((popdb, site, inventory, date))

        mapper = Map()
        mapper.logger = LOG

        records = mapper.execute(CRABAccessHistory._get_site_record, arg_pool)

        for site_record in records:
            for replica, date, naccess, cputime in site_record:
                if replica not in all_accesses:
                    all_accesses[replica] = {}

                all_accesses[replica][date] = (naccess, cputime)

        return all_accesses
Beispiel #2
0
    def get_dataset_names(self, include=['*'], exclude=[]):
        dataset_names = []

        exclude_exps = []
        for pattern in exclude:
            exclude_exps.append(re.compile(fnmatch.translate(pattern)))

        def add_datasets(result):
            for entry in result:
                name = entry['dataset']
                for ex_exp in exclude_exps:
                    if ex_exp.match(name):
                        break
                else:
                    # not excluded
                    dataset_names.append(name)

        if len(include) == 1 and include[0] == '/*/*/*':
            # all datasets requested - will do this efficiently
            result = self._dbs.make_request('acquisitioneras')
            sds = [entry['acquisition_era_name'] for entry in result]

            # query DBS in parallel
            args = [('datasets', ['acquisition_era_name=' + sd]) for sd in sds]
            results = Map().execute(self._dbs.make_request, args)
            for result in results:
                add_datasets(result)

        for in_pattern in include:
            result = self._dbs.make_request('datasets',
                                            ['dataset=' + in_pattern])
            add_datasets(result)

        return dataset_names
Beispiel #3
0
    def get_updated_datasets(self, updated_since):  #override
        LOG.warning(
            'PhEDExDatasetInfoSource can only return a list of datasets and blocks that are created since the given timestamp.'
        )

        result = self._phedex.make_request('data', [
            'dataset=' + name, 'level=block',
            'create_since=%d' % updated_since
        ])

        try:
            dataset_entries = result[0]['dataset']
        except:
            return []

        if self.include is not None or self.exclude is not None:
            ientry = 0
            while ientry != len(dataset_entries):
                if self.check_allowed_dataset(dataset_entries[ientry]['name']):
                    ientry += 1
                else:
                    dataset_entries.pop(ientry)

        return Map().execute(self._create_dataset, dataset_entries)
    def get_replicas(self, site=None, dataset=None, block=None):  #override
        if site is None:
            site_check = self.check_allowed_site
        else:
            site_check = None
            if not self.check_allowed_site(site):
                return []

        if dataset is None and block is None:
            dataset_check = self.check_allowed_dataset
        else:
            dataset_check = None
            if dataset is not None:
                if not self.check_allowed_dataset(dataset):
                    return []
            if block is not None:
                if not self.check_allowed_dataset(block[:block.find('#')]):
                    return []

        options = []
        if site is not None:
            options.append('node=' + site)
        if dataset is not None:
            options.append('dataset=' + dataset)
        if block is not None:
            options.append('block=' + block)

        LOG.info('get_replicas(' + ','.join(options) +
                 ')  Fetching the list of replicas from PhEDEx')

        if len(options) == 0:
            return []

        block_entries = self._phedex.make_request('blockreplicas',
                                                  options,
                                                  timeout=7200)

        parallelizer = Map()
        parallelizer.timeout = 7200

        # Automatically starts a thread as we add the output of block_entries
        combine_file = parallelizer.get_starter(self._combine_file_info)

        for block_entry in block_entries:
            for replica_entry in block_entry['replica']:
                if replica_entry['complete'] == 'n':
                    break
            else:
                continue

            # there is at least one incomplete replica
            try:
                dataset_name, block_name = Block.from_full_name(
                    block_entry['name'])
            except ObjectError:  # invalid name
                continue

            if dataset_check and not dataset_check(dataset_name):
                continue

            combine_file.add_input(block_entry)

        combine_file.close()

        # _combine_file_info alters block_entries directly - no need to deal with output
        combine_file.get_outputs()

        block_replicas = PhEDExReplicaInfoSource.make_block_replicas(
            block_entries,
            PhEDExReplicaInfoSource.maker_blockreplicas,
            site_check=site_check,
            dataset_check=dataset_check)

        # Also use subscriptions call which has a lower latency than blockreplicas
        # For example, group change on a block replica at time T may not show up in blockreplicas until up to T + 15 minutes
        # while in subscriptions it is visible within a few seconds
        # But subscriptions call without a dataset or block takes too long
        if dataset is None and block is None:
            return block_replicas

        indexed = collections.defaultdict(dict)
        for replica in block_replicas:
            indexed[(replica.site.name,
                     replica.block.dataset.name)][replica.block.name] = replica

        dataset_entries = self._phedex.make_request('subscriptions',
                                                    options,
                                                    timeout=3600)

        for dataset_entry in dataset_entries:
            dataset_name = dataset_entry['name']

            if not self.check_allowed_dataset(dataset_name):
                continue

            try:
                subscriptions = dataset_entry['subscription']
            except KeyError:
                pass
            else:
                for sub_entry in subscriptions:
                    site_name = sub_entry['node']

                    if not self.check_allowed_site(site_name):
                        continue

                    replicas = indexed[(site_name, dataset_name)]

                    for replica in replicas.itervalues():
                        replica.group = Group(sub_entry['group'])
                        replica.is_custodial = (sub_entry['custodial'] == 'y')

            try:
                block_entries = dataset_entry['block']
            except KeyError:
                pass
            else:
                for block_entry in block_entries:
                    try:
                        _, block_name = Block.from_full_name(
                            block_entry['name'])
                    except ObjectError:
                        continue

                    try:
                        subscriptions = block_entry['subscription']
                    except KeyError:
                        continue

                    for sub_entry in subscriptions:
                        site_name = sub_entry['node']

                        if not self.check_allowed_site(site_name):
                            continue

                        try:
                            replica = indexed[(site_name,
                                               dataset_name)][block_name]
                        except KeyError:
                            continue

                        replica.group = Group(sub_entry['group'])

                        if sub_entry['node_bytes'] == block_entry['bytes']:
                            # complete
                            replica.size = sub_entry['node_bytes']
                            if replica.size is None:
                                replica.size = 0
                            replica.files = None
                        else:
                            # incomplete - since we cannot know what files are there, we'll just have to pretend there is none
                            replica.size = 0
                            replica.files = tuple()

                        replica.is_custodial = (sub_entry['custodial'] == 'y')

                        if sub_entry['time_update'] is not None:
                            replica.last_update = 0
                        else:
                            replica.last_update = int(sub_entry['time_update'])

        return block_replicas
    def get_updated_replicas(self, updated_since, inventory):  #override
        LOG.info(
            'get_updated_replicas(%d)  Fetching the list of replicas from PhEDEx',
            updated_since)

        nodes = []
        for entry in self._phedex.make_request('nodes', timeout=600):
            if not self.check_allowed_site(entry['name']):
                continue

            if entry['name'] not in inventory.sites:
                continue

            nodes.append(entry['name'])

        try:
            tmpconfig = Configuration(
                self._parallelizer_config.get('parallel', None))
        except Exception as e:
            LOG.error(str(e))
            tmpconfig = Configuration()

        parallelizer = Map(tmpconfig)
        parallelizer.timeout = 5400

        def get_node_replicas(node):
            options = ['update_since=%d' % updated_since, 'node=%s' % node]
            results = self._phedex.make_request('blockreplicas', options)

            return node, results

        # Use async to fire threads on demand
        node_results = parallelizer.execute(get_node_replicas,
                                            nodes,
                                            async=True)

        # Automatically starts a thread as we add the output of block_replicas
        combine_file = parallelizer.get_starter(self._combine_file_info)

        all_block_entries = []

        for node, block_entries in node_results:
            site = inventory.sites[node]

            for block_entry in block_entries:
                all_block_entries.append(block_entry)

                replica_entry = block_entry['replica'][0]

                if replica_entry['complete'] == 'y':
                    continue

                # incomplete block replica - should we fetch file info?
                try:
                    dataset_name, block_name = Block.from_full_name(
                        block_entry['name'])
                except ObjectError:
                    pass
                else:
                    try:
                        dataset = inventory.datasets[dataset_name]
                        block = dataset.find_block(block_name)
                        replica = block.find_replica(site)
                        if replica.file_ids is None:
                            num_files = block.num_files
                        else:
                            num_files = len(replica.file_ids)

                        if replica.size == replica_entry[
                                'bytes'] and num_files == replica_entry[
                                    'files']:
                            # no we don't have to
                            continue
                    except:
                        # At any point of the above lookups we may hit a None object or KeyError or what not
                        pass

                LOG.debug(
                    'Replica %s:%s is incomplete. Fetching file information.',
                    replica_entry['node'], block_entry['name'])
                combine_file.add_input(block_entry)

        combine_file.close()

        # _combine_file_info alters block_entries directly - no need to deal with output
        combine_file.get_outputs()

        LOG.info('get_updated_replicas(%d) Got outputs' % updated_since)

        return PhEDExReplicaInfoSource.make_block_replicas(
            all_block_entries,
            PhEDExReplicaInfoSource.maker_blockreplicas,
            dataset_check=self.check_allowed_dataset)