def _get_source_records(popdb, inventory, included_sites, excluded_sites, start_date): """ Get the replica access data from PopDB from start_date to today. @param popdb PopDB interface @param inventory DynamoInventory @param included_sites List of site name patterns to include @param excluded_sites List of site name patterns to exclude @param start_date Query start date (datetime.datetime) @return {replica: {date: (number of access, total cpu time)}} """ days_to_query = [] utctoday = datetime.date(*time.gmtime()[:3]) date = start_date while date <= utctoday: # get records up to today days_to_query.append(date) date += datetime.timedelta(1) # one day LOG.info('Updating dataset access info from %s to %s', start_date.strftime('%Y-%m-%d'), utctoday.strftime('%Y-%m-%d')) all_accesses = {} arg_pool = [] for site in inventory.sites.itervalues(): matched = False for pattern in included_sites: if fnmatch.fnmatch(site.name, pattern): matched = True break for pattern in excluded_sites: if fnmatch.fnmatch(site.name, pattern): matched = False break if matched: for date in days_to_query: arg_pool.append((popdb, site, inventory, date)) mapper = Map() mapper.logger = LOG records = mapper.execute(CRABAccessHistory._get_site_record, arg_pool) for site_record in records: for replica, date, naccess, cputime in site_record: if replica not in all_accesses: all_accesses[replica] = {} all_accesses[replica][date] = (naccess, cputime) return all_accesses
def get_dataset_names(self, include=['*'], exclude=[]): dataset_names = [] exclude_exps = [] for pattern in exclude: exclude_exps.append(re.compile(fnmatch.translate(pattern))) def add_datasets(result): for entry in result: name = entry['dataset'] for ex_exp in exclude_exps: if ex_exp.match(name): break else: # not excluded dataset_names.append(name) if len(include) == 1 and include[0] == '/*/*/*': # all datasets requested - will do this efficiently result = self._dbs.make_request('acquisitioneras') sds = [entry['acquisition_era_name'] for entry in result] # query DBS in parallel args = [('datasets', ['acquisition_era_name=' + sd]) for sd in sds] results = Map().execute(self._dbs.make_request, args) for result in results: add_datasets(result) for in_pattern in include: result = self._dbs.make_request('datasets', ['dataset=' + in_pattern]) add_datasets(result) return dataset_names
def get_updated_datasets(self, updated_since): #override LOG.warning( 'PhEDExDatasetInfoSource can only return a list of datasets and blocks that are created since the given timestamp.' ) result = self._phedex.make_request('data', [ 'dataset=' + name, 'level=block', 'create_since=%d' % updated_since ]) try: dataset_entries = result[0]['dataset'] except: return [] if self.include is not None or self.exclude is not None: ientry = 0 while ientry != len(dataset_entries): if self.check_allowed_dataset(dataset_entries[ientry]['name']): ientry += 1 else: dataset_entries.pop(ientry) return Map().execute(self._create_dataset, dataset_entries)
def get_replicas(self, site=None, dataset=None, block=None): #override if site is None: site_check = self.check_allowed_site else: site_check = None if not self.check_allowed_site(site): return [] if dataset is None and block is None: dataset_check = self.check_allowed_dataset else: dataset_check = None if dataset is not None: if not self.check_allowed_dataset(dataset): return [] if block is not None: if not self.check_allowed_dataset(block[:block.find('#')]): return [] options = [] if site is not None: options.append('node=' + site) if dataset is not None: options.append('dataset=' + dataset) if block is not None: options.append('block=' + block) LOG.info('get_replicas(' + ','.join(options) + ') Fetching the list of replicas from PhEDEx') if len(options) == 0: return [] block_entries = self._phedex.make_request('blockreplicas', options, timeout=7200) parallelizer = Map() parallelizer.timeout = 7200 # Automatically starts a thread as we add the output of block_entries combine_file = parallelizer.get_starter(self._combine_file_info) for block_entry in block_entries: for replica_entry in block_entry['replica']: if replica_entry['complete'] == 'n': break else: continue # there is at least one incomplete replica try: dataset_name, block_name = Block.from_full_name( block_entry['name']) except ObjectError: # invalid name continue if dataset_check and not dataset_check(dataset_name): continue combine_file.add_input(block_entry) combine_file.close() # _combine_file_info alters block_entries directly - no need to deal with output combine_file.get_outputs() block_replicas = PhEDExReplicaInfoSource.make_block_replicas( block_entries, PhEDExReplicaInfoSource.maker_blockreplicas, site_check=site_check, dataset_check=dataset_check) # Also use subscriptions call which has a lower latency than blockreplicas # For example, group change on a block replica at time T may not show up in blockreplicas until up to T + 15 minutes # while in subscriptions it is visible within a few seconds # But subscriptions call without a dataset or block takes too long if dataset is None and block is None: return block_replicas indexed = collections.defaultdict(dict) for replica in block_replicas: indexed[(replica.site.name, replica.block.dataset.name)][replica.block.name] = replica dataset_entries = self._phedex.make_request('subscriptions', options, timeout=3600) for dataset_entry in dataset_entries: dataset_name = dataset_entry['name'] if not self.check_allowed_dataset(dataset_name): continue try: subscriptions = dataset_entry['subscription'] except KeyError: pass else: for sub_entry in subscriptions: site_name = sub_entry['node'] if not self.check_allowed_site(site_name): continue replicas = indexed[(site_name, dataset_name)] for replica in replicas.itervalues(): replica.group = Group(sub_entry['group']) replica.is_custodial = (sub_entry['custodial'] == 'y') try: block_entries = dataset_entry['block'] except KeyError: pass else: for block_entry in block_entries: try: _, block_name = Block.from_full_name( block_entry['name']) except ObjectError: continue try: subscriptions = block_entry['subscription'] except KeyError: continue for sub_entry in subscriptions: site_name = sub_entry['node'] if not self.check_allowed_site(site_name): continue try: replica = indexed[(site_name, dataset_name)][block_name] except KeyError: continue replica.group = Group(sub_entry['group']) if sub_entry['node_bytes'] == block_entry['bytes']: # complete replica.size = sub_entry['node_bytes'] if replica.size is None: replica.size = 0 replica.files = None else: # incomplete - since we cannot know what files are there, we'll just have to pretend there is none replica.size = 0 replica.files = tuple() replica.is_custodial = (sub_entry['custodial'] == 'y') if sub_entry['time_update'] is not None: replica.last_update = 0 else: replica.last_update = int(sub_entry['time_update']) return block_replicas
def get_updated_replicas(self, updated_since, inventory): #override LOG.info( 'get_updated_replicas(%d) Fetching the list of replicas from PhEDEx', updated_since) nodes = [] for entry in self._phedex.make_request('nodes', timeout=600): if not self.check_allowed_site(entry['name']): continue if entry['name'] not in inventory.sites: continue nodes.append(entry['name']) try: tmpconfig = Configuration( self._parallelizer_config.get('parallel', None)) except Exception as e: LOG.error(str(e)) tmpconfig = Configuration() parallelizer = Map(tmpconfig) parallelizer.timeout = 5400 def get_node_replicas(node): options = ['update_since=%d' % updated_since, 'node=%s' % node] results = self._phedex.make_request('blockreplicas', options) return node, results # Use async to fire threads on demand node_results = parallelizer.execute(get_node_replicas, nodes, async=True) # Automatically starts a thread as we add the output of block_replicas combine_file = parallelizer.get_starter(self._combine_file_info) all_block_entries = [] for node, block_entries in node_results: site = inventory.sites[node] for block_entry in block_entries: all_block_entries.append(block_entry) replica_entry = block_entry['replica'][0] if replica_entry['complete'] == 'y': continue # incomplete block replica - should we fetch file info? try: dataset_name, block_name = Block.from_full_name( block_entry['name']) except ObjectError: pass else: try: dataset = inventory.datasets[dataset_name] block = dataset.find_block(block_name) replica = block.find_replica(site) if replica.file_ids is None: num_files = block.num_files else: num_files = len(replica.file_ids) if replica.size == replica_entry[ 'bytes'] and num_files == replica_entry[ 'files']: # no we don't have to continue except: # At any point of the above lookups we may hit a None object or KeyError or what not pass LOG.debug( 'Replica %s:%s is incomplete. Fetching file information.', replica_entry['node'], block_entry['name']) combine_file.add_input(block_entry) combine_file.close() # _combine_file_info alters block_entries directly - no need to deal with output combine_file.get_outputs() LOG.info('get_updated_replicas(%d) Got outputs' % updated_since) return PhEDExReplicaInfoSource.make_block_replicas( all_block_entries, PhEDExReplicaInfoSource.maker_blockreplicas, dataset_check=self.check_allowed_dataset)