def list_content(scope, name): """ List data identifier contents. :param scope: The scope name. :param name: The data identifier name. """ return did.list_content(scope=scope, name=name)
def list_content(scope, name): """ List data identifier contents. :param scope: The scope name. :param name: The data identifier name. """ return did.list_content(scope=scope, name=name)
def list_content(scope, name): """ List data identifier contents. :param scope: The scope name. :param name: The data identifier name. """ scope = InternalScope(scope) dids = did.list_content(scope=scope, name=name) for d in dids: yield api_update_return_dict(d)
def list_content(scope, name, vo='def', session=None): """ List data identifier contents. :param scope: The scope name. :param name: The data identifier name. :param vo: The VO to act on. :param session: The database session in use. """ scope = InternalScope(scope, vo=vo) dids = did.list_content(scope=scope, name=name, session=session) for d in dids: yield api_update_return_dict(d, session=session)
def list_dids(self, scope, filters, type='collection', ignore_case=False, limit=None, offset=None, long=False, recursive=False, session=None): """ Search data identifiers :param scope: the scope name. :param filters: dictionary of attributes by which the results should be filtered. :param type: the type of the did: all(container, dataset, file), collection(dataset or container), dataset, container, file. :param ignore_case: ignore case distinctions. :param limit: limit number. :param offset: offset number. :param long: Long format option to display more information for each DID. :param session: The database session in use. :param recursive: Recursively list DIDs content. """ types = ['all', 'collection', 'container', 'dataset', 'file'] if type not in types: raise exception.UnsupportedOperation("Valid type are: %(types)s" % locals()) query = session.query(models.DataIdentifier.scope, models.DataIdentifier.name, models.DataIdentifier.did_type, models.DataIdentifier.bytes, models.DataIdentifier.length).\ filter(models.DataIdentifier.scope == scope) # Exclude suppressed dids query = query.filter(models.DataIdentifier.suppressed != true()) if type == 'all': query = query.filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER, models.DataIdentifier.did_type == DIDType.DATASET, models.DataIdentifier.did_type == DIDType.FILE)) elif type.lower() == 'collection': query = query.filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER, models.DataIdentifier.did_type == DIDType.DATASET)) elif type.lower() == 'container': query = query.filter(models.DataIdentifier.did_type == DIDType.CONTAINER) elif type.lower() == 'dataset': query = query.filter(models.DataIdentifier.did_type == DIDType.DATASET) elif type.lower() == 'file': query = query.filter(models.DataIdentifier.did_type == DIDType.FILE) for (k, v) in filters.items(): if k not in ['created_before', 'created_after', 'length.gt', 'length.lt', 'length.lte', 'length.gte', 'length'] \ and not hasattr(models.DataIdentifier, k): raise exception.KeyNotFound(k) if isinstance(v, string_types) and ('*' in v or '%' in v): if v in ('*', '%', u'*', u'%'): continue if session.bind.dialect.name == 'postgresql': query = query.filter(getattr(models.DataIdentifier, k). like(v.replace('*', '%').replace('_', '\_'), # NOQA: W605 escape='\\')) else: query = query.filter(getattr(models.DataIdentifier, k). like(v.replace('*', '%').replace('_', '\_'), escape='\\')) # NOQA: W605 elif k == 'created_before': created_before = str_to_date(v) query = query.filter(models.DataIdentifier.created_at <= created_before) elif k == 'created_after': created_after = str_to_date(v) query = query.filter(models.DataIdentifier.created_at >= created_after) elif k == 'guid': query = query.filter_by(guid=v).\ with_hint(models.ReplicaLock, "INDEX(DIDS_GUIDS_IDX)", 'oracle') elif k == 'length.gt': query = query.filter(models.DataIdentifier.length > v) elif k == 'length.lt': query = query.filter(models.DataIdentifier.length < v) elif k == 'length.gte': query = query.filter(models.DataIdentifier.length >= v) elif k == 'length.lte': query = query.filter(models.DataIdentifier.length <= v) elif k == 'length': query = query.filter(models.DataIdentifier.length == v) else: query = query.filter(getattr(models.DataIdentifier, k) == v) if 'name' in filters: if '*' in filters['name']: query = query.\ with_hint(models.DataIdentifier, "NO_INDEX(dids(SCOPE,NAME))", 'oracle') else: query = query.\ with_hint(models.DataIdentifier, "INDEX(DIDS DIDS_PK)", 'oracle') if limit: query = query.limit(limit) if recursive: # Get attachted DIDs and save in list because query has to be finished before starting a new one in the recursion collections_content = [] parent_scope = scope from rucio.core.did import list_content for scope, name, did_type, bytes, length in query.yield_per(100): if (did_type == DIDType.CONTAINER or did_type == DIDType.DATASET): collections_content += [did for did in list_content(scope=scope, name=name)] # List DIDs again to use filter for did in collections_content: filters['name'] = did['name'] for result in self.list_dids(scope=did['scope'], filters=filters, recursive=True, type=type, limit=limit, offset=offset, long=long, session=session): yield result if long: for scope, name, did_type, bytes, length in query.yield_per(5): yield {'scope': scope, 'name': name, 'did_type': str(did_type), 'bytes': bytes, 'length': length} else: for scope, name, did_type, bytes, length in query.yield_per(5): yield name
def test_archive_removal_impact_on_constituents(rse_factory, did_factory, mock_scope, root_account, caches_mock, file_config_mock): [cache_region] = caches_mock rse_name, rse_id = rse_factory.make_mock_rse() scope = mock_scope account = root_account # Create 2 archives and 4 files: # - One only exists in the first archive # - One in both, plus another replica, which is not in an archive # - One in both, plus another replica, which is not in an archive; and this replica has expired # - One in both, plus another replica, which is not in an archive; and this replica has expired; but a replication rule exists on this second replica # Also add these files to datasets, one of which will be removed at the end nb_constituents = 4 nb_c_outside_archive = nb_constituents - 1 constituent_size = 2000 archive_size = 1000 uuid = str(generate_uuid()) constituents = [{'scope': scope, 'name': 'lfn.%s.%d' % (uuid, i)} for i in range(nb_constituents)] did_factory.register_dids(constituents) c_first_archive_only, c_with_replica, c_with_expired_replica, c_with_replica_and_rule = constituents replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size, **c_with_replica) replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size, tombstone=datetime.utcnow() - timedelta(days=1), **c_with_expired_replica) replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size, tombstone=datetime.utcnow() - timedelta(days=1), **c_with_replica_and_rule) rule_core.add_rule(dids=[c_with_replica_and_rule], account=account, copies=1, rse_expression=rse_name, grouping='NONE', weight=None, lifetime=None, locked=False, subscription_id=None) archive1, archive2 = [{'scope': scope, 'name': 'archive_%s.%d.zip' % (uuid, i)} for i in range(2)] replica_core.add_replica(rse_id=rse_id, bytes_=archive_size, account=account, **archive1) replica_core.add_replica(rse_id=rse_id, bytes_=archive_size, account=account, **archive2) did_core.attach_dids(dids=[{'scope': c['scope'], 'name': c['name'], 'bytes': constituent_size} for c in constituents], account=account, **archive1) did_core.attach_dids(dids=[{'scope': c['scope'], 'name': c['name'], 'bytes': constituent_size} for c in [c_with_replica, c_with_expired_replica, c_with_replica_and_rule]], account=account, **archive2) dataset1, dataset2 = [{'scope': scope, 'name': 'dataset_%s.%i' % (uuid, i)} for i in range(2)] did_core.add_did(did_type='DATASET', account=account, **dataset1) did_core.attach_dids(dids=constituents, account=account, **dataset1) did_core.add_did(did_type='DATASET', account=account, **dataset2) did_core.attach_dids(dids=[c_first_archive_only, c_with_expired_replica], account=account, **dataset2) @read_session def __get_archive_contents_history_count(archive, session=None): return session.query(ConstituentAssociationHistory).filter_by(**archive).count() # Run reaper the first time. # the expired non-archive replica of c_with_expired_replica must be removed, # but the did must not be removed, and it must still remain in the dataset because # it still has the replica from inside the archive assert replica_core.get_replica(rse_id=rse_id, **c_with_expired_replica) cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) for did in constituents + [archive1, archive2]: assert did_core.get_did(**did) for did in [archive1, archive2, c_with_replica, c_with_replica_and_rule]: assert replica_core.get_replica(rse_id=rse_id, **did) with pytest.raises(ReplicaNotFound): # The replica is only on the archive, not on the constituent replica_core.get_replica(rse_id=rse_id, **c_first_archive_only) with pytest.raises(ReplicaNotFound): # The replica outside the archive was removed by reaper nb_c_outside_archive -= 1 replica_core.get_replica(rse_id=rse_id, **c_with_expired_replica) # Compared to get_replica, list_replicas resolves archives, must return replicas for all files assert len(list(replica_core.list_replicas(dids=constituents))) == 4 assert len(list(did_core.list_content(**dataset1))) == 4 assert len(list(did_core.list_archive_content(**archive1))) == 4 assert len(list(did_core.list_archive_content(**archive2))) == 3 assert __get_archive_contents_history_count(archive1) == 0 assert __get_archive_contents_history_count(archive2) == 0 # Expire the first archive and run reaper again # the archive will be removed; and c_first_archive_only must be removed from datasets # and from the did table. replica_core.set_tombstone(rse_id=rse_id, tombstone=datetime.utcnow() - timedelta(days=1), **archive1) cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**archive1) with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**c_first_archive_only) assert len(list(replica_core.list_replicas(dids=constituents))) == 3 assert len(list(did_core.list_content(**dataset1))) == 3 assert len(list(did_core.list_archive_content(**archive1))) == 0 assert len(list(did_core.list_archive_content(**archive2))) == 3 assert __get_archive_contents_history_count(archive1) == 4 assert __get_archive_contents_history_count(archive2) == 0 # Expire the second archive replica and run reaper another time # c_with_expired_replica is removed because its external replica got removed at previous step # and it exists only inside the archive now. # If not open, Dataset2 will be removed because it will be empty. did_core.set_status(open=False, **dataset2) replica_core.set_tombstone(rse_id=rse_id, tombstone=datetime.utcnow() - timedelta(days=1), **archive2) cache_region.invalidate() rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=archive_size + nb_c_outside_archive * constituent_size) rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=archive_size + nb_c_outside_archive * constituent_size, free=1) reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None) # The archive must be removed with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**archive2) # The DIDs which only existed in the archive are also removed with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**c_first_archive_only) with pytest.raises(DataIdentifierNotFound): assert did_core.get_did(**c_with_expired_replica) # If the DID has a non-expired replica outside the archive without rules on it, the DID is not removed assert did_core.get_did(**c_with_replica) # If the DID has an expired replica outside the archive, but has rules on that replica, the DID is not removed assert did_core.get_did(**c_with_replica_and_rule) assert len(list(replica_core.list_replicas(dids=constituents))) == 2 assert len(list(did_core.list_content(**dataset1))) == 2 with pytest.raises(DataIdentifierNotFound): did_core.get_did(**dataset2) assert len(list(did_core.list_content(**dataset2))) == 0 assert len(list(did_core.list_archive_content(**archive2))) == 0 assert __get_archive_contents_history_count(archive1) == 4 assert __get_archive_contents_history_count(archive2) == 3
def list_dids(self, scope, filters, did_type='collection', ignore_case=False, limit=None, offset=None, long=False, recursive=False, ignore_dids=None, session=None): """ Search data identifiers. :param scope: the scope name. :param filters: dictionary of attributes by which the results should be filtered. :param did_type: the type of the did: all(container, dataset, file), collection(dataset or container), dataset, container, file. :param ignore_case: ignore case distinctions. :param limit: limit number. :param offset: offset number. :param long: Long format option to display more information for each DID. :param session: The database session in use. :param recursive: Recursively list DIDs content. :param ignore_dids: List of DIDs to refrain from yielding. """ if not ignore_dids: ignore_dids = set() # mapping for semantic <type> to a (set of) recognised DIDType(s). type_to_did_type_mapping = { 'all': [DIDType.CONTAINER, DIDType.DATASET, DIDType.FILE], 'collection': [DIDType.CONTAINER, DIDType.DATASET], 'container': [DIDType.CONTAINER], 'dataset': [DIDType.DATASET], 'file': [DIDType.FILE] } # backwards compatability for filters as single {}. if isinstance(filters, dict): filters = [filters] # for each or_group, make sure there is a mapped "did_type" filter. # if type maps to many DIDTypes, the corresponding or_group will be copied the required number of times to satisfy all the logical possibilities. filters_tmp = [] for or_group in filters: if 'type' not in or_group: or_group_type = did_type.lower() else: or_group_type = or_group.pop('type').lower() if or_group_type not in type_to_did_type_mapping.keys(): raise exception.UnsupportedOperation('{} is not a valid type. Valid types are {}'.format(or_group_type, type_to_did_type_mapping.keys())) for mapped_did_type in type_to_did_type_mapping[or_group_type]: or_group['did_type'] = mapped_did_type filters_tmp.append(or_group.copy()) filters = filters_tmp # instantiate fe and create sqla query fe = FilterEngine(filters, model_class=models.DataIdentifier) query = fe.create_sqla_query( additional_model_attributes=[ models.DataIdentifier.scope, models.DataIdentifier.name, models.DataIdentifier.did_type, models.DataIdentifier.bytes, models.DataIdentifier.length ], additional_filters=[ (models.DataIdentifier.scope, operator.eq, scope), (models.DataIdentifier.suppressed, operator.ne, true()) ] ) if limit: query = query.limit(limit) if recursive: from rucio.core.did import list_content # Get attached DIDs and save in list because query has to be finished before starting a new one in the recursion collections_content = [] for did in query.yield_per(100): if (did.did_type == DIDType.CONTAINER or did.did_type == DIDType.DATASET): collections_content += [d for d in list_content(scope=did.scope, name=did.name)] # Replace any name filtering with recursed DID names. for did in collections_content: for or_group in filters: or_group['name'] = did['name'] for result in self.list_dids(scope=did['scope'], filters=filters, recursive=True, did_type=did_type, limit=limit, offset=offset, long=long, ignore_dids=ignore_dids, session=session): yield result for did in query.yield_per(5): # don't unpack this as it makes it dependent on query return order! if long: did_full = "{}:{}".format(did.scope, did.name) if did_full not in ignore_dids: # concatenating results of OR clauses may contain duplicate DIDs if query result sets not mutually exclusive. ignore_dids.add(did_full) yield { 'scope': did.scope, 'name': did.name, 'did_type': str(did.did_type), 'bytes': did.bytes, 'length': did.length } else: did_full = "{}:{}".format(did.scope, did.name) if did_full not in ignore_dids: # concatenating results of OR clauses may contain duplicate DIDs if query result sets not mutually exclusive. ignore_dids.add(did_full) yield did.name
def list_dids(self, scope, filters, did_type='collection', ignore_case=False, limit=None, offset=None, long=False, recursive=False, ignore_dids=None, session=None): if not json_implemented(session=session): raise NotImplementedError if not ignore_dids: ignore_dids = set() # backwards compatability for filters as single {}. if isinstance(filters, dict): filters = [filters] # instantiate fe and create sqla query, note that coercion to a model keyword # is not appropriate here as the filter words are stored in a single json column. fe = FilterEngine(filters, model_class=models.DidMeta, strict_coerce=False) query = fe.create_sqla_query(additional_model_attributes=[ models.DidMeta.scope, models.DidMeta.name ], additional_filters=[(models.DidMeta.scope, operator.eq, scope)], json_column=models.DidMeta.meta) if limit: query = query.limit(limit) if recursive: from rucio.core.did import list_content # Get attached DIDs and save in list because query has to be finished before starting a new one in the recursion collections_content = [] for did in query.yield_per(100): if (did.did_type == DIDType.CONTAINER or did.did_type == DIDType.DATASET): collections_content += [ d for d in list_content(scope=did.scope, name=did.name) ] # Replace any name filtering with recursed DID names. for did in collections_content: for or_group in filters: or_group['name'] = did['name'] for result in self.list_dids(scope=did['scope'], filters=filters, recursive=True, did_type=did_type, limit=limit, offset=offset, long=long, ignore_dids=ignore_dids, session=session): yield result try: for did in query.yield_per( 5 ): # don't unpack this as it makes it dependent on query return order! if long: did_full = "{}:{}".format(did.scope, did.name) if did_full not in ignore_dids: # concatenating results of OR clauses may contain duplicate DIDs if query result sets not mutually exclusive. ignore_dids.add(did_full) yield { 'scope': did.scope, 'name': did.name, 'did_type': None, # not available with JSON plugin 'bytes': None, # not available with JSON plugin 'length': None # not available with JSON plugin } else: did_full = "{}:{}".format(did.scope, did.name) if did_full not in ignore_dids: # concatenating results of OR clauses may contain duplicate DIDs if query result sets not mutually exclusive. ignore_dids.add(did_full) yield did.name except DataError as e: raise exception.InvalidMetadata( "Database query failed: {}. This can be raised when the datatype of a key is inconsistent between dids." .format(e))