Beispiel #1
0
def list_content(scope, name):
    """
    List data identifier contents.

    :param scope: The scope name.
    :param name: The data identifier name.
    """
    return did.list_content(scope=scope, name=name)
Beispiel #2
0
Datei: did.py Projekt: kbg/rucio
def list_content(scope, name):
    """
    List data identifier contents.

    :param scope: The scope name.
    :param name: The data identifier name.
    """
    return did.list_content(scope=scope, name=name)
Beispiel #3
0
def list_content(scope, name):
    """
    List data identifier contents.

    :param scope: The scope name.
    :param name: The data identifier name.
    """

    scope = InternalScope(scope)

    dids = did.list_content(scope=scope, name=name)
    for d in dids:
        yield api_update_return_dict(d)
Beispiel #4
0
def list_content(scope, name, vo='def', session=None):
    """
    List data identifier contents.

    :param scope: The scope name.
    :param name: The data identifier name.
    :param vo: The VO to act on.
    :param session: The database session in use.
    """

    scope = InternalScope(scope, vo=vo)

    dids = did.list_content(scope=scope, name=name, session=session)
    for d in dids:
        yield api_update_return_dict(d, session=session)
Beispiel #5
0
    def list_dids(self, scope, filters, type='collection', ignore_case=False, limit=None,
                  offset=None, long=False, recursive=False, session=None):
        """
        Search data identifiers

        :param scope: the scope name.
        :param filters: dictionary of attributes by which the results should be filtered.
        :param type: the type of the did: all(container, dataset, file), collection(dataset or container), dataset, container, file.
        :param ignore_case: ignore case distinctions.
        :param limit: limit number.
        :param offset: offset number.
        :param long: Long format option to display more information for each DID.
        :param session: The database session in use.
        :param recursive: Recursively list DIDs content.
        """
        types = ['all', 'collection', 'container', 'dataset', 'file']
        if type not in types:
            raise exception.UnsupportedOperation("Valid type are: %(types)s" % locals())

        query = session.query(models.DataIdentifier.scope,
                              models.DataIdentifier.name,
                              models.DataIdentifier.did_type,
                              models.DataIdentifier.bytes,
                              models.DataIdentifier.length).\
            filter(models.DataIdentifier.scope == scope)

        # Exclude suppressed dids
        query = query.filter(models.DataIdentifier.suppressed != true())

        if type == 'all':
            query = query.filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER,
                                     models.DataIdentifier.did_type == DIDType.DATASET,
                                     models.DataIdentifier.did_type == DIDType.FILE))
        elif type.lower() == 'collection':
            query = query.filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER,
                                     models.DataIdentifier.did_type == DIDType.DATASET))
        elif type.lower() == 'container':
            query = query.filter(models.DataIdentifier.did_type == DIDType.CONTAINER)
        elif type.lower() == 'dataset':
            query = query.filter(models.DataIdentifier.did_type == DIDType.DATASET)
        elif type.lower() == 'file':
            query = query.filter(models.DataIdentifier.did_type == DIDType.FILE)

        for (k, v) in filters.items():

            if k not in ['created_before', 'created_after', 'length.gt', 'length.lt', 'length.lte', 'length.gte', 'length'] \
                    and not hasattr(models.DataIdentifier, k):
                raise exception.KeyNotFound(k)

            if isinstance(v, string_types) and ('*' in v or '%' in v):
                if v in ('*', '%', u'*', u'%'):
                    continue
                if session.bind.dialect.name == 'postgresql':
                    query = query.filter(getattr(models.DataIdentifier, k).
                                        like(v.replace('*', '%').replace('_', '\_'),  # NOQA: W605
                                            escape='\\'))
                else:
                    query = query.filter(getattr(models.DataIdentifier, k).
                                        like(v.replace('*', '%').replace('_', '\_'), escape='\\'))  # NOQA: W605
            elif k == 'created_before':
                created_before = str_to_date(v)
                query = query.filter(models.DataIdentifier.created_at <= created_before)
            elif k == 'created_after':
                created_after = str_to_date(v)
                query = query.filter(models.DataIdentifier.created_at >= created_after)
            elif k == 'guid':
                query = query.filter_by(guid=v).\
                    with_hint(models.ReplicaLock, "INDEX(DIDS_GUIDS_IDX)", 'oracle')
            elif k == 'length.gt':
                query = query.filter(models.DataIdentifier.length > v)
            elif k == 'length.lt':
                query = query.filter(models.DataIdentifier.length < v)
            elif k == 'length.gte':
                query = query.filter(models.DataIdentifier.length >= v)
            elif k == 'length.lte':
                query = query.filter(models.DataIdentifier.length <= v)
            elif k == 'length':
                query = query.filter(models.DataIdentifier.length == v)
            else:
                query = query.filter(getattr(models.DataIdentifier, k) == v)

        if 'name' in filters:
            if '*' in filters['name']:
                query = query.\
                    with_hint(models.DataIdentifier, "NO_INDEX(dids(SCOPE,NAME))", 'oracle')
            else:
                query = query.\
                    with_hint(models.DataIdentifier, "INDEX(DIDS DIDS_PK)", 'oracle')

        if limit:
            query = query.limit(limit)

        if recursive:
            # Get attachted DIDs and save in list because query has to be finished before starting a new one in the recursion
            collections_content = []
            parent_scope = scope

            from rucio.core.did import list_content

            for scope, name, did_type, bytes, length in query.yield_per(100):
                if (did_type == DIDType.CONTAINER or did_type == DIDType.DATASET):
                    collections_content += [did for did in list_content(scope=scope, name=name)]

            # List DIDs again to use filter
            for did in collections_content:
                filters['name'] = did['name']
                for result in self.list_dids(scope=did['scope'], filters=filters, recursive=True, type=type, limit=limit, offset=offset, long=long, session=session):
                    yield result

        if long:
            for scope, name, did_type, bytes, length in query.yield_per(5):
                yield {'scope': scope,
                       'name': name,
                       'did_type': str(did_type),
                       'bytes': bytes,
                       'length': length}
        else:
            for scope, name, did_type, bytes, length in query.yield_per(5):
                yield name
Beispiel #6
0
def test_archive_removal_impact_on_constituents(rse_factory, did_factory, mock_scope, root_account, caches_mock, file_config_mock):
    [cache_region] = caches_mock
    rse_name, rse_id = rse_factory.make_mock_rse()
    scope = mock_scope
    account = root_account

    # Create 2 archives and 4 files:
    # - One only exists in the first archive
    # - One in both, plus another replica, which is not in an archive
    # - One in both, plus another replica, which is not in an archive; and this replica has expired
    # - One in both, plus another replica, which is not in an archive; and this replica has expired; but a replication rule exists on this second replica
    # Also add these files to datasets, one of which will be removed at the end
    nb_constituents = 4
    nb_c_outside_archive = nb_constituents - 1
    constituent_size = 2000
    archive_size = 1000
    uuid = str(generate_uuid())
    constituents = [{'scope': scope, 'name': 'lfn.%s.%d' % (uuid, i)} for i in range(nb_constituents)]
    did_factory.register_dids(constituents)
    c_first_archive_only, c_with_replica, c_with_expired_replica, c_with_replica_and_rule = constituents

    replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size, **c_with_replica)

    replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size,
                             tombstone=datetime.utcnow() - timedelta(days=1), **c_with_expired_replica)

    replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size,
                             tombstone=datetime.utcnow() - timedelta(days=1), **c_with_replica_and_rule)
    rule_core.add_rule(dids=[c_with_replica_and_rule], account=account, copies=1, rse_expression=rse_name, grouping='NONE',
                       weight=None, lifetime=None, locked=False, subscription_id=None)

    archive1, archive2 = [{'scope': scope, 'name': 'archive_%s.%d.zip' % (uuid, i)} for i in range(2)]
    replica_core.add_replica(rse_id=rse_id, bytes_=archive_size, account=account, **archive1)
    replica_core.add_replica(rse_id=rse_id, bytes_=archive_size, account=account, **archive2)
    did_core.attach_dids(dids=[{'scope': c['scope'], 'name': c['name'], 'bytes': constituent_size} for c in constituents],
                         account=account, **archive1)
    did_core.attach_dids(dids=[{'scope': c['scope'], 'name': c['name'], 'bytes': constituent_size} for c in [c_with_replica, c_with_expired_replica, c_with_replica_and_rule]],
                         account=account, **archive2)

    dataset1, dataset2 = [{'scope': scope, 'name': 'dataset_%s.%i' % (uuid, i)} for i in range(2)]
    did_core.add_did(did_type='DATASET', account=account, **dataset1)
    did_core.attach_dids(dids=constituents, account=account, **dataset1)
    did_core.add_did(did_type='DATASET', account=account, **dataset2)
    did_core.attach_dids(dids=[c_first_archive_only, c_with_expired_replica], account=account, **dataset2)

    @read_session
    def __get_archive_contents_history_count(archive, session=None):
        return session.query(ConstituentAssociationHistory).filter_by(**archive).count()

    # Run reaper the first time.
    # the expired non-archive replica of c_with_expired_replica must be removed,
    # but the did must not be removed, and it must still remain in the dataset because
    # it still has the replica from inside the archive
    assert replica_core.get_replica(rse_id=rse_id, **c_with_expired_replica)
    cache_region.invalidate()
    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size)
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    for did in constituents + [archive1, archive2]:
        assert did_core.get_did(**did)
    for did in [archive1, archive2, c_with_replica, c_with_replica_and_rule]:
        assert replica_core.get_replica(rse_id=rse_id, **did)
    with pytest.raises(ReplicaNotFound):
        # The replica is only on the archive, not on the constituent
        replica_core.get_replica(rse_id=rse_id, **c_first_archive_only)
    with pytest.raises(ReplicaNotFound):
        # The replica outside the archive was removed by reaper
        nb_c_outside_archive -= 1
        replica_core.get_replica(rse_id=rse_id, **c_with_expired_replica)
    # Compared to get_replica, list_replicas resolves archives, must return replicas for all files
    assert len(list(replica_core.list_replicas(dids=constituents))) == 4
    assert len(list(did_core.list_content(**dataset1))) == 4
    assert len(list(did_core.list_archive_content(**archive1))) == 4
    assert len(list(did_core.list_archive_content(**archive2))) == 3
    assert __get_archive_contents_history_count(archive1) == 0
    assert __get_archive_contents_history_count(archive2) == 0

    # Expire the first archive and run reaper again
    # the archive will be removed; and c_first_archive_only must be removed from datasets
    # and from the did table.
    replica_core.set_tombstone(rse_id=rse_id, tombstone=datetime.utcnow() - timedelta(days=1), **archive1)
    cache_region.invalidate()
    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size)
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**archive1)
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**c_first_archive_only)
    assert len(list(replica_core.list_replicas(dids=constituents))) == 3
    assert len(list(did_core.list_content(**dataset1))) == 3
    assert len(list(did_core.list_archive_content(**archive1))) == 0
    assert len(list(did_core.list_archive_content(**archive2))) == 3
    assert __get_archive_contents_history_count(archive1) == 4
    assert __get_archive_contents_history_count(archive2) == 0

    # Expire the second archive replica and run reaper another time
    # c_with_expired_replica is removed because its external replica got removed at previous step
    # and it exists only inside the archive now.
    # If not open, Dataset2 will be removed because it will be empty.
    did_core.set_status(open=False, **dataset2)
    replica_core.set_tombstone(rse_id=rse_id, tombstone=datetime.utcnow() - timedelta(days=1), **archive2)
    cache_region.invalidate()
    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=archive_size + nb_c_outside_archive * constituent_size)
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=archive_size + nb_c_outside_archive * constituent_size, free=1)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    # The archive must be removed
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**archive2)
    # The DIDs which only existed in the archive are also removed
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**c_first_archive_only)
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**c_with_expired_replica)
    # If the DID has a non-expired replica outside the archive without rules on it, the DID is not removed
    assert did_core.get_did(**c_with_replica)
    # If the DID has an expired replica outside the archive, but has rules on that replica, the DID is not removed
    assert did_core.get_did(**c_with_replica_and_rule)
    assert len(list(replica_core.list_replicas(dids=constituents))) == 2
    assert len(list(did_core.list_content(**dataset1))) == 2
    with pytest.raises(DataIdentifierNotFound):
        did_core.get_did(**dataset2)
    assert len(list(did_core.list_content(**dataset2))) == 0
    assert len(list(did_core.list_archive_content(**archive2))) == 0
    assert __get_archive_contents_history_count(archive1) == 4
    assert __get_archive_contents_history_count(archive2) == 3
Beispiel #7
0
    def list_dids(self, scope, filters, did_type='collection', ignore_case=False, limit=None,
                  offset=None, long=False, recursive=False, ignore_dids=None, session=None):
        """
        Search data identifiers.

        :param scope: the scope name.
        :param filters: dictionary of attributes by which the results should be filtered.
        :param did_type: the type of the did: all(container, dataset, file), collection(dataset or container), dataset, container, file.
        :param ignore_case: ignore case distinctions.
        :param limit: limit number.
        :param offset: offset number.
        :param long: Long format option to display more information for each DID.
        :param session: The database session in use.
        :param recursive: Recursively list DIDs content.
        :param ignore_dids: List of DIDs to refrain from yielding.
        """
        if not ignore_dids:
            ignore_dids = set()

        # mapping for semantic <type> to a (set of) recognised DIDType(s).
        type_to_did_type_mapping = {
            'all': [DIDType.CONTAINER, DIDType.DATASET, DIDType.FILE],
            'collection': [DIDType.CONTAINER, DIDType.DATASET],
            'container': [DIDType.CONTAINER],
            'dataset': [DIDType.DATASET],
            'file': [DIDType.FILE]
        }

        # backwards compatability for filters as single {}.
        if isinstance(filters, dict):
            filters = [filters]

        # for each or_group, make sure there is a mapped "did_type" filter.
        # if type maps to many DIDTypes, the corresponding or_group will be copied the required number of times to satisfy all the logical possibilities.
        filters_tmp = []
        for or_group in filters:
            if 'type' not in or_group:
                or_group_type = did_type.lower()
            else:
                or_group_type = or_group.pop('type').lower()
            if or_group_type not in type_to_did_type_mapping.keys():
                raise exception.UnsupportedOperation('{} is not a valid type. Valid types are {}'.format(or_group_type, type_to_did_type_mapping.keys()))

            for mapped_did_type in type_to_did_type_mapping[or_group_type]:
                or_group['did_type'] = mapped_did_type
                filters_tmp.append(or_group.copy())
        filters = filters_tmp

        # instantiate fe and create sqla query
        fe = FilterEngine(filters, model_class=models.DataIdentifier)
        query = fe.create_sqla_query(
            additional_model_attributes=[
                models.DataIdentifier.scope,
                models.DataIdentifier.name,
                models.DataIdentifier.did_type,
                models.DataIdentifier.bytes,
                models.DataIdentifier.length
            ], additional_filters=[
                (models.DataIdentifier.scope, operator.eq, scope),
                (models.DataIdentifier.suppressed, operator.ne, true())
            ]
        )

        if limit:
            query = query.limit(limit)
        if recursive:
            from rucio.core.did import list_content

            # Get attached DIDs and save in list because query has to be finished before starting a new one in the recursion
            collections_content = []
            for did in query.yield_per(100):
                if (did.did_type == DIDType.CONTAINER or did.did_type == DIDType.DATASET):
                    collections_content += [d for d in list_content(scope=did.scope, name=did.name)]

            # Replace any name filtering with recursed DID names.
            for did in collections_content:
                for or_group in filters:
                    or_group['name'] = did['name']
                for result in self.list_dids(scope=did['scope'], filters=filters, recursive=True, did_type=did_type, limit=limit, offset=offset,
                                             long=long, ignore_dids=ignore_dids, session=session):
                    yield result

        for did in query.yield_per(5):                  # don't unpack this as it makes it dependent on query return order!
            if long:
                did_full = "{}:{}".format(did.scope, did.name)
                if did_full not in ignore_dids:         # concatenating results of OR clauses may contain duplicate DIDs if query result sets not mutually exclusive.
                    ignore_dids.add(did_full)
                    yield {
                        'scope': did.scope,
                        'name': did.name,
                        'did_type': str(did.did_type),
                        'bytes': did.bytes,
                        'length': did.length
                    }
            else:
                did_full = "{}:{}".format(did.scope, did.name)
                if did_full not in ignore_dids:         # concatenating results of OR clauses may contain duplicate DIDs if query result sets not mutually exclusive.
                    ignore_dids.add(did_full)
                    yield did.name
Beispiel #8
0
    def list_dids(self,
                  scope,
                  filters,
                  did_type='collection',
                  ignore_case=False,
                  limit=None,
                  offset=None,
                  long=False,
                  recursive=False,
                  ignore_dids=None,
                  session=None):
        if not json_implemented(session=session):
            raise NotImplementedError

        if not ignore_dids:
            ignore_dids = set()

        # backwards compatability for filters as single {}.
        if isinstance(filters, dict):
            filters = [filters]

        # instantiate fe and create sqla query, note that coercion to a model keyword
        # is not appropriate here as the filter words are stored in a single json column.
        fe = FilterEngine(filters,
                          model_class=models.DidMeta,
                          strict_coerce=False)
        query = fe.create_sqla_query(additional_model_attributes=[
            models.DidMeta.scope, models.DidMeta.name
        ],
                                     additional_filters=[(models.DidMeta.scope,
                                                          operator.eq, scope)],
                                     json_column=models.DidMeta.meta)

        if limit:
            query = query.limit(limit)
        if recursive:
            from rucio.core.did import list_content

            # Get attached DIDs and save in list because query has to be finished before starting a new one in the recursion
            collections_content = []
            for did in query.yield_per(100):
                if (did.did_type == DIDType.CONTAINER
                        or did.did_type == DIDType.DATASET):
                    collections_content += [
                        d for d in list_content(scope=did.scope, name=did.name)
                    ]

            # Replace any name filtering with recursed DID names.
            for did in collections_content:
                for or_group in filters:
                    or_group['name'] = did['name']
                for result in self.list_dids(scope=did['scope'],
                                             filters=filters,
                                             recursive=True,
                                             did_type=did_type,
                                             limit=limit,
                                             offset=offset,
                                             long=long,
                                             ignore_dids=ignore_dids,
                                             session=session):
                    yield result

        try:
            for did in query.yield_per(
                    5
            ):  # don't unpack this as it makes it dependent on query return order!
                if long:
                    did_full = "{}:{}".format(did.scope, did.name)
                    if did_full not in ignore_dids:  # concatenating results of OR clauses may contain duplicate DIDs if query result sets not mutually exclusive.
                        ignore_dids.add(did_full)
                        yield {
                            'scope': did.scope,
                            'name': did.name,
                            'did_type': None,  # not available with JSON plugin
                            'bytes': None,  # not available with JSON plugin
                            'length': None  # not available with JSON plugin
                        }
                else:
                    did_full = "{}:{}".format(did.scope, did.name)
                    if did_full not in ignore_dids:  # concatenating results of OR clauses may contain duplicate DIDs if query result sets not mutually exclusive.
                        ignore_dids.add(did_full)
                        yield did.name
        except DataError as e:
            raise exception.InvalidMetadata(
                "Database query failed: {}. This can be raised when the datatype of a key is inconsistent between dids."
                .format(e))