Exemple #1
0
def test_reaper(vo, caches_mock, file_config_mock):
    """ REAPER (DAEMON): Test the reaper daemon."""
    [cache_region] = caches_mock
    scope = InternalScope('data13_hip', vo=vo)

    nb_files = 250
    file_size = 200  # 2G
    rse_name, rse_id, dids = __add_test_rse_and_replicas(vo=vo, scope=scope, rse_name=rse_name_generator(),
                                                         names=['lfn' + generate_uuid() for _ in range(nb_files)], file_size=file_size)

    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=50 * file_size)
    assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files

    # Check first if the reaper does not delete anything if no space is needed
    cache_region.invalidate()
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=323000000000)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files

    # Now put it over threshold and delete
    cache_region.invalidate()
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=1)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    assert len(list(replica_core.list_replicas(dids, rse_expression=rse_name))) == 200
Exemple #2
0
def test_reaper_multi_vo(vo, second_vo, scope_factory, caches_mock, file_config_mock):
    """ REAPER (DAEMON): Test the reaper daemon with multiple vo."""
    [cache_region] = caches_mock
    new_vo = second_vo
    _, [scope_tst, scope_new] = scope_factory(vos=[vo, new_vo])

    nb_files = 250
    file_size = 200  # 2G
    rse1_name, rse1_id, dids1 = __add_test_rse_and_replicas(vo=vo, scope=scope_tst, rse_name=rse_name_generator(),
                                                            names=['lfn' + generate_uuid() for _ in range(nb_files)], file_size=file_size)
    rse2_name, rse2_id, dids2 = __add_test_rse_and_replicas(vo=new_vo, scope=scope_new, rse_name=rse_name_generator(),
                                                            names=['lfn' + generate_uuid() for _ in range(nb_files)], file_size=file_size)

    rse_core.set_rse_limits(rse_id=rse1_id, name='MinFreeSpace', value=50 * file_size)
    rse_core.set_rse_limits(rse_id=rse2_id, name='MinFreeSpace', value=50 * file_size)

    # Check we reap all VOs by default
    cache_region.invalidate()
    rse_core.set_rse_usage(rse_id=rse1_id, source='storage', used=nb_files * file_size, free=1)
    rse_core.set_rse_usage(rse_id=rse2_id, source='storage', used=nb_files * file_size, free=1)
    both_rses = '%s|%s' % (rse1_name, rse2_name)
    reaper(once=True, rses=[], include_rses=both_rses, exclude_rses=None)
    reaper(once=True, rses=[], include_rses=both_rses, exclude_rses=None)
    assert len(list(replica_core.list_replicas(dids=dids1, rse_expression=both_rses))) == 200
    assert len(list(replica_core.list_replicas(dids=dids2, rse_expression=both_rses))) == 200
Exemple #3
0
    def test_add_list_bad_replicas(self):
        """ REPLICA (CORE): Add bad replicas and list them"""
        tmp_scope = 'mock'
        nbfiles = 5
        # Adding replicas to deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in range(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK')
        rse_id1 = rse_info['id']
        add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True)

        # Listing replicas on deterministic RSE
        replicas = []
        list_rep = []
        for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
            replicas.extend(replica['rses']['MOCK'])
            list_rep.append(replica)
        r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root')
        assert_equal(r, {})
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id1:
                    if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Adding replicas to non-deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb',
                  'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': {'events': 10}} for i in range(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK2')
        rse_id2 = rse_info['id']
        add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True)

        # Listing replicas on non-deterministic RSE
        replicas = []
        list_rep = []
        for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
            replicas.extend(replica['rses']['MOCK2'])
            list_rep.append(replica)
        r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root')
        assert_equal(r, {})
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id2:
                    if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Now adding non-existing bad replicas
        files = ['srm://mock2.com/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), ]
        r = declare_bad_file_replicas(files, 'This is a good reason', 'root')
        output = ['%s Unknown replica' % rep for rep in files]
        assert_equal(r, {'MOCK2': output})
Exemple #4
0
def test_add_list_bad_replicas(rse_factory, mock_scope, root_account):
    """ REPLICA (CORE): Add bad replicas and list them"""

    nbfiles = 5
    # Adding replicas to deterministic RSE
    _, rse1_id = rse_factory.make_srm_rse(deterministic=True)
    files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles)]
    add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True)

    # Listing replicas on deterministic RSE
    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
        replicas.extend(replica['rses'][rse1_id])
        list_rep.append(replica)
    r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account)
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse1_id:
                if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # Adding replicas to non-deterministic RSE
    _, rse2_id = rse_factory.make_srm_rse(deterministic=False)
    files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb',
              'pfn': 'srm://%s.cern.ch/srm/managerv2?SFN=/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), 'meta': {'events': 10}} for _ in range(nbfiles)]
    add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True)

    # Listing replicas on non-deterministic RSE
    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
        replicas.extend(replica['rses'][rse2_id])
        list_rep.append(replica)
    r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account)
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse2_id:
                if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # Now adding non-existing bad replicas
    files = ['srm://%s.cern.ch/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), ]
    r = declare_bad_file_replicas(files, 'This is a good reason', root_account)
    output = ['%s Unknown replica' % rep for rep in files]
    assert r == {rse2_id: output}
Exemple #5
0
def test_get_bad_replicas_backlog(rse_factory, mock_scope, root_account, file_config_mock):
    """ REPLICA (CORE): Check the behaviour of the necromancer in case of backlog on an RSE"""

    # Run necromancer once
    necromancer_run(threads=1, bulk=10000, once=True)

    nbfiles1 = 100
    nbfiles2 = 20
    # Adding replicas to deterministic RSE
    rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True)
    _, rse2_id = rse_factory.make_srm_rse(deterministic=True)

    # Create bad replicas on rse1
    files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles1)]
    add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True)

    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
        replicas.extend(replica['rses'][rse1_id])
        list_rep.append({'scope': replica['scope'], 'name': replica['name'], 'rse': rse1, 'rse_id': rse1_id})
    res = declare_bad_file_replicas(replicas, 'This is a good reason', root_account)
    assert res == {}

    result = get_bad_replicas_backlog(force_refresh=True)
    assert rse1_id in result
    assert result[rse1_id] == nbfiles1

    # Create more bad replicas on rse2
    files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles2)]
    add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True)

    repl = []
    for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
        repl.extend(replica['rses'][rse2_id])
    res = declare_bad_file_replicas(repl, 'This is a good reason', root_account)
    assert res == {}

    # List bad replicas on rse1
    bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}])
    assert len(bad_replicas) == nbfiles1
    for rep in bad_replicas:
        assert rep in list_rep

    # Run necromancer once, all the files on RSE2 should be gone, 80 files should stay on RSE1
    get_bad_replicas_backlog(force_refresh=True)
    necromancer_run(threads=1, bulk=20, once=True)

    bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}, {'id': rse2_id}])
    assert len(bad_replicas) == 80
    for rep in bad_replicas:
        assert rep['rse_id'] == rse1_id
Exemple #6
0
def test_archive_of_deleted_dids(vo, did_factory, root_account, core_config_mock, caches_mock, file_config_mock):
    """ REAPER (DAEMON): Test that the options to keep the did and content history work."""
    [reaper_cache_region, _config_cache_region, _replica_cache_region] = caches_mock
    scope = InternalScope('data13_hip', vo=vo)
    account = root_account

    nb_files = 10
    file_size = 200  # 2G
    rse_name, rse_id, dids = __add_test_rse_and_replicas(vo=vo, scope=scope, rse_name=rse_name_generator(),
                                                         names=['lfn' + generate_uuid() for _ in range(nb_files)], file_size=file_size, epoch_tombstone=True)
    dataset = did_factory.make_dataset()
    did_core.attach_dids(dids=dids, account=account, **dataset)

    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=50 * file_size)
    assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == nb_files

    # Check first if the reaper does not delete anything if no space is needed
    reaper_cache_region.invalidate()
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=323000000000)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None, greedy=True)
    assert len(list(replica_core.list_replicas(dids=dids, rse_expression=rse_name))) == 0

    file_clause = []
    for did in dids:
        file_clause.append(and_(models.DeletedDataIdentifier.scope == did['scope'], models.DeletedDataIdentifier.name == did['name']))

    session = get_session()
    query = session.query(models.DeletedDataIdentifier.scope,
                          models.DeletedDataIdentifier.name,
                          models.DeletedDataIdentifier.did_type).\
        filter(or_(*file_clause))

    deleted_dids = list()
    for did in query.all():
        print(did)
        deleted_dids.append(did)
    assert len(deleted_dids) == len(dids)

    query = session.query(models.DataIdentifierAssociationHistory.child_scope,
                          models.DataIdentifierAssociationHistory.child_name,
                          models.DataIdentifierAssociationHistory.child_type).\
        filter(and_(models.DataIdentifierAssociationHistory.scope == dataset['scope'], models.DataIdentifierAssociationHistory.name == dataset['name']))

    deleted_dids = list()
    for did in query.all():
        print(did)
        deleted_dids.append(did)
    assert len(deleted_dids) == len(dids)
Exemple #7
0
def necromancer(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    param worker_number: The number of the worker (thread).
    param total_number: The total number of workers (threads).
    chunk_size: The chunk of the size to process.
    once: To run only once
    """
    sleep_time = 60
    while not graceful_stop.is_set():
        stime = time.time()
        try:
            replicas = list_bad_replicas(limit=chunk_size, worker_number=worker_number, total_workers=total_workers)
            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info('Thread [%i/%i] : Working on %s:%s on %s' % (worker_number, total_workers, scope, name, rse))
                rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])]
                if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]):
                    logging.info('Thread [%i/%i] : File %s:%s has no other replicas, it will be marked as lost' % (worker_number, total_workers, scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile',  delta=1)
                    except DatabaseException, e:
                        logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e)))
                else:
                    logging.info('Thread [%i/%i] : File %s:%s can be recovered. Available sources : %s' % (worker_number, total_workers, scope, name, str(rep[0]['rses'])))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id)
                        monitor.record_counter(counters='necromancer.badfiles.recovering',  delta=1)
                    except DatabaseException, e:
                        logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e)))
            logging.info('Thread [%i/%i] : It took %s seconds to process %s replicas' % (worker_number, total_workers, str(time.time() - stime), str(len(replicas))))
Exemple #8
0
def list_replicas(dids,
                  schemes=None,
                  unavailable=False,
                  request_id=None,
                  ignore_availability=True,
                  all_states=False,
                  rse_expression=None):
    """
    List file replicas for a list of data identifiers.

    :param dids: The list of data identifiers (DIDs).
    :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...)
    :param unavailable: Also include unavailable replicas in the list.
    :param request_id: ID associated with the request for debugging.
    :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary.
    :param rse_expression: The RSE expression to restrict replicas on a set of RSEs.
    """
    validate_schema(name='r_dids', obj=dids)
    return replica.list_replicas(dids=dids,
                                 schemes=schemes,
                                 unavailable=unavailable,
                                 request_id=request_id,
                                 ignore_availability=ignore_availability,
                                 all_states=all_states,
                                 rse_expression=rse_expression)
Exemple #9
0
    def test_add_list_replicas(self):
        """ REPLICA (CORE): Add and list file replicas """
        tmp_scope = 'mock'
        nbfiles = 13
        files = [{
            'scope': tmp_scope,
            'name': 'file_%s' % generate_uuid(),
            'bytes': 1,
            'adler32': '0cc737eb',
            'meta': {
                'events': 10
            }
        } for _ in range(nbfiles)]
        rses = ['MOCK', 'MOCK3']
        for rse in rses:
            add_replicas(rse=rse,
                         files=files,
                         account='root',
                         ignore_availability=True)

        replica_cpt = 0
        for _ in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                               schemes=['srm']):
            replica_cpt += 1

        assert_equal(nbfiles, replica_cpt)
Exemple #10
0
def list_replicas(dids,
                  schemes=None,
                  unavailable=False,
                  request_id=None,
                  ignore_availability=True,
                  all_states=False,
                  rse_expression=None,
                  client_location=None,
                  domain=None):
    """
    List file replicas for a list of data identifiers.

    :param dids: The list of data identifiers (DIDs).
    :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...)
    :param unavailable: Also include unavailable replicas in the list.
    :param request_id: ID associated with the request for debugging.
    :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary.
    :param rse_expression: The RSE expression to restrict replicas on a set of RSEs.
    :param client_location: Client location dictionary for PFN modification {'ip', 'fqdn', 'site'}
    :param domain: The network domain for the call, either None, 'wan' or 'lan'. Compatibility fallback: None falls back to 'wan'.
    """
    validate_schema(name='r_dids', obj=dids)
    return replica.list_replicas(dids=dids,
                                 schemes=schemes,
                                 unavailable=unavailable,
                                 request_id=request_id,
                                 ignore_availability=ignore_availability,
                                 all_states=all_states,
                                 rse_expression=rse_expression,
                                 client_location=client_location,
                                 domain=domain)
Exemple #11
0
    def cleanup(self, session=None):
        if not self.created_dids:
            return

        # Cleanup Transfers
        session.query(models.Source).filter(or_(and_(models.Source.scope == did['scope'],
                                                     models.Source.name == did['name'])
                                                for did in self.created_dids)).delete(synchronize_session=False)
        session.query(models.Request).filter(or_(and_(models.Request.scope == did['scope'],
                                                      models.Request.name == did['name'])
                                                 for did in self.created_dids)).delete(synchronize_session=False)

        # Cleanup Locks Rules
        query = session.query(models.ReplicationRule.id).filter(or_(and_(models.ReplicationRule.scope == did['scope'],
                                                                         models.ReplicationRule.name == did['name'])
                                                                    for did in self.created_dids))
        for rule_id, in query:
            rule_core.delete_rule(rule_id, session=session)

        # Cleanup Replicas and Parent Datasets
        dids_by_rse = {}
        replicas = list(replica_core.list_replicas(self.created_dids, all_states=True, session=session))
        for replica in replicas:
            for rse_id in replica['rses']:
                dids_by_rse.setdefault(rse_id, []).append({'scope': replica['scope'], 'name': replica['name']})
        for rse_id, dids in dids_by_rse.items():
            replica_core.delete_replicas(rse_id=rse_id, files=dids, session=session)
Exemple #12
0
 def test_update_replicas_paths(self):
     """ REPLICA (CORE): Force update the replica path """
     tmp_scope = 'mock'
     nbfiles = 5
     rse_info = rsemgr.get_rse_info('MOCK')
     files = [{
         'scope': tmp_scope,
         'name': 'file_%s' % generate_uuid(),
         'pfn':
         'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/does/not/really/matter/where',
         'bytes': 1,
         'adler32': '0cc737eb',
         'meta': {
             'events': 10
         },
         'rse_id': rse_info['id'],
         'path': '/does/not/really/matter/where'
     } for _ in range(nbfiles)]
     add_replicas(rse='MOCK2',
                  files=files,
                  account='root',
                  ignore_availability=True)
     update_replicas_paths(files)
     for replica in list_replicas(dids=[{
             'scope': f['scope'],
             'name': f['name'],
             'type': DIDType.FILE
     } for f in files],
                                  schemes=['srm']):
         # force the changed string - if we look it up from the DB, then we're not testing anything :-D
         assert_equal(
             replica['rses']['MOCK2'][0],
             'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/does/not/really/matter/where'
         )
Exemple #13
0
def list_replicas(dids, schemes=None, unavailable=False, request_id=None,
                  ignore_availability=True, all_states=False, rse_expression=None,
                  client_location=None, domain=None, lifetime=None, issuer=None):
    """
    List file replicas for a list of data identifiers.

    :param dids: The list of data identifiers (DIDs).
    :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...)
    :param unavailable: Also include unavailable replicas in the list.
    :param request_id: ID associated with the request for debugging.
    :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary.
    :param rse_expression: The RSE expression to restrict replicas on a set of RSEs.
    :param client_location: Client location dictionary for PFN modification {'ip', 'fqdn', 'site'}
    :param domain: The network domain for the call, either None, 'wan' or 'lan'. Compatibility fallback: None falls back to 'wan'.
    :param lifetime: If supported, in seconds, restrict the lifetime of the replica PFN.
    :param issuer: The issuer account.
    """
    validate_schema(name='r_dids', obj=dids)

    # Allow selected authenticated users to retrieve signed URLs.
    # Unauthenticated users, or permission-less users will get the raw URL without the signature.
    sign_urls = False
    if permission.has_permission(issuer=issuer, action='get_signed_urls', kwargs={}):
        sign_urls = True

    return replica.list_replicas(dids=dids, schemes=schemes, unavailable=unavailable,
                                 request_id=request_id,
                                 ignore_availability=ignore_availability,
                                 all_states=all_states, rse_expression=rse_expression,
                                 client_location=client_location, domain=domain,
                                 sign_urls=sign_urls, lifetime=lifetime)
Exemple #14
0
def list_replicas(dids, schemes=None, unavailable=False, request_id=None, ignore_availability=True, all_states=False):
    """
    List file replicas for a list of data identifiers.

    :param dids: The list of data identifiers (DIDs).
    :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...)
    :param unavailable: Also include unavailable replicas in the list.
    :param request_id: ID associated with the request for debugging.
    :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary.
    """
    validate_schema(name='r_dids', obj=dids)
    return replica.list_replicas(dids=dids, schemes=schemes, unavailable=unavailable, request_id=request_id, ignore_availability=ignore_availability, all_states=all_states)
Exemple #15
0
def necromancer(thread=0, bulk=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    """

    sleep_time = 60
    update_history_threshold = 3600
    update_history_time = time.time()

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        hb = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads'])

        stime = time.time()
        try:
            replicas = list_bad_replicas(limit=bulk, thread=hb['assign_thread'], total_threads=hb['nr_threads'])

            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse))

                rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])]
                if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]):
                    logging.info(prepend_str + 'File %s:%s has no other replicas, it will be marked as lost' % (scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1)
                    except DatabaseException, error:
                        logging.info(prepend_str + '%s' % (str(error)))

                else:
                    logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s' % (scope, name, str(rep[0]['rses'])))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1)
                    except DatabaseException, error:
                        logging.info(prepend_str + '%s' % (str(error)))

            logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
Exemple #16
0
def get_source_rse(scope, name, src_url):
    try:
        scheme = src_url.split(":")[0]
        replications = replica.list_replicas([{'scope': scope, 'name': name, 'type': DIDType.FILE}], schemes=[scheme], unavailable=True)
        for source in replications:
            for source_rse in source['rses']:
                for pfn in source['rses'][source_rse]:
                    if pfn == src_url:
                        return source_rse
        # cannot find matched surl
        logging.warn('Cannot get correct RSE for source url: %s' % (src_url))
        return None
    except:
        logging.error('Cannot get correct RSE for source url: %s(%s)' % (src_url, sys.exc_info()[1]))
        return None
Exemple #17
0
def test_reaper():
    """ REAPER2 (DAEMON): Test the reaper2 daemon."""
    if config_get_bool('common', 'multi_vo', raise_exception=False, default=False):
        vo = {'vo': config_get('client', 'vo', raise_exception=False, default='tst')}
    else:
        vo = {}

    rse_name = rse_name_generator()
    rse_id = rse_core.add_rse(rse_name, **vo)

    mock_protocol = {'scheme': 'MOCK',
                     'hostname': 'localhost',
                     'port': 123,
                     'prefix': '/test/reaper',
                     'impl': 'rucio.rse.protocols.mock.Default',
                     'domains': {
                         'lan': {'read': 1,
                                 'write': 1,
                                 'delete': 1},
                         'wan': {'read': 1,
                                 'write': 1,
                                 'delete': 1}}}
    rse_core.add_protocol(rse_id=rse_id, parameter=mock_protocol)

    nb_files = 30
    file_size = 2147483648  # 2G

    file_names = []
    for i in range(nb_files):
        file_name = 'lfn' + generate_uuid()
        file_names.append(file_name)
        replica_core.add_replica(rse_id=rse_id, scope=InternalScope('data13_hip', **vo),
                                 name=file_name, bytes=file_size,
                                 tombstone=datetime.utcnow() - timedelta(days=1),
                                 account=InternalAccount('root', **vo), adler32=None, md5=None)

    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=nb_files * file_size, free=800)
    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=10737418240)
    rse_core.set_rse_limits(rse_id=rse_id, name='MaxBeingDeletedFiles', value=10)

    if vo:
        reaper(once=True, rses=[], include_rses='vo=%s&(%s)' % (vo['vo'], rse_name), exclude_rses=[])
        reaper(once=True, rses=[], include_rses='vo=%s&(%s)' % (vo['vo'], rse_name), exclude_rses=[])
    else:
        reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=[])
        reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=[])

    assert len(list(replica_core.list_replicas(dids=[{'scope': InternalScope('data13_hip', **vo), 'name': n} for n in file_names], rse_expression=rse_name))) == nb_files - 5
Exemple #18
0
 def test_get_did_from_pfns_nondeterministic(self):
     """ REPLICA (CLIENT): Get list of DIDs associated to PFNs for non-deterministic sites"""
     rse = 'MOCK2'
     tmp_scope = 'mock'
     nbfiles = 3
     pfns = []
     input = {}
     rse_info = rsemgr.get_rse_info(rse)
     assert_equal(rse_info['deterministic'], False)
     files = [{
         'scope':
         tmp_scope,
         'name':
         'file_%s' % generate_uuid(),
         'bytes':
         1,
         'adler32':
         '0cc737eb',
         'pfn':
         'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s'
         % (tmp_scope, generate_uuid()),
         'meta': {
             'events': 10
         }
     } for _ in range(nbfiles)]
     for f in files:
         input[f['pfn']] = {'scope': f['scope'], 'name': f['name']}
     add_replicas(rse=rse,
                  files=files,
                  account='root',
                  ignore_availability=True)
     for replica in list_replicas(dids=[{
             'scope': f['scope'],
             'name': f['name'],
             'type': DIDType.FILE
     } for f in files],
                                  schemes=['srm'],
                                  ignore_availability=True):
         for rse in replica['rses']:
             pfns.extend(replica['rses'][rse])
     for result in self.replica_client.get_did_from_pfns(pfns, rse):
         pfn = result.keys()[0]
         assert_equal(input[pfn], result.values()[0])
Exemple #19
0
    def test_list_replicas_all_states(self):
        """ REPLICA (CORE): list file replicas with all_states"""
        tmp_scope = 'mock'
        nbfiles = 13
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in range(nbfiles)]
        rses = ['MOCK', 'MOCK3']
        for rse in rses:
            add_replicas(rse=rse, files=files, account='root', ignore_availability=True)

        for file in files:
            update_replica_state('MOCK', tmp_scope, file['name'], ReplicaState.COPYING)

        replica_cpt = 0
        for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm'], all_states=True):
            assert_in('states', replica)
            assert_equal(replica['states']['MOCK'], str(ReplicaState.COPYING))
            assert_equal(replica['states']['MOCK3'], str(ReplicaState.AVAILABLE))
            replica_cpt += 1

        assert_equal(nbfiles, replica_cpt)
Exemple #20
0
    def setUp(self):
        if config_get_bool('common', 'multi_vo', raise_exception=False, default=False):
            self.vo = {'vo': get_vo()}
        else:
            self.vo = {}

        self.replica_client = ReplicaClient()

        # Using two test RSEs
        self.rse4suspicious = 'MOCK_SUSPICIOUS'
        self.rse4suspicious_id = get_rse_id(self.rse4suspicious, **self.vo)
        self.rse4recovery = 'MOCK_RECOVERY'
        self.rse4recovery_id = get_rse_id(self.rse4recovery, **self.vo)
        self.scope = 'mock'
        self.internal_scope = InternalScope(self.scope, **self.vo)

        # For testing, we create 3 files and upload them to Rucio to two test RSEs.
        self.tmp_file1 = file_generator()
        self.tmp_file2 = file_generator()
        self.tmp_file3 = file_generator()
        self.tmp_file4 = file_generator()
        self.tmp_file5 = file_generator()

        self.listdids = [{'scope': self.internal_scope, 'name': path.basename(f), 'type': DIDType.FILE}
                         for f in [self.tmp_file1, self.tmp_file2, self.tmp_file3, self.tmp_file4, self.tmp_file5]]

        for rse in [self.rse4suspicious, self.rse4recovery]:
            cmd = 'rucio -v upload --rse {0} --scope {1} {2} {3} {4} {5} {6}'.format(rse, self.scope, self.tmp_file1, self.tmp_file2, self.tmp_file3, self.tmp_file4, self.tmp_file5)
            exitcode, out, err = execute(cmd)

            # checking if Rucio upload went OK
            assert exitcode == 0

        # Set fictional datatypes
        set_metadata(self.internal_scope, path.basename(self.tmp_file4), 'datatype', 'testtypedeclarebad')
        set_metadata(self.internal_scope, path.basename(self.tmp_file5), 'datatype', 'testtypenopolicy')

        # Allow for the RSEs to be affected by the suspicious file recovery daemon
        add_rse_attribute(self.rse4suspicious_id, "enable_suspicious_file_recovery", True)
        add_rse_attribute(self.rse4recovery_id, "enable_suspicious_file_recovery", True)

        # removing physical files from /tmp location - keeping only their DB info
        remove(self.tmp_file1)
        remove(self.tmp_file2)
        remove(self.tmp_file3)
        remove(self.tmp_file4)
        remove(self.tmp_file5)

        # Gather replica info
        replicalist = list_replicas(dids=self.listdids)

        # Changing the replica statuses as follows:
        # ----------------------------------------------------------------------------------------------------------------------------------
        # Name         State(s) declared on MOCK_RECOVERY       State(s) declared on MOCK_SUSPICIOUS        Metadata "datatype"
        # ----------------------------------------------------------------------------------------------------------------------------------
        # tmp_file1    available                                suspicious (available)
        # tmp_file2    available                                suspicious + bad (unavailable)
        # tmp_file3    unavailable                              suspicious (available)                      RAW
        # tmp_file4    unavailable                              suspicious (available)                      testtypedeclarebad
        # tmp_file5    unavailable                              suspicious (available)                      testtypenopolicy
        # ----------------------------------------------------------------------------------------------------------------------------------

        for replica in replicalist:
            suspicious_pfns = replica['rses'][self.rse4suspicious_id]
            for i in range(3):
                print("Declaring suspicious file replica: " + suspicious_pfns[0])
                self.replica_client.declare_suspicious_file_replicas([suspicious_pfns[0], ], 'This is a good reason.')
                sleep(1)
            if replica['name'] == path.basename(self.tmp_file2):
                print("Declaring bad file replica: " + suspicious_pfns[0])
                self.replica_client.declare_bad_file_replicas([suspicious_pfns[0], ], 'This is a good reason')
            if replica['name'] == path.basename(self.tmp_file3):
                print("Updating replica state as unavailable: " + replica['rses'][self.rse4recovery_id][0])
                update_replica_state(self.rse4recovery_id, self.internal_scope, path.basename(self.tmp_file3), ReplicaState.UNAVAILABLE)
            if replica['name'] == path.basename(self.tmp_file4):
                print("Updating replica state as unavailable: " + replica['rses'][self.rse4recovery_id][0])
                update_replica_state(self.rse4recovery_id, self.internal_scope, path.basename(self.tmp_file4), ReplicaState.UNAVAILABLE)
            if replica['name'] == path.basename(self.tmp_file5):
                print("Updating replica state as unavailable: " + replica['rses'][self.rse4recovery_id][0])
                update_replica_state(self.rse4recovery_id, self.internal_scope, path.basename(self.tmp_file5), ReplicaState.UNAVAILABLE)

        # Gather replica info after setting initial replica statuses
        replicalist = list_replicas(dids=self.listdids)

        # Checking if the status changes were effective
        for replica in replicalist:
            if replica['name'] == path.basename(self.tmp_file1):
                assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE'
                assert replica['states'][self.rse4recovery_id] == 'AVAILABLE'
            if replica['name'] == path.basename(self.tmp_file2):
                assert (self.rse4suspicious_id in replica['states']) is False
                assert replica['states'][self.rse4recovery_id] == 'AVAILABLE'
            if replica['name'] == path.basename(self.tmp_file3):
                assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE'
                assert (self.rse4recovery_id in replica['states']) is False
            if replica['name'] == path.basename(self.tmp_file4):
                assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE'
                assert (self.rse4recovery_id in replica['states']) is False
            if replica['name'] == path.basename(self.tmp_file5):
                assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE'
                assert (self.rse4recovery_id in replica['states']) is False

        # Checking if only self.tmp_file2 is declared as 'BAD'
        self.from_date = datetime.now() - timedelta(days=1)
        bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4suspicious_id, younger_than=self.from_date, **self.vo)
        bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list]

        assert (path.basename(self.tmp_file1), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file2), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist
        assert (path.basename(self.tmp_file3), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file4), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file5), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist

        bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4recovery_id, younger_than=self.from_date, **self.vo)
        bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list]

        assert (path.basename(self.tmp_file1), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file2), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file3), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file4), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file5), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
Exemple #21
0
def process_output(output, sanity_check=True, compress=True):
    """Perform post-consistency-check actions.

    DARK files are put in the quarantined-replica table so that they
    may be deleted by the Dark Reaper.  LOST files are reported as
    suspicious so that they may be further checked by the cloud squads.

    ``output`` should be an ``str`` with the absolute path to the file
    produced by ``consistency()``.  It must maintain its naming
    convention.

    If ``sanity_check`` is ``True`` (default) and the number of entries
    in the output file is deemed excessive, the actions are aborted.

    If ``compress`` is ``True`` (default), the file is compressed with
    bzip2 after the actions are successfully performed.
    """
    logger = logging.getLogger('auditor-worker')
    dark_replicas = []
    lost_replicas = []
    try:
        with open(output) as f:
            for line in f:
                label, path = line.rstrip().split(',', 1)
                scope, name = guess_replica_info(path)
                if label == 'DARK':
                    dark_replicas.append({'path': path,
                                          'scope': InternalScope(scope),
                                          'name': name})
                elif label == 'LOST':
                    lost_replicas.append({'scope': InternalScope(scope),
                                          'name': name})
                else:
                    raise ValueError('unexpected label')
    # Since the file is read immediately after its creation, any error
    # exposes a bug in the Auditor.
    except Exception as error:
        logger.critical('Error processing "%s"', output, exc_info=True)
        raise error

    rse = os.path.basename(output[:output.rfind('_')])
    rse_id = get_rse_id(rse=rse)
    usage = get_rse_usage(rse_id=rse_id, source='rucio')[0]
    threshold = config.config_get('auditor', 'threshold', False, 0.2)

    # Perform a basic sanity check by comparing the number of entries
    # with the total number of files on the RSE.  If the percentage is
    # significant, there is most likely an issue with the site dump.
    found_error = False
    if len(dark_replicas) > threshold * usage['files']:
        logger.warning('Number of DARK files is exceeding threshold: "%s"',
                       output)
        found_error = True
    if len(lost_replicas) > threshold * usage['files']:
        logger.warning('Number of LOST files is exceeding threshold: "%s"',
                       output)
        found_error = True
    if found_error and sanity_check:
        raise AssertionError('sanity check failed')

    # While converting LOST replicas to PFNs, entries that do not
    # correspond to a replica registered in Rucio are silently dropped.
    lost_pfns = [r['rses'][rse_id][0] for r in list_replicas(lost_replicas)
                 if rse_id in r['rses']]

    add_quarantined_replicas(rse_id=rse_id, replicas=dark_replicas)
    logger.debug('Processed %d DARK files from "%s"', len(dark_replicas),
                 output)
    declare_bad_file_replicas(lost_pfns, reason='Reported by Auditor',
                              issuer=InternalAccount('root'), status=BadFilesStatus.SUSPICIOUS)
    logger.debug('Processed %d LOST files from "%s"', len(lost_replicas),
                 output)

    if compress:
        destination = bz2_compress_file(output)
        logger.debug('Compressed "%s"', destination)
Exemple #22
0
    def test_replica_recoverer(self):
        """ REPLICA RECOVERER: Testing declaration of suspicious replicas as bad if they are found available on other RSEs.

            setUp function (above) is supposed to run first
            (nose does this automatically):

            - uploads 6 test files to two test RSEs ('MOCK_RECOVERY', 'MOCK_SUSPICIOUS')
            - prepares their statuses to be as follows:

            # ----------------------------------------------------------------------------------------------------------------------------------
            # Name         State(s) declared on MOCK_RECOVERY       State(s) declared on MOCK_SUSPICIOUS        Metadata "datatype"
            # ----------------------------------------------------------------------------------------------------------------------------------
            # tmp_file1    available                                suspicious (available)
            # tmp_file2    available                                suspicious + bad (unavailable)
            # tmp_file3    unavailable                              suspicious (available)                      RAW
            # tmp_file4    unavailable                              suspicious (available)                      testtypedeclare_bad
            # tmp_file5    unavailable                              suspicious (available)                      testtypenopolicy
            # ----------------------------------------------------------------------------------------------------------------------------------

            - Explaination: Suspicious replicas that are the last remaining copy (unavailable on MOCK_RECOVERY) are handeled differently depending
                            by their metadata "datatype". RAW files have the poilcy to be ignored. testtype_declare_bad files are of a fictional
                            type that has the policy of being declared bad. testtype_nopolicy files are of a fictional type that doesn't have a
                            policy specified, meaning they should be ignored by default.

            Runs the Test:

            - running suspicious_replica_recoverer

            Concluding:

            - checks that tmp_file1 and tmp_file4 were declared as 'BAD' on 'MOCK_SUSPICIOUS'

        """

        # Run replica recoverer once
        try:
            run(once=True, younger_than=1, nattempts=2, limit_suspicious_files_on_rse=5)
        except KeyboardInterrupt:
            stop()

        # Checking the outcome:
        # we expect to see only one change, i.e. tmp_file1 declared as bad on MOCK_SUSPICIOUS
        # ----------------------------------------------------------------------------------------------------------------------------------
        # Name         State(s) declared on MOCK_RECOVERY       State(s) declared on MOCK_SUSPICIOUS        Metadata "datatype"
        # ----------------------------------------------------------------------------------------------------------------------------------
        # tmp_file1    available                                suspicious + bad (unavailable)
        # tmp_file2    available                                suspicious + bad (unavailable)
        # tmp_file3    unavailable                              suspicious (available)                      RAW
        # tmp_file4    unavailable                              suspicious + bad (unvailable)               test_type_declare_bad
        # tmp_file5    unavailable                              suspicious (available)                      test_type_ignore
        # ----------------------------------------------------------------------------------------------------------------------------------

        # Gather replica info after replica_recoverer has run.
        replicalist = list_replicas(dids=self.listdids)

        for replica in replicalist:
            if replica['name'] == path.basename(self.tmp_file1) or replica['name'] == path.basename(self.tmp_file2):
                assert (self.rse4suspicious_id in replica['states']) is False
                assert replica['states'][self.rse4recovery_id] == 'AVAILABLE'
            if replica['name'] == path.basename(self.tmp_file3):
                assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE'
                assert (self.rse4recovery_id in replica['states']) is False
            if replica['name'] == path.basename(self.tmp_file4):
                # The key 'state' only exists if the replica is available on at least one RSE. It shouldn't exist for tmp_file4.
                assert ('states' in replica) is False
            if replica['name'] == path.basename(self.tmp_file5):
                assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE'
                assert (self.rse4recovery_id in replica['states']) is False

        # Checking if replicas declared as 'BAD'
        bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4suspicious_id, younger_than=self.from_date, **self.vo)
        bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list]

        assert (path.basename(self.tmp_file1), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist
        assert (path.basename(self.tmp_file2), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist
        assert (path.basename(self.tmp_file3), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file4), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist
        assert (path.basename(self.tmp_file5), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist

        bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4recovery_id, younger_than=self.from_date, **self.vo)
        bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list]

        assert (path.basename(self.tmp_file1), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file2), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file3), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file4), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file5), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
def declare_suspicious_replicas_bad(once=False, younger_than=3, nattempts=10, vos=None, limit_suspicious_files_on_rse=5, sleep_time=300):
    """
    Main loop to check for available replicas which are labeled as suspicious.

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param vos: VOs on which to look for RSEs. Only used in multi-VO mode.
                If None, we either use all VOs if run from "def",
    :param limit_suspicious_files_on_rse: Maximum number of suspicious replicas on an RSE before that RSE
                                          is considered problematic and the suspicious replicas on that RSE
                                          are labeled as 'TEMPORARY_UNAVAILABLE'.
    :param sleep_time: The daemon should not run too often. If the daemon's runtime is quicker than sleep_time, then
                       it should sleep until sleep_time is over.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]

    prepend_str = 'replica_recoverer: '
    logger = formatted_logger(logging.log, prepend_str + '%s')

    multi_vo = config_get_bool('common', 'multi_vo', raise_exception=False, default=False)
    if not multi_vo:
        if vos:
            logger(logging.WARNING, 'Ignoring argument vos, this is only applicable in a multi-VO setup.')
        vos = ['def']
    else:
        if vos:
            invalid = set(vos) - set([v['vo'] for v in list_vos()])
            if invalid:
                msg = 'VO{} {} cannot be found'.format('s' if len(invalid) > 1 else '', ', '.join([repr(v) for v in invalid]))
                raise VONotFound(msg)
        else:
            vos = [v['vo'] for v in list_vos()]
        logger(logging.INFO, 'replica_recoverer: This instance will work on VO%s: %s' % ('s' if len(vos) > 1 else '', ', '.join([v for v in vos])))

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
    prepend_str = 'replica_recoverer [%i/%i] : ' % (heartbeat['assign_thread'], heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread'] + 1

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logger(logging.ERROR, 'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.', socket.gethostname())
                die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
                break

            prepend_str = 'replica_recoverer[%s/%s]: ' % (worker_number, total_workers)
            logger = formatted_logger(logging.log, prepend_str + '%s')

            start = time.time()

            try:
                json_file = open("/opt/rucio/etc/suspicious_replica_recoverer.json")
            except:
                logger(logging.WARNING, "An error occured whilst trying to open the JSON file.")
                break

            try:
                json_data = json.load(json_file)
            except ValueError:
                logger(logging.WARNING, "No JSON object could be decoded.")

            # Checking that the json file is formatedd properly.
            for i, entry in enumerate(json_data):
                if "datatype" not in entry or "action" not in entry:
                    logger(logging.ERROR, 'Entry %s in the json file is incomplete (missing either "datatype" or "action").', i)
                    break

            logger(logging.INFO, 'Ready to query replicas that were reported as suspicious in the last %s days at least %s times.', younger_than, nattempts)

            getfileskwargs = {'younger_than': younger_than,
                              'nattempts': nattempts,
                              'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                              'is_suspicious': True}

            for vo in vos:
                logger(logging.INFO, 'Start replica recovery for VO: %s', vo)
                recoverable_replicas = {}
                if vo not in recoverable_replicas:
                    recoverable_replicas[vo] = {}

                # rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true', filter={'vo': vo})], key=lambda k: k['rse'])
                rse_list = sorted([rse for rse in parse_expression('enable_suspicious_file_recovery=true') if rse['vo'] == vo], key=lambda k: k['rse'])

                logger(logging.DEBUG, "List of RSEs with enable_suspicious_file_recovery = True:")
                for i in rse_list:
                    logger(logging.DEBUG, '%s', i)

                for rse in rse_list:
                    time_start_rse = time.time()
                    rse_expr = rse['rse']
                    cnt_surl_not_found = 0
                    if rse_expr not in recoverable_replicas[vo]:
                        recoverable_replicas[vo][rse_expr] = {}
                    # Get a dictionary of the suspicious replicas on the RSE that have available copies on other RSEs
                    suspicious_replicas_avail_elsewhere = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["EXIST_COPIES"].value, filter_={'vo': vo}, **getfileskwargs)
                    # Get the suspicious replicas that are the last remaining copies
                    suspicious_replicas_last_copy = get_suspicious_files(rse_expr, available_elsewhere=SuspiciousAvailability["LAST_COPY"].value, filter_={'vo': vo}, **getfileskwargs)

                    logger(logging.DEBUG, 'Suspicious replicas on %s:', rse_expr)
                    logger(logging.DEBUG, 'Replicas with copies on other RSEs (%s):', len(suspicious_replicas_avail_elsewhere))
                    for i in suspicious_replicas_avail_elsewhere:
                        logger(logging.DEBUG, '%s', i)
                    logger(logging.DEBUG, 'Replicas that are the last remaining copy (%s):', len(suspicious_replicas_last_copy))
                    for i in suspicious_replicas_last_copy:
                        logger(logging.DEBUG, '%s', i)

                    # RSEs that aren't available shouldn't have suspicious replicas showing up. Skip to next RSE.
                    if (rse['availability'] not in {4, 5, 6, 7}) and ((len(suspicious_replicas_avail_elsewhere) > 0) or (len(suspicious_replicas_last_copy) > 0)):
                        logger(logging.WARNING, "%s is not available (availability: %s), yet it has suspicious replicas. Please investigate. \n", rse_expr, rse['availability'])
                        continue

                    if suspicious_replicas_avail_elsewhere:
                        for replica in suspicious_replicas_avail_elsewhere:
                            if vo == replica['scope'].vo:
                                scope = replica['scope']
                                rep_name = replica['name']
                                rse_id = replica['rse_id']
                                surl_not_found = True
                                for rep in list_replicas([{'scope': scope, 'name': rep_name}]):
                                    for rse_ in rep['rses']:
                                        if rse_ == rse_id:
                                            recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_][0], 'available_elsewhere': True}
                                            surl_not_found = False
                                if surl_not_found:
                                    cnt_surl_not_found += 1
                                    logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr)

                    if suspicious_replicas_last_copy:
                        for replica in suspicious_replicas_last_copy:
                            if vo == replica['scope'].vo:
                                scope = replica['scope']
                                rep_name = replica['name']
                                rse_id = replica['rse_id']
                                surl_not_found = True
                                # Should only return one rse, as there is only one replica remaining
                                for rep in list_replicas([{'scope': scope, 'name': rep_name}]):
                                    recoverable_replicas[vo][rse_expr][rep_name] = {'name': rep_name, 'rse_id': rse_id, 'scope': scope, 'surl': rep['rses'][rse_id][0], 'available_elsewhere': False}
                                    surl_not_found = False
                                if surl_not_found:
                                    cnt_surl_not_found += 1
                                    logger(logging.WARNING, 'Skipping suspicious replica %s on %s, no surls were found.', rep_name, rse_expr)

                    logger(logging.INFO, 'Suspicious replica query took %s seconds on %s and found %i suspicious replicas. The pfns for %s/%s replicas were found.',
                           time.time() - time_start_rse, rse_expr, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy),
                           len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) - cnt_surl_not_found, len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy))

                    if len(suspicious_replicas_avail_elsewhere) + len(suspicious_replicas_last_copy) != 0:
                        logger(logging.DEBUG, 'List of replicas on %s for which the pfns have been found:', rse_expr)
                        for i in recoverable_replicas[vo][rse_expr]:
                            logger(logging.DEBUG, '%s', i)

                # Log file is long and hard to read -> implement some spacing
                logger(logging.INFO, 'All RSEs have been checked for suspicious replicas. Total time: %s seconds.', time.time() - start)
                logger(logging.INFO, 'Begin check for problematic RSEs.')
                time_start_check_probl = time.time()

                # If an RSE has more than *limit_suspicious_files_on_rse* suspicious files, then there might be a problem with the RSE.
                # The suspicious files are marked as temporarily unavailable.
                list_problematic_rses = []
                for rse_key in list(recoverable_replicas[vo].keys()):
                    if len(recoverable_replicas[vo][rse_key].values()) > limit_suspicious_files_on_rse:
                        list_problematic_rses.append(rse_key)
                        surls_list = []
                        for replica_value in recoverable_replicas[vo][rse_key].values():
                            surls_list.append(replica_value['surl'])

                        add_bad_pfns(pfns=surls_list, account=InternalAccount('root', vo=vo), state='TEMPORARY_UNAVAILABLE', expires_at=datetime.utcnow() + timedelta(days=3))

                        logger(logging.INFO, "%s is problematic (more than %s suspicious replicas). Send a Jira ticket for the RSE (to be implemented).", rse_key, limit_suspicious_files_on_rse)
                        logger(logging.INFO, "The following files on %s have been marked as TEMPORARILY UNAVAILABLE:", rse_key)
                        for rse_values in recoverable_replicas[vo][rse_key].values():
                            logger(logging.INFO, 'Scope: %s    Name: %s', rse_values['scope'], rse_values['name'])
                        # Remove the RSE from the dictionary as it has been dealt with.
                        del recoverable_replicas[vo][rse_key]

                logger(logging.INFO, "Following RSEs were deemed problematic (total: %s)", len(list_problematic_rses))
                for rse in list_problematic_rses:
                    logger(logging.INFO, "%s", rse)

                # Label suspicious replicas as bad if they have oher copies on other RSEs (that aren't also marked as suspicious).
                # If they are the last remaining copies, deal with them differently.
                for rse_key in list(recoverable_replicas[vo].keys()):
                    files_to_be_declared_bad = []
                    files_to_be_ignored = []
                    # Remove RSEs from dictionary that don't have any suspicious replicas
                    if len(recoverable_replicas[vo][rse_key]) == 0:
                        del recoverable_replicas[vo][rse_key]
                        continue
                    # Get the rse_id by going to one of the suspicious replicas from that RSE and reading it from there
                    rse_id = list(recoverable_replicas[vo][rse_key].values())[0]['rse_id']
                    for replica_key in list(recoverable_replicas[vo][rse_key].keys()):
                        if recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is True:
                            # Replicas with other copies on at least one other RSE can safely be labeled as bad
                            files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                            # Remove replica from dictionary
                            del recoverable_replicas[vo][rse_key][replica_key]
                        elif recoverable_replicas[vo][rse_key][replica_key]['available_elsewhere'] is False:
                            if (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("log.")) or (recoverable_replicas[vo][rse_key][replica_key]['name'].startswith("user")):
                                # Don't keep log files or user files
                                files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                del recoverable_replicas[vo][rse_key][replica_key]
                            else:
                                # Deal with replicas based on their metadata.
                                file_metadata = get_metadata(recoverable_replicas[vo][rse_key][replica_key]["scope"], recoverable_replicas[vo][rse_key][replica_key]["name"])
                                if file_metadata["datatype"] is None:  # "None" type has no function "split()"
                                    files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                    continue
                                for i in json_data:
                                    if i["datatype"] == file_metadata["datatype"].split("_")[-1]:
                                        action = i["action"]
                                        if action == "ignore":
                                            files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                        elif action == "declare bad":
                                            files_to_be_declared_bad.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])
                                        else:
                                            logger(logging.WARNING, "RSE: %s, replica name %s, surl %s: Match for the metadata 'datatype' (%s) of replica found in json file, but no match for 'action' (%s)",
                                                   rse_key, replica_key, recoverable_replicas[vo][rse_key][replica_key]['surl'], i["datatype"], i["action"])
                                        break
                                    else:
                                        # If no policy has be set, default to ignoring the file (no action taken).
                                        files_to_be_ignored.append(recoverable_replicas[vo][rse_key][replica_key]['surl'])

                    logger(logging.INFO, '(%s) Remaining replicas (pfns) that will be ignored:', rse_key)
                    for i in files_to_be_ignored:
                        logger(logging.INFO, '%s', i)
                    logger(logging.INFO, '(%s) Remaining replica (pfns) that will be declared BAD:', rse_key)
                    for i in files_to_be_declared_bad:
                        logger(logging.INFO, '%s', i)

                    if files_to_be_declared_bad:
                        logger(logging.INFO, 'Ready to declare %s bad replica(s) on %s (RSE id: %s).', len(files_to_be_declared_bad), rse_key, str(rse_id))

                        declare_bad_file_replicas(pfns=files_to_be_declared_bad, reason='Suspicious. Automatic recovery.', issuer=InternalAccount('root', vo=vo), session=None)

                        logger(logging.INFO, 'Finished declaring bad replicas on %s.\n', rse_key)

                logger(logging.INFO, 'Finished checking for problematic RSEs and declaring bad replicas. Total time: %s seconds.', time.time() - time_start_check_probl)

                time_passed = time.time() - start
                logger(logging.INFO, 'Total time: %s seconds', time_passed)
                daemon_sleep(start_time=start, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logger(logging.WARNING, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logger(logging.WARNING, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
            else:
                logger(logging.CRITICAL, traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
        except Exception as err:
            logger(logging.CRITICAL, traceback.format_exc())
            record_counter('replica.recoverer.exceptions.' + err.__class__.__name__)
        if once:
            break

    die(executable=executable, hostname=socket.gethostname(), pid=os.getpid(), thread=threading.current_thread())
    logger(logging.INFO, 'Graceful stop done.')
def test_reaper():
    """ REAPER2 (DAEMON): Test the reaper2 daemon."""
    if config_get_bool('common',
                       'multi_vo',
                       raise_exception=False,
                       default=False):
        vo = {
            'vo': config_get('client',
                             'vo',
                             raise_exception=False,
                             default='tst')
        }
        new_vo = {'vo': 'new'}
        if not vo_core.vo_exists(**new_vo):
            vo_core.add_vo(description='Test',
                           email='*****@*****.**',
                           **new_vo)
        if not scope_core.check_scope(InternalScope('data13_hip', **new_vo)):
            scope_core.add_scope(InternalScope('data13_hip', **new_vo),
                                 InternalAccount('root', **new_vo))
        nb_rses = 2
    else:
        vo = {}
        new_vo = {}
        nb_rses = 1

    mock_protocol = {
        'scheme': 'MOCK',
        'hostname': 'localhost',
        'port': 123,
        'prefix': '/test/reaper',
        'impl': 'rucio.rse.protocols.mock.Default',
        'domains': {
            'lan': {
                'read': 1,
                'write': 1,
                'delete': 1
            },
            'wan': {
                'read': 1,
                'write': 1,
                'delete': 1
            }
        }
    }

    nb_files = 30
    file_size = 2147483648  # 2G

    rse_names = []
    all_file_names = []
    for j in range(nb_rses):
        rse_name = rse_name_generator()
        rse_names.append(rse_name)
        rse_id = rse_core.add_rse(rse_name, **vo)
        rse_core.add_protocol(rse_id=rse_id, parameter=mock_protocol)
        if new_vo:
            rse_id_new = rse_core.add_rse(rse_name, **new_vo)
            rse_core.add_protocol(rse_id=rse_id_new, parameter=mock_protocol)

        file_names = []
        for i in range(nb_files):
            file_name = 'lfn' + generate_uuid()
            file_names.append(file_name)
            replica_core.add_replica(rse_id=rse_id,
                                     scope=InternalScope('data13_hip', **vo),
                                     name=file_name,
                                     bytes=file_size,
                                     tombstone=datetime.utcnow() -
                                     timedelta(days=1),
                                     account=InternalAccount('root', **vo),
                                     adler32=None,
                                     md5=None)
            if new_vo:
                replica_core.add_replica(
                    rse_id=rse_id_new,
                    scope=InternalScope('data13_hip', **new_vo),
                    name=file_name,
                    bytes=file_size,
                    tombstone=datetime.utcnow() - timedelta(days=1),
                    account=InternalAccount('root', **new_vo),
                    adler32=None,
                    md5=None)

        all_file_names.append(file_names)
        rse_core.set_rse_usage(rse_id=rse_id,
                               source='storage',
                               used=nb_files * file_size,
                               free=800)
        rse_core.set_rse_limits(rse_id=rse_id,
                                name='MinFreeSpace',
                                value=10737418240)
        rse_core.set_rse_limits(rse_id=rse_id,
                                name='MaxBeingDeletedFiles',
                                value=10)
        if new_vo:
            rse_core.set_rse_usage(rse_id=rse_id_new,
                                   source='storage',
                                   used=nb_files * file_size,
                                   free=800)
            rse_core.set_rse_limits(rse_id=rse_id_new,
                                    name='MinFreeSpace',
                                    value=10737418240)
            rse_core.set_rse_limits(rse_id=rse_id_new,
                                    name='MaxBeingDeletedFiles',
                                    value=10)

    if not vo:
        reaper(once=True, rses=[], include_rses=rse_names[0], exclude_rses=[])
        reaper(once=True, rses=[], include_rses=rse_names[0], exclude_rses=[])
        assert len(
            list(
                replica_core.list_replicas(
                    dids=[{
                        'scope': InternalScope('data13_hip', **vo),
                        'name': n
                    } for n in all_file_names[0]],
                    rse_expression=rse_name))) == nb_files - 5
    else:
        # Check we reap all VOs by default
        reaper(once=True, rses=[], include_rses=rse_names[0], exclude_rses=[])
        reaper(once=True, rses=[], include_rses=rse_names[0], exclude_rses=[])
        assert len(
            list(
                replica_core.list_replicas(
                    dids=[{
                        'scope': InternalScope('data13_hip', **vo),
                        'name': n
                    } for n in all_file_names[0]],
                    rse_expression=rse_names[0]))) == nb_files - 5
        assert len(
            list(
                replica_core.list_replicas(
                    dids=[{
                        'scope': InternalScope('data13_hip', **new_vo),
                        'name': n
                    } for n in all_file_names[0]],
                    rse_expression=rse_names[0]))) == nb_files - 5
        # Check we don't affect a second VO that isn't specified
        reaper(once=True,
               rses=[],
               include_rses=rse_names[1],
               exclude_rses=[],
               vos=['new'])
        reaper(once=True,
               rses=[],
               include_rses=rse_names[1],
               exclude_rses=[],
               vos=['new'])
        assert len(
            list(
                replica_core.list_replicas(
                    dids=[{
                        'scope': InternalScope('data13_hip', **vo),
                        'name': n
                    } for n in all_file_names[1]],
                    rse_expression=rse_names[1]))), nb_files
        assert len(
            list(
                replica_core.list_replicas(
                    dids=[{
                        'scope': InternalScope('data13_hip', **new_vo),
                        'name': n
                    } for n in all_file_names[1]],
                    rse_expression=rse_names[1]))), nb_files - 5
Exemple #25
0
            try:
                parsed_rses = parse_expression(source_replica_expression, session=None)
            except InvalidRSEExpression, e:
                logging.error("Invalid RSE exception %s for request %s: %s" % (source_replica_expression,
                                                                               req['request_id'],
                                                                               e))
                allowed_source_rses = []
            else:
                allowed_source_rses = [x['rse'] for x in parsed_rses]

    tmpsrc = []
    metadata = {}
    try:
        ts = time.time()
        replications = replica.list_replicas(dids=[{'scope': req['scope'],
                                                    'name': req['name'],
                                                    'type': DIDType.FILE}],
                                             schemes=[scheme, 'gsiftp'])
        record_timer('daemons.conveyor.submitter.list_replicas', (time.time() - ts) * 1000)

        # return gracefully if there are no replicas for a DID
        if not replications:
            return None, None

        for source in replications:

            metadata['filesize'] = long(source['bytes'])
            metadata['md5'] = source['md5']
            metadata['adler32'] = source['adler32']
            # TODO: Source protection

            # we need to know upfront if we are mixed DISK/TAPE source
Exemple #26
0
    def test_replica_recoverer(self):
        """ REPLICA RECOVERER: Testing declaration of suspicious replicas as bad if they are found available on other RSEs.

            setUp function (above) is supposed to run first
            (nose does this automatically):

            - uploads 3 test files to two test RSEs ('MOCK_RECOVERY', 'MOCK_SUSPICIOUS')
            - prepares their statuses to be as follows:

            # --------------------------------------------------------------------------------------------
            # Name         State(s) declared on MOCK_RECOVERY       State(s) declared on MOCK_SUSPICIOUS
            # --------------------------------------------------------------------------------------------
            # tmp_file1    available                                suspicious (available)
            # tmp_file2    available                                suspicious + bad (unavailable)
            # tmp_file3    unavailable                              suspicious (available)
            # --------------------------------------------------------------------------------------------

            Runs the Test:

            - running suspicious_replica_recoverer

            Concluding:

            - checks that the only change made is that tmp_file1 was declared as 'BAD on 'MOCK_SUSPICIOUS'

        """

        # Run replica recoverer once
        try:
            run(once=True, younger_than=1, nattempts=2, rse_expression='MOCK_SUSPICIOUS')
        except KeyboardInterrupt:
            stop()

        # Checking the outcome:
        # we expect to see only one change, i.e. tmp_file1 declared as bad on MOCK_SUSPICIOUS
        # --------------------------------------------------------------------------------------------
        # Name         State(s) declared on MOCK_RECOVERY       State(s) declared on MOCK_SUSPICIOUS
        # --------------------------------------------------------------------------------------------
        # tmp_file1    available                                suspicious + bad (unavailable)
        # tmp_file2    available                                suspicious + bad (unavailable)
        # tmp_file3    unavailable                              suspicious (available)
        # --------------------------------------------------------------------------------------------

        # Gather replica info after replica_recoverer has run.
        replicalist = list_replicas(dids=self.listdids)

        for replica in replicalist:
            if replica['name'] == path.basename(self.tmp_file1) or replica['name'] == path.basename(self.tmp_file2):
                assert (self.rse4suspicious_id in replica['states']) is False
                assert replica['states'][self.rse4recovery_id] == 'AVAILABLE'
            if replica['name'] == path.basename(self.tmp_file3):
                assert replica['states'][self.rse4suspicious_id] == 'AVAILABLE'
                assert (self.rse4recovery_id in replica['states']) is False

        # Checking if replicas declared as 'BAD'
        bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4suspicious_id, younger_than=self.from_date, **self.vo)
        bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list]

        assert (path.basename(self.tmp_file1), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist
        assert (path.basename(self.tmp_file2), self.rse4suspicious_id, BadFilesStatus.BAD) in bad_checklist
        assert (path.basename(self.tmp_file3), self.rse4suspicious_id, BadFilesStatus.BAD) not in bad_checklist

        bad_replicas_list = list_bad_replicas_status(rse_id=self.rse4recovery_id, younger_than=self.from_date, **self.vo)
        bad_checklist = [(badf['name'], badf['rse_id'], badf['state']) for badf in bad_replicas_list]

        assert (path.basename(self.tmp_file1), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file2), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
        assert (path.basename(self.tmp_file3), self.rse4recovery_id, BadFilesStatus.BAD) not in bad_checklist
Exemple #27
0
def test_archive_removal_impact_on_constituents(rse_factory, did_factory, mock_scope, root_account, caches_mock, file_config_mock):
    [cache_region] = caches_mock
    rse_name, rse_id = rse_factory.make_mock_rse()
    scope = mock_scope
    account = root_account

    # Create 2 archives and 4 files:
    # - One only exists in the first archive
    # - One in both, plus another replica, which is not in an archive
    # - One in both, plus another replica, which is not in an archive; and this replica has expired
    # - One in both, plus another replica, which is not in an archive; and this replica has expired; but a replication rule exists on this second replica
    # Also add these files to datasets, one of which will be removed at the end
    nb_constituents = 4
    nb_c_outside_archive = nb_constituents - 1
    constituent_size = 2000
    archive_size = 1000
    uuid = str(generate_uuid())
    constituents = [{'scope': scope, 'name': 'lfn.%s.%d' % (uuid, i)} for i in range(nb_constituents)]
    did_factory.register_dids(constituents)
    c_first_archive_only, c_with_replica, c_with_expired_replica, c_with_replica_and_rule = constituents

    replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size, **c_with_replica)

    replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size,
                             tombstone=datetime.utcnow() - timedelta(days=1), **c_with_expired_replica)

    replica_core.add_replica(rse_id=rse_id, account=account, bytes_=constituent_size,
                             tombstone=datetime.utcnow() - timedelta(days=1), **c_with_replica_and_rule)
    rule_core.add_rule(dids=[c_with_replica_and_rule], account=account, copies=1, rse_expression=rse_name, grouping='NONE',
                       weight=None, lifetime=None, locked=False, subscription_id=None)

    archive1, archive2 = [{'scope': scope, 'name': 'archive_%s.%d.zip' % (uuid, i)} for i in range(2)]
    replica_core.add_replica(rse_id=rse_id, bytes_=archive_size, account=account, **archive1)
    replica_core.add_replica(rse_id=rse_id, bytes_=archive_size, account=account, **archive2)
    did_core.attach_dids(dids=[{'scope': c['scope'], 'name': c['name'], 'bytes': constituent_size} for c in constituents],
                         account=account, **archive1)
    did_core.attach_dids(dids=[{'scope': c['scope'], 'name': c['name'], 'bytes': constituent_size} for c in [c_with_replica, c_with_expired_replica, c_with_replica_and_rule]],
                         account=account, **archive2)

    dataset1, dataset2 = [{'scope': scope, 'name': 'dataset_%s.%i' % (uuid, i)} for i in range(2)]
    did_core.add_did(did_type='DATASET', account=account, **dataset1)
    did_core.attach_dids(dids=constituents, account=account, **dataset1)
    did_core.add_did(did_type='DATASET', account=account, **dataset2)
    did_core.attach_dids(dids=[c_first_archive_only, c_with_expired_replica], account=account, **dataset2)

    @read_session
    def __get_archive_contents_history_count(archive, session=None):
        return session.query(ConstituentAssociationHistory).filter_by(**archive).count()

    # Run reaper the first time.
    # the expired non-archive replica of c_with_expired_replica must be removed,
    # but the did must not be removed, and it must still remain in the dataset because
    # it still has the replica from inside the archive
    assert replica_core.get_replica(rse_id=rse_id, **c_with_expired_replica)
    cache_region.invalidate()
    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size)
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    for did in constituents + [archive1, archive2]:
        assert did_core.get_did(**did)
    for did in [archive1, archive2, c_with_replica, c_with_replica_and_rule]:
        assert replica_core.get_replica(rse_id=rse_id, **did)
    with pytest.raises(ReplicaNotFound):
        # The replica is only on the archive, not on the constituent
        replica_core.get_replica(rse_id=rse_id, **c_first_archive_only)
    with pytest.raises(ReplicaNotFound):
        # The replica outside the archive was removed by reaper
        nb_c_outside_archive -= 1
        replica_core.get_replica(rse_id=rse_id, **c_with_expired_replica)
    # Compared to get_replica, list_replicas resolves archives, must return replicas for all files
    assert len(list(replica_core.list_replicas(dids=constituents))) == 4
    assert len(list(did_core.list_content(**dataset1))) == 4
    assert len(list(did_core.list_archive_content(**archive1))) == 4
    assert len(list(did_core.list_archive_content(**archive2))) == 3
    assert __get_archive_contents_history_count(archive1) == 0
    assert __get_archive_contents_history_count(archive2) == 0

    # Expire the first archive and run reaper again
    # the archive will be removed; and c_first_archive_only must be removed from datasets
    # and from the did table.
    replica_core.set_tombstone(rse_id=rse_id, tombstone=datetime.utcnow() - timedelta(days=1), **archive1)
    cache_region.invalidate()
    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=2 * archive_size + nb_c_outside_archive * constituent_size)
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=2 * archive_size + nb_c_outside_archive * constituent_size, free=1)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**archive1)
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**c_first_archive_only)
    assert len(list(replica_core.list_replicas(dids=constituents))) == 3
    assert len(list(did_core.list_content(**dataset1))) == 3
    assert len(list(did_core.list_archive_content(**archive1))) == 0
    assert len(list(did_core.list_archive_content(**archive2))) == 3
    assert __get_archive_contents_history_count(archive1) == 4
    assert __get_archive_contents_history_count(archive2) == 0

    # Expire the second archive replica and run reaper another time
    # c_with_expired_replica is removed because its external replica got removed at previous step
    # and it exists only inside the archive now.
    # If not open, Dataset2 will be removed because it will be empty.
    did_core.set_status(open=False, **dataset2)
    replica_core.set_tombstone(rse_id=rse_id, tombstone=datetime.utcnow() - timedelta(days=1), **archive2)
    cache_region.invalidate()
    rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=archive_size + nb_c_outside_archive * constituent_size)
    rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=archive_size + nb_c_outside_archive * constituent_size, free=1)
    reaper(once=True, rses=[], include_rses=rse_name, exclude_rses=None)
    # The archive must be removed
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**archive2)
    # The DIDs which only existed in the archive are also removed
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**c_first_archive_only)
    with pytest.raises(DataIdentifierNotFound):
        assert did_core.get_did(**c_with_expired_replica)
    # If the DID has a non-expired replica outside the archive without rules on it, the DID is not removed
    assert did_core.get_did(**c_with_replica)
    # If the DID has an expired replica outside the archive, but has rules on that replica, the DID is not removed
    assert did_core.get_did(**c_with_replica_and_rule)
    assert len(list(replica_core.list_replicas(dids=constituents))) == 2
    assert len(list(did_core.list_content(**dataset1))) == 2
    with pytest.raises(DataIdentifierNotFound):
        did_core.get_did(**dataset2)
    assert len(list(did_core.list_content(**dataset2))) == 0
    assert len(list(did_core.list_archive_content(**archive2))) == 0
    assert __get_archive_contents_history_count(archive1) == 4
    assert __get_archive_contents_history_count(archive2) == 3
Exemple #28
0
        """ REPLICA (CORE): Force update the replica path """
        tmp_scope = 'mock'
        nbfiles = 5
        rse_info = rsemgr.get_rse_info('MOCK')
        files = [{'scope': tmp_scope,
                  'name': 'file_%s' % generate_uuid(),
                  'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests//does/not/really/matter/where',
                  'bytes': 1L,
                  'adler32': '0cc737eb',
                  'meta': {'events': 10},
                  'rse_id': rse_info['id'],
                  'path': '/does/not/really/matter/where'} for i in xrange(nbfiles)]
        add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True)
        update_replicas_paths(files)
        for replica in list_replicas(dids=[{'scope': f['scope'],
                                            'name': f['name'],
                                            'type': DIDType.FILE} for f in files],
                                     schemes=['srm']):
            # force the changed string - if we look it up from the DB, then we're not testing anything :-D
            assert_equal(replica['rses']['MOCK2'][0], 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests//does/not/really/matter/where')

    def test_add_list_bad_replicas(self):
        """ REPLICA (CORE): Add bad replicas and list them"""
        tmp_scope = 'mock'
        nbfiles = 5
        # Adding replicas to deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in xrange(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK')
        rse_id1 = rse_info['id']
        add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True)

        # Listing replicas on deterministic RSE
Exemple #29
0
def test_reaper():
    """ REAPER (DAEMON): Test the reaper daemon."""
    rse_name = rse_name_generator()
    rse_id = rse_core.add_rse(rse_name)

    mock_protocol = {
        'scheme': 'MOCK',
        'hostname': 'localhost',
        'port': 123,
        'prefix': '/test/reaper',
        'impl': 'rucio.rse.protocols.mock.Default',
        'domains': {
            'lan': {
                'read': 1,
                'write': 1,
                'delete': 1
            },
            'wan': {
                'read': 1,
                'write': 1,
                'delete': 1
            }
        }
    }
    rse_core.add_protocol(rse_id=rse_id, parameter=mock_protocol)

    nb_files = 30
    file_size = 2147483648  # 2G

    file_names = []
    for i in range(nb_files):
        file_name = 'lfn' + generate_uuid()
        file_names.append(file_name)
        replica_core.add_replica(rse_id=rse_id,
                                 scope=InternalScope('data13_hip'),
                                 name=file_name,
                                 bytes=file_size,
                                 tombstone=datetime.utcnow() -
                                 timedelta(days=1),
                                 account=InternalAccount('root'),
                                 adler32=None,
                                 md5=None)

    rse_core.set_rse_usage(rse_id=rse_id,
                           source='storage',
                           used=nb_files * file_size,
                           free=800)
    rse_core.set_rse_limits(rse_id=rse_id,
                            name='MinFreeSpace',
                            value=10737418240)
    rse_core.set_rse_limits(rse_id=rse_id,
                            name='MaxBeingDeletedFiles',
                            value=10)

    rses = [
        rse_core.get_rse(rse_id),
    ]
    reaper(once=True, rses=rses)
    reaper(once=True, rses=rses)
    assert_equal(
        len(
            list(
                replica_core.list_replicas(dids=[{
                    'scope':
                    InternalScope('data13_hip'),
                    'name':
                    n
                } for n in file_names],
                                           rse_expression=rse_name))),
        nb_files - 10)
Exemple #30
0
            'bytes': 1L,
            'adler32': '0cc737eb',
            'meta': {
                'events': 10
            },
            'rse_id': rse_info['id'],
            'path': '/does/not/really/matter/where'
        } for i in xrange(nbfiles)]
        add_replicas(rse='MOCK2',
                     files=files,
                     account='root',
                     ignore_availability=True)
        update_replicas_paths(files)
        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['srm']):
            # force the changed string - if we look it up from the DB, then we're not testing anything :-D
            assert_equal(
                replica['rses']['MOCK2'][0],
                'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/does/not/really/matter/where'
            )

    def test_add_list_bad_replicas(self):
        """ REPLICA (CORE): Add bad replicas and list them"""
        tmp_scope = 'mock'
        nbfiles = 5
        # Adding replicas to deterministic RSE
        files = [{
            'scope': tmp_scope,
def test_source_avoid_deletion(vo, caches_mock, core_config_mock, rse_factory,
                               did_factory, root_account, file_factory):
    """ Test that sources on a file block it from deletion """

    _, reaper_region = caches_mock
    src_rse1, src_rse1_id = rse_factory.make_mock_rse()
    src_rse2, src_rse2_id = rse_factory.make_mock_rse()
    dst_rse, dst_rse_id = rse_factory.make_mock_rse()
    all_rses = [src_rse1_id, src_rse2_id, dst_rse_id]
    any_source = f'{src_rse1}|{src_rse2}'

    for rse_id in [src_rse1_id, src_rse2_id]:
        rse_core.set_rse_limits(rse_id=rse_id, name='MinFreeSpace', value=1)
        rse_core.set_rse_usage(rse_id=rse_id, source='storage', used=1, free=0)
    distance_core.add_distance(src_rse1_id, dst_rse_id, ranking=20)
    distance_core.add_distance(src_rse2_id, dst_rse_id, ranking=10)

    # Upload a test file to both rses without registering
    did = did_factory.random_did()

    # Register replica on one source RSE
    replica_core.add_replica(rse_id=src_rse1_id,
                             account=root_account,
                             bytes_=1,
                             tombstone=datetime(year=1970, month=1, day=1),
                             **did)
    rule_core.add_rule(dids=[did],
                       account=root_account,
                       copies=1,
                       rse_expression=dst_rse,
                       grouping='ALL',
                       weight=None,
                       lifetime=None,
                       locked=False,
                       subscription_id=None)

    # Reaper will not delete a file which only has one replica if there is any pending transfer for it
    reaper_region.invalidate()
    reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None)
    replica = next(
        iter(replica_core.list_replicas(dids=[did],
                                        rse_expression=any_source)))
    assert len(replica['pfns']) == 1

    # Register replica on second source rse
    replica_core.add_replica(rse_id=src_rse2_id,
                             account=root_account,
                             bytes_=1,
                             tombstone=datetime(year=1970, month=1, day=1),
                             **did)
    replica = next(
        iter(replica_core.list_replicas(dids=[did],
                                        rse_expression=any_source)))
    assert len(replica['pfns']) == 2

    # Submit the transfer. This will create the sources.
    submitter(once=True,
              rses=[{
                  'id': rse_id
              } for rse_id in all_rses],
              partition_wait_time=None,
              transfertool='mock',
              transfertype='single',
              filter_transfertool=None)

    # None of the replicas will be removed. They are protected by an entry in the sources table
    reaper_region.invalidate()
    reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None)
    replica = next(
        iter(replica_core.list_replicas(dids=[did],
                                        rse_expression=any_source)))
    assert len(replica['pfns']) == 2

    @transactional_session
    def __delete_sources(rse_id, scope, name, session=None):
        session.execute(
            delete(Source).where(Source.rse_id == rse_id,
                                 Source.scope == scope, Source.name == name))

    # Deletion succeeds for one replica (second still protected by existing request)
    __delete_sources(src_rse1_id, **did)
    __delete_sources(src_rse2_id, **did)
    reaper_region.invalidate()
    reaper(once=True, rses=[], include_rses=any_source, exclude_rses=None)
    replica = next(
        iter(replica_core.list_replicas(dids=[did],
                                        rse_expression=any_source)))
    assert len(replica['pfns']) == 1
Exemple #32
0
def declare_suspicious_replicas_bad(once=False,
                                    younger_than=3,
                                    nattempts=10,
                                    rse_expression='MOCK',
                                    max_replicas_per_rse=100):
    """
    Main loop to check for available replicas which are labeled as suspicious

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'.
    :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE.
                                 If more is found, processing is skipped and warning is printed.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]
    rses = []
    for rse in parse_expression(expression=rse_expression):
        rses.append(rse['rse'])
    rses.sort()
    executable += ' --rse-expression ' + str(rses)

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    live(executable=executable,
         hostname=socket.gethostname(),
         pid=os.getpid(),
         thread=threading.current_thread())

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable,
                             hostname=socket.gethostname(),
                             pid=os.getpid(),
                             thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread']

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logging.error(
                    'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.',
                    socket.gethostname())
                die(executable=executable,
                    hostname=socket.gethostname(),
                    pid=os.getpid(),
                    thread=threading.current_thread())
                break

            start = time.time()

            logging.info(
                'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,'
                +
                ' reported suspicious in the last %i days at least %i times which are available on other RSEs.',  # NOQA: W503
                worker_number,
                total_workers,
                rse_expression,
                younger_than,
                nattempts)

            getfileskwargs = {
                'younger_than': younger_than,
                'nattempts': nattempts,
                'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                'available_elsewhere': True,
                'is_suspicious': True
            }

            recoverable_replicas = get_suspicious_files(
                rse_expression, **getfileskwargs)

            logging.info(
                'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.',
                worker_number, total_workers,
                time.time() - start, len(recoverable_replicas))

            if not recoverable_replicas and not once:
                logging.info(
                    'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas. Sleeping for 60 seconds.',
                    worker_number, total_workers, len(recoverable_replicas))
                GRACEFUL_STOP.wait(60)
            else:
                logging.info(
                    'replica_recoverer[%i/%i]: looking for replica surls.',
                    worker_number, total_workers)

                start = time.time()
                surls_to_recover = {
                }  # dictionary of { rse1: [surl1, surl2, ... ], rse2: ... }
                cnt_surl_not_found = 0
                for replica in recoverable_replicas:
                    scope = replica['scope']
                    name = replica['name']
                    rse = replica['rse']
                    rse_id = replica['rse_id']
                    if GRACEFUL_STOP.is_set():
                        break
                    if rse_id not in surls_to_recover:
                        surls_to_recover[rse_id] = []
                    # for each suspicious replica, we get its surl through the list_replicas function
                    surl_not_found = True
                    for rep in list_replicas([{'scope': scope, 'name': name}]):
                        for site in rep['rses']:
                            if site == rse_id:
                                surls_to_recover[rse_id].append(
                                    rep['rses'][site][0])
                                surl_not_found = False
                    if surl_not_found:
                        cnt_surl_not_found += 1
                        logging.warning(
                            'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.',
                            worker_number, total_workers, name, rse)

                logging.info(
                    'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.',
                    worker_number, total_workers,
                    len(recoverable_replicas) - cnt_surl_not_found,
                    len(recoverable_replicas),
                    time.time() - start)

                for rse_id in surls_to_recover:
                    logging.info(
                        'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.',
                        worker_number, total_workers,
                        len(surls_to_recover[rse_id]), rse,
                        str(surls_to_recover[rse_id]))
                    if len(surls_to_recover[rse_id]) > max_replicas_per_rse:
                        logging.warning(
                            'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.',
                            worker_number, total_workers, max_replicas_per_rse,
                            str(len(surls_to_recover[rse_id])), rse)
                    else:
                        declare_bad_file_replicas(
                            pfns=surls_to_recover[rse_id],
                            reason='Suspicious. Automatic recovery.',
                            issuer=InternalAccount('root'),
                            status=BadFilesStatus.BAD,
                            session=None)
                        logging.info(
                            'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.',
                            worker_number, total_workers, rse)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
        except Exception as err:
            logging.critical(traceback.format_exc())
            record_counter('replica.recoverer.exceptions.%s',
                           err.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=socket.gethostname(),
        pid=os.getpid(),
        thread=threading.current_thread())
    logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number,
                 total_workers)
Exemple #33
0
    def test_list_replica_with_domain(self):
        """ REPLICA (CORE): Add and list file replicas forcing domain"""

        tmp_rse = rse_name_generator()
        add_rse(tmp_rse)

        protocols = [
            {
                'scheme': 'MOCK',
                'hostname': 'localhost',
                'port': 17,
                'prefix': '/i/prefer/the/lan',
                'impl': 'rucio.rse.protocols.mock.Default',
                'domains': {
                    'lan': {
                        'read': 1,
                        'write': 1,
                        'delete': 1
                    },
                    'wan': {
                        'read': 2,
                        'write': 2,
                        'delete': 2
                    }
                }
            },
            {
                'scheme': 'MOCK',
                'hostname': 'localhost',
                'port': 18,
                'prefix': '/i/prefer/the/wan',
                'impl': 'rucio.rse.protocols.mock.Default',
                'domains': {
                    'lan': {
                        'read': 2,
                        'write': 2,
                        'delete': 2
                    },
                    'wan': {
                        'read': 1,
                        'write': 1,
                        'delete': 1
                    }
                }
            },
        ]
        for p in protocols:
            add_protocol(tmp_rse, p)

        nbfiles = 3
        files = [{
            'scope': 'mock',
            'name': 'file_%s' % generate_uuid(),
            'bytes': 1234,
            'adler32': '01234567',
            'meta': {
                'events': 1234
            }
        } for _ in range(nbfiles)]

        add_replicas(rse=tmp_rse,
                     files=files,
                     account='root',
                     ignore_availability=True)

        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['MOCK'],
                                     domain='wan'):
            assert_in('/i/prefer/the/wan', replica['pfns'].keys()[0])

        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['MOCK'],
                                     domain='lan'):
            assert_in('/i/prefer/the/lan', replica['pfns'].keys()[0])

        # test old client behaviour - get all WAN answers
        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['MOCK']):
            cmd = 'rucio list-file-replicas --pfns %s:%s' % (replica['scope'],
                                                             replica['name'])
            _, stdout, _ = execute(cmd)
            assert_in('/i/prefer/the/wan', stdout)

        # # force all LAN
        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['MOCK'],
                                     domain='lan'):
            cmd = 'rucio list-file-replicas --pfns --domain=lan %s:%s' % (
                replica['scope'], replica['name'])
            errno, stdout, stderr = execute(cmd)
            assert_in('/i/prefer/the/lan', stdout)

        # # force all WAN
        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['MOCK'],
                                     domain='wan'):
            cmd = 'rucio list-file-replicas --pfns --domain=wan %s:%s' % (
                replica['scope'], replica['name'])
            errno, stdout, stderr = execute(cmd)
            assert_in('/i/prefer/the/wan', stdout)

        # # force both WAN and LAN
        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['MOCK'],
                                     domain='all'):
            cmd = 'rucio list-file-replicas --pfns --domain=all %s:%s' % (
                replica['scope'], replica['name'])
            errno, stdout, stderr = execute(cmd)
            assert_in('/i/prefer/the/wan', stdout)
            assert_in('/i/prefer/the/lan', stdout)
Exemple #34
0
def list_replicas(dids,
                  schemes=None,
                  unavailable=False,
                  request_id=None,
                  ignore_availability=True,
                  all_states=False,
                  rse_expression=None,
                  client_location=None,
                  domain=None,
                  signature_lifetime=None,
                  resolve_archives=True,
                  resolve_parents=False,
                  updated_after=None,
                  issuer=None):
    """
    List file replicas for a list of data identifiers.

    :param dids: The list of data identifiers (DIDs).
    :param schemes: A list of schemes to filter the replicas. (e.g. file, http, ...)
    :param unavailable: Also include unavailable replicas in the list.
    :param request_id: ID associated with the request for debugging.
    :param all_states: Return all replicas whatever state they are in. Adds an extra 'states' entry in the result dictionary.
    :param rse_expression: The RSE expression to restrict replicas on a set of RSEs.
    :param client_location: Client location dictionary for PFN modification {'ip', 'fqdn', 'site'}
    :param domain: The network domain for the call, either None, 'wan' or 'lan'. Compatibility fallback: None falls back to 'wan'.
    :param signature_lifetime: If supported, in seconds, restrict the lifetime of the signed PFN.
    :param resolve_archives: When set to True, find archives which contain the replicas.
    :param resolve_parents: When set to True, find all parent datasets which contain the replicas.
    :param updated_after: datetime object (UTC time), only return replicas updated after this time
    :param issuer: The issuer account.
    """
    validate_schema(name='r_dids', obj=dids)

    # Allow selected authenticated users to retrieve signed URLs.
    # Unauthenticated users, or permission-less users will get the raw URL without the signature.
    sign_urls = False
    if permission.has_permission(issuer=issuer,
                                 action='get_signed_url',
                                 kwargs={}):
        sign_urls = True

    for d in dids:
        d['scope'] = InternalScope(d['scope'])

    replicas = replica.list_replicas(dids=dids,
                                     schemes=schemes,
                                     unavailable=unavailable,
                                     request_id=request_id,
                                     ignore_availability=ignore_availability,
                                     all_states=all_states,
                                     rse_expression=rse_expression,
                                     client_location=client_location,
                                     domain=domain,
                                     sign_urls=sign_urls,
                                     signature_lifetime=signature_lifetime,
                                     resolve_archives=resolve_archives,
                                     resolve_parents=resolve_parents,
                                     updated_after=updated_after)

    for rep in replicas:
        # 'rses' and 'states' use rse_id as the key. This needs updating to be rse.
        keys = ['rses', 'states']
        for k in keys:
            old_dict = rep.get(k, None)
            if old_dict is not None:
                new_dict = {}
                for rse_id in old_dict:
                    rse = get_rse_name(
                        rse_id=rse_id) if rse_id is not None else None
                    new_dict[rse] = old_dict[rse_id]
                rep[k] = new_dict

        rep['scope'] = rep['scope'].external
        if 'parents' in rep:
            new_parents = []
            for p in rep['parents']:
                scope, name = p.split(':')
                scope = InternalScope(scope, fromExternal=False).external
                new_parents.append('{}:{}'.format(scope, name))
            rep['parents'] = new_parents

        yield rep
Exemple #35
0
def declare_suspicious_replicas_bad(once=False,
                                    younger_than=3,
                                    nattempts=10,
                                    rse_expression='MOCK',
                                    vos=None,
                                    max_replicas_per_rse=100,
                                    sleep_time=60):
    """
    Main loop to check for available replicas which are labeled as suspicious

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'.
    :param vos: VOs on which to look for RSEs. Only used in multi-VO mode.
                If None, we either use all VOs if run from "def",
    :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE.
                                 If more is found, processing is skipped and warning is printed.
    :param sleep_time: Thread sleep time after each chunk of work.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]

    multi_vo = config_get_bool('common',
                               'multi_vo',
                               raise_exception=False,
                               default=False)
    if not multi_vo:
        if vos:
            logging.warning(
                'Ignoring argument vos, this is only applicable in a multi-VO setup.'
            )
        vos = ['def']
    else:
        if vos:
            invalid = set(vos) - set([v['vo'] for v in list_vos()])
            if invalid:
                msg = 'VO{} {} cannot be found'.format(
                    's' if len(invalid) > 1 else '',
                    ', '.join([repr(v) for v in invalid]))
                raise VONotFound(msg)
        else:
            vos = [v['vo'] for v in list_vos()]
        logging.info('replica_recoverer: This instance will work on VO%s: %s' %
                     ('s' if len(vos) > 1 else '', ', '.join([v
                                                              for v in vos])))

    # Don't require a result from the expression at each VO, only raise if we can't get a result from any of them
    rses = []
    exceptions_raised = 0
    for vo in vos:
        try:
            parsed_rses = parse_expression(expression=rse_expression,
                                           filter_={'vo': vo})
        except InvalidRSEExpression:
            exceptions_raised += 1
            parsed_rses = []
        for rse in parsed_rses:
            rses.append(rse['id'])
    if exceptions_raised == len(vos):
        raise InvalidRSEExpression('RSE Expression resulted in an empty set.')

    rses.sort()
    executable += ' --rse-expression ' + str(rses)

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    live(executable=executable,
         hostname=socket.gethostname(),
         pid=os.getpid(),
         thread=threading.current_thread())

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable,
                             hostname=socket.gethostname(),
                             pid=os.getpid(),
                             thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread']

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logging.error(
                    'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.',
                    socket.gethostname())
                die(executable=executable,
                    hostname=socket.gethostname(),
                    pid=os.getpid(),
                    thread=threading.current_thread())
                break

            start = time.time()

            logging.info(
                'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,'
                +
                ' reported suspicious in the last %i days at least %i times which are available on other RSEs.',  # NOQA: W503
                worker_number,
                total_workers,
                rse_expression,
                younger_than,
                nattempts)

            getfileskwargs = {
                'younger_than': younger_than,
                'nattempts': nattempts,
                'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                'available_elsewhere': True,
                'is_suspicious': True
            }

            # Don't require a result from the expression at each VO, only raise if we can't get a result from any of them
            recoverable_replicas = []
            exceptions_raised = 0
            for vo in vos:
                try:
                    recoverable_replicas.extend(
                        get_suspicious_files(rse_expression,
                                             filter_={'vo': vo},
                                             **getfileskwargs))
                except InvalidRSEExpression:
                    exceptions_raised += 1
            if exceptions_raised == len(vos):
                raise InvalidRSEExpression(
                    'RSE Expression resulted in an empty set.')

            logging.info(
                'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.',
                worker_number, total_workers,
                time.time() - start, len(recoverable_replicas))

            if not recoverable_replicas and not once:
                logging.info(
                    'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas.',
                    worker_number, total_workers, len(recoverable_replicas))
                daemon_sleep(start_time=start,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)
            else:
                logging.info(
                    'replica_recoverer[%i/%i]: looking for replica surls.',
                    worker_number, total_workers)

                start = time.time()
                surls_to_recover = {
                }  # dictionary of { vo1: {rse1: [surl1, surl2, ... ], rse2: ...}, vo2:... }
                cnt_surl_not_found = 0
                for replica in recoverable_replicas:
                    scope = replica['scope']
                    name = replica['name']
                    vo = scope.vo
                    rse = replica['rse']
                    rse_id = replica['rse_id']
                    if GRACEFUL_STOP.is_set():
                        break
                    if vo not in surls_to_recover:
                        surls_to_recover[vo] = {}
                    if rse_id not in surls_to_recover[vo]:
                        surls_to_recover[vo][rse_id] = []
                    # for each suspicious replica, we get its surl through the list_replicas function
                    surl_not_found = True
                    for rep in list_replicas([{'scope': scope, 'name': name}]):
                        for site in rep['rses']:
                            if site == rse_id:
                                surls_to_recover[vo][rse_id].append(
                                    rep['rses'][site][0])
                                surl_not_found = False
                    if surl_not_found:
                        cnt_surl_not_found += 1
                        logging.warning(
                            'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.',
                            worker_number, total_workers, name, rse)

                logging.info(
                    'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.',
                    worker_number, total_workers,
                    len(recoverable_replicas) - cnt_surl_not_found,
                    len(recoverable_replicas),
                    time.time() - start)

                for vo in surls_to_recover:
                    for rse_id in surls_to_recover[vo]:
                        logging.info(
                            'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.',
                            worker_number, total_workers,
                            len(surls_to_recover[vo][rse_id]), rse,
                            str(surls_to_recover[vo][rse_id]))
                        if len(surls_to_recover[vo]
                               [rse_id]) > max_replicas_per_rse:
                            logging.warning(
                                'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.',
                                worker_number, total_workers,
                                max_replicas_per_rse,
                                str(len(surls_to_recover[vo][rse_id])), rse)
                        else:
                            declare_bad_file_replicas(
                                pfns=surls_to_recover[vo][rse_id],
                                reason='Suspicious. Automatic recovery.',
                                issuer=InternalAccount('root', vo=vo),
                                status=BadFilesStatus.BAD,
                                session=None)
                            logging.info(
                                'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.',
                                worker_number, total_workers, rse)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' +
                               err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' +
                               err.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.' +
                               err.__class__.__name__)
        except Exception as err:
            logging.critical(traceback.format_exc())
            record_counter('replica.recoverer.exceptions.' +
                           err.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=socket.gethostname(),
        pid=os.getpid(),
        thread=threading.current_thread())
    logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number,
                 total_workers)