Ejemplo n.º 1
0
    def test_add_list_bad_replicas(self):
        """ REPLICA (CORE): Add bad replicas and list them"""
        tmp_scope = 'mock'
        nbfiles = 5
        # Adding replicas to deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in range(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK')
        rse_id1 = rse_info['id']
        add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True)

        # Listing replicas on deterministic RSE
        replicas = []
        list_rep = []
        for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
            replicas.extend(replica['rses']['MOCK'])
            list_rep.append(replica)
        r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root')
        assert_equal(r, {})
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id1:
                    if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Adding replicas to non-deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb',
                  'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': {'events': 10}} for i in range(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK2')
        rse_id2 = rse_info['id']
        add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True)

        # Listing replicas on non-deterministic RSE
        replicas = []
        list_rep = []
        for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
            replicas.extend(replica['rses']['MOCK2'])
            list_rep.append(replica)
        r = declare_bad_file_replicas(replicas, 'This is a good reason', 'root')
        assert_equal(r, {})
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id2:
                    if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Now adding non-existing bad replicas
        files = ['srm://mock2.com/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), ]
        r = declare_bad_file_replicas(files, 'This is a good reason', 'root')
        output = ['%s Unknown replica' % rep for rep in files]
        assert_equal(r, {'MOCK2': output})
Ejemplo n.º 2
0
def test_add_list_bad_replicas(rse_factory, mock_scope, root_account):
    """ REPLICA (CORE): Add bad replicas and list them"""

    nbfiles = 5
    # Adding replicas to deterministic RSE
    _, rse1_id = rse_factory.make_srm_rse(deterministic=True)
    files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles)]
    add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True)

    # Listing replicas on deterministic RSE
    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
        replicas.extend(replica['rses'][rse1_id])
        list_rep.append(replica)
    r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account)
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse1_id:
                if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # Adding replicas to non-deterministic RSE
    _, rse2_id = rse_factory.make_srm_rse(deterministic=False)
    files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb',
              'pfn': 'srm://%s.cern.ch/srm/managerv2?SFN=/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), 'meta': {'events': 10}} for _ in range(nbfiles)]
    add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True)

    # Listing replicas on non-deterministic RSE
    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
        replicas.extend(replica['rses'][rse2_id])
        list_rep.append(replica)
    r = declare_bad_file_replicas(replicas, 'This is a good reason', root_account)
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse2_id:
                if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # Now adding non-existing bad replicas
    files = ['srm://%s.cern.ch/test/%s/%s' % (rse2_id, mock_scope, generate_uuid()), ]
    r = declare_bad_file_replicas(files, 'This is a good reason', root_account)
    output = ['%s Unknown replica' % rep for rep in files]
    assert r == {rse2_id: output}
Ejemplo n.º 3
0
def test_get_bad_replicas_backlog(rse_factory, mock_scope, root_account, file_config_mock):
    """ REPLICA (CORE): Check the behaviour of the necromancer in case of backlog on an RSE"""

    # Run necromancer once
    necromancer_run(threads=1, bulk=10000, once=True)

    nbfiles1 = 100
    nbfiles2 = 20
    # Adding replicas to deterministic RSE
    rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True)
    _, rse2_id = rse_factory.make_srm_rse(deterministic=True)

    # Create bad replicas on rse1
    files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles1)]
    add_replicas(rse_id=rse1_id, files=files, account=root_account, ignore_availability=True)

    replicas = []
    list_rep = []
    for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
        replicas.extend(replica['rses'][rse1_id])
        list_rep.append({'scope': replica['scope'], 'name': replica['name'], 'rse': rse1, 'rse_id': rse1_id})
    res = declare_bad_file_replicas(replicas, 'This is a good reason', root_account)
    assert res == {}

    result = get_bad_replicas_backlog(force_refresh=True)
    assert rse1_id in result
    assert result[rse1_id] == nbfiles1

    # Create more bad replicas on rse2
    files = [{'scope': mock_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1, 'adler32': '0cc737eb', 'meta': {'events': 10}} for _ in range(nbfiles2)]
    add_replicas(rse_id=rse2_id, files=files, account=root_account, ignore_availability=True)

    repl = []
    for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
        repl.extend(replica['rses'][rse2_id])
    res = declare_bad_file_replicas(repl, 'This is a good reason', root_account)
    assert res == {}

    # List bad replicas on rse1
    bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}])
    assert len(bad_replicas) == nbfiles1
    for rep in bad_replicas:
        assert rep in list_rep

    # Run necromancer once, all the files on RSE2 should be gone, 80 files should stay on RSE1
    get_bad_replicas_backlog(force_refresh=True)
    necromancer_run(threads=1, bulk=20, once=True)

    bad_replicas = list_bad_replicas(rses=[{'id': rse1_id}, {'id': rse2_id}])
    assert len(bad_replicas) == 80
    for rep in bad_replicas:
        assert rep['rse_id'] == rse1_id
Ejemplo n.º 4
0
def necromancer(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    param worker_number: The number of the worker (thread).
    param total_number: The total number of workers (threads).
    chunk_size: The chunk of the size to process.
    once: To run only once
    """
    sleep_time = 60
    while not graceful_stop.is_set():
        stime = time.time()
        try:
            replicas = list_bad_replicas(limit=chunk_size, worker_number=worker_number, total_workers=total_workers)
            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info('Thread [%i/%i] : Working on %s:%s on %s' % (worker_number, total_workers, scope, name, rse))
                rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])]
                if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]):
                    logging.info('Thread [%i/%i] : File %s:%s has no other replicas, it will be marked as lost' % (worker_number, total_workers, scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile',  delta=1)
                    except DatabaseException, e:
                        logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e)))
                else:
                    logging.info('Thread [%i/%i] : File %s:%s can be recovered. Available sources : %s' % (worker_number, total_workers, scope, name, str(rep[0]['rses'])))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id)
                        monitor.record_counter(counters='necromancer.badfiles.recovering',  delta=1)
                    except DatabaseException, e:
                        logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e)))
            logging.info('Thread [%i/%i] : It took %s seconds to process %s replicas' % (worker_number, total_workers, str(time.time() - stime), str(len(replicas))))
Ejemplo n.º 5
0
def necromancer(thread=0, bulk=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    """

    sleep_time = 60
    update_history_threshold = 3600
    update_history_time = time.time()

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        hb = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads'])

        stime = time.time()
        try:
            replicas = list_bad_replicas(limit=bulk, thread=hb['assign_thread'], total_threads=hb['nr_threads'])

            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse))

                rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])]
                if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]):
                    logging.info(prepend_str + 'File %s:%s has no other replicas, it will be marked as lost' % (scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1)
                    except DatabaseException, error:
                        logging.info(prepend_str + '%s' % (str(error)))

                else:
                    logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s' % (scope, name, str(rep[0]['rses'])))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1)
                    except DatabaseException, error:
                        logging.info(prepend_str + '%s' % (str(error)))

            logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
Ejemplo n.º 6
0
def necromancer(thread=0, bulk=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    """

    sleep_time = 60
    update_history_threshold = 3600
    update_history_time = time.time()

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

        stime = time.time()
        replicas = []
        try:
            replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'])

            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse))

                list_replicas = get_replicas_state(scope=scope, name=name)
                if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                    logging.info(prepend_str + 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1)
                    except DatabaseException as error:
                        logging.info(prepend_str + '%s' % (str(error)))

                else:
                    rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                    unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, [])
                    logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep)))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1)
                    except DatabaseException as error:
                        logging.info(prepend_str + '%s' % (str(error)))

            logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip())

        if once:
            break
        else:
            now = time.time()
            if (now - update_history_time) > update_history_threshold:
                logging.info(prepend_str + 'Last update of history table %s seconds ago. Running update.' % (now - update_history_time))
                bad_replicas = list_bad_replicas_history(limit=1000000,
                                                         thread=heart_beat['assign_thread'],
                                                         total_threads=heart_beat['nr_threads'])
                for rse_id in bad_replicas:
                    chunk_size = 1000
                    nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size))
                    logging.debug(prepend_str + 'Update history for rse_id %s' % (rse_id))
                    cnt = 0
                    for chunk in chunks(bad_replicas[rse_id], chunk_size):
                        logging.debug(prepend_str + ' History for rse_id %s : chunk %i/%i' % (rse_id, cnt, nchunk))
                        cnt += 1
                        update_bad_replicas_history(chunk, rse_id)
                logging.info(prepend_str + 'History table updated in %s seconds' % (time.time() - now))
                update_history_time = time.time()

            tottime = time.time() - stime
            if len(replicas) == bulk:
                logging.info(prepend_str + 'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle')
            elif tottime < sleep_time:
                logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime)))
                time.sleep(sleep_time - tottime)
                continue

    logging.info(prepend_str + 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 7
0
    def test_add_list_bad_replicas(self):
        """ REPLICA (CLIENT): Add bad replicas"""
        tmp_scope = 'mock'
        nbfiles = 5
        # Adding replicas to deterministic RSE
        files = [{
            'scope': tmp_scope,
            'name': 'file_%s' % generate_uuid(),
            'bytes': 1,
            'adler32': '0cc737eb',
            'meta': {
                'events': 10
            }
        } for _ in range(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK')
        rse_id1 = rse_info['id']
        self.replica_client.add_replicas(rse='MOCK', files=files)

        # Listing replicas on deterministic RSE
        replicas, list_rep = [], []
        for replica in self.replica_client.list_replicas(dids=[{
                'scope':
                f['scope'],
                'name':
                f['name']
        } for f in files],
                                                         schemes=['srm'],
                                                         unavailable=True):
            replicas.extend(replica['rses']['MOCK'])
            list_rep.append(replica)
        r = self.replica_client.declare_bad_file_replicas(
            replicas, 'This is a good reason')
        assert_equal(r, {})
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id1:
                    if badrep['scope'] == rep['scope'] and badrep[
                            'name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Run necromancer once
        necromancer_run(threads=1, bulk=10000, once=True)

        # Try to attach a lost file
        tmp_dsn = 'dataset_%s' % generate_uuid()
        self.did_client.add_dataset(scope=tmp_scope, name=tmp_dsn)
        with assert_raises(UnsupportedOperation):
            self.did_client.add_files_to_dataset(tmp_scope,
                                                 name=tmp_dsn,
                                                 files=files,
                                                 rse='MOCK')

        # Adding replicas to non-deterministic RSE
        files = [{
            'scope':
            tmp_scope,
            'name':
            'file_%s' % generate_uuid(),
            'bytes':
            1,
            'adler32':
            '0cc737eb',
            'pfn':
            'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s'
            % (tmp_scope, generate_uuid()),
            'meta': {
                'events': 10
            }
        } for _ in range(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK2')
        rse_id2 = rse_info['id']
        self.replica_client.add_replicas(rse='MOCK2', files=files)

        # Listing replicas on non-deterministic RSE
        replicas, list_rep = [], []
        for replica in self.replica_client.list_replicas(dids=[{
                'scope':
                f['scope'],
                'name':
                f['name']
        } for f in files],
                                                         schemes=['srm'],
                                                         unavailable=True):
            replicas.extend(replica['rses']['MOCK2'])
            list_rep.append(replica)
        print(replicas, list_rep)
        r = self.replica_client.declare_bad_file_replicas(
            replicas, 'This is a good reason')
        print(r)
        assert_equal(r, {})
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id2:
                    if badrep['scope'] == rep['scope'] and badrep[
                            'name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Now adding non-existing bad replicas
        files = [
            'srm://mock2.com/rucio/tmpdisk/rucio_tests/%s/%s' %
            (tmp_scope, generate_uuid()),
        ]
        r = self.replica_client.declare_bad_file_replicas(
            files, 'This is a good reason')
        output = ['%s Unknown replica' % rep for rep in files]
        assert_equal(r, {'MOCK2': output})
Ejemplo n.º 8
0
        # Listing replicas on deterministic RSE
        replicas = []
        list_rep = []
        for replica in list_replicas(dids=[{
                'scope': f['scope'],
                'name': f['name'],
                'type': DIDType.FILE
        } for f in files],
                                     schemes=['srm']):
            replicas.extend(replica['rses']['MOCK'])
            list_rep.append(replica)
        r = declare_bad_file_replicas(replicas, 'This is a good reason',
                                      'root')
        assert_equal(r, {})
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id1:
                    if badrep['scope'] == rep['scope'] and badrep[
                            'name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Adding replicas to non-deterministic RSE
        files = [{
            'scope':
            tmp_scope,
            'name':
            'file_%s' % generate_uuid(),
Ejemplo n.º 9
0
def test_client_add_list_bad_replicas(rse_factory, replica_client, did_client):
    """ REPLICA (CLIENT): Add bad replicas"""
    tmp_scope = 'mock'
    nbfiles = 5
    # Adding replicas to deterministic RSE
    files = [{
        'scope': tmp_scope,
        'name': 'file_%s' % generate_uuid(),
        'bytes': 1,
        'adler32': '0cc737eb',
        'meta': {
            'events': 10
        }
    } for _ in range(nbfiles)]
    rse1, rse1_id = rse_factory.make_srm_rse(deterministic=True)
    replica_client.add_replicas(rse=rse1, files=files)

    # Listing replicas on deterministic RSE
    replicas, list_rep = [], []
    for replica in replica_client.list_replicas(dids=[{
            'scope': f['scope'],
            'name': f['name']
    } for f in files],
                                                schemes=['srm'],
                                                all_states=True):
        replicas.extend(replica['rses'][rse1])
        list_rep.append(replica)
    r = replica_client.declare_bad_file_replicas(replicas,
                                                 'This is a good reason')
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse1_id:
                if badrep['scope'].external == rep['scope'] and badrep[
                        'name'] == rep['name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # Run necromancer once
    necromancer_run(threads=1, bulk=10000, once=True)

    # Try to attach a lost file
    tmp_dsn = 'dataset_%s' % generate_uuid()
    did_client.add_dataset(scope=tmp_scope, name=tmp_dsn)
    with pytest.raises(UnsupportedOperation):
        did_client.add_files_to_dataset(tmp_scope,
                                        name=tmp_dsn,
                                        files=files,
                                        rse=rse1)

    # Adding replicas to non-deterministic RSE
    rse2, rse2_id = rse_factory.make_srm_rse(deterministic=False)
    files = [{
        'scope':
        tmp_scope,
        'name':
        'file_%s' % generate_uuid(),
        'bytes':
        1,
        'adler32':
        '0cc737eb',
        'pfn':
        'srm://%s.cern.ch/srm/managerv2?SFN=/test_%s/%s/%s' %
        (rse2_id, rse2_id, tmp_scope, generate_uuid()),
        'meta': {
            'events': 10
        }
    } for _ in range(nbfiles)]
    replica_client.add_replicas(rse=rse2, files=files)

    # Listing replicas on non-deterministic RSE
    replicas, list_rep = [], []
    for replica in replica_client.list_replicas(dids=[{
            'scope': f['scope'],
            'name': f['name']
    } for f in files],
                                                schemes=['srm'],
                                                all_states=True):
        replicas.extend(replica['rses'][rse2])
        list_rep.append(replica)
    r = replica_client.declare_bad_file_replicas(replicas,
                                                 'This is a good reason')
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse2_id:
                if badrep['scope'].external == rep['scope'] and badrep[
                        'name'] == rep['name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # Now adding non-existing bad replicas
    files = [
        'srm://%s.cern.ch/test_%s/%s/%s' %
        (rse2_id, rse2_id, tmp_scope, generate_uuid()),
    ]
    r = replica_client.declare_bad_file_replicas(files,
                                                 'This is a good reason')
    output = ['%s Unknown replica' % rep for rep in files]
    assert r == {rse2: output}

    # Now test adding bad_replicas with a list of replicas instead of PFNs
    # Adding replicas to deterministic RSE
    rse3, rse3_id = rse_factory.make_srm_rse(deterministic=True)
    files = [{
        'scope': tmp_scope,
        'name': 'file_%s' % generate_uuid(),
        'bytes': 1,
        'adler32': '0cc737eb',
        'meta': {
            'events': 10
        }
    } for _ in range(nbfiles)]
    replica_client.add_replicas(rse=rse3, files=files)
    list_rep = [{
        'scope': file_['scope'],
        'name': file_['name'],
        'rse': rse3
    } for file_ in files]

    # Listing replicas on deterministic RSE
    replicas = []
    for replica in replica_client.list_replicas(dids=[{
            'scope': f['scope'],
            'name': f['name']
    } for f in files],
                                                schemes=['srm'],
                                                all_states=True):
        replicas.extend(replica['rses'][rse3])
    r = replica_client.declare_bad_file_replicas(list_rep,
                                                 'This is a good reason')
    assert r == {}
    bad_replicas = list_bad_replicas()
    nbbadrep = 0
    for rep in list_rep:
        for badrep in bad_replicas:
            if badrep['rse_id'] == rse3_id:
                if badrep['scope'].external == rep['scope'] and badrep[
                        'name'] == rep['name']:
                    nbbadrep += 1
    assert len(replicas) == nbbadrep

    # InvalidType is raised if list_rep contains a mixture of replicas and PFNs
    list_rep.extend([
        'srm://%s.cern.ch/test_%s/%s/%s' %
        (rse2_id, rse2_id, tmp_scope, generate_uuid()),
    ])
    with pytest.raises(InvalidType):
        r = replica_client.declare_bad_file_replicas(list_rep,
                                                     'This is a good reason')
Ejemplo n.º 10
0
        tmp_scope = 'mock'
        nbfiles = 5
        # Adding replicas to deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb', 'meta': {'events': 10}} for i in xrange(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK')
        rse_id1 = rse_info['id']
        add_replicas(rse='MOCK', files=files, account='root', ignore_availability=True)

        # Listing replicas on deterministic RSE
        replicas = []
        list_rep = []
        for replica in list_replicas(dids=[{'scope': f['scope'], 'name': f['name'], 'type': DIDType.FILE} for f in files], schemes=['srm']):
            replicas.extend(replica['rses']['MOCK'])
            list_rep.append(replica)
        declare_bad_file_replicas(replicas, 'MOCK')
        bad_replicas = list_bad_replicas()
        nbbadrep = 0
        for rep in list_rep:
            for badrep in bad_replicas:
                if badrep['rse_id'] == rse_id1:
                    if badrep['scope'] == rep['scope'] and badrep['name'] == rep['name']:
                        nbbadrep += 1
        assert_equal(len(replicas), nbbadrep)

        # Adding replicas to non-deterministic RSE
        files = [{'scope': tmp_scope, 'name': 'file_%s' % generate_uuid(), 'bytes': 1L, 'adler32': '0cc737eb',
                  'pfn': 'srm://mock2.com:8443/srm/managerv2?SFN=/rucio/tmpdisk/rucio_tests/%s/%s' % (tmp_scope, generate_uuid()), 'meta': {'events': 10}} for i in xrange(nbfiles)]
        rse_info = rsemgr.get_rse_info('MOCK2')
        rse_id2 = rse_info['id']
        add_replicas(rse='MOCK2', files=files, account='root', ignore_availability=True)
Ejemplo n.º 11
0
def necromancer(thread=0, bulk=5, once=False, sleep_time=60):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Thread sleep time after each chunk of work.
    """

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not GRACEFUL_STOP.is_set():

        stime = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prefix + '%s')
        logger(logging.DEBUG, 'Starting new cycle')

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get('necromancer', 'max_evaluator_backlog_count')
            max_evaluator_backlog_count = int(max_evaluator_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get('necromancer', 'max_evaluator_backlog_duration')
            max_evaluator_backlog_duration = int(max_evaluator_backlog_duration)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            evaluator_backlog_count, evaluator_backlog_duration = get_evaluation_backlog(expiration_time=sleep_time)
            if max_evaluator_backlog_count and \
               evaluator_backlog_count and \
               max_evaluator_backlog_duration and \
               evaluator_backlog_duration and \
               evaluator_backlog_count > max_evaluator_backlog_count and \
               evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count and duration hit, stopping operation')
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and evaluator_backlog_count and evaluator_backlog_count > max_evaluator_backlog_count:
                logger(logging.ERROR, 'Necromancer: Judge evaluator backlog count hit, stopping operation')
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and evaluator_backlog_duration and evaluator_backlog_duration < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(logging.ERROR, 'Necromancer: Judge evaluator backlog duration hit, stopping operation')
                GRACEFUL_STOP.wait(30)
                continue

        # Check how many bad replicas are queued
        try:
            max_bad_replicas_backlog_count = config_get('necromancer', 'max_bad_replicas_backlog_count')
            max_bad_replicas_backlog_count = int(max_bad_replicas_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_bad_replicas_backlog_count = None
        bad_replicas_backlog = REGION.get('bad_replicas_backlog')
        if bad_replicas_backlog is NO_VALUE:
            bad_replicas_backlog = get_bad_replicas_backlog()
            REGION.set('bad_replicas_backlog', bad_replicas_backlog)
        tot_bad_files = sum([bad_replicas_backlog[key] for key in bad_replicas_backlog])
        list_of_rses = list()
        # If too many replica, call list_bad_replicas with a list of RSEs
        if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(bad_replicas_backlog) > 1:
            logger(logging.INFO, 'Backlog of bads replica too big. Apply some sharing between different RSEs')
            rses = list()
            cnt = 0
            for key in sorted(bad_replicas_backlog, key=bad_replicas_backlog.get, reverse=True):
                rses.append({'id': key})
                cnt += bad_replicas_backlog[key]
                if cnt >= bulk:
                    list_of_rses.append(rses)
                    rses = list()
                    cnt = 0
        else:
            list_of_rses.append(None)

        tot_processed = 0
        if tot_bad_files == 0:
            logger(logging.INFO, 'No bad replicas to process.')
        else:
            ttime = time.time()
            replicas = []
            try:
                for rses in list_of_rses:

                    replicas = list_bad_replicas(limit=bulk, thread=heart_beat['assign_thread'], total_threads=heart_beat['nr_threads'], rses=rses)
                    for replica in replicas:
                        scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                        logger(logging.INFO, 'Working on %s:%s on %s' % (scope, name, rse))

                        list_replicas = get_replicas_state(scope=scope, name=name)
                        if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                            logger(logging.INFO, 'File %s:%s has no other available or temporary available replicas, it will be marked as lost' % (scope, name))
                            try:
                                update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                                monitor.record_counter(name='necromancer.badfiles.lostfile')
                            except (DatabaseException, DatabaseError) as error:
                                if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                                    logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                                else:
                                    logger(logging.ERROR, str(error))

                        else:
                            rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                            unavailable_rep = list_replicas.get(ReplicaState.TEMPORARY_UNAVAILABLE, [])
                            logger(logging.INFO, 'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s' % (scope, name, str(rep), str(unavailable_rep)))
                            try:
                                update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                                monitor.record_counter(name='necromancer.badfiles.recovering')
                            except (DatabaseException, DatabaseError) as error:
                                if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                                    logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error))
                                else:
                                    logger(logging.ERROR, str(error))

                    tot_processed += len(replicas)
                    logger(logging.INFO, 'It took %s seconds to process %s replicas' % (str(time.time() - ttime), str(len(replicas))))
            except Exception:
                exc_type, exc_value, exc_traceback = exc_info()
                logger(logging.CRITICAL, ''.join(format_exception(exc_type, exc_value, exc_traceback)).strip())

        if once:
            break
        elif tot_processed == 0 or tot_bad_files == 0:
            daemon_sleep(start_time=stime, sleep_time=sleep_time, graceful_stop=GRACEFUL_STOP, logger=logger)

    logger(logging.INFO, 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop done')
Ejemplo n.º 12
0
def necromancer(thread=0, bulk=5, once=False, sleep_time=60):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Thread sleep time after each chunk of work.
    """

    update_history_threshold = 3600
    update_history_time = time.time()

    executable = 'necromancer'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not GRACEFUL_STOP.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prefix = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                        heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prefix + '%s')

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get(
                'necromancer', 'max_evaluator_backlog_count')
            max_evaluator_backlog_count = int(max_evaluator_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get(
                'necromancer', 'max_evaluator_backlog_duration')
            max_evaluator_backlog_duration = int(
                max_evaluator_backlog_duration)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog(expiration_time=sleep_time)
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Necromancer: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        # Check how many bad replicas are queued
        try:
            max_bad_replicas_backlog_count = config_get(
                'necromancer', 'max_bad_replicas_backlog_count')
            max_bad_replicas_backlog_count = int(
                max_bad_replicas_backlog_count)
        except (NoOptionError, NoSectionError, RuntimeError, ValueError):
            max_bad_replicas_backlog_count = None
        bad_replicas_backlog = get_bad_replicas_backlog()
        tot_bad_files = sum(
            [bad_replicas_backlog[key] for key in bad_replicas_backlog])
        list_of_rses = list()
        # If too many replica, call list_bad_replicas with a list of RSEs
        if max_bad_replicas_backlog_count and tot_bad_files > max_bad_replicas_backlog_count and len(
                bad_replicas_backlog) > 1:
            logger(
                logging.INFO,
                'Backlog of bads replica too big. Apply some sharing between different RSEs'
            )
            rses = list()
            cnt = 0
            for key in sorted(bad_replicas_backlog,
                              key=bad_replicas_backlog.get,
                              reverse=False):
                rses.append({'id': key})
                cnt += bad_replicas_backlog[key]
                if cnt >= bulk:
                    list_of_rses.append(rses)
                    rses = list()
                    cnt = 0
        else:
            list_of_rses.append(None)

        stime = time.time()
        replicas = []
        try:
            for rses in list_of_rses:
                replicas = list_bad_replicas(
                    limit=bulk,
                    thread=heart_beat['assign_thread'],
                    total_threads=heart_beat['nr_threads'],
                    rses=rses)

                for replica in replicas:
                    scope, name, rse_id, rse = replica['scope'], replica[
                        'name'], replica['rse_id'], replica['rse']
                    logger(logging.INFO,
                           'Working on %s:%s on %s' % (scope, name, rse))

                    list_replicas = get_replicas_state(scope=scope, name=name)
                    if ReplicaState.AVAILABLE not in list_replicas and ReplicaState.TEMPORARY_UNAVAILABLE not in list_replicas:
                        logger(
                            logging.INFO,
                            'File %s:%s has no other available or temporary available replicas, it will be marked as lost'
                            % (scope, name))
                        try:
                            update_rules_for_lost_replica(scope=scope,
                                                          name=name,
                                                          rse_id=rse_id,
                                                          nowait=True)
                            monitor.record_counter(
                                name='necromancer.badfiles.lostfile')
                        except DatabaseException as error:
                            logger(logging.WARNING, str(error))

                    else:
                        rep = list_replicas.get(ReplicaState.AVAILABLE, [])
                        unavailable_rep = list_replicas.get(
                            ReplicaState.TEMPORARY_UNAVAILABLE, [])
                        logger(
                            logging.INFO,
                            'File %s:%s can be recovered. Available sources : %s + Unavailable sources : %s'
                            % (scope, name, str(rep), str(unavailable_rep)))
                        try:
                            update_rules_for_bad_replica(scope=scope,
                                                         name=name,
                                                         rse_id=rse_id,
                                                         nowait=True)
                            monitor.record_counter(
                                name='necromancer.badfiles.recovering')
                        except DatabaseException as error:
                            logger(logging.WARNING, str(error))

                logger(
                    logging.INFO, 'It took %s seconds to process %s replicas' %
                    (str(time.time() - stime), str(len(replicas))))
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logger(
                logging.CRITICAL,
                ''.join(format_exception(exc_type, exc_value,
                                         exc_traceback)).strip())

        if once:
            break
        else:
            now = time.time()
            if (now - update_history_time) > update_history_threshold:
                logger(
                    logging.INFO,
                    'Last update of history table %s seconds ago. Running update.'
                    % (now - update_history_time))
                bad_replicas = list_bad_replicas_history(
                    limit=1000000,
                    thread=heart_beat['assign_thread'],
                    total_threads=heart_beat['nr_threads'])
                for rse_id in bad_replicas:
                    chunk_size = 1000
                    nchunk = int(ceil(len(bad_replicas[rse_id]) / chunk_size))
                    logger(logging.DEBUG,
                           'Update history for rse_id %s' % (rse_id))
                    cnt = 0
                    for chunk in chunks(bad_replicas[rse_id], chunk_size):
                        logger(
                            logging.DEBUG,
                            ' History for rse_id %s : chunk %i/%i' %
                            (rse_id, cnt, nchunk))
                        cnt += 1
                        update_bad_replicas_history(chunk, rse_id)
                logger(
                    logging.INFO, 'History table updated in %s seconds' %
                    (time.time() - now))
                update_history_time = time.time()

            if len(replicas) == bulk:
                logger(
                    logging.INFO,
                    'Processed maximum number of replicas according to the bulk size. Restart immediately next cycle'
                )
            else:
                daemon_sleep(start_time=stime,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

    logger(logging.INFO, 'Graceful stop requested')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop done')