Ejemplo n.º 1
0
    def test_list_expired_dids_with_locked_rules(self):
        """ UNDERTAKER (CORE): Test that the undertaker does not list expired dids with locked rules"""
        tmp_scope = InternalScope('mock')
        jdoe = InternalAccount('jdoe')
        root = InternalAccount('root')

        # Add quota
        set_local_account_limit(jdoe, get_rse_id('MOCK'), -1)

        dsn = {
            'name':
            'dsn_%s' % generate_uuid(),
            'scope':
            tmp_scope,
            'type':
            'DATASET',
            'lifetime':
            -1,
            'rules': [{
                'account': jdoe,
                'copies': 1,
                'rse_expression': 'MOCK',
                'locked': True,
                'grouping': 'DATASET'
            }]
        }

        add_dids(dids=[dsn], account=root)

        for did in list_expired_dids(limit=1000):
            assert (did['scope'] != dsn['scope']
                    and did['name'] != dsn['name'])
Ejemplo n.º 2
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting', worker_number)
    logging.info('Undertaker(%s): started', worker_number)
    executable = 'undertaker'
    hostname = socket.gethostname()
    pid = os.getpid()
    thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)

    paused_dids = {}  # {(scope, name): datetime}

    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, older_than=6000)
            logging.info('Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'.format(locals()))

            # Refresh paused dids
            iter_paused_dids = deepcopy(paused_dids)
            for key in iter_paused_dids:
                if datetime.utcnow() > paused_dids[key]:
                    del paused_dids[key]

            dids = list_expired_dids(worker_number=heartbeat['assign_thread'], total_workers=heartbeat['nr_threads'], limit=10000)

            dids = [did for did in dids if (did['scope'], did['name']) not in paused_dids]

            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.', worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete', worker_number, len(chunk))
                    delete_dids(dids=chunk, account=InternalAccount('root'), expire_rules=True)
                    logging.info('Undertaker(%s): Delete %s dids', worker_number, len(chunk))
                    record_counter(counters='undertaker.delete_dids', delta=len(chunk))
                except RuleNotFound as error:
                    logging.error(error)
                except (DatabaseException, DatabaseError, UnsupportedOperation) as e:
                    if match('.*ORA-00054.*', str(e.args[0])) or match('.*55P03.*', str(e.args[0])) or match('.*3572.*', str(e.args[0])):
                        for did in chunk:
                            paused_dids[(did['scope'], did['name'])] = datetime.utcnow() + timedelta(seconds=randint(600, 2400))
                        record_counter('undertaker.delete_dids.exceptions.LocksDetected')
                        logging.warning('undertaker[%s/%s]: Locks detected for chunk', heartbeat['assign_thread'], heartbeat['nr_threads'])
                    else:
                        logging.error('Undertaker(%s): Got database error %s.', worker_number, str(e))
        except:
            logging.critical(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    die(executable=executable, hostname=hostname, pid=pid, thread=thread)
    logging.info('Undertaker(%s): graceful stop requested', worker_number)
    logging.info('Undertaker(%s): graceful stop done', worker_number)
Ejemplo n.º 3
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting' % worker_number)
    logging.info('Undertaker(%s): started' % worker_number)
    while not graceful_stop.is_set():
        try:
            dids = list_expired_dids(worker_number=worker_number, total_workers=total_workers, limit=10000)
            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.' % worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete' % (worker_number, len(chunk)))
                    delete_dids(dids=chunk, account='root')
                    logging.info('Undertaker(%s): Delete %s dids' % (worker_number, len(chunk)))
                    record_counter(counters='undertaker.delete_dids',  delta=len(chunk))
                except DatabaseException, e:
                    logging.error('Undertaker(%s): Got database error %s.' % (worker_number, str(e)))
        except:
            logging.error(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    logging.info('Undertaker(%s): graceful stop requested' % worker_number)
    logging.info('Undertaker(%s): graceful stop done' % worker_number)
Ejemplo n.º 4
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting', worker_number)
    logging.info('Undertaker(%s): started', worker_number)
    hostname = socket.gethostname()
    pid = os.getpid()
    thread = threading.current_thread()
    sanity_check(executable='rucio-undertaker', hostname=hostname)
    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable='rucio-undertaker',
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             older_than=6000)
            logging.info(
                'Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'
                .format(locals()))

            dids = list_expired_dids(worker_number=heartbeat['assign_thread'] +
                                     1,
                                     total_workers=heartbeat['nr_threads'],
                                     limit=10000)
            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.',
                             worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete',
                                 worker_number, len(chunk))
                    delete_dids(dids=chunk, account='root', expire_rules=True)
                    logging.info('Undertaker(%s): Delete %s dids',
                                 worker_number, len(chunk))
                    record_counter(counters='undertaker.delete_dids',
                                   delta=len(chunk))
                except RuleNotFound as error:
                    logging.error(error)
                except DatabaseException as error:
                    logging.error('Undertaker(%s): Got database error %s.',
                                  worker_number, str(error))
        except:
            logging.critical(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    die(executable='rucio-undertaker',
        hostname=hostname,
        pid=pid,
        thread=thread)
    logging.info('Undertaker(%s): graceful stop requested', worker_number)
    logging.info('Undertaker(%s): graceful stop done', worker_number)
Ejemplo n.º 5
0
def run_once(paused_dids: Dict[Tuple, datetime], chunk_size: int,
             heartbeat_handler: HeartbeatHandler, **_kwargs):
    worker_number, total_workers, logger = heartbeat_handler.live()

    try:
        # Refresh paused dids
        iter_paused_dids = deepcopy(paused_dids)
        for key in iter_paused_dids:
            if datetime.utcnow() > paused_dids[key]:
                del paused_dids[key]

        dids = list_expired_dids(worker_number=worker_number,
                                 total_workers=total_workers,
                                 limit=10000)

        dids = [
            did for did in dids
            if (did['scope'], did['name']) not in paused_dids
        ]

        if not dids:
            logger(logging.INFO, 'did not get any work')
            return

        for chunk in chunks(dids, chunk_size):
            _, _, logger = heartbeat_handler.live()
            try:
                logger(logging.INFO, 'Receive %s dids to delete', len(chunk))
                delete_dids(dids=chunk,
                            account=InternalAccount('root', vo='def'),
                            expire_rules=True)
                logger(logging.INFO, 'Delete %s dids', len(chunk))
                record_counter(name='undertaker.delete_dids', delta=len(chunk))
            except RuleNotFound as error:
                logger(logging.ERROR, error)
            except (DatabaseException, DatabaseError,
                    UnsupportedOperation) as e:
                if match('.*ORA-00054.*', str(e.args[0])) or match(
                        '.*55P03.*', str(e.args[0])) or match(
                            '.*3572.*', str(e.args[0])):
                    for did in chunk:
                        paused_dids[(
                            did['scope'],
                            did['name'])] = datetime.utcnow() + timedelta(
                                seconds=randint(600, 2400))
                    record_counter(
                        'undertaker.delete_dids.exceptions.{exception}',
                        labels={'exception': 'LocksDetected'})
                    logger(logging.WARNING, 'Locks detected for chunk')
                else:
                    logger(logging.ERROR, 'Got database error %s.', str(e))
    except:
        logging.critical(traceback.format_exc())