Beispiel #1
0
def run(once=False, process=0, total_processes=1, threads_per_process=1):
    """
    Starts up the Judge-Clean threads.
    """

    try:
        ntpc = ntplib.NTPClient()
        response = ntpc.request('137.138.16.69', version=3)  # 137.138.16.69 CERN IP-TIME-1 NTP Server (Stratum 2)
        if response.offset > 60*60+10:  # 1hour 10seconds
            logging.critical('Offset between NTP server and system time too big. Stopping Cleaner')
            return
    except:
        return

    for i in xrange(process * threads_per_process, max(0, process * threads_per_process + threads_per_process - 1)):
        record_gauge('rule.judge.cleaner.threads.%d' % i, 0)

    if once:
        logging.info('main: executing one iteration only')
        rule_cleaner(once)
    else:
        logging.info('main: starting threads')
        threads = [threading.Thread(target=rule_cleaner, kwargs={'process': process, 'total_processes': total_processes, 'once': once, 'thread': i, 'threads_per_process': threads_per_process}) for i in xrange(0, threads_per_process)]
        [t.start() for t in threads]
        logging.info('main: waiting for interrupts')
        # Interruptible joins require a timeout.
        while threads[0].is_alive():
            [t.join(timeout=3.14) for t in threads]
Beispiel #2
0
def run(once=False, process=0, total_processes=1, threads_per_process=1):
    """
    Starts up the Judge-Repairer threads.
    """
    for i in xrange(process * threads_per_process, max(0, process * threads_per_process + threads_per_process - 1)):
        record_gauge('rule.judge.repairer.threads.%d' % i, 0)

    if once:
        logging.info('main: executing one iteration only')
        rule_repairer(once)
    else:
        logging.info('main: starting threads')
        threads = [threading.Thread(target=rule_repairer, kwargs={'process': process, 'total_processes': total_processes, 'once': once, 'thread': i, 'threads_per_process': threads_per_process}) for i in xrange(0, threads_per_process)]
        [t.start() for t in threads]
        logging.info('main: waiting for interrupts')
        # Interruptible joins require a timeout.
        while threads[0].is_alive():
            [t.join(timeout=3.14) for t in threads]
Beispiel #3
0
def __release_all_activities(stats, direction, rse_name, rse_id, logger, session):
    """
    Release requests if activities should be ignored.

    :param stats:          Request statistics
    :param direction:      String whether request statistics are based on source or destination RSEs.
    :param rse_name:       RSE name.
    :param rse_id:         RSE id.
    """
    threshold = stats['threshold']
    transfer = stats['transfer']
    waiting = stats['waiting']
    strategy = stats['strategy']
    if threshold is not None and transfer + waiting > threshold:
        record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', threshold, labels={'activity': 'all_activities', 'rse': rse_name, 'limit_attr': 'max_transfers'})
        record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', transfer, labels={'activity': 'all_activities', 'rse': rse_name, 'limit_attr': 'transfers'})
        record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', waiting, labels={'activity': 'all_activities', 'rse': rse_name, 'limit_attr': 'waiting'})
        if transfer < 0.8 * threshold:
            to_be_released = threshold - transfer
            if strategy == 'grouped_fifo':
                deadline = stats.get('deadline')
                volume = stats.get('volume')
                release_waiting_requests_grouped_fifo(rse_id, count=to_be_released, direction=direction, volume=volume, deadline=deadline, session=session)
            elif strategy == 'fifo':
                release_waiting_requests_fifo(rse_id, count=to_be_released, direction=direction, session=session)
        else:
            logger(logging.DEBUG, "Throttler has done nothing on rse %s (transfer > 0.8 * threshold)" % rse_name)
    elif waiting > 0 or not threshold:
        logger(logging.DEBUG, "Throttler remove limits(threshold: %s) and release all waiting requests, rse %s" % (threshold, rse_name))
        delete_rse_transfer_limits(rse_id, activity='all_activities', session=session)
        release_all_waiting_requests(rse_id, direction=direction, session=session)
        record_counter('daemons.conveyor.throttler.delete_rse_transfer_limits.{activity}.{rse}', labels={'activity': 'all_activities', 'rse': rse_name})
Beispiel #4
0
 def test_record_gauge_message(self):
     """MONITOR (CORE): Send a gauge message to graphite """
     monitor.record_gauge('test.gauge', 10)
Beispiel #5
0
def __release_per_activity(stats, direction, rse_name, rse_id, logger, session):
    """
    Release requests per activity.

    :param stats:          Request statistics
    :param direction:      String whether request statistics are based on source or destination RSEs.
    :param rse_name:       RSE name.
    :param rse_id:         RSE id.
    """
    for activity in stats['activities']:
        threshold = stats['activities'][activity]['threshold']
        transfer = stats['activities'][activity]['transfer']
        waiting = stats['activities'][activity]['waiting']
        if waiting:
            logger(logging.DEBUG, "Request status for %s at %s: %s" % (activity, rse_name,
                                                                       stats['activities'][activity]))
            if threshold is None:
                logger(logging.DEBUG, "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse_id %s" % (threshold, activity, rse_id))
                delete_rse_transfer_limits(rse_id, activity=activity, session=session)
                release_all_waiting_requests(rse_id, activity=activity, direction=direction, session=session)
                record_counter('daemons.conveyor.throttler.delete_rse_transfer_limits.{activity}.{rse}', labels={'activity': activity, 'rse': rse_name})
            elif transfer + waiting > threshold:
                logger(logging.DEBUG, "Throttler set limits for activity %s, rse %s" % (activity, rse_name))
                set_rse_transfer_limits(rse_id, activity=activity, max_transfers=threshold, transfers=transfer, waitings=waiting, session=session)
                record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', threshold, labels={'activity': activity, 'rse': rse_name, 'limit_attr': 'max_transfers'})
                record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', transfer, labels={'activity': activity, 'rse': rse_name, 'limit_attr': 'transfers'})
                record_gauge('daemons.conveyor.throttler.set_rse_transfer_limits.{activity}.{rse}.{limit_attr}', waiting, labels={'activity': activity, 'rse': rse_name, 'limit_attr': 'waiting'})
                if transfer < 0.8 * threshold:
                    # release requests on account
                    nr_accounts = len(stats['activities'][activity]['accounts'])
                    if nr_accounts < 1:
                        nr_accounts = 1
                    to_release = threshold - transfer
                    threshold_per_account = math.ceil(threshold / nr_accounts)
                    to_release_per_account = math.ceil(to_release / nr_accounts)
                    accounts = stats['activities'][activity]['accounts']
                    for account in accounts:
                        if nr_accounts == 1:
                            logger(logging.DEBUG, "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (to_release, activity, rse_name, account))
                            release_waiting_requests_fifo(rse_id, activity=activity, account=account, count=to_release, direction=direction, session=session)
                            record_gauge('daemons.conveyor.throttler.release_waiting_requests.{activity}.{rse}.{account}', to_release, labels={'activity': activity, 'rse': rse_name, 'account': account})
                        elif accounts[account]['transfer'] > threshold_per_account:
                            logger(logging.DEBUG, "Throttler will not release  %s waiting requests for activity %s, rse %s, account %s: It queued more transfers than its share " %
                                   (accounts[account]['waiting'], activity, rse_name, account))
                            nr_accounts -= 1
                            to_release_per_account = math.ceil(to_release / nr_accounts)
                        elif accounts[account]['waiting'] < to_release_per_account:
                            logger(logging.DEBUG, "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (accounts[account]['waiting'], activity, rse_name, account))
                            release_waiting_requests_fifo(rse_id, activity=activity, account=account, count=accounts[account]['waiting'], direction=direction, session=session)
                            record_gauge('daemons.conveyor.throttler.release_waiting_requests.{activity}.{rse}.{account}', accounts[account]['waiting'], labels={'activity': activity, 'rse': rse_name, 'account': account})
                            to_release = to_release - accounts[account]['waiting']
                            nr_accounts -= 1
                            to_release_per_account = math.ceil(to_release / nr_accounts)
                        else:
                            logger(logging.DEBUG, "Throttler release %s waiting requests for activity %s, rse %s, account %s " % (to_release_per_account, activity, rse_name, account))
                            release_waiting_requests_fifo(rse_id, activity=activity, account=account, count=to_release_per_account, direction=direction, session=session)
                            record_gauge('daemons.conveyor.throttler.release_waiting_requests.{activity}.{rse}.{account}', to_release_per_account, labels={'activity': activity, 'rse': rse_name, 'account': account})
                            to_release = to_release - to_release_per_account
                            nr_accounts -= 1
                else:
                    logger(logging.DEBUG, "Throttler has done nothing for activity %s on rse %s (transfer > 0.8 * threshold)" % (activity, rse_name))
            elif waiting > 0:
                logger(logging.DEBUG, "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse %s" % (threshold, activity, rse_name))
                delete_rse_transfer_limits(rse_id, activity=activity, session=session)
                release_all_waiting_requests(rse_id, activity=activity, direction=direction, session=session)
                record_counter('daemons.conveyor.throttler.delete_rse_transfer_limits.{activity}.{rse}', labels={'activity': activity, 'rse': rse_name})
Beispiel #6
0
 def test_monitor_record_gauge_message(self):
     """MONITOR (CORE): Send a gauge message to graphite """
     monitor.record_gauge('test.gauge', 10)
Beispiel #7
0
def __schedule_requests():
    """
    Schedule requests
    """
    try:
        logging.info("Throttler retrieve requests statistics")
        results = get_stats_by_activity_dest_state(state=[
            RequestState.QUEUED, RequestState.SUBMITTING,
            RequestState.SUBMITTED, RequestState.WAITING
        ])
        result_dict = {}
        for activity, dest_rse_id, account, state, rse, counter in results:
            threshold = get_config_limit(activity, dest_rse_id)

            if threshold or (counter and (state == RequestState.WAITING)):
                if activity not in result_dict:
                    result_dict[activity] = {}
                if dest_rse_id not in result_dict[activity]:
                    result_dict[activity][dest_rse_id] = {
                        'waiting': 0,
                        'transfer': 0,
                        'threshold': threshold,
                        'accounts': {},
                        'rse': rse
                    }
                if account not in result_dict[activity][dest_rse_id][
                        'accounts']:
                    result_dict[activity][dest_rse_id]['accounts'][account] = {
                        'waiting': 0,
                        'transfer': 0
                    }
                if state == RequestState.WAITING:
                    result_dict[activity][dest_rse_id]['accounts'][account][
                        'waiting'] += counter
                    result_dict[activity][dest_rse_id]['waiting'] += counter
                else:
                    result_dict[activity][dest_rse_id]['accounts'][account][
                        'transfer'] += counter
                    result_dict[activity][dest_rse_id]['transfer'] += counter

        for activity in result_dict:
            for dest_rse_id in result_dict[activity]:
                threshold = result_dict[activity][dest_rse_id]['threshold']
                transfer = result_dict[activity][dest_rse_id]['transfer']
                waiting = result_dict[activity][dest_rse_id]['waiting']
                rse_name = result_dict[activity][dest_rse_id]['rse']
                if waiting:
                    logging.debug("Request status for %s at %s: %s" %
                                  (activity, rse_name,
                                   result_dict[activity][dest_rse_id]))

                if threshold is None:
                    logging.debug(
                        "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse_id %s"
                        % (threshold, activity, dest_rse_id))
                    delete_rse_transfer_limits(rse=None,
                                               activity=activity,
                                               rse_id=dest_rse_id)
                    release_waiting_requests(rse=None,
                                             activity=activity,
                                             rse_id=dest_rse_id)
                    record_counter(
                        'daemons.conveyor.throttler.delete_rse_transfer_limits.%s.%s'
                        % (activity, rse_name))

                elif transfer + waiting > threshold:
                    logging.debug(
                        "Throttler set limits for activity %s, rse %s" %
                        (activity, rse_name))
                    set_rse_transfer_limits(rse=None,
                                            activity=activity,
                                            rse_id=dest_rse_id,
                                            max_transfers=threshold,
                                            transfers=transfer,
                                            waitings=waiting)
                    record_gauge(
                        'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.max_transfers'
                        % (activity, rse_name), threshold)
                    record_gauge(
                        'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.transfers'
                        % (activity, rse_name), transfer)
                    record_gauge(
                        'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.waitings'
                        % (activity, rse_name), waiting)
                    if transfer < 0.8 * threshold:
                        # release requests on account
                        nr_accounts = len(
                            result_dict[activity][dest_rse_id]['accounts'])
                        if nr_accounts < 1:
                            nr_accounts = 1
                        to_release = threshold - transfer
                        threshold_per_account = math.ceil(threshold /
                                                          nr_accounts)
                        to_release_per_account = math.ceil(to_release /
                                                           nr_accounts)
                        accounts = result_dict[activity][dest_rse_id][
                            'accounts']
                        for account in accounts:
                            if nr_accounts == 1:
                                logging.debug(
                                    "Throttler release %s waiting requests for activity %s, rse %s, account %s "
                                    %
                                    (to_release, activity, rse_name, account))
                                release_waiting_requests(rse=None,
                                                         activity=activity,
                                                         rse_id=dest_rse_id,
                                                         account=account,
                                                         count=to_release)
                                record_gauge(
                                    'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s'
                                    % (activity, rse_name, account),
                                    to_release)

                            elif accounts[account][
                                    'transfer'] > threshold_per_account:
                                logging.debug(
                                    "Throttler will not release  %s waiting requests for activity %s, rse %s, account %s: It queued more transfers than its share "
                                    % (accounts[account]['waiting'], activity,
                                       rse_name, account))
                                nr_accounts -= 1
                                to_release_per_account = math.ceil(to_release /
                                                                   nr_accounts)
                            elif accounts[account][
                                    'waiting'] < to_release_per_account:
                                logging.debug(
                                    "Throttler release %s waiting requests for activity %s, rse %s, account %s "
                                    % (accounts[account]['waiting'], activity,
                                       rse_name, account))
                                release_waiting_requests(
                                    rse=None,
                                    activity=activity,
                                    rse_id=dest_rse_id,
                                    account=account,
                                    count=accounts[account]['waiting'])
                                record_gauge(
                                    'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s'
                                    % (activity, rse_name, account),
                                    accounts[account]['waiting'])

                                to_release = to_release - accounts[account][
                                    'waiting']
                                nr_accounts -= 1
                                to_release_per_account = math.ceil(to_release /
                                                                   nr_accounts)
                            else:
                                logging.debug(
                                    "Throttler release %s waiting requests for activity %s, rse %s, account %s "
                                    % (to_release_per_account, activity,
                                       rse_name, account))
                                release_waiting_requests(
                                    rse=None,
                                    activity=activity,
                                    rse_id=dest_rse_id,
                                    account=account,
                                    count=to_release_per_account)
                                record_gauge(
                                    'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s'
                                    % (activity, rse_name, account),
                                    to_release_per_account)

                                to_release = to_release - to_release_per_account
                                nr_accounts -= 1
                    else:
                        logging.debug(
                            "Throttler has done nothing for activity %s on rse %s (transfer > 0.8 * threshold)"
                            % (activity, rse_name))

                elif waiting > 0:
                    logging.debug(
                        "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse %s"
                        % (threshold, activity, rse_name))
                    delete_rse_transfer_limits(rse=None,
                                               activity=activity,
                                               rse_id=dest_rse_id)
                    release_waiting_requests(rse=None,
                                             activity=activity,
                                             rse_id=dest_rse_id)
                    record_counter(
                        'daemons.conveyor.throttler.delete_rse_transfer_limits.%s.%s'
                        % (activity, rse_name))
    except:
        logging.critical("Failed to schedule requests, error: %s" %
                         (traceback.format_exc()))
Beispiel #8
0
def rule_cleaner(once=False, process=0, total_processes=1, thread=0, threads_per_process=1):
    """
    Main loop to check for expired replication rules
    """

    logging.info('rule_cleaner: starting')

    logging.info('rule_cleaner: started')

    paused_rules = {}  # {rule_id: datetime}

    while not graceful_stop.is_set():
        try:
            start = time.time()
            rules = get_expired_rules(total_workers=total_processes*threads_per_process-1,
                                      worker_number=process*threads_per_process+thread,
                                      limit=1000)
            logging.debug('rule_cleaner index query time %f fetch size is %d' % (time.time() - start, len(rules)))

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            # Remove paused rules from result set
            rules = [rule for rule in rules if rule[0] not in paused_rules]

            if not rules and not once:
                logging.info('rule_cleaner[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1))
                time.sleep(10)
            else:
                record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 1)
                for rule in rules:
                    rule_id = rule[0]
                    rule_expression = rule[1]
                    logging.info('rule_cleaner[%s/%s]: Deleting rule %s with expression %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, rule_expression))
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        delete_rule(rule_id=rule_id, nowait=True)
                        logging.debug('rule_cleaner[%s/%s]: deletion of %s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError, AccessDenied), e:
                        if isinstance(e.args[0], tuple):
                            if match('.*ORA-00054.*', e.args[0][0]):
                                paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                                record_counter('rule.judge.exceptions.LocksDetected')
                                logging.warning('rule_cleaner[%s/%s]: Locks detected for %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id))
                            else:
                                logging.error(traceback.format_exc())
                                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0)
        except Exception, e:
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0)
            logging.critical(traceback.format_exc())
        if once:
            return
Beispiel #9
0
                            else:
                                logging.error(traceback.format_exc())
                                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0)
        except Exception, e:
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0)
            logging.critical(traceback.format_exc())
        if once:
            return

    logging.info('rule_cleaner: graceful stop requested')
    record_gauge('rule.judge.cleaner.threads.%d' % (process*threads_per_process+thread), 0)
    logging.info('rule_cleaner: graceful stop done')


def stop(signum=None, frame=None):
    """
    Graceful exit.
    """

    graceful_stop.set()


def run(once=False, process=0, total_processes=1, threads_per_process=1):
    """
    Starts up the Judge-Clean threads.
    """
Beispiel #10
0
def process_miss_files(path, scope, rse, latest_run, max_miss_fraction,
                       max_files_at_site, old_enough_run, force_proceed):
    """
    Process the Missing Replicas.
    """

    prefix = 'storage-consistency-actions (process_miss_files())'
    logger = formatted_logger(logging.log, prefix + '%s')

    latest_miss = re.sub('_stats.json$', '_M.list', latest_run)
    logger(logging.INFO, 'latest_missing = %s' % latest_miss)

    # Create a cc_miss section in the stats file

    t0 = time.time()
    stats_key = "cc_miss"
    cc_stats = stats = None
    stats = Stats(latest_run)
    cc_stats = {
        "start_time": t0,
        "end_time": None,
        "initial_miss_files": 0,
        "confirmed_miss_files": 0,
        "x-check_run": old_enough_run,
        "status": "started"
    }
    stats[stats_key] = cc_stats

    ###
    #   SAFEGUARD
    #   If a large fraction (larger than 'max_miss_fraction') of the files at a site are reported as
    #   'missing', do NOT proceed with the invalidation.
    #   Instead, put a warning in the _stats.json file, so that an operator can have a look.
    ###

    miss_files = sum(1 for line in open(latest_miss))
    logger(logging.INFO, 'miss_files = %d' % miss_files)
    logger(
        logging.INFO,
        'miss_files/max_files_at_site = %f' % (miss_files / max_files_at_site))
    logger(
        logging.INFO, 'max_miss_fraction configured for this RSE (in %%): %f' %
        max_miss_fraction)

    labels = {'rse': rse}
    record_gauge('storage.consistency.actions_miss_files_found',
                 miss_files,
                 labels=labels)

    if miss_files / max_files_at_site < max_miss_fraction or force_proceed is True:
        logger(logging.INFO, 'Can proceed with missing files retransfer')

        invalidated_files = 0
        issuer = InternalAccount('root')
        with open(latest_miss, 'r') as csvfile:
            reader = csv.reader(csvfile)
            reason = "invalidating damaged/missing replica"
            for name, in reader:
                logger(
                    logging.INFO,
                    'Processing invalid replica:\n RSE: %s Scope: %s Name: %s'
                    % (rse, scope, name))

                rse_id = get_rse_id(rse=rse)
                dids = [{'scope': scope, 'name': name}]
                declare_bad_file_replicas(dids=dids,
                                          rse_id=rse_id,
                                          reason=reason,
                                          issuer=issuer)
                invalidated_files += 1
                record_counter(
                    'storage.consistency.actions_miss_files_to_retransfer_counter',
                    delta=1,
                    labels=labels)


# TODO: The stats updating can be refactored in a future version of the Stats class.
# See: https://github.com/rucio/rucio/pull/5120#discussion_r792688019
# Update the stats
            t1 = time.time()

            cc_stats.update({
                "end_time": t1,
                "initial_miss_files": miss_files,
                "confirmed_miss": invalidated_files,
                "status": "done"
            })
            stats[stats_key] = cc_stats
            record_gauge(
                'storage.consistency.actions_miss_files_to_retransfer',
                invalidated_files,
                labels=labels)

    else:
        missperc = 100. * miss_files / max_files_at_site
        logger(
            logging.WARNING, '\n Too many MISS files (%3.2f%%)!\n\
         Stopping and asking for operators help.' % missperc)

        # Update the stats
        t1 = time.time()

        cc_stats.update({
            "end_time": t1,
            "initial_miss_files": miss_files,
            "confirmed_miss_files": 0,
            "status": "ABORTED",
            "aborted_reason": "%3.2f%% miss" % missperc,
        })
        stats[stats_key] = cc_stats
        record_gauge('storage.consistency.actions_miss_files_to_retransfer',
                     0,
                     labels=labels)
Beispiel #11
0
def process_dark_files(path, scope, rse, latest_run, max_dark_fraction,
                       max_files_at_site, old_enough_run, force_proceed):
    """
    Process the Dark Files.
    """

    prefix = 'storage-consistency-actions (process_dark_files())'
    logger = formatted_logger(logging.log, prefix + '%s')

    # Create a cc_dark section in the stats file

    t0 = time.time()
    stats_key = "cc_dark"
    cc_stats = stats = None
    stats = Stats(latest_run)
    cc_stats = {
        "start_time": t0,
        "end_time": None,
        "initial_dark_files": 0,
        "confirmed_dark_files": 0,
        "x-check_run": old_enough_run,
        "status": "started"
    }
    stats[stats_key] = cc_stats

    # Compare the two lists, and take only the dark files that are in both
    latest_dark = re.sub('_stats.json$', '_D.list', latest_run)
    old_enough_dark = re.sub('_stats.json$', '_D.list', old_enough_run)
    logger(logging.INFO, 'latest_dark = %s' % latest_dark)
    logger(logging.INFO, 'old_enough_dark = %s' % old_enough_dark)
    confirmed_dark = re.sub('_stats.json$', '_DeletionList.csv', latest_run)
    cmp2dark(new_list=latest_dark,
             old_list=old_enough_dark,
             comm_list=confirmed_dark,
             stats_file=latest_run)

    ###
    #   SAFEGUARD
    #   If a large fraction (larger than 'max_dark_fraction') of the files at a site
    #   are reported as 'dark', do NOT proceed with the deletion.
    #   Instead, put a warning in the _stats.json file, so that an operator can have a look.
    ###

    # Get the number of files recorded by the scanner
    dark_files = sum(1 for line in open(latest_dark))
    confirmed_dark_files = sum(1 for line in open(confirmed_dark))
    logger(logging.INFO, 'dark_files %d' % dark_files)
    logger(logging.INFO, 'confirmed_dark_files %d' % confirmed_dark_files)
    logger(
        logging.INFO, 'confirmed_dark_files/max_files_at_sit = %f' %
        (confirmed_dark_files / max_files_at_site))
    logger(logging.INFO,
           'max_dark_fraction configured for this RSE: %f' % max_dark_fraction)

    # Labels for the Prometheus counters/gauges
    labels = {'rse': rse}

    record_gauge('storage.consistency.actions_dark_files_found',
                 confirmed_dark_files,
                 labels=labels)
    record_gauge('storage.consistency.actions_dark_files_confirmed',
                 confirmed_dark_files,
                 labels=labels)

    deleted_files = 0
    if confirmed_dark_files / max_files_at_site < max_dark_fraction or force_proceed is True:
        logger(logging.INFO, 'Can proceed with dark files deletion')

        # Then, do the real deletion (code from DeleteReplicas.py)
        issuer = InternalAccount('root')
        with open(confirmed_dark, 'r') as csvfile:
            reader = csv.reader(csvfile)
            for name, in reader:
                logger(
                    logging.INFO,
                    'Processing a dark file:\n RSE %s  Scope: %s  Name: %s' %
                    (rse, scope, name))
                rse_id = get_rse_id(rse=rse)
                Intscope = InternalScope(scope=scope, vo=issuer.vo)
                lfns = [{'scope': scope, 'name': name}]

                attributes = get_rse_info(rse=rse)
                pfns = lfns2pfns(rse_settings=attributes,
                                 lfns=lfns,
                                 operation='delete')
                pfn_key = scope + ':' + name
                url = pfns[pfn_key]
                urls = [url]
                paths = parse_pfns(attributes, urls, operation='delete')
                replicas = [{
                    'scope': Intscope,
                    'rse_id': rse_id,
                    'name': name,
                    'path': paths[url]['path'] + paths[url]['name']
                }]
                add_quarantined_replicas(rse_id, replicas, session=None)
                deleted_files += 1
                labels = {'rse': rse}
                record_counter(
                    'storage.consistency.actions_dark_files_deleted_counter',
                    delta=1,
                    labels=labels)


# Update the stats
        t1 = time.time()

        cc_stats.update({
            "end_time": t1,
            "initial_dark_files": dark_files,
            "confirmed_dark_files": deleted_files,
            "status": "done"
        })
        stats[stats_key] = cc_stats
        record_gauge('storage.consistency.actions_dark_files_deleted',
                     deleted_files,
                     labels=labels)

    else:
        darkperc = 100. * confirmed_dark_files / max_files_at_site
        logger(
            logging.WARNING, '\n ATTENTION: Too many DARK files! (%3.2f%%) \n\
               Stopping and asking for operators help.' % darkperc)

        # Update the stats
        t1 = time.time()

        cc_stats.update({
            "end_time": t1,
            "initial_dark_files": dark_files,
            "confirmed_dark_files": 0,
            "status": "ABORTED",
            "aborted_reason": "%3.2f%% dark" % darkperc,
        })
        stats[stats_key] = cc_stats
        record_gauge('storage.consistency.actions_dark_files_deleted',
                     0,
                     labels=labels)
Beispiel #12
0
def rule_repairer(once=False, process=0, total_processes=1, thread=0, threads_per_process=1):
    """
    Main loop to check for STUCK replication rules
    """

    logging.info('rule_repairer: starting')

    logging.info('rule_repairer: started')

    paused_rules = {}  # {rule_id: datetime}

    while not graceful_stop.is_set():
        try:
            # Select a bunch of rules for this worker to repair
            start = time.time()
            rules = get_stuck_rules(total_workers=total_processes*threads_per_process-1,
                                    worker_number=process*threads_per_process+thread,
                                    delta=-1 if once else 600)
            logging.debug('rule_repairer index query time %f fetch size is %d' % (time.time() - start, len(rules)))

            # Refresh paused rules
            iter_paused_rules = deepcopy(paused_rules)
            for key in iter_paused_rules:
                if datetime.utcnow() > paused_rules[key]:
                    del paused_rules[key]

            # Remove paused rules from result set
            rules = [rule for rule in rules if rule[0] not in paused_rules]

            if not rules and not once:
                logging.info('rule_repairer[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1))
                time.sleep(10)
            else:
                record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 1)
                for rule_id in rules:
                    rule_id = rule_id[0]
                    logging.info('rule_repairer[%s/%s]: Repairing rule %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id))
                    if graceful_stop.is_set():
                        break
                    try:
                        start = time.time()
                        repair_rule(rule_id=rule_id)
                        logging.debug('rule_repairer[%s/%s]: repairing of %s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id, time.time() - start))
                    except (DatabaseException, DatabaseError), e:
                        if match('.*ORA-00054.*', str(e.args[0])):
                            paused_rules[rule_id] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                            logging.warning('rule_repairer[%s/%s]: Locks detected for %s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, rule_id))
                            record_counter('rule.judge.exceptions.LocksDetected')
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)

                record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 0)
        except Exception, e:
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            record_gauge('rule.judge.repairer.threads.%d' % (process*threads_per_process+thread), 0)
            logging.critical(traceback.format_exc())
        if once:
            return
        else:
            time.sleep(30)
Beispiel #13
0
def re_evaluator(once=False, process=0, total_processes=1, thread=0, threads_per_process=1):
    """
    Main loop to check the re-evaluation of dids.
    """

    logging.info('re_evaluator: starting')

    logging.info('re_evaluator: started')

    paused_dids = {}  # {(scope, name): datetime}

    while not graceful_stop.is_set():
        try:
            # Select a bunch of dids for re evaluation for this worker
            start = time.time()  # NOQA
            dids = get_updated_dids(total_workers=total_processes*threads_per_process-1,
                                    worker_number=process*threads_per_process+thread,
                                    limit=1000)
            logging.debug('Re-Evaluation index query time %f fetch size is %d' % (time.time() - start, len(dids)))

            # Refresh paused dids
            iter_paused_dids = deepcopy(paused_dids)
            for key in iter_paused_dids:
                if datetime.utcnow() > paused_dids[key]:
                    del paused_dids[key]

            # Remove paused dids from result set
            dids = [did for did in dids if (did.scope, did.name) not in paused_dids]

            # If the list is empty, sent the worker to sleep
            if not dids and not once:
                logging.info('re_evaluator[%s/%s] did not get any work' % (process*threads_per_process+thread, total_processes*threads_per_process-1))
                time.sleep(10)
            else:
                record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 1)

                done_dids = {}
                for did in dids:
                    if graceful_stop.is_set():
                        break

                    # Try to delete all duplicate dids
                    # delete_duplicate_updated_dids(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action, id=did.id)

                    # Check if this did has already been operated on
                    if '%s:%s' % (did.scope, did.name) in done_dids:
                        if did.rule_evaluation_action in done_dids['%s:%s' % (did.scope, did.name)]:
                            continue
                    else:
                        done_dids['%s:%s' % (did.scope, did.name)] = []
                    done_dids['%s:%s' % (did.scope, did.name)].append(did.rule_evaluation_action)

                    try:
                        start_time = time.time()
                        re_evaluate_did(scope=did.scope, name=did.name, rule_evaluation_action=did.rule_evaluation_action)
                        logging.debug('re_evaluator[%s/%s]: evaluation of %s:%s took %f' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name, time.time() - start_time))
                        delete_updated_did(id=did.id)
                    except DataIdentifierNotFound, e:
                        delete_updated_did(id=did.id)
                    except (DatabaseException, DatabaseError), e:
                        if isinstance(e.args[0], tuple):
                            if match('.*ORA-00054.*', e.args[0][0]):
                                paused_dids[(did.scope, did.name)] = datetime.utcnow() + timedelta(seconds=randint(60, 600))
                                logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))
                                record_counter('rule.judge.exceptions.LocksDetected')
                            else:
                                logging.error(traceback.format_exc())
                                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed, e:
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))
Beispiel #14
0
                                logging.warning('re_evaluator[%s/%s]: Locks detected for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))
                                record_counter('rule.judge.exceptions.LocksDetected')
                            else:
                                logging.error(traceback.format_exc())
                                record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        else:
                            logging.error(traceback.format_exc())
                            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                    except ReplicationRuleCreationTemporaryFailed, e:
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        logging.warning('re_evaluator[%s/%s]: Replica Creation temporary failed, retrying later for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))
                    except FlushError, e:
                        record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
                        logging.critical('re_evaluator[%s/%s]: Flush error for %s:%s' % (process*threads_per_process+thread, total_processes*threads_per_process-1, did.scope, did.name))
                        logging.critical(traceback.format_exc())
                record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 0)
        except Exception, e:
            record_counter('rule.judge.exceptions.%s' % e.__class__.__name__)
            record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 0)
            logging.critical(traceback.format_exc())
        if once:
            break

    logging.info('re_evaluator: graceful stop requested')
    record_gauge('rule.judge.re_evaluate.threads.%d' % (process*threads_per_process+thread), 0)

    logging.info('re_evaluator: graceful stop done')


def stop(signum=None, frame=None):
    """