Beispiel #1
0
def reaper(rses,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60):
    """
    Main loop to select and delete files.

    :param rses:           List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param chunk_size:     The size of chunk for deletion.
    :param once:           If True, only runs one iteration of the main loop.
    :param greedy:         If True, delete right away replicas with tombstone.
    :param scheme:         Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:  The delay to query replicas in BEING_DELETED state.
    :param sleep_time:     Time between two cycles.
    """

    try:
        max_deletion_thread = get('reaper', 'nb_workers_by_hostname')
    except ConfigNotFound as error:
        max_deletion_thread = 5
    hostname = socket.getfqdn()
    executable = sys.argv[0]
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info('%s Reaper starting', prepend_str)

    time.sleep(
        10
    )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info('%s Reaper started', prepend_str)

    while not GRACEFUL_STOP.is_set():

        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] +
                                                 1, heart_beat['nr_threads'])
            tot_needed_free_space = 0
            for rse in rses:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blacklisted
                if rse['availability'] % 2 == 0:
                    logging.debug('%s RSE %s is blacklisted for delete',
                                  prepend_str, rse['rse'])
                    continue
                max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(
                    rse['rse'], rse['id'], prepend_str)
                # Check if greedy mode
                if greedy:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        1000000000000, max_being_deleted_files
                    ]
                    tot_needed_free_space += 1000000000000
                else:
                    if needed_free_space:
                        dict_rses[(rse['rse'], rse['id'])] = [
                            needed_free_space, max_being_deleted_files
                        ]
                        tot_needed_free_space += needed_free_space
                    else:
                        logging.debug('%s Nothing to delete on %s',
                                      prepend_str, rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=itemgetter(1), reverse=True))
            logging.debug(
                '%s List of RSEs to process ordered by needed space desc : %s',
                prepend_str, str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logging.debug('%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                max_workers = ceil(dict_rses[rse_key][0] /
                                   tot_needed_free_space * 1000 /
                                   heart_beat['nr_threads'])
                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            skip_until_next_run = []
            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                if rse_id in skip_until_next_run:
                    continue
                logging.debug(
                    '%s Working on %s. Percentage of the total space needed %.2f',
                    prepend_str, rse_name,
                    needed_free_space / tot_needed_free_space * 100)
                rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logging.debug('%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]

                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logging.debug(
                        '%s Too many deletion threads for %s on RSE %s. Back off',
                        prepend_str, rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue

                logging.info(
                    '%s Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    prepend_str, rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logging.debug('%s Total deletion workers for %s : %i',
                              prepend_str, rse_hostname,
                              tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            session=None)
                    logging.debug(
                        '%s list_and_mark_unlocked_replicas  on %s for %s bytes in %s seconds: %s replicas',
                        prepend_str, rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logging.info(
                            '%s Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            prepend_str, rse_name, chunk_size, len(replicas))
                        skip_until_next_run.append(rse_id)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logging.error('%s %s', prepend_str, str(error))
                    continue
                except Exception:
                    logging.critical('%s %s', prepend_str,
                                     str(traceback.format_exc()))

                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme)
                    for file_replicas in chunks(replicas, 100):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    rsemgr.lfns2pfns(
                                        rse_settings=rse_info,
                                        lfns=[{
                                            'scope': replica['scope'],
                                            'name': replica['name'],
                                            'path': replica['path']
                                        }],
                                        operation='delete',
                                        scheme=scheme).values()[0])
                                time.sleep(random.uniform(0, 0.01))
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logging.warning(
                                    '%s Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    prepend_str, replica['scope'],
                                    replica['name'], rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logging.critical('%s %s', prepend_str,
                                                 str(traceback.format_exc()))

                        deleted_files = delete_from_storage(
                            file_replicas, prot, rse_info, staging_areas,
                            prepend_str)
                        logging.info('%s %i files processed in %s seconds',
                                     prepend_str, len(file_replicas),
                                     time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logging.debug(
                            '%s delete_replicas successed on %s : %s replicas in %s seconds',
                            prepend_str, rse_name, len(deleted_files),
                            time.time() - del_start)
                        monitor.record_counter(counters='reaper.deletion.done',
                                               delta=len(deleted_files))

                except Exception as error:
                    logging.critical('%s %s', prepend_str,
                                     str(traceback.format_exc()))

            if once:
                break

            tottime = time.time() - start_time
            if tottime < sleep_time:
                logging.info('%s Will sleep for %s seconds', prepend_str,
                             sleep_time - tottime)
                time.sleep(sleep_time - tottime)

        except DatabaseException as error:
            logging.warning('%s Reaper:  %s', prepend_str, str(error))
        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
    return
Beispiel #2
0
def reaper(rses,
           include_rses,
           exclude_rses,
           vos=None,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60,
           auto_exclude_threshold=100,
           auto_exclude_timeout=600):
    """
    Main loop to select and delete files.

    :param rses:                   List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param include_rses:           RSE expression to include RSEs.
    :param exclude_rses:           RSE expression to exclude RSEs from the Reaper.
    :param vos:                    VOs on which to look for RSEs. Only used in multi-VO mode.
                                   If None, we either use all VOs if run from "def", or the current VO otherwise.
    :param chunk_size:             The size of chunk for deletion.
    :param once:                   If True, only runs one iteration of the main loop.
    :param greedy:                 If True, delete right away replicas with tombstone.
    :param scheme:                 Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:          The delay to query replicas in BEING_DELETED state.
    :param sleep_time:             Time between two cycles.
    :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded.
    :param auto_exclude_timeout:   Timeout for temporarily excluded RSEs.
    """
    hostname = socket.getfqdn()
    executable = 'reaper'
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                      heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    logger(logging.INFO, 'Reaper starting')

    if not once:
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                      heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Reaper started')

    while not GRACEFUL_STOP.is_set():
        # try to get auto exclude parameters from the config table. Otherwise use CLI parameters.
        try:
            auto_exclude_threshold = config_get('reaper',
                                                'auto_exclude_threshold',
                                                default=auto_exclude_threshold)
            auto_exclude_timeout = config_get('reaper',
                                              'auto_exclude_timeout',
                                              default=auto_exclude_timeout)
        except (NoOptionError, NoSectionError, RuntimeError):
            pass

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = config_get(
                'reaper', 'max_evaluator_backlog_count')
        except (NoOptionError, NoSectionError, RuntimeError):
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = config_get(
                'reaper', 'max_evaluator_backlog_duration')
        except (NoOptionError, NoSectionError, RuntimeError):
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog()
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses,
                                              vos)
        if not rses_to_process:
            logger(logging.ERROR,
                   'Reaper: No RSEs found. Will sleep for 30 seconds')
            GRACEFUL_STOP.wait(30)
            continue
        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'reaper[%i/%i] ' % (heart_beat['assign_thread'],
                                              heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            tot_needed_free_space = 0
            for rse in rses_to_process:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blocklisted
                if rse['availability'] % 2 == 0:
                    logger(logging.DEBUG, 'RSE %s is blocklisted for delete',
                           rse['rse'])
                    continue
                needed_free_space, only_delete_obsolete = __check_rse_usage(
                    rse['rse'], rse['id'], greedy=greedy, logger=logger)
                if needed_free_space:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                    tot_needed_free_space += needed_free_space
                elif only_delete_obsolete:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                else:
                    logger(logging.DEBUG, 'Nothing to delete on %s',
                           rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True))
            logger(logging.DEBUG,
                   'List of RSEs to process ordered by needed space desc: %s',
                   str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                if tot_needed_free_space:
                    max_workers = ceil(dict_rses[rse_key][0] /
                                       tot_needed_free_space * 1000 /
                                       heart_beat['nr_threads'])
                else:
                    max_workers = 1

                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            paused_rses = []
            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                result = REGION.get('pause_deletion_%s' % rse_id,
                                    expiration_time=120)
                if result is not NO_VALUE:
                    paused_rses.append(rse_name)
                    logger(
                        logging.DEBUG,
                        'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                        rse_name)
                    continue
                result = REGION.get('temporary_exclude_%s' % rse_id,
                                    expiration_time=auto_exclude_timeout)
                if result is not NO_VALUE:
                    logger(
                        logging.WARNING,
                        'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                        rse_name)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    continue
                labels = {'rse': rse_name}
                EXCLUDED_RSE_GAUGE.labels(**labels).set(0)
                percent = 0
                if tot_needed_free_space:
                    percent = needed_free_space / tot_needed_free_space * 100
                logger(
                    logging.DEBUG,
                    'Working on %s. Percentage of the total space needed %.2f',
                    rse_name, percent)
                try:
                    rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                except KeyError:
                    logger(logging.DEBUG, "Hostname lookup for %s failed.",
                           rse_name)
                    REGION.set('pause_deletion_%s' % rse_id, True)
                    continue
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]
                max_deletion_thread = get_max_deletion_threads_by_hostname(
                    rse_hostname)
                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logger(
                        logging.DEBUG,
                        'Too many deletion threads for %s on RSE %s. Back off',
                        rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue
                logger(
                    logging.INFO,
                    'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logger(logging.DEBUG, 'Total deletion workers for %s : %i',
                       rse_hostname, tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                only_delete_obsolete = dict_rses[(rse_name, rse_id)][1]
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        if only_delete_obsolete:
                            logger(
                                logging.DEBUG,
                                'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                                rse_name)
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes_=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            only_delete_obsolete=only_delete_obsolete,
                            session=None)
                    logger(
                        logging.DEBUG,
                        'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logger(
                            logging.DEBUG,
                            'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            rse_name, chunk_size, len(replicas))
                        REGION.set('pause_deletion_%s' % rse_id, True)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logger(logging.ERROR, '%s', str(error))
                    continue
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)
                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme,
                                                  logger=logger)
                    for file_replicas in chunks(replicas, chunk_size):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    list(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope':
                                                replica['scope'].external,
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values())[0])
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logger(
                                    logging.WARNING,
                                    'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    replica['scope'], replica['name'],
                                    rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logger(logging.CRITICAL,
                                       'Exception',
                                       exc_info=True)

                        deleted_files = delete_from_storage(
                            file_replicas,
                            prot,
                            rse_info,
                            staging_areas,
                            auto_exclude_threshold,
                            logger=logger)
                        logger(logging.INFO,
                               '%i files processed in %s seconds',
                               len(file_replicas),
                               time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logger(
                            logging.DEBUG,
                            'delete_replicas successed on %s : %s replicas in %s seconds',
                            rse_name, len(deleted_files),
                            time.time() - del_start)
                        DELETION_COUNTER.inc(len(deleted_files))
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            if paused_rses:
                logger(logging.INFO,
                       'Deletion paused for a while for following RSEs: %s',
                       ', '.join(paused_rses))

            if once:
                break

            daemon_sleep(start_time=start_time,
                         sleep_time=sleep_time,
                         graceful_stop=GRACEFUL_STOP,
                         logger=logger)

        except DatabaseException as error:
            logger(logging.WARNING, 'Reaper:  %s', str(error))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
    return
Beispiel #3
0
def _run_once(rses_to_process, chunk_size, greedy, scheme, delay_seconds,
              auto_exclude_threshold, auto_exclude_timeout, heartbeat_handler,
              oidc_account, oidc_scope, oidc_audience, **_kwargs):

    dict_rses = {}
    _, total_workers, logger = heartbeat_handler.live()
    tot_needed_free_space = 0
    for rse in rses_to_process:
        # Check if RSE is blocklisted
        if rse.columns['availability'] % 2 == 0:
            logger(logging.DEBUG, 'RSE %s is blocklisted for delete', rse.name)
            continue
        rse.ensure_loaded(load_attributes=True)
        enable_greedy = rse.attributes.get('greedyDeletion', False) or greedy
        needed_free_space, only_delete_obsolete = __check_rse_usage_cached(
            rse, greedy=enable_greedy, logger=logger)
        if needed_free_space:
            dict_rses[rse] = [
                needed_free_space, only_delete_obsolete, enable_greedy
            ]
            tot_needed_free_space += needed_free_space
        elif only_delete_obsolete:
            dict_rses[rse] = [
                needed_free_space, only_delete_obsolete, enable_greedy
            ]
        else:
            logger(logging.DEBUG, 'Nothing to delete on %s', rse.name)

    rses_with_params = [
        (rse, needed_free_space, only_delete_obsolete, enable_greedy)
        for rse, (needed_free_space, only_delete_obsolete,
                  enable_greedy) in dict_rses.items()
    ]

    # Ordering the RSEs based on the needed free space
    sorted_rses = sorted(rses_with_params, key=lambda x: x[1], reverse=True)
    log_msg_str = ', '.join(
        f'{rse}:{needed_free_space}:{only_delete_obsolete}:{enable_greedy}'
        for rse, needed_free_space, only_delete_obsolete, enable_greedy in
        sorted_rses)
    logger(logging.DEBUG,
           'List of RSEs to process ordered by needed space desc: %s',
           log_msg_str)

    random.shuffle(rses_with_params)

    work_remaining_by_rse = {}
    paused_rses = []
    for rse, needed_free_space, only_delete_obsolete, enable_greedy in rses_with_params:
        result = REGION.get('pause_deletion_%s' % rse.id, expiration_time=120)
        if result is not NO_VALUE:
            paused_rses.append(rse.name)
            logger(
                logging.DEBUG,
                'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                rse.name)
            continue

        result = REGION.get('temporary_exclude_%s' % rse.id,
                            expiration_time=auto_exclude_timeout)
        if result is not NO_VALUE:
            logger(
                logging.WARNING,
                'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                rse.name)
            EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(1)
            continue
        EXCLUDED_RSE_GAUGE.labels(rse=rse.name).set(0)

        percent = 0
        if tot_needed_free_space:
            percent = needed_free_space / tot_needed_free_space * 100
        logger(logging.DEBUG,
               'Working on %s. Percentage of the total space needed %.2f',
               rse.name, percent)

        rse_hostname = _rse_deletion_hostname(rse)
        if not rse_hostname:
            logger(logging.WARNING, 'No default delete protocol for %s',
                   rse.name)
            REGION.set('pause_deletion_%s' % rse.id, True)
            continue

        hb_payload = __try_reserve_worker_slot(
            heartbeat_handler=heartbeat_handler,
            rse=rse,
            hostname=rse_hostname,
            logger=logger)
        if not hb_payload:
            # Might need to reschedule a try on this RSE later in the same cycle
            continue

        # List and mark BEING_DELETED the files to delete
        del_start_time = time.time()
        try:
            use_temp_tables = config_get_bool('core',
                                              'use_temp_tables',
                                              default=False)
            with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                if only_delete_obsolete:
                    logger(
                        logging.DEBUG,
                        'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                        rse.name)
                if use_temp_tables:
                    replicas = list_and_mark_unlocked_replicas(
                        limit=chunk_size,
                        bytes_=needed_free_space,
                        rse_id=rse.id,
                        delay_seconds=delay_seconds,
                        only_delete_obsolete=only_delete_obsolete,
                        session=None)
                else:
                    replicas = list_and_mark_unlocked_replicas_no_temp_table(
                        limit=chunk_size,
                        bytes_=needed_free_space,
                        rse_id=rse.id,
                        delay_seconds=delay_seconds,
                        only_delete_obsolete=only_delete_obsolete,
                        session=None)
            logger(
                logging.DEBUG,
                'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                rse.name, needed_free_space,
                time.time() - del_start_time, len(replicas))
            if (len(replicas) == 0
                    and enable_greedy) or (len(replicas) < chunk_size
                                           and not enable_greedy):
                logger(
                    logging.DEBUG,
                    'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                    rse.name, chunk_size, len(replicas))
                REGION.set('pause_deletion_%s' % rse.id, True)
                work_remaining_by_rse[rse] = False
            else:
                work_remaining_by_rse[rse] = True

        except (DatabaseException, IntegrityError, DatabaseError) as error:
            logger(logging.ERROR, '%s', str(error))
            continue
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
            continue
        # Physical  deletion will take place there
        try:
            rse.ensure_loaded(load_info=True, load_attributes=True)
            auth_token = None
            if oidc_account and rse.attributes.get('oidc_support', False):
                account = InternalAccount(oidc_account, vo=rse.columns['vo'])
                token_dict = get_token_for_account_operation(
                    account,
                    req_audience=oidc_audience,
                    req_scope=oidc_scope,
                    admin=True)
                if token_dict is not None and 'token' in token_dict:
                    auth_token = token_dict['token']
                    logger(logging.DEBUG,
                           'OIDC authentication used for deletion.')
            prot = rsemgr.create_protocol(rse.info,
                                          'delete',
                                          scheme=scheme,
                                          auth_token=auth_token,
                                          logger=logger)
            for file_replicas in chunks(replicas, chunk_size):
                # Refresh heartbeat
                _, total_workers, logger = heartbeat_handler.live(
                    payload=hb_payload)
                del_start_time = time.time()
                for replica in file_replicas:
                    try:
                        replica['pfn'] = str(
                            list(
                                rsemgr.lfns2pfns(rse_settings=rse.info,
                                                 lfns=[{
                                                     'scope':
                                                     replica['scope'].external,
                                                     'name':
                                                     replica['name'],
                                                     'path':
                                                     replica['path']
                                                 }],
                                                 operation='delete',
                                                 scheme=scheme).values())[0])
                    except (ReplicaUnAvailable, ReplicaNotFound) as error:
                        logger(
                            logging.WARNING,
                            'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                            replica['scope'], replica['name'], rse.name,
                            str(error))
                        replica['pfn'] = None

                    except Exception:
                        logger(logging.CRITICAL, 'Exception', exc_info=True)

                is_staging = rse.columns['staging_area']
                deleted_files = delete_from_storage(heartbeat_handler,
                                                    hb_payload,
                                                    file_replicas,
                                                    prot,
                                                    rse.info,
                                                    is_staging,
                                                    auto_exclude_threshold,
                                                    logger=logger)
                logger(logging.INFO, '%i files processed in %s seconds',
                       len(file_replicas),
                       time.time() - del_start_time)

                # Then finally delete the replicas
                del_start = time.time()
                with monitor.record_timer_block('reaper.delete_replicas'):
                    delete_replicas(rse_id=rse.id, files=deleted_files)
                logger(
                    logging.DEBUG,
                    'delete_replicas successed on %s : %s replicas in %s seconds',
                    rse.name, len(deleted_files),
                    time.time() - del_start)
                DELETION_COUNTER.inc(len(deleted_files))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)

    if paused_rses:
        logger(logging.INFO,
               'Deletion paused for a while for following RSEs: %s',
               ', '.join(paused_rses))

    rses_with_more_work = [
        rse for rse, has_more_work in work_remaining_by_rse.items()
        if has_more_work
    ]
    return rses_with_more_work