Esempio n. 1
0
def get(section, option, issuer=None):
    """
    Get an option value for the named section. Value can be auto-coerced to int, float, and bool; string otherwise.

    Caveat emptor: Strings, regardless the case, matching 'on'/off', 'true'/'false', 'yes'/'no' are converted to bool.
                   0/1 are converted to int, and not to bool.

    :param section: The name of the section.
    :param option: The name of the option.
    :param issuer: The issuer account.
    :returns: The auto-coerced value.
    """

    kwargs = {'issuer': issuer, 'section': section, 'option': option}
    if not permission.has_permission(issuer=issuer, action='config_get', kwargs=kwargs):
        raise exception.AccessDenied('%s cannot retrieve option %s from section %s' % (issuer, option, section))
    return config.get(section, option)
Esempio n. 2
0
def __schedule_requests():
    """
    Schedule requests
    """
    try:
        logging.info("Throttler retrieve requests statistics")
        results = get_stats_by_activity_dest_state(state=[
            RequestState.QUEUED, RequestState.SUBMITTING,
            RequestState.SUBMITTED, RequestState.WAITING
        ])
        result_dict = {}
        for activity, dest_rse_id, account, state, rse, counter in results:
            threshold = get_config_limit(activity, dest_rse_id)
            if threshold or (counter and (state == RequestState.WAITING)):
                if dest_rse_id not in result_dict:
                    result_dict[dest_rse_id] = {
                        'waiting':
                        0,
                        'transfer':
                        0,
                        'threshold':
                        get_config_limit('all_activities', dest_rse_id),
                        'rse':
                        rse,
                        'activities': {}
                    }

                if activity not in result_dict[dest_rse_id]['activities']:
                    result_dict[dest_rse_id]['activities'][activity] = {
                        'waiting': 0,
                        'transfer': 0,
                        'threshold': threshold,
                        'accounts': {}
                    }
                if account not in result_dict[dest_rse_id]['activities'][
                        activity]['accounts']:
                    result_dict[dest_rse_id]['activities'][activity][
                        'accounts'][account] = {
                            'waiting': 0,
                            'transfer': 0
                        }
                if state == RequestState.WAITING:
                    result_dict[dest_rse_id]['activities'][activity][
                        'accounts'][account]['waiting'] += counter
                    result_dict[dest_rse_id]['activities'][activity][
                        'waiting'] += counter
                    result_dict[dest_rse_id]['waiting'] += counter
                else:
                    result_dict[dest_rse_id]['activities'][activity][
                        'accounts'][account]['transfer'] += counter
                    result_dict[dest_rse_id]['activities'][activity][
                        'transfer'] += counter
                    result_dict[dest_rse_id]['transfer'] += counter

        for dest_rse_id in result_dict:
            dest_rse_release_strategy = config_core.get(
                'throttler_release_strategy',
                'dest_%s' % dest_rse_id,
                default='fifo',
                use_cache=False)
            rse_name = result_dict[dest_rse_id]['rse']
            availability = get_rse(rse_name).availability
            if availability & 2:  # dest_rse is not blacklisted for write
                if dest_rse_release_strategy == 'grouped_fifo':
                    threshold = result_dict[dest_rse_id]['threshold']
                    transfer = result_dict[dest_rse_id]['transfer']
                    waiting = result_dict[dest_rse_id]['waiting']
                    if threshold and transfer + waiting > threshold:
                        record_gauge(
                            'daemons.conveyor.throttler.set_rse_transfer_limits.%s.max_transfers'
                            % (rse_name), threshold)
                        record_gauge(
                            'daemons.conveyor.throttler.set_rse_transfer_limits.%s.transfers'
                            % (rse_name), transfer)
                        record_gauge(
                            'daemons.conveyor.throttler.set_rse_transfer_limits.%s.waitings'
                            % (rse_name), waiting)
                        if transfer < 0.8 * threshold:
                            to_be_released = threshold - transfer
                            release_waiting_requests_grouped_fifo(
                                rse_name,
                                rse_id=dest_rse_id,
                                count=to_be_released)
                        else:
                            logging.debug(
                                "Throttler has done nothing on rse %s (transfer > 0.8 * threshold)"
                                % rse_name)
                    elif waiting > 0 or not threshold:
                        logging.debug(
                            "Throttler remove limits(threshold: %s) and release all waiting requests, rse %s"
                            % (threshold, rse_name))
                        delete_rse_transfer_limits(rse=None,
                                                   activity=activity,
                                                   rse_id=dest_rse_id)
                        release_all_waiting_requests(rse=None,
                                                     rse_id=dest_rse_id)
                        record_counter(
                            'daemons.conveyor.throttler.delete_rse_transfer_limits.%s'
                            % (rse_name))
                elif dest_rse_release_strategy == 'fifo':
                    for activity in result_dict[dest_rse_id]['activities']:
                        threshold = result_dict[dest_rse_id]['activities'][
                            activity]['threshold']
                        transfer = result_dict[dest_rse_id]['activities'][
                            activity]['transfer']
                        waiting = result_dict[dest_rse_id]['activities'][
                            activity]['waiting']
                        if waiting:
                            logging.debug(
                                "Request status for %s at %s: %s" %
                                (activity, rse_name, result_dict[dest_rse_id]
                                 ['activities'][activity]))
                        if threshold is None:
                            logging.debug(
                                "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse_id %s"
                                % (threshold, activity, dest_rse_id))
                            delete_rse_transfer_limits(rse=None,
                                                       activity=activity,
                                                       rse_id=dest_rse_id)
                            release_all_waiting_requests(rse=None,
                                                         activity=activity,
                                                         rse_id=dest_rse_id)
                            record_counter(
                                'daemons.conveyor.throttler.delete_rse_transfer_limits.%s.%s'
                                % (activity, rse_name))
                        elif transfer + waiting > threshold:
                            logging.debug(
                                "Throttler set limits for activity %s, rse %s"
                                % (activity, rse_name))
                            set_rse_transfer_limits(rse=None,
                                                    activity=activity,
                                                    rse_id=dest_rse_id,
                                                    max_transfers=threshold,
                                                    transfers=transfer,
                                                    waitings=waiting)
                            record_gauge(
                                'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.max_transfers'
                                % (activity, rse_name), threshold)
                            record_gauge(
                                'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.transfers'
                                % (activity, rse_name), transfer)
                            record_gauge(
                                'daemons.conveyor.throttler.set_rse_transfer_limits.%s.%s.waitings'
                                % (activity, rse_name), waiting)
                            if transfer < 0.8 * threshold:
                                # release requests on account
                                nr_accounts = len(
                                    result_dict[dest_rse_id]['activities']
                                    [activity]['accounts'])
                                if nr_accounts < 1:
                                    nr_accounts = 1
                                to_release = threshold - transfer
                                threshold_per_account = math.ceil(threshold /
                                                                  nr_accounts)
                                to_release_per_account = math.ceil(to_release /
                                                                   nr_accounts)
                                accounts = result_dict[dest_rse_id][
                                    'activities'][activity]['accounts']
                                for account in accounts:
                                    if nr_accounts == 1:
                                        logging.debug(
                                            "Throttler release %s waiting requests for activity %s, rse %s, account %s "
                                            % (to_release, activity, rse_name,
                                               account))
                                        release_waiting_requests_fifo(
                                            rse=None,
                                            activity=activity,
                                            rse_id=dest_rse_id,
                                            account=account,
                                            count=to_release)
                                        record_gauge(
                                            'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s'
                                            % (activity, rse_name, account),
                                            to_release)
                                    elif accounts[account][
                                            'transfer'] > threshold_per_account:
                                        logging.debug(
                                            "Throttler will not release  %s waiting requests for activity %s, rse %s, account %s: It queued more transfers than its share "
                                            % (accounts[account]['waiting'],
                                               activity, rse_name, account))
                                        nr_accounts -= 1
                                        to_release_per_account = math.ceil(
                                            to_release / nr_accounts)
                                    elif accounts[account][
                                            'waiting'] < to_release_per_account:
                                        logging.debug(
                                            "Throttler release %s waiting requests for activity %s, rse %s, account %s "
                                            % (accounts[account]['waiting'],
                                               activity, rse_name, account))
                                        release_waiting_requests_fifo(
                                            rse=None,
                                            activity=activity,
                                            rse_id=dest_rse_id,
                                            account=account,
                                            count=accounts[account]['waiting'])
                                        record_gauge(
                                            'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s'
                                            % (activity, rse_name, account),
                                            accounts[account]['waiting'])
                                        to_release = to_release - accounts[
                                            account]['waiting']
                                        nr_accounts -= 1
                                        to_release_per_account = math.ceil(
                                            to_release / nr_accounts)
                                    else:
                                        logging.debug(
                                            "Throttler release %s waiting requests for activity %s, rse %s, account %s "
                                            % (to_release_per_account,
                                               activity, rse_name, account))
                                        release_waiting_requests_fifo(
                                            rse=None,
                                            activity=activity,
                                            rse_id=dest_rse_id,
                                            account=account,
                                            count=to_release_per_account)
                                        record_gauge(
                                            'daemons.conveyor.throttler.release_waiting_requests.%s.%s.%s'
                                            % (activity, rse_name, account),
                                            to_release_per_account)
                                        to_release = to_release - to_release_per_account
                                        nr_accounts -= 1
                            else:
                                logging.debug(
                                    "Throttler has done nothing for activity %s on rse %s (transfer > 0.8 * threshold)"
                                    % (activity, rse_name))

                        elif waiting > 0:
                            logging.debug(
                                "Throttler remove limits(threshold: %s) and release all waiting requests for activity %s, rse %s"
                                % (threshold, activity, rse_name))
                            delete_rse_transfer_limits(rse=None,
                                                       activity=activity,
                                                       rse_id=dest_rse_id)
                            release_all_waiting_requests(rse=None,
                                                         activity=activity,
                                                         rse_id=dest_rse_id)
                            record_counter(
                                'daemons.conveyor.throttler.delete_rse_transfer_limits.%s.%s'
                                % (activity, rse_name))
    except Exception:
        logging.critical("Failed to schedule requests, error: %s" %
                         (traceback.format_exc()))
Esempio n. 3
0
def reaper(rses,
           include_rses,
           exclude_rses,
           vos=None,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60,
           auto_exclude_threshold=100,
           auto_exclude_timeout=600):
    """
    Main loop to select and delete files.

    :param rses:                   List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param include_rses:           RSE expression to include RSEs.
    :param exclude_rses:           RSE expression to exclude RSEs from the Reaper.
    :param vos:                    VOs on which to look for RSEs. Only used in multi-VO mode.
                                   If None, we either use all VOs if run from "def", or the current VO otherwise.
    :param chunk_size:             The size of chunk for deletion.
    :param once:                   If True, only runs one iteration of the main loop.
    :param greedy:                 If True, delete right away replicas with tombstone.
    :param scheme:                 Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:          The delay to query replicas in BEING_DELETED state.
    :param sleep_time:             Time between two cycles.
    :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded.
    :param auto_exclude_timeout:   Timeout for temporarily excluded RSEs.
    """
    hostname = socket.getfqdn()
    executable = 'reaper2'
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                       heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    logger(logging.INFO, 'Reaper starting')

    if not once:
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                       heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Reaper started')

    while not GRACEFUL_STOP.is_set():
        # try to get auto exclude parameters from the config table. Otherwise use CLI parameters.
        try:
            auto_exclude_threshold = get('reaper',
                                         'auto_exclude_threshold',
                                         default=auto_exclude_threshold)
            auto_exclude_timeout = get('reaper',
                                       'auto_exclude_timeout',
                                       default=auto_exclude_timeout)
        except ConfigNotFound:
            pass

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = get('reaper',
                                              'max_evaluator_backlog_count')
        except ConfigNotFound:
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = get(
                'reaper', 'max_evaluator_backlog_duration')
        except ConfigNotFound:
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog()
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses,
                                              vos)
        if not rses_to_process:
            logger(logging.ERROR,
                   'Reaper: No RSEs found. Will sleep for 30 seconds')
            GRACEFUL_STOP.wait(30)
            continue
        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                               heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            tot_needed_free_space = 0
            for rse in rses_to_process:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blacklisted
                if rse['availability'] % 2 == 0:
                    logger(logging.DEBUG, 'RSE %s is blacklisted for delete',
                           rse['rse'])
                    continue
                max_being_deleted_files, needed_free_space, used, free, only_delete_obsolete = __check_rse_usage(
                    rse['rse'], rse['id'], logger)
                # Check if greedy mode
                if greedy:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        1000000000000, max_being_deleted_files,
                        only_delete_obsolete
                    ]
                    tot_needed_free_space += 1000000000000
                else:
                    if needed_free_space:
                        dict_rses[(rse['rse'], rse['id'])] = [
                            needed_free_space, max_being_deleted_files,
                            only_delete_obsolete
                        ]
                        tot_needed_free_space += needed_free_space
                    elif only_delete_obsolete:
                        dict_rses[(rse['rse'], rse['id'])] = [
                            needed_free_space, max_being_deleted_files,
                            only_delete_obsolete
                        ]
                    else:
                        logger(logging.DEBUG, 'Nothing to delete on %s',
                               rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=itemgetter(1), reverse=True))
            logger(logging.DEBUG,
                   'List of RSEs to process ordered by needed space desc: %s',
                   str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                if tot_needed_free_space:
                    max_workers = ceil(dict_rses[rse_key][0] /
                                       tot_needed_free_space * 1000 /
                                       heart_beat['nr_threads'])
                else:
                    max_workers = 1

                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                result = REGION.get('pause_deletion_%s' % rse_id,
                                    expiration_time=120)
                if result is not NO_VALUE:
                    logger(
                        logging.INFO,
                        'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                        rse_name)
                    continue
                result = REGION.get('temporary_exclude_%s' % rse_id,
                                    expiration_time=auto_exclude_timeout)
                if result is not NO_VALUE:
                    logger(
                        logging.WARNING,
                        'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                        rse_name)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    continue
                labels = {'rse': rse_name}
                EXCLUDED_RSE_GAUGE.labels(**labels).set(0)
                percent = 0
                if tot_needed_free_space:
                    percent = needed_free_space / tot_needed_free_space * 100
                logger(
                    logging.DEBUG,
                    'Working on %s. Percentage of the total space needed %.2f',
                    rse_name, percent)
                rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]

                max_deletion_thread = get_max_deletion_threads_by_hostname(
                    rse_hostname)
                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logger(
                        logging.DEBUG,
                        'Too many deletion threads for %s on RSE %s. Back off',
                        rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue

                logger(
                    logging.INFO,
                    'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logger(logging.DEBUG, 'Total deletion workers for %s : %i',
                       rse_hostname, tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                only_delete_obsolete = dict_rses[(rse_name, rse_id)][2]
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        if only_delete_obsolete:
                            logger(
                                logging.DEBUG,
                                'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                                rse_name)
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            only_delete_obsolete=only_delete_obsolete,
                            session=None)
                    logger(
                        logging.DEBUG,
                        'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logger(
                            logging.DEBUG,
                            'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            rse_name, chunk_size, len(replicas))
                        REGION.set('pause_deletion_%s' % rse_id, True)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logger(logging.ERROR, '%s', str(error))
                    continue
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme)
                    for file_replicas in chunks(replicas, 100):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    list(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope':
                                                replica['scope'].external,
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values())[0])
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logger(
                                    logging.WARNING,
                                    'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    replica['scope'], replica['name'],
                                    rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logger(logging.CRITICAL,
                                       'Exception',
                                       exc_info=True)

                        deleted_files = delete_from_storage(
                            file_replicas, prot, rse_info, staging_areas,
                            auto_exclude_threshold, logger)
                        logger(logging.INFO,
                               '%i files processed in %s seconds',
                               len(file_replicas),
                               time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logger(
                            logging.DEBUG,
                            'delete_replicas successed on %s : %s replicas in %s seconds',
                            rse_name, len(deleted_files),
                            time.time() - del_start)
                        monitor.record_counter(counters='reaper.deletion.done',
                                               delta=len(deleted_files))
                        DELETION_COUNTER.inc(len(deleted_files))
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            if once:
                break

            tottime = time.time() - start_time
            if tottime < sleep_time:
                logger(logging.INFO, 'Will sleep for %s seconds',
                       sleep_time - tottime)
                GRACEFUL_STOP.wait(sleep_time - tottime)

        except DatabaseException as error:
            logger(logging.WARNING, 'Reaper:  %s', str(error))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
    return
Esempio n. 4
0
def add_exception(dids, account, pattern, comments, expires_at, session=None):
    """
    Add exceptions to Lifetime Model.

    :param dids:        The list of dids
    :param account:     The account of the requester.
    :param pattern:     The account.
    :param comments:    The comments associated to the exception.
    :param expires_at:  The expiration date of the exception.
    :param session:     The database session in use.

    returns:            The id of the exception.
    """
    exception_id = generate_uuid()
    text = 'Account %s requested a lifetime extension for a list of DIDs that can be found below\n' % account
    reason = comments
    volume = None
    lifetime = None
    if comments.find('||||') > -1:
        reason, volume = comments.split('||||')
    text += 'The reason for the extension is "%s"\n' % reason
    text += 'It represents %s datasets\n' % len(dids)
    if volume:
        text += 'The estimated physical volume is %s\n' % volume
    if expires_at and isinstance(expires_at, string_types):
        lifetime = str_to_date(expires_at)
        text += 'The lifetime exception should expires on %s\n' % str(
            expires_at)
    elif isinstance(expires_at, datetime):
        lifetime = expires_at
        text += 'The lifetime exception should expires on %s\n' % str(
            expires_at)
    text += 'Link to approve or reject this request can be found at the end of the mail\n'
    text += '\n'
    text += 'DIDTYPE SCOPE NAME\n'
    text += '\n'
    truncated_message = False
    for did in dids:
        did_type = None
        if 'did_type' in did:
            if isinstance(did['did_type'], string_types):
                did_type = DIDType.from_sym(did['did_type'])
            else:
                did_type = did['did_type']
        new_exception = models.LifetimeExceptions(
            id=exception_id,
            scope=did['scope'],
            name=did['name'],
            did_type=did_type,
            account=account,
            pattern=pattern,
            comments=reason,
            state=LifetimeExceptionsState.WAITING,
            expires_at=lifetime)
        if len(text) < 3000:
            text += '%s %s %s\n' % (str(did_type), did['scope'], did['name'])
        else:
            truncated_message = True
        try:
            new_exception.save(session=session, flush=False)
        except IntegrityError as error:
            if match('.*ORA-00001.*', str(error.args[0]))\
               or match('.*IntegrityError.*UNIQUE constraint failed.*', str(error.args[0]))\
               or match('.*1062.*Duplicate entry.*for key.*', str(error.args[0]))\
               or match('.*sqlite3.IntegrityError.*are not unique.*', error.args[0]):
                raise LifetimeExceptionDuplicate()
            raise RucioException(error.args[0])
    if truncated_message:
        text += '...\n'
        text += 'List too long. Truncated\n'
    text += '\n'
    text += 'Approve:   https://rucio-ui.cern.ch/lifetime_exception?id=%s&action=approve\n' % str(
        exception_id)
    text += 'Deny:      https://rucio-ui.cern.ch/lifetime_exception?id=%s&action=deny\n' % str(
        exception_id)
    approvers_email = get('lifetime_model',
                          'approvers_email',
                          default=[],
                          session=session)
    if approvers_email:
        approvers_email = approvers_email.split(',')  # pylint: disable=no-member

    add_message(event_type='email',
                payload={
                    'body':
                    text,
                    'to':
                    approvers_email,
                    'subject':
                    '[RUCIO] Request to approve lifetime exception %s' %
                    str(exception_id)
                },
                session=session)
    return exception_id
Esempio n. 5
0
File: bb8.py Progetto: rak108/rucio
def rule_rebalancer(rse_expression,
                    move_subscriptions=False,
                    use_dump=False,
                    sleep_time=300,
                    once=True,
                    dry_run=False):
    """
    Main loop to rebalancer rules automatically
    """

    total_rebalance_volume = 0
    executable = 'rucio-bb8'
    hostname = socket.gethostname()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'bb8[%i/%i] ' % (heart_beat['assign_thread'],
                                   heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.DEBUG, 'rse_expression: %s', rse_expression)
    logger(logging.INFO, 'BB8 started')

    while not GRACEFUL_STOP.is_set():
        logger(logging.INFO, 'Starting new cycle')
        heart_beat = live(executable, hostname, pid, hb_thread)
        start_time = time.time()
        total_rebalance_volume = 0
        tolerance = config_core.get('bb8', 'tolerance', default=0.05)
        max_total_rebalance_volume = config_core.get(
            'bb8', 'max_total_rebalance_volume', default=10 * 1E12)
        max_rse_rebalance_volume = config_core.get('bb8',
                                                   'max_rse_rebalance_volume',
                                                   default=500 * 1E9)
        min_total = config_core.get('bb8', 'min_total', default=20 * 1E9)
        payload_cnt = list_payload_counts(executable,
                                          older_than=600,
                                          hash_executable=None,
                                          session=None)
        if rse_expression in payload_cnt:
            logger(
                logging.WARNING,
                'One BB8 instance already running with the same RSE expression. Stopping'
            )
            break
        else:
            # List the RSEs represented by rse_expression
            try:
                rses = [rse for rse in parse_expression(rse_expression)]
                list_rses2 = [rse['rse'] for rse in rses]
            except InvalidRSEExpression as err:
                logger(logging.ERROR, err)
                break
            # List the RSEs represented by all the RSE expressions stored in heartbeat payload
            list_rses1 = []
            for rse_exp in payload_cnt:
                if rse_exp:
                    list_rses1 = [
                        rse['rse'] for rse in parse_expression(rse_exp)
                    ]
            for rse in list_rses2:
                if rse in list_rses1:
                    logger(logging.WARNING,
                           'Overlapping RSE expressions %s vs %s. Stopping',
                           rse_exp, rse_expression)
                    break

            logger(logging.INFO, 'Will process rebalancing on %s',
                   rse_expression)
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=max(600, sleep_time),
                              hash_executable=None,
                              payload=rse_expression,
                              session=None)
            total_primary = 0
            total_secondary = 0
            total_total = 0
            global_ratio = float(0)
            for rse in rses:
                logger(logging.DEBUG, 'Getting RSE usage on %s', rse['rse'])
                rse_usage = get_rse_usage(rse_id=rse['id'])
                usage_dict = {}
                for item in rse_usage:
                    # TODO Check last update
                    usage_dict[item['source']] = {
                        'used': item['used'],
                        'free': item['free'],
                        'total': item['total']
                    }

                try:
                    rse['primary'] = usage_dict['rucio']['used'] - usage_dict[
                        'expired']['used']
                    rse['secondary'] = usage_dict['expired']['used']
                    rse['total'] = usage_dict['storage']['total'] - usage_dict[
                        'min_free_space']['used']
                    rse['ratio'] = float(rse['primary']) / float(rse['total'])
                except KeyError as err:
                    logger(logging.ERROR,
                           'Missing source usage %s for RSE %s. Exiting', err,
                           rse['rse'])
                    break
                total_primary += rse['primary']
                total_secondary += rse['secondary']
                total_total += float(rse['total'])
                rse['receive_volume'] = 0  # Already rebalanced volume in this run
                global_ratio = float(total_primary) / float(total_total)
                logger(logging.INFO, 'Global ratio: %f' % (global_ratio))

            for rse in sorted(rses, key=lambda k: k['ratio']):
                logger(logging.INFO,
                       '%s Sec/Prim local ratio (%f) vs global %s', rse['rse'],
                       rse['ratio'], global_ratio)
            rses_over_ratio = sorted([
                rse for rse in rses
                if rse['ratio'] > global_ratio + global_ratio * tolerance
            ],
                                     key=lambda k: k['ratio'],
                                     reverse=True)
            rses_under_ratio = sorted([
                rse for rse in rses
                if rse['ratio'] < global_ratio - global_ratio * tolerance
            ],
                                      key=lambda k: k['ratio'],
                                      reverse=False)

            # Excluding RSEs
            logger(
                logging.DEBUG,
                'Excluding RSEs as destination which are too small by size:')
            for des in rses_under_ratio:
                if des['total'] < min_total:
                    logger(logging.DEBUG, 'Excluding %s', des['rse'])
                    rses_under_ratio.remove(des)
            logger(logging.DEBUG,
                   'Excluding RSEs as sources which are too small by size:')
            for src in rses_over_ratio:
                if src['total'] < min_total:
                    logger(logging.DEBUG, 'Excluding %s', src['rse'])
                    rses_over_ratio.remove(src)
            logger(
                logging.DEBUG,
                'Excluding RSEs as destinations which are not available for write:'
            )
            for des in rses_under_ratio:
                if des['availability'] & 2 == 0:
                    logger(logging.DEBUG, 'Excluding %s', des['rse'])
                    rses_under_ratio.remove(des)
            logger(
                logging.DEBUG,
                'Excluding RSEs as sources which are not available for read:')
            for src in rses_over_ratio:
                if src['availability'] & 4 == 0:
                    logger(logging.DEBUG, 'Excluding %s', src['rse'])
                    rses_over_ratio.remove(src)

            # Gets the number of active transfers per location
            dict_locks = get_active_locks(session=None)

            # Loop over RSEs over the ratio
            for index, source_rse in enumerate(rses_over_ratio):

                # The volume that would be rebalanced, not real availability of the data:
                available_source_rebalance_volume = int(
                    (source_rse['primary'] -
                     global_ratio * source_rse['secondary']) /
                    (global_ratio + 1))
                if available_source_rebalance_volume > max_rse_rebalance_volume:
                    available_source_rebalance_volume = max_rse_rebalance_volume
                if available_source_rebalance_volume > max_total_rebalance_volume - total_rebalance_volume:
                    available_source_rebalance_volume = max_total_rebalance_volume - total_rebalance_volume

                # Select a target:
                for destination_rse in rses_under_ratio:
                    if available_source_rebalance_volume > 0:
                        vo_str = ' on VO {}'.format(
                            destination_rse['vo']
                        ) if destination_rse['vo'] != 'def' else ''
                        if index == 0 and destination_rse['id'] in dict_locks:
                            replicating_volume = dict_locks[
                                destination_rse['id']]['bytes']
                            logger(logging.DEBUG,
                                   'Already %f TB replicating to %s%s',
                                   replicating_volume / 1E12,
                                   destination_rse['rse'], vo_str)
                            destination_rse[
                                'receive_volume'] += replicating_volume
                        if destination_rse[
                                'receive_volume'] >= max_rse_rebalance_volume:
                            continue
                        available_target_rebalance_volume = max_rse_rebalance_volume - destination_rse[
                            'receive_volume']
                        if available_target_rebalance_volume >= available_source_rebalance_volume:
                            available_target_rebalance_volume = available_source_rebalance_volume

                        logger(logging.INFO,
                               'Rebalance %d TB from %s(%f) to %s(%f)%s',
                               available_target_rebalance_volume / 1E12,
                               source_rse['rse'], source_rse['ratio'],
                               destination_rse['rse'],
                               destination_rse['ratio'], vo_str)
                        expr = destination_rse['rse']
                        rebalance_rse(
                            rse_id=source_rse['id'],
                            max_bytes=available_target_rebalance_volume,
                            dry_run=dry_run,
                            comment='Background rebalancing',
                            force_expression=expr,
                            logger=logger)

                        destination_rse[
                            'receive_volume'] += available_target_rebalance_volume
                        total_rebalance_volume += available_target_rebalance_volume
                        available_source_rebalance_volume -= available_target_rebalance_volume

        if once:
            break

        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logger(logging.INFO, 'Sleeping for a while : %f seconds',
                   sleep_time - time_diff)
            GRACEFUL_STOP.wait(sleep_time - time_diff)

    die(executable='rucio-bb8', hostname=hostname, pid=pid, thread=hb_thread)
Esempio n. 6
0
def add_subscription(name,
                     account,
                     filter,
                     replication_rules,
                     comments,
                     lifetime,
                     retroactive,
                     dry_run,
                     priority=3,
                     session=None):
    """
    Adds a new subscription which will be verified against every new added file and dataset

    :param account: Account identifier
    :type account:  String
    :param name: Name of the subscription
    :type:  String
    :param filter: Dictionary of attributes by which the input data should be filtered
                   **Example**: ``{'dsn': 'data11_hi*.express_express.*,data11_hi*physics_MinBiasOverlay*', 'account': 'tzero'}``
    :type filter:  Dict
    :param replication_rules: Replication rules to be set : Dictionary with keys copies, rse_expression, weight, rse_expression
    :type replication_rules:  Dict
    :param comments: Comments for the subscription
    :type comments:  String
    :param lifetime: Subscription's lifetime (days)
    :type lifetime:  Integer or None
    :param retroactive: Flag to know if the subscription should be applied on previous data
    :type retroactive:  Boolean
    :param dry_run: Just print the subscriptions actions without actually executing them (Useful if retroactive flag is set)
    :type dry_run:  Boolean
    :param priority: The priority of the subscription
    :type priority: Integer
    :param session: The database session in use.

    :returns: The subscriptionid
    """
    try:
        keep_history = get('subscriptions', 'keep_history')
    except ConfigNotFound:
        keep_history = False

    SubscriptionHistory = models.Subscription.__history_mapper__.class_
    retroactive = bool(
        retroactive)  # Force boolean type, necessary for strict SQL
    state = SubscriptionState.ACTIVE
    lifetime = None
    if retroactive:
        state = SubscriptionState.NEW
    if lifetime:
        lifetime = datetime.datetime.utcnow() + datetime.timedelta(
            days=lifetime)
    new_subscription = models.Subscription(name=name,
                                           filter=filter,
                                           account=account,
                                           replication_rules=replication_rules,
                                           state=state,
                                           lifetime=lifetime,
                                           retroactive=retroactive,
                                           policyid=priority,
                                           comments=comments)
    if keep_history:
        subscription_history = SubscriptionHistory(
            id=new_subscription.id,
            name=new_subscription.name,
            filter=new_subscription.filter,
            account=new_subscription.account,
            replication_rules=new_subscription.replication_rules,
            state=new_subscription.state,
            lifetime=new_subscription.lifetime,
            retroactive=new_subscription.retroactive,
            policyid=new_subscription.policyid,
            comments=new_subscription.comments)
    try:
        new_subscription.save(session=session)
        if keep_history:
            subscription_history.save(session=session)
    except IntegrityError as error:
        if re.match('.*IntegrityError.*ORA-00001: unique constraint.*SUBSCRIPTIONS_PK.*violated.*', error.args[0])\
           or re.match(".*IntegrityError.*UNIQUE constraint failed: subscriptions.name, subscriptions.account.*", error.args[0])\
           or re.match('.*IntegrityError.*columns? name.*account.*not unique.*', error.args[0]) \
           or re.match('.*IntegrityError.*ORA-00001: unique constraint.*SUBSCRIPTIONS_NAME_ACCOUNT_UQ.*violated.*', error.args[0])\
           or re.match('.*IntegrityError.*1062.*Duplicate entry.*', error.args[0]) \
           or re.match('.*IntegrityError.*duplicate key value violates unique constraint.*', error.args[0]) \
           or re.match('.*UniqueViolation.*duplicate key value violates unique constraint.*', error.args[0]):
            raise SubscriptionDuplicate(
                'Subscription \'%s\' owned by \'%s\' already exists!' %
                (name, account))
        raise RucioException(error.args)
    return new_subscription.id
Esempio n. 7
0
def update_subscription(name, account, metadata=None, session=None):
    """
    Updates a subscription

    :param name: Name of the subscription
    :type:  String
    :param account: Account identifier
    :type account:  String
    :param metadata: Dictionary of metadata to update. Supported keys : filter, replication_rules, comments, lifetime, retroactive, dry_run, priority, last_processed
    :type metadata:  Dict
    :param session: The database session in use.
    :raises: SubscriptionNotFound if subscription is not found
    """
    try:
        keep_history = get('subscriptions', 'keep_history')
    except ConfigNotFound:
        keep_history = False
    values = {'state': SubscriptionState.UPDATED}
    if 'filter' in metadata and metadata['filter']:
        values['filter'] = dumps(metadata['filter'])
    if 'replication_rules' in metadata and metadata['replication_rules']:
        values['replication_rules'] = dumps(metadata['replication_rules'])
    if 'lifetime' in metadata and metadata['lifetime']:
        values['lifetime'] = datetime.datetime.utcnow() + datetime.timedelta(
            days=float(metadata['lifetime']))
    if 'retroactive' in metadata and metadata['retroactive']:
        values['retroactive'] = metadata['retroactive']
    if 'dry_run' in metadata and metadata['dry_run']:
        values['dry_run'] = metadata['dry_run']
    if 'comments' in metadata and metadata['comments']:
        values['comments'] = metadata['comments']
    if 'priority' in metadata and metadata['priority']:
        values['policyid'] = metadata['priority']
    if 'last_processed' in metadata and metadata['last_processed']:
        values['last_processed'] = metadata['last_processed']
    if 'state' in metadata and metadata['state'] == SubscriptionState.INACTIVE:
        values['state'] = SubscriptionState.INACTIVE
        values['expired_at'] = datetime.datetime.utcnow()

    SubscriptionHistory = models.Subscription.__history_mapper__.class_
    try:
        subscription = session.query(models.Subscription).filter_by(
            account=account, name=name).one()
        subscription.update(values)
        if keep_history:
            subscription_history = SubscriptionHistory(
                id=subscription.id,
                name=subscription.name,
                filter=subscription.filter,
                account=subscription.account,
                replication_rules=subscription.replication_rules,
                state=subscription.state,
                lifetime=subscription.lifetime,
                retroactive=subscription.retroactive,
                policyid=subscription.policyid,
                comments=subscription.comments,
                last_processed=subscription.last_processed,
                expired_at=subscription.expired_at,
                updated_at=subscription.updated_at,
                created_at=subscription.created_at)
            subscription_history.save(session=session)
    except NoResultFound:
        raise SubscriptionNotFound(
            "Subscription for account '%(account)s' named '%(name)s' not found"
            % locals())