Ejemplo n.º 1
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info(prepend_str + 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'] - 1,
                worker_number=heart_beat['assign_thread'],
                limit=1000)
            logging.info(prepend_str + '%s expired replicas returned' %
                         len(expired_replicas))
            logging.debug(prepend_str +
                          'List of expired replicas returned %s' %
                          str(expired_replicas))
            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                })
                bad_replicas.append({
                    'scope':
                    replica[0],
                    'name':
                    replica[1],
                    'rse_id':
                    replica[2],
                    'state':
                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                })
            session = get_session()

            for chunk in chunks(replicas, chunk_size):
                # Process and update the replicas in chunks
                update_replicas_states(chunk, nowait=False, session=session)
            for chunk in chunks(bad_replicas, chunk_size):
                # Remove the replicas from bad_replicas table in chunks
                bulk_delete_bad_replicas(chunk, session=session)

            session.commit()  # pylint: disable=no-member

        except Exception as error:
            logging.error(prepend_str + '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 2
0
def stager(once=False,
           rses=None,
           mock=False,
           bulk=100,
           group_bulk=1,
           group_policy='rule',
           source_strategy=None,
           activities=None,
           sleep_time=600,
           retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None

    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}
    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" %
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = 'conveyor-stager'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Stager starting with bring_online %s seconds' %
                 (bring_online))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Stager started')

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None

            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                logging.info(prepend_str +
                             'Starting to get stagein transfers for %s' %
                             (activity))
                start_time = time.time()
                transfers = __get_stagein_transfers(
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    mock=mock,
                    schemes=scheme,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts)
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.stager.get_stagein_transfers',
                               len(transfers))
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.transfers',
                    len(transfers))
                logging.info(prepend_str + 'Got %s stagein transfers for %s' %
                             (len(transfers), activity))

                # group transfers
                logging.info(prepend_str +
                             'Starting to group transfers for %s' % (activity))
                start_time = time.time()
                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk, source_strategy,
                                                   max_time_in_queue)
                record_timer('daemons.conveyor.stager.bulk_group_transfer',
                             (time.time() - start_time) * 1000 /
                             (len(transfers) if transfers else 1))

                logging.info(prepend_str +
                             'Starting to submit transfers for %s' %
                             (activity))
                # submit transfers
                for external_host in grouped_jobs:
                    for job in grouped_jobs[external_host]:
                        # submit transfers
                        submit_transfer(external_host=external_host,
                                        job=job,
                                        submitter='transfer_submitter',
                                        logging_prepend_str=prepend_str)

                if len(transfers) < group_bulk:
                    logging.info(
                        prepend_str +
                        'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds'
                        % (len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical(prepend_str + '%s' % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 3
0
def reaper(rses=[],
           worker_number=1,
           total_workers=1,
           chunk_size=100,
           once=False,
           scheme=None):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param total_workers:  The total number of workers.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    """
    logging.info('Starting Dark Reaper %s-%s: Will work on RSEs: %s',
                 worker_number, total_workers, str(rses))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    hash_executable = hashlib.sha256(sys.argv[0] + ''.join(rses)).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            logging.info(
                'Dark Reaper({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'
                .format(locals()))
            nothing_to_do = True

            random.shuffle(rses)
            for rse_id in rses:
                rse = rse_core.get_rse_name(rse_id=rse_id)
                rse_info = rsemgr.get_rse_info(rse)
                replicas = list_quarantined_replicas(
                    rse_id=rse_id,
                    limit=chunk_size,
                    worker_number=worker_number,
                    total_workers=total_workers)

                rse_protocol = rse_core.get_rse_protocols(rse_id=rse_id)
                prot = rsemgr.create_protocol(rse_info,
                                              'delete',
                                              scheme=scheme)
                deleted_replicas = []
                try:
                    prot.connect()
                    for replica in replicas:
                        nothing_to_do = False
                        try:
                            pfn = str(
                                rsemgr.lfns2pfns(rse_settings=rse_info,
                                                 lfns=[{
                                                     'scope':
                                                     replica['scope'].external,
                                                     'name':
                                                     replica['name'],
                                                     'path':
                                                     replica['path']
                                                 }],
                                                 operation='delete',
                                                 scheme=scheme).values()[0])
                            logging.info(
                                'Dark Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s',
                                worker_number, total_workers, replica['scope'],
                                replica['name'], pfn, rse)
                            start = time.time()
                            prot.delete(pfn)
                            duration = time.time() - start
                            logging.info(
                                'Dark Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                worker_number, total_workers, replica['scope'],
                                replica['name'], pfn, rse, duration)
                            add_message(
                                'deletion-done', {
                                    'scope': replica['scope'].external,
                                    'name': replica['name'],
                                    'rse': rse,
                                    'rse_id': rse_id,
                                    'file-size': replica.get('bytes') or 0,
                                    'bytes': replica.get('bytes') or 0,
                                    'url': pfn,
                                    'duration': duration,
                                    'protocol': prot.attributes['scheme']
                                })
                            deleted_replicas.append(replica)
                        except SourceNotFound:
                            err_msg = 'Dark Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (
                                worker_number, total_workers, replica['scope'],
                                replica['name'], pfn, rse)
                            logging.warning(err_msg)
                            deleted_replicas.append(replica)
                        except (ServiceUnavailable, RSEAccessDenied,
                                ResourceTemporaryUnavailable) as error:
                            err_msg = 'Dark Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (
                                worker_number, total_workers, replica['scope'],
                                replica['name'], pfn, rse, str(error))
                            logging.warning(err_msg)
                            add_message(
                                'deletion-failed', {
                                    'scope': replica['scope'].external,
                                    'name': replica['name'],
                                    'rse': rse,
                                    'rse_id': rse_id,
                                    'file-size': replica['bytes'] or 0,
                                    'bytes': replica['bytes'] or 0,
                                    'url': pfn,
                                    'reason': str(error),
                                    'protocol': prot.attributes['scheme']
                                })

                        except:
                            logging.critical(traceback.format_exc())
                finally:
                    prot.close()

                delete_quarantined_replicas(rse_id=rse_id,
                                            replicas=deleted_replicas)

                if once:
                    break

            if once:
                break

            if nothing_to_do:
                logging.info(
                    'Dark Reaper %s-%s: Nothing to do. I will sleep for 60s',
                    worker_number, total_workers)
                time.sleep(60)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Ejemplo n.º 4
0
def submitter(once=False, rses=None, partition_wait_time=10,
              bulk=100, group_bulk=1, group_policy='rule', source_strategy=None,
              activities=None, sleep_time=600, max_sources=4, retry_other_fts=False,
              filter_transfertool=FILTER_TRANSFERTOOL, transfertool=TRANSFER_TOOL, transfertype=TRANSFER_TYPE):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s", max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = "conveyor-submitter"
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if filter_transfertool:
        executable += ' --filter-transfertool ' + filter_transfertool

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Submitter starting with timeout %s', timeout)

    if partition_wait_time:
        time.sleep(partition_wait_time)  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Transfer submitter started')

    while not graceful_stop.is_set():
        if activities is None:
            activities = [None]
        if rses:
            rse_ids = [rse['id'] for rse in rses]
        else:
            rse_ids = None
        for activity in activities:
            try:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
                prefix = 'conveyor-submitter[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
                logger = formatted_logger(logging.log, prefix + '%s')

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logger(logging.INFO, 'CMS user transfer activity')
                    user_transfer = True

                logger(logging.INFO, 'Starting to get transfer transfers for %s', activity)
                start_time = time.time()
                transfers = __get_transfers(total_workers=heart_beat['nr_threads'],
                                            worker_number=heart_beat['assign_thread'],
                                            failover_schemes=failover_scheme,
                                            limit=bulk,
                                            activity=activity,
                                            rses=rse_ids,
                                            schemes=scheme,
                                            max_sources=max_sources,
                                            bring_online=bring_online,
                                            retry_other_fts=retry_other_fts,
                                            transfertool=filter_transfertool,
                                            logger=logger)

                record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers))
                GET_TRANSFERS_COUNTER.inc(len(transfers))
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers))
                logger(logging.INFO, 'Got %s transfers for %s in %s seconds', len(transfers), activity, time.time() - start_time)

                # group transfers
                logger(logging.INFO, 'Starting to group transfers for %s', activity)
                start_time = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, source_strategy, max_time_in_queue, group_by_scope=user_transfer)
                record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfers) if transfers else 1))

                logger(logging.INFO, 'Starting to submit transfers for %s', activity)

                if transfertool in ['fts3', 'mock']:
                    for external_host in grouped_jobs:
                        if not user_transfer:
                            for job in grouped_jobs[external_host]:
                                # submit transfers
                                submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter',
                                                timeout=timeout, logger=logger, transfertool=transfertool)
                        else:
                            for _, jobs in iteritems(grouped_jobs[external_host]):
                                # submit transfers
                                for job in jobs:
                                    submit_transfer(external_host=external_host, job=job, submitter='transfer_submitter',
                                                    timeout=timeout, user_transfer_job=user_transfer, logger=logger, transfertool=transfertool)
                elif transfertool == 'globus':
                    if transfertype == 'bulk':
                        # build bulk job file list per external host to send to submit_transfer
                        for external_host in grouped_jobs:
                            # pad the job with job_params; irrelevant for globus but needed for further rucio parsing
                            submitjob = {'files': [], 'job_params': grouped_jobs[''][0].get('job_params')}
                            for job in grouped_jobs[external_host]:
                                submitjob.get('files').append(job.get('files')[0])
                            logger(logging.DEBUG, 'submitjob: %s' % submitjob)
                            submit_transfer(external_host=external_host, job=submitjob, submitter='transfer_submitter',
                                            timeout=timeout, logger=logger, transfertool=transfertool)
                    else:
                        # build single job files and individually send to submit_transfer
                        job_params = grouped_jobs[''][0].get('job_params') if grouped_jobs else None
                        for external_host in grouped_jobs:
                            for job in grouped_jobs[external_host]:
                                for file in job['files']:
                                    singlejob = {'files': [file], 'job_params': job_params}
                                    logger(logging.DEBUG, 'singlejob: %s' % singlejob)
                                    submit_transfer(external_host=external_host, job=singlejob, submitter='transfer_submitter',
                                                    timeout=timeout, logger=logger, transfertool=transfertool)
                else:
                    logger(logging.ERROR, 'Unknown transfer tool')

                if len(transfers) < group_bulk:
                    logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds', len(transfers), activity, group_bulk, sleep_time)
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
            except Exception:
                logger(logging.CRITICAL, 'Exception', exc_info=True)

        if once:
            break

    logger(logging.INFO, 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
    return
Ejemplo n.º 5
0
def transmogrifier(bulk=5, once=False, sleep_time=60):
    """
    Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash,
    identifies the subscriptions matching the DIDs and
    submit a replication rule for each DID matching a subscription.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'transmogrifier'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)

        dids, subscriptions = [], []
        tottime = 0
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])

        try:
            #  Get the new DIDs based on the is_new flag
            for did in list_new_dids(thread=heart_beat['assign_thread'],
                                     total_threads=heart_beat['nr_threads'],
                                     chunk_size=bulk,
                                     did_type=None):
                dids.append({
                    'scope': did['scope'],
                    'did_type': str(did['did_type']),
                    'name': did['name']
                })

            sub_dict = {3: []}
            #  Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest
            #  The priority is defined as 'policyid'
            for sub in list_subscriptions(None, None):
                if sub['state'] != SubscriptionState.INACTIVE and sub[
                        'lifetime'] and (datetime.now() > sub['lifetime']):
                    update_subscription(
                        name=sub['name'],
                        account=sub['account'],
                        metadata={'state': SubscriptionState.INACTIVE})

                elif sub['state'] in [
                        SubscriptionState.ACTIVE, SubscriptionState.UPDATED
                ]:
                    priority = 3
                    if 'policyid' in sub:
                        if int(sub['policyid']) not in sub_dict:
                            sub_dict[int(sub['policyid'])] = []
                        priority = int(sub['policyid'])
                    sub_dict[priority].append(sub)
            priorities = list(sub_dict.keys())
            priorities.sort()
            #  Order the subscriptions according to their priority
            for priority in priorities:
                subscriptions.extend(sub_dict[priority])
        except SubscriptionNotFound as error:
            logging.warning(prepend_str + 'No subscriptions defined: %s' %
                            (str(error)))
            time.sleep(10)
            continue
        except Exception as error:
            logging.error(
                prepend_str +
                'Failed to get list of new DIDs or subscriptions: %s' %
                (str(error)))

        try:
            results = {}
            start_time = time.time()
            blacklisted_rse_id = [
                rse['id'] for rse in list_rses({'availability_write': False})
            ]
            logging.debug(prepend_str + 'In transmogrifier worker')
            identifiers = []
            #  Loop over all the new dids
            for did in dids:
                did_success = True
                if did['did_type'] == str(
                        DIDType.DATASET) or did['did_type'] == str(
                            DIDType.CONTAINER):
                    did_tag = '%s:%s' % (did['scope'].internal, did['name'])
                    results[did_tag] = []
                    try:
                        metadata = get_metadata(did['scope'], did['name'])
                        # Loop over all the subscriptions
                        for subscription in subscriptions:
                            #  Check if the DID match the subscription
                            if is_matching_subscription(
                                    subscription, did, metadata) is True:
                                filter_string = loads(subscription['filter'])
                                split_rule = filter_string.get(
                                    'split_rule', False)
                                stime = time.time()
                                results[did_tag].append(subscription['id'])
                                logging.info(prepend_str +
                                             '%s:%s matches subscription %s' %
                                             (did['scope'], did['name'],
                                              subscription['name']))
                                rules = loads(
                                    subscription['replication_rules'])
                                created_rules = {}
                                cnt = 0
                                for rule_dict in rules:
                                    cnt += 1
                                    created_rules[cnt] = []
                                    # Get all the rule and subscription parameters
                                    grouping = rule_dict.get(
                                        'grouping', 'DATASET')
                                    lifetime = rule_dict.get('lifetime', None)
                                    ignore_availability = rule_dict.get(
                                        'ignore_availability', None)
                                    weight = rule_dict.get('weight', None)
                                    source_replica_expression = rule_dict.get(
                                        'source_replica_expression', None)
                                    locked = rule_dict.get('locked', None)
                                    if locked == 'True':
                                        locked = True
                                    else:
                                        locked = False
                                    purge_replicas = rule_dict.get(
                                        'purge_replicas', False)
                                    if purge_replicas == 'True':
                                        purge_replicas = True
                                    else:
                                        purge_replicas = False
                                    rse_expression = str(
                                        rule_dict['rse_expression'])
                                    comment = str(subscription['comments'])
                                    if 'comments' in rule_dict:
                                        comment = str(rule_dict['comments'])
                                    subscription_id = str(subscription['id'])
                                    account = subscription['account']
                                    copies = int(rule_dict['copies'])
                                    activity = rule_dict.get(
                                        'activity', 'User Subscriptions')
                                    try:
                                        validate_schema(name='activity',
                                                        obj=activity)
                                    except InputValidationError as error:
                                        logging.error(
                                            prepend_str +
                                            'Error validating the activity %s'
                                            % (str(error)))
                                        activity = 'User Subscriptions'
                                    if lifetime:
                                        lifetime = int(lifetime)

                                    str_activity = "".join(activity.split())
                                    success = False
                                    nattempt = 5
                                    attemptnr = 0
                                    skip_rule_creation = False

                                    selected_rses = []
                                    chained_idx = rule_dict.get(
                                        'chained_idx', None)
                                    if chained_idx:
                                        params = {}
                                        if rule_dict.get(
                                                'associated_site_idx', None):
                                            params[
                                                'associated_site_idx'] = rule_dict.get(
                                                    'associated_site_idx',
                                                    None)
                                        logging.debug(
                                            '%s Chained subscription identified. Will use %s',
                                            prepend_str,
                                            str(created_rules[chained_idx]))
                                        algorithm = rule_dict.get(
                                            'algorithm', None)
                                        selected_rses = select_algorithm(
                                            algorithm,
                                            created_rules[chained_idx], params)
                                    else:
                                        # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm
                                        if split_rule:
                                            vo = account.vo
                                            rses = parse_expression(
                                                rse_expression,
                                                filter={'vo': vo})
                                            list_of_rses = [
                                                rse['id'] for rse in rses
                                            ]
                                            # Check that some rule doesn't already exist for this DID and subscription
                                            preferred_rse_ids = []
                                            for rule in list_rules(
                                                    filters={
                                                        'subscription_id':
                                                        subscription_id,
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }):
                                                already_existing_rses = [
                                                    (rse['rse'], rse['id']) for
                                                    rse in parse_expression(
                                                        rule['rse_expression'],
                                                        filter={'vo': vo})
                                                ]
                                                for rse, rse_id in already_existing_rses:
                                                    if (rse_id in list_of_rses
                                                        ) and (
                                                            rse_id not in
                                                            preferred_rse_ids):
                                                        preferred_rse_ids.append(
                                                            rse_id)
                                            if len(preferred_rse_ids
                                                   ) >= copies:
                                                skip_rule_creation = True
                                            rse_id_dict = {}
                                            for rse in rses:
                                                rse_id_dict[
                                                    rse['id']] = rse['rse']
                                            try:
                                                rseselector = RSESelector(
                                                    account=account,
                                                    rses=rses,
                                                    weight=weight,
                                                    copies=copies -
                                                    len(preferred_rse_ids))
                                                selected_rses = [
                                                    rse_id_dict[rse_id]
                                                    for rse_id, _, _ in
                                                    rseselector.select_rse(
                                                        0,
                                                        preferred_rse_ids=
                                                        preferred_rse_ids,
                                                        copies=copies,
                                                        blacklist=
                                                        blacklisted_rse_id)
                                                ]
                                            except (InsufficientTargetRSEs,
                                                    InsufficientAccountLimit,
                                                    InvalidRuleWeight,
                                                    RSEOverQuota) as error:
                                                logging.warning(
                                                    prepend_str +
                                                    'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites'
                                                    % (subscription['name'],
                                                       account, str(error)))
                                                # Now including the blacklisted sites
                                                try:
                                                    rseselector = RSESelector(
                                                        account=account,
                                                        rses=rses,
                                                        weight=weight,
                                                        copies=copies -
                                                        len(preferred_rse_ids))
                                                    selected_rses = [
                                                        rse_id_dict[rse_id]
                                                        for rse_id, _, _ in
                                                        rseselector.select_rse(
                                                            0,
                                                            preferred_rse_ids=
                                                            preferred_rse_ids,
                                                            copies=copies,
                                                            blacklist=[])
                                                    ]
                                                    ignore_availability = True
                                                except (InsufficientTargetRSEs,
                                                        InsufficientAccountLimit,
                                                        InvalidRuleWeight,
                                                        RSEOverQuota) as error:
                                                    logging.error(
                                                        prepend_str +
                                                        'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.'
                                                        %
                                                        (subscription['name'],
                                                         account, str(error)))
                                                    monitor.record_counter(
                                                        counters=
                                                        'transmogrifier.addnewrule.errortype.%s'
                                                        % (str(error.__class__.
                                                               __name__)),
                                                        delta=1)
                                                    # The DID won't be reevaluated at the next cycle
                                                    did_success = did_success and True
                                                    continue

                                    for attempt in range(0, nattempt):
                                        attemptnr = attempt
                                        nb_rule = 0
                                        #  Try to create the rule
                                        try:
                                            if split_rule:
                                                if not skip_rule_creation:
                                                    for rse in selected_rses:
                                                        if isinstance(
                                                                selected_rses,
                                                                dict):
                                                            source_replica_expression = selected_rses[
                                                                rse].get(
                                                                    'source_replica_expression',
                                                                    None)
                                                            weight = selected_rses[
                                                                rse].get(
                                                                    'weight',
                                                                    None)
                                                        logging.info(
                                                            prepend_str +
                                                            'Will insert one rule for %s:%s on %s'
                                                            %
                                                            (did['scope'],
                                                             did['name'], rse))
                                                        rule_ids = add_rule(
                                                            dids=[{
                                                                'scope':
                                                                did['scope'],
                                                                'name':
                                                                did['name']
                                                            }],
                                                            account=account,
                                                            copies=1,
                                                            rse_expression=rse,
                                                            grouping=grouping,
                                                            weight=weight,
                                                            lifetime=lifetime,
                                                            locked=locked,
                                                            subscription_id=
                                                            subscription_id,
                                                            source_replica_expression
                                                            =source_replica_expression,
                                                            activity=activity,
                                                            purge_replicas=
                                                            purge_replicas,
                                                            ignore_availability=
                                                            ignore_availability,
                                                            comment=comment)
                                                        created_rules[
                                                            cnt].append(
                                                                rule_ids[0])
                                                        nb_rule += 1
                                                        if nb_rule == copies:
                                                            success = True
                                                            break
                                            else:
                                                rule_ids = add_rule(
                                                    dids=[{
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }],
                                                    account=account,
                                                    copies=copies,
                                                    rse_expression=
                                                    rse_expression,
                                                    grouping=grouping,
                                                    weight=weight,
                                                    lifetime=lifetime,
                                                    locked=locked,
                                                    subscription_id=
                                                    subscription['id'],
                                                    source_replica_expression=
                                                    source_replica_expression,
                                                    activity=activity,
                                                    purge_replicas=
                                                    purge_replicas,
                                                    ignore_availability=
                                                    ignore_availability,
                                                    comment=comment)
                                                created_rules[cnt].append(
                                                    rule_ids[0])
                                                nb_rule += 1
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.done',
                                                delta=nb_rule)
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.activity.%s'
                                                % str_activity,
                                                delta=nb_rule)
                                            success = True
                                            break
                                        except (InvalidReplicationRule,
                                                InvalidRuleWeight,
                                                InvalidRSEExpression,
                                                StagingAreaRuleRequiresLifetime,
                                                DuplicateRule) as error:
                                            # Errors that won't be retried
                                            success = True
                                            logging.error(prepend_str + '%s' %
                                                          (str(error)))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                            break
                                        except (ReplicationRuleCreationTemporaryFailed,
                                                InsufficientTargetRSEs,
                                                InsufficientAccountLimit,
                                                DatabaseException,
                                                RSEBlacklisted,
                                                RSEWriteBlocked) as error:
                                            # Errors to be retried
                                            logging.error(
                                                prepend_str +
                                                '%s Will perform an other attempt %i/%i'
                                                % (str(error), attempt + 1,
                                                   nattempt))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                        except Exception:
                                            # Unexpected errors
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.unknown',
                                                delta=1)
                                            exc_type, exc_value, exc_traceback = exc_info(
                                            )
                                            logging.critical(
                                                prepend_str + ''.join(
                                                    format_exception(
                                                        exc_type, exc_value,
                                                        exc_traceback)).strip(
                                                        ))

                                    did_success = (did_success and success)
                                    if (attemptnr +
                                            1) == nattempt and not success:
                                        logging.error(
                                            prepend_str +
                                            'Rule for %s:%s on %s cannot be inserted'
                                            % (did['scope'], did['name'],
                                               rse_expression))
                                    else:
                                        logging.info(
                                            prepend_str +
                                            '%s rule(s) inserted in %f seconds'
                                            % (str(nb_rule),
                                               time.time() - stime))
                    except DataIdentifierNotFound as error:
                        logging.warning(prepend_str + error)

                if did_success:
                    if did['did_type'] == str(DIDType.FILE):
                        monitor.record_counter(
                            counters='transmogrifier.did.file.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.DATASET):
                        monitor.record_counter(
                            counters='transmogrifier.did.dataset.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.CONTAINER):
                        monitor.record_counter(
                            counters='transmogrifier.did.container.processed',
                            delta=1)
                    monitor.record_counter(
                        counters='transmogrifier.did.processed', delta=1)
                    identifiers.append({
                        'scope':
                        did['scope'],
                        'name':
                        did['name'],
                        'did_type':
                        DIDType.from_sym(did['did_type'])
                    })

            time1 = time.time()

            #  Mark the DIDs as processed
            for identifier in chunks(identifiers, 100):
                _retrial(set_new_dids, identifier, None)

            logging.info(prepend_str + 'Time to set the new flag : %f' %
                         (time.time() - time1))
            tottime = time.time() - start_time
            for sub in subscriptions:
                update_subscription(
                    name=sub['name'],
                    account=sub['account'],
                    metadata={'last_processed': datetime.now()})
            logging.info(prepend_str +
                         'It took %f seconds to process %i DIDs' %
                         (tottime, len(dids)))
            logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids)))
            monitor.record_counter(counters='transmogrifier.job.done', delta=1)
            monitor.record_timer(stat='transmogrifier.job.duration',
                                 time=1000 * tottime)
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(
                format_exception(exc_type, exc_value, exc_traceback)).strip())
            monitor.record_counter(counters='transmogrifier.job.error',
                                   delta=1)
            monitor.record_counter(counters='transmogrifier.addnewrule.error',
                                   delta=1)
        if once is True:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 6
0
def reaper(rses, include_rses, exclude_rses, chunk_size=100, once=False, greedy=False,
           scheme=None, delay_seconds=0, sleep_time=60):
    """
    Main loop to select and delete files.

    :param rses:           List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param exclude_rses:       RSE expression to exclude RSEs from the Reaper.
    :param include_rses:       RSE expression to include RSEs.
    :param chunk_size:     The size of chunk for deletion.
    :param once:           If True, only runs one iteration of the main loop.
    :param greedy:         If True, delete right away replicas with tombstone.
    :param scheme:         Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:  The delay to query replicas in BEING_DELETED state.
    :param sleep_time:     Time between two cycles.
    """
    hostname = socket.getfqdn()
    executable = sys.argv[0]
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
    logging.info('%s Reaper starting', prepend_str)

    time.sleep(10)  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
    logging.info('%s Reaper started', prepend_str)

    while not GRACEFUL_STOP.is_set():

        rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses)
        if not rses_to_process:
            logging.error('%s Reaper: No RSEs found. Will sleep for 30 seconds', prepend_str)
            time.sleep(30)
            continue
        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable, hostname, pid, hb_thread, older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
            tot_needed_free_space = 0
            for rse in rses_to_process:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blacklisted
                if rse['availability'] % 2 == 0:
                    logging.debug('%s RSE %s is blacklisted for delete', prepend_str, rse['rse'])
                    continue
                max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse['rse'], rse['id'], prepend_str)
                # Check if greedy mode
                if greedy:
                    dict_rses[(rse['rse'], rse['id'])] = [1000000000000, max_being_deleted_files]
                    tot_needed_free_space += 1000000000000
                else:
                    if needed_free_space:
                        dict_rses[(rse['rse'], rse['id'])] = [needed_free_space, max_being_deleted_files]
                        tot_needed_free_space += needed_free_space
                    else:
                        logging.debug('%s Nothing to delete on %s', prepend_str, rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(sorted(dict_rses.items(), key=itemgetter(1), reverse=True))
            logging.debug('%s List of RSEs to process ordered by needed space desc : %s', prepend_str, str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logging.debug('%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                max_workers = ceil(dict_rses[rse_key][0] / tot_needed_free_space * 1000 / heart_beat['nr_threads'])
                list_rses_mult.extend([(rse_name, rse_id, dict_rses[rse_key][0], dict_rses[rse_key][1]) for _ in range(int(max_workers))])
            random.shuffle(list_rses_mult)

            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                result = REGION.get('pause_deletion_%s' % rse_id, expiration_time=120)
                if result is not NO_VALUE:
                    logging.info('%s Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while', prepend_str, rse_name)
                    continue
                logging.debug('%s Working on %s. Percentage of the total space needed %.2f', prepend_str, rse_name, needed_free_space / tot_needed_free_space * 100)
                rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable, older_than=600, hash_executable=None, session=None)
                # logging.debug('%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]

                max_deletion_thread = get_max_deletion_threads_by_hostname(rse_hostname)
                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logging.debug('%s Too many deletion threads for %s on RSE %s. Back off', prepend_str, rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue

                logging.info('%s Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s', prepend_str, rse_hostname, tot_threads_for_hostname, max_deletion_thread, rse_name)
                live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None)
                logging.debug('%s Total deletion workers for %s : %i', prepend_str, rse_hostname, tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                try:
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_and_mark_unlocked_replicas(limit=chunk_size,
                                                                   bytes=needed_free_space,
                                                                   rse_id=rse_id,
                                                                   delay_seconds=delay_seconds,
                                                                   session=None)
                    logging.debug('%s list_and_mark_unlocked_replicas  on %s for %s bytes in %s seconds: %s replicas', prepend_str, rse_name, needed_free_space, time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logging.info('%s Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle', prepend_str, rse_name, chunk_size, len(replicas))
                        REGION.set('pause_deletion_%s' % rse_id, True)

                except (DatabaseException, IntegrityError, DatabaseError) as error:
                    logging.error('%s %s', prepend_str, str(error))
                    continue
                except Exception:
                    logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme)
                    for file_replicas in chunks(replicas, 100):
                        # Refresh heartbeat
                        live(executable, hostname, pid, hb_thread, older_than=600, hash_executable=None, payload=rse_hostname_key, session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info,
                                                                      lfns=[{'scope': replica['scope'], 'name': replica['name'], 'path': replica['path']}],
                                                                      operation='delete', scheme=scheme).values()[0])
                            except (ReplicaUnAvailable, ReplicaNotFound) as error:
                                logging.warning('%s Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s', prepend_str, replica['scope'], replica['name'], rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

                        deleted_files = delete_from_storage(file_replicas, prot, rse_info, staging_areas, prepend_str)
                        logging.info('%s %i files processed in %s seconds', prepend_str, len(file_replicas), time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block('reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logging.debug('%s delete_replicas successed on %s : %s replicas in %s seconds', prepend_str, rse_name, len(deleted_files), time.time() - del_start)
                        monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files))

                except Exception as error:
                    logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

            if once:
                break

            tottime = time.time() - start_time
            if tottime < sleep_time:
                logging.info('%s Will sleep for %s seconds', prepend_str, sleep_time - tottime)
                time.sleep(sleep_time - tottime)

        except DatabaseException as error:
            logging.warning('%s Reaper:  %s', prepend_str, str(error))
        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
    return
Ejemplo n.º 7
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Minos Temporary Expiration starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Minos Temporary Expiration started')

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info(prepend_str + 'Getting list of expired replicas')
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'],
                worker_number=heart_beat['assign_thread'],
                limit=1000)
            logging.info(prepend_str + '%s expired replicas returned' %
                         len(expired_replicas))
            logging.debug(prepend_str +
                          'List of expired replicas returned %s' %
                          str(expired_replicas))
            replicas = []
            bad_replicas = []
            for replica in expired_replicas:
                replicas.append({
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                })
                bad_replicas.append({
                    'scope':
                    replica[0],
                    'name':
                    replica[1],
                    'rse_id':
                    replica[2],
                    'state':
                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                })
            session = get_session()

            nchunk = 0
            tot_chunk = int(math.ceil(len(replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                # Process and update the replicas in chunks
                replicas = [{
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': ReplicaState.AVAILABLE
                } for replica in chunk]
                # Remove the replicas from bad_replicas table in chunks
                bad_replicas = [{
                    'scope': replica[0],
                    'name': replica[1],
                    'rse_id': replica[2],
                    'state': BadFilesStatus.TEMPORARY_UNAVAILABLE
                } for replica in chunk]
                try:
                    nchunk += 1
                    logging.debug(prepend_str +
                                  'Running on %s chunk out of %s' %
                                  (nchunk, tot_chunk))
                    update_replicas_states(replicas,
                                           nowait=True,
                                           session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning(
                        prepend_str +
                        'One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s'
                        % str(error))
                    for idx in range(len(chunk)):
                        logging.debug(prepend_str + 'Working on %s' %
                                      (str(replicas[idx])))
                        try:
                            get_metadata(replicas[idx]['scope'],
                                         replicas[idx]['name'])
                            update_replicas_states([
                                replicas[idx],
                            ],
                                                   nowait=True,
                                                   session=session)
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                prepend_str +
                                'DID %s:%s does not exist anymore. ' %
                                (bad_replicas[idx]['scope'],
                                 bad_replicas[idx]['name']))
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                prepend_str +
                                '%s:%s on RSEID %s does not exist anymore. ' %
                                (replicas[idx]['scope'], replicas[idx]['name'],
                                 replicas[idx]['rse_id']))
                            bulk_delete_bad_replicas([
                                bad_replicas[idx],
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical(traceback.format_exc())
                    session = get_session()

        except Exception:
            logging.critical(traceback.format_exc())

        tottime = time.time() - start_time
        if once:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 8
0
def reaper(rses,
           include_rses,
           exclude_rses,
           vos=None,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0,
           sleep_time=60,
           auto_exclude_threshold=100,
           auto_exclude_timeout=600):
    """
    Main loop to select and delete files.

    :param rses:                   List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param include_rses:           RSE expression to include RSEs.
    :param exclude_rses:           RSE expression to exclude RSEs from the Reaper.
    :param vos:                    VOs on which to look for RSEs. Only used in multi-VO mode.
                                   If None, we either use all VOs if run from "def", or the current VO otherwise.
    :param chunk_size:             The size of chunk for deletion.
    :param once:                   If True, only runs one iteration of the main loop.
    :param greedy:                 If True, delete right away replicas with tombstone.
    :param scheme:                 Force the reaper to use a particular protocol, e.g., mock.
    :param delay_seconds:          The delay to query replicas in BEING_DELETED state.
    :param sleep_time:             Time between two cycles.
    :param auto_exclude_threshold: Number of service unavailable exceptions after which the RSE gets temporarily excluded.
    :param auto_exclude_timeout:   Timeout for temporarily excluded RSEs.
    """
    hostname = socket.getfqdn()
    executable = 'reaper2'
    pid = os.getpid()
    hb_thread = threading.current_thread()
    sanity_check(executable=executable, hostname=hostname)
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                       heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')

    logger(logging.INFO, 'Reaper starting')

    if not once:
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time
    heart_beat = live(executable, hostname, pid, hb_thread)
    prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                       heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Reaper started')

    while not GRACEFUL_STOP.is_set():
        # try to get auto exclude parameters from the config table. Otherwise use CLI parameters.
        try:
            auto_exclude_threshold = get('reaper',
                                         'auto_exclude_threshold',
                                         default=auto_exclude_threshold)
            auto_exclude_timeout = get('reaper',
                                       'auto_exclude_timeout',
                                       default=auto_exclude_timeout)
        except ConfigNotFound:
            pass

        # Check if there is a Judge Evaluator backlog
        try:
            max_evaluator_backlog_count = get('reaper',
                                              'max_evaluator_backlog_count')
        except ConfigNotFound:
            max_evaluator_backlog_count = None
        try:
            max_evaluator_backlog_duration = get(
                'reaper', 'max_evaluator_backlog_duration')
        except ConfigNotFound:
            max_evaluator_backlog_duration = None
        if max_evaluator_backlog_count or max_evaluator_backlog_duration:
            backlog = get_evaluation_backlog()
            if max_evaluator_backlog_count and \
               backlog[0] and \
               max_evaluator_backlog_duration and \
               backlog[1] and \
               backlog[0] > max_evaluator_backlog_count and \
               backlog[1] < datetime.utcnow() - timedelta(minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count and duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_count and backlog[
                    0] and backlog[0] > max_evaluator_backlog_count:
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog count hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue
            elif max_evaluator_backlog_duration and backlog[
                    1] and backlog[1] < datetime.utcnow() - timedelta(
                        minutes=max_evaluator_backlog_duration):
                logger(
                    logging.ERROR,
                    'Reaper: Judge evaluator backlog duration hit, stopping operation'
                )
                GRACEFUL_STOP.wait(30)
                continue

        rses_to_process = get_rses_to_process(rses, include_rses, exclude_rses,
                                              vos)
        if not rses_to_process:
            logger(logging.ERROR,
                   'Reaper: No RSEs found. Will sleep for 30 seconds')
            GRACEFUL_STOP.wait(30)
            continue
        start_time = time.time()
        try:
            staging_areas = []
            dict_rses = {}
            heart_beat = live(executable,
                              hostname,
                              pid,
                              hb_thread,
                              older_than=3600)
            prepend_str = 'reaper2[%i/%i] ' % (heart_beat['assign_thread'],
                                               heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            tot_needed_free_space = 0
            for rse in rses_to_process:
                # Check if the RSE is a staging area
                if rse['staging_area']:
                    staging_areas.append(rse['rse'])
                # Check if RSE is blacklisted
                if rse['availability'] % 2 == 0:
                    logger(logging.DEBUG, 'RSE %s is blacklisted for delete',
                           rse['rse'])
                    continue
                needed_free_space, only_delete_obsolete = __check_rse_usage(
                    rse['rse'], rse['id'], greedy=greedy, logger=logger)
                if needed_free_space:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                    tot_needed_free_space += needed_free_space
                elif only_delete_obsolete:
                    dict_rses[(rse['rse'], rse['id'])] = [
                        needed_free_space, only_delete_obsolete
                    ]
                else:
                    logger(logging.DEBUG, 'Nothing to delete on %s',
                           rse['rse'])

            # Ordering the RSEs based on the needed free space
            sorted_dict_rses = OrderedDict(
                sorted(dict_rses.items(), key=lambda x: x[1][0], reverse=True))
            logger(logging.DEBUG,
                   'List of RSEs to process ordered by needed space desc: %s',
                   str(sorted_dict_rses))

            # Get the mapping between the RSE and the hostname used for deletion. The dictionary has RSE as key and (hostanme, rse_info) as value
            rses_hostname_mapping = get_rses_to_hostname_mapping()
            # logger(logging.DEBUG, '%s Mapping RSEs to hostnames used for deletion : %s', prepend_str, str(rses_hostname_mapping))

            list_rses_mult = []

            # Loop over the RSEs. rse_key = (rse, rse_id) and fill list_rses_mult that contains all RSEs to process with different multiplicity
            for rse_key in dict_rses:
                rse_name, rse_id = rse_key
                # The length of the deletion queue scales inversily with the number of workers
                # The ceil increase the weight of the RSE with small amount of files to delete
                if tot_needed_free_space:
                    max_workers = ceil(dict_rses[rse_key][0] /
                                       tot_needed_free_space * 1000 /
                                       heart_beat['nr_threads'])
                else:
                    max_workers = 1

                list_rses_mult.extend([
                    (rse_name, rse_id, dict_rses[rse_key][0],
                     dict_rses[rse_key][1]) for _ in range(int(max_workers))
                ])
            random.shuffle(list_rses_mult)

            for rse_name, rse_id, needed_free_space, max_being_deleted_files in list_rses_mult:
                result = REGION.get('pause_deletion_%s' % rse_id,
                                    expiration_time=120)
                if result is not NO_VALUE:
                    logger(
                        logging.INFO,
                        'Not enough replicas to delete on %s during the previous cycle. Deletion paused for a while',
                        rse_name)
                    continue
                result = REGION.get('temporary_exclude_%s' % rse_id,
                                    expiration_time=auto_exclude_timeout)
                if result is not NO_VALUE:
                    logger(
                        logging.WARNING,
                        'Too many failed attempts for %s in last cycle. RSE is temporarly excluded.',
                        rse_name)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    continue
                labels = {'rse': rse_name}
                EXCLUDED_RSE_GAUGE.labels(**labels).set(0)
                percent = 0
                if tot_needed_free_space:
                    percent = needed_free_space / tot_needed_free_space * 100
                logger(
                    logging.DEBUG,
                    'Working on %s. Percentage of the total space needed %.2f',
                    rse_name, percent)
                try:
                    rse_hostname, rse_info = rses_hostname_mapping[rse_id]
                except KeyError:
                    logger(logging.DEBUG, "Hostname lookup for %s failed.",
                           rse_name)
                    REGION.set('pause_deletion_%s' % rse_id, True)
                    continue
                rse_hostname_key = '%s,%s' % (rse_id, rse_hostname)
                payload_cnt = list_payload_counts(executable,
                                                  older_than=600,
                                                  hash_executable=None,
                                                  session=None)
                # logger(logging.DEBUG, '%s Payload count : %s', prepend_str, str(payload_cnt))
                tot_threads_for_hostname = 0
                tot_threads_for_rse = 0
                for key in payload_cnt:
                    if key and key.find(',') > -1:
                        if key.split(',')[1] == rse_hostname:
                            tot_threads_for_hostname += payload_cnt[key]
                        if key.split(',')[0] == str(rse_id):
                            tot_threads_for_rse += payload_cnt[key]
                max_deletion_thread = get_max_deletion_threads_by_hostname(
                    rse_hostname)
                if rse_hostname_key in payload_cnt and tot_threads_for_hostname >= max_deletion_thread:
                    logger(
                        logging.DEBUG,
                        'Too many deletion threads for %s on RSE %s. Back off',
                        rse_hostname, rse_name)
                    # Might need to reschedule a try on this RSE later in the same cycle
                    continue
                logger(
                    logging.INFO,
                    'Nb workers on %s smaller than the limit (current %i vs max %i). Starting new worker on RSE %s',
                    rse_hostname, tot_threads_for_hostname,
                    max_deletion_thread, rse_name)
                live(executable,
                     hostname,
                     pid,
                     hb_thread,
                     older_than=600,
                     hash_executable=None,
                     payload=rse_hostname_key,
                     session=None)
                logger(logging.DEBUG, 'Total deletion workers for %s : %i',
                       rse_hostname, tot_threads_for_hostname + 1)
                # List and mark BEING_DELETED the files to delete
                del_start_time = time.time()
                only_delete_obsolete = dict_rses[(rse_name, rse_id)][1]
                try:
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        if only_delete_obsolete:
                            logger(
                                logging.DEBUG,
                                'Will run list_and_mark_unlocked_replicas on %s. No space needed, will only delete EPOCH tombstoned replicas',
                                rse_name)
                        replicas = list_and_mark_unlocked_replicas(
                            limit=chunk_size,
                            bytes=needed_free_space,
                            rse_id=rse_id,
                            delay_seconds=delay_seconds,
                            only_delete_obsolete=only_delete_obsolete,
                            session=None)
                    logger(
                        logging.DEBUG,
                        'list_and_mark_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        rse_name, needed_free_space,
                        time.time() - del_start_time, len(replicas))
                    if len(replicas) < chunk_size:
                        logger(
                            logging.DEBUG,
                            'Not enough replicas to delete on %s (%s requested vs %s returned). Will skip any new attempts on this RSE until next cycle',
                            rse_name, chunk_size, len(replicas))
                        REGION.set('pause_deletion_%s' % rse_id, True)

                except (DatabaseException, IntegrityError,
                        DatabaseError) as error:
                    logger(logging.ERROR, '%s', str(error))
                    continue
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)
                # Physical  deletion will take place there
                try:
                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme,
                                                  logger=logger)
                    for file_replicas in chunks(replicas, 100):
                        # Refresh heartbeat
                        live(executable,
                             hostname,
                             pid,
                             hb_thread,
                             older_than=600,
                             hash_executable=None,
                             payload=rse_hostname_key,
                             session=None)
                        del_start_time = time.time()
                        for replica in file_replicas:
                            try:
                                replica['pfn'] = str(
                                    list(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope':
                                                replica['scope'].external,
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values())[0])
                            except (ReplicaUnAvailable,
                                    ReplicaNotFound) as error:
                                logger(
                                    logging.WARNING,
                                    'Failed get pfn UNAVAILABLE replica %s:%s on %s with error %s',
                                    replica['scope'], replica['name'],
                                    rse_name, str(error))
                                replica['pfn'] = None

                            except Exception:
                                logger(logging.CRITICAL,
                                       'Exception',
                                       exc_info=True)

                        deleted_files = delete_from_storage(
                            file_replicas,
                            prot,
                            rse_info,
                            staging_areas,
                            auto_exclude_threshold,
                            logger=logger)
                        logger(logging.INFO,
                               '%i files processed in %s seconds',
                               len(file_replicas),
                               time.time() - del_start_time)

                        # Then finally delete the replicas
                        del_start = time.time()
                        with monitor.record_timer_block(
                                'reaper.delete_replicas'):
                            delete_replicas(rse_id=rse_id, files=deleted_files)
                        logger(
                            logging.DEBUG,
                            'delete_replicas successed on %s : %s replicas in %s seconds',
                            rse_name, len(deleted_files),
                            time.time() - del_start)
                        monitor.record_counter(counters='reaper.deletion.done',
                                               delta=len(deleted_files))
                        DELETION_COUNTER.inc(len(deleted_files))
                except Exception:
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            if once:
                break

            tottime = time.time() - start_time
            if tottime < sleep_time:
                logger(logging.INFO, 'Will sleep for %s seconds',
                       sleep_time - tottime)
                GRACEFUL_STOP.wait(sleep_time - tottime)

        except DatabaseException as error:
            logger(logging.WARNING, 'Reaper:  %s', str(error))
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)
        finally:
            if once:
                break

    die(executable=executable, hostname=hostname, pid=pid, thread=hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
    return
Ejemplo n.º 9
0
def fts_throttler(once=False, cycle_interval=3600):
    """
    Main loop to automatically configure FTS storages.
    """
    graceful_stop.clear()
    logging.info('FTS Throttler starting')

    executable = 'conveyor-fts-throttler'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    current_time = time.time()
    graceful_stop.wait(10)
    running_instance = False

    while not graceful_stop.is_set():
        heart_beat = heartbeat.live(executable,
                                    hostname,
                                    pid,
                                    hb_thread,
                                    older_than=3600)
        if heart_beat['nr_threads'] < 2:
            running_instance = True
            # this loop cannot be entered by more than one instance at a time.
            while not graceful_stop.is_set():

                try:
                    heart_beat = heartbeat.live(executable,
                                                hostname,
                                                pid,
                                                hb_thread,
                                                older_than=3600)
                    prepend_str = 'Thread [%i/%i] : ' % (
                        heart_beat['assign_thread'], heart_beat['nr_threads'])

                    logging.info(prepend_str + "fts_throttler start cycle")
                    if FTSThrottler().revert():
                        logging.info('revert was successful, now tuning')
                        FTSThrottler().tune()
                        logging.info('Tuning finished for this cycle')
                    else:
                        logging.warning(
                            'could not revert, cannot tune unless revert has been done, will try again next cycle.'
                        )

                    if once:
                        break
                    if time.time() < current_time + cycle_interval:
                        graceful_stop.wait(
                            int((current_time + cycle_interval) - time.time()))
                    current_time = time.time()
                except Exception:
                    logging.critical(prepend_str + 'fts_throttler crashed %s' %
                                     (traceback.format_exc()))

                if once:
                    break
        else:
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])
            logging.info(
                prepend_str +
                'another fts_throttler instance already exists. will wait')
            if time.time() < current_time + cycle_interval:
                graceful_stop.wait(
                    int((current_time + cycle_interval) - time.time()))
            current_time = time.time()

    logging.info(prepend_str + 'Throttler - graceful stop requested')

    # before we stop, try to revert, but only if this instance was running the cycles.
    # ! If the cycle info file information is shared between instances, then this implementation must be changed !
    if running_instance:
        try:
            FTSThrottler().revert()
        except Exception:
            logging.warning('could not revert changes before stopping')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Throttler - graceful stop done')
Ejemplo n.º 10
0
def run(once=False,
        scope=None,
        rses=None,
        sleep_time=60,
        default_dark_min_age=28,
        default_dark_threshold_percent=1.0,
        default_miss_threshold_percent=1.0,
        force_proceed=False,
        default_scanner_files_path="/var/cache/consistency-dump",
        threads=1):
    """
    Starts up the Consistency-Actions.
    """

    setup_logging()

    prefix = 'storage-consistency-actions (run())'
    logger = formatted_logger(logging.log, prefix + '%s')

    # TODO: These variables should be sourced from the RSE config in the future.
    # For now, they are passed as arguments, and to emphasize that fact, we are re-assigning them:
    dark_min_age = default_dark_min_age
    dark_threshold_percent = default_dark_threshold_percent
    miss_threshold_percent = default_miss_threshold_percent
    scanner_files_path = default_scanner_files_path

    if rses == []:
        logger(logging.INFO,
               'NO RSEs passed. Will loop over all writable RSEs.')

        rses = [rse['rse'] for rse in list_rses({'availability_write': True})]

# Could limit it only to Tier-2s:
#        rses = [rse['rse'] for rse in list_rses({'tier': 2, 'availability_write': True})]

    logging.info('\n RSEs: %s' % rses)
    logger(
        logging.INFO,
        '\n RSEs: %s  \n run once: %r \n Sleep time: %d  \n Dark min age (days): %d\
     \n Dark files threshold %%: %f  \n Missing files threshold %%: %f  \n Force proceed: %r\
     \n Scanner files path: %s ' %
        (rses, once, sleep_time, dark_min_age, dark_threshold_percent,
         miss_threshold_percent, force_proceed, scanner_files_path))

    executable = 'storage-consistency-actions'
    hostname = socket.gethostname()
    sanity_check(executable=executable, hostname=hostname)

    # It was decided that for the time being this daemon is best executed in a single thread
    # TODO: If this decicion is reversed in the future, the following line should be removed.
    threads = 1

    if once:
        actions_loop(once, scope, rses, sleep_time, dark_min_age,
                     dark_threshold_percent, miss_threshold_percent,
                     force_proceed, scanner_files_path)
    else:
        logging.info('Consistency Actions starting %s threads' % str(threads))
        threads = [
            threading.Thread(target=actions_loop,
                             kwargs={
                                 'once': once,
                                 'scope': scope,
                                 'rses': rses,
                                 'sleep_time': sleep_time,
                                 'dark_min_age': dark_min_age,
                                 'dark_threshold_percent':
                                 dark_threshold_percent,
                                 'miss_threshold_percent':
                                 miss_threshold_percent,
                                 'force_proceed': force_proceed,
                                 'scanner_files_path': scanner_files_path
                             }) for i in range(0, threads)
        ]
        logger(logging.INFO, 'Threads: %d' % len(threads))
        [t.start() for t in threads]
        # Interruptible joins require a timeout.
        while threads[0].is_alive():
            [t.join(timeout=3.14) for t in threads]
Ejemplo n.º 11
0
def reaper(rses,
           worker_number=1,
           child_number=1,
           total_children=1,
           chunk_size=100,
           once=False,
           greedy=False,
           scheme=None,
           delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param exclude_rses: RSE expression to exclude RSEs from the Reaper.
    """
    logging.info('Starting Reaper: Worker %(worker_number)s, '
                 'child %(child_number)s will work on RSEs: ' % locals() +
                 ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    # Generate a hash just for the subset of RSEs
    rse_names = [rse['rse'] for rse in rses]
    hash_executable = hashlib.sha256(sys.argv[0] +
                                     ''.join(rse_names)).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    nothing_to_do = {}
    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            checkpoint_time = datetime.datetime.now()
            # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))

            max_deleting_rate = 0
            for rse in sort_rses(rses):
                try:
                    if checkpoint_time + datetime.timedelta(
                            minutes=1) < datetime.datetime.now():
                        heartbeat = live(executable=executable,
                                         hostname=hostname,
                                         pid=pid,
                                         thread=thread,
                                         hash_executable=hash_executable)
                        # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))
                        checkpoint_time = datetime.datetime.now()

                    if rse['id'] in nothing_to_do and nothing_to_do[
                            rse['id']] > datetime.datetime.now():
                        continue
                    logging.info('Reaper %s-%s: Running on RSE %s %s',
                                 worker_number, child_number, rse['rse'],
                                 nothing_to_do.get(rse['id']))

                    rse_info = rsemgr.get_rse_info(rse['rse'])
                    rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id'])

                    if not rse_protocol['availability_delete']:
                        logging.info(
                            'Reaper %s-%s: RSE %s is not available for deletion',
                            worker_number, child_number, rse_info['rse'])
                        nothing_to_do[rse['id']] = datetime.datetime.now(
                        ) + datetime.timedelta(minutes=30)
                        continue

                    # Temporary hack to force gfal for deletion
                    for protocol in rse_info['protocols']:
                        if protocol[
                                'impl'] == 'rucio.rse.protocols.srm.Default' or protocol[
                                    'impl'] == 'rucio.rse.protocols.gsiftp.Default':
                            protocol[
                                'impl'] = 'rucio.rse.protocols.gfal.Default'

                        if protocol[
                                'impl'] == 'rucio.rse.protocols.signeds3.Default':
                            protocol[
                                'impl'] = 'rucio.rse.protocols.s3es.Default'

                    needed_free_space, max_being_deleted_files = None, 100
                    needed_free_space_per_child = None
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(
                            rse_id=rse['id'])
                        logging.info(
                            'Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s'
                            % locals())
                        if needed_free_space <= 0:
                            needed_free_space, needed_free_space_per_child = 0, 0
                            logging.info(
                                'Reaper %s-%s: free space is above minimum limit for %s',
                                worker_number, child_number, rse['rse'])
                        else:
                            if total_children and total_children > 0:
                                needed_free_space_per_child = needed_free_space / float(
                                    total_children)

                    start = time.time()
                    with monitor.record_timer_block(
                            'reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(
                            rse_id=rse['id'],
                            bytes=needed_free_space_per_child,
                            limit=max_being_deleted_files,
                            worker_number=child_number,
                            total_workers=total_children,
                            delay_seconds=delay_seconds)
                    logging.debug(
                        'Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas',
                        worker_number, child_number, rse['rse'],
                        needed_free_space_per_child,
                        time.time() - start, len(replicas))

                    if not replicas:
                        nothing_to_do[rse['id']] = datetime.datetime.now(
                        ) + datetime.timedelta(minutes=30)
                        logging.info(
                            'Reaper %s-%s: No replicas to delete %s. The next check will occur at %s',
                            worker_number, child_number, rse['rse'],
                            nothing_to_do[rse['id']])
                        continue

                    prot = rsemgr.create_protocol(rse_info,
                                                  'delete',
                                                  scheme=scheme)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s',
                                      worker_number, child_number, str(files))
                        try:
                            update_replicas_states(replicas=[
                                dict(replica.items() +
                                     [('state', ReplicaState.BEING_DELETED),
                                      ('rse_id', rse['id'])])
                                for replica in files
                            ],
                                                   nowait=True)
                            for replica in files:
                                try:
                                    replica['pfn'] = str(
                                        rsemgr.lfns2pfns(
                                            rse_settings=rse_info,
                                            lfns=[{
                                                'scope': replica['scope'],
                                                'name': replica['name'],
                                                'path': replica['path']
                                            }],
                                            operation='delete',
                                            scheme=scheme).values()[0])
                                except (ReplicaUnAvailable,
                                        ReplicaNotFound) as error:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (
                                        replica['scope'], replica['name'],
                                        rse['rse'], str(error))
                                    logging.warning('Reaper %s-%s: %s',
                                                    worker_number,
                                                    child_number, err_msg)
                                    replica['pfn'] = None

                            monitor.record_counter(
                                counters='reaper.deletion.being_deleted',
                                delta=len(files))

                            try:
                                deleted_files = []
                                prot.connect()
                                for replica in files:
                                    try:
                                        logging.info(
                                            'Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'])
                                        start = time.time()
                                        if rse['staging_area'] or rse[
                                                'rse'].endswith("STAGING"):
                                            logging.warning(
                                                'Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                                                worker_number, child_number,
                                                replica['scope'],
                                                replica['name'],
                                                replica['pfn'], rse['rse'])
                                        else:
                                            if replica['pfn']:
                                                pfn = replica['pfn']
                                                # sign the URL if necessary
                                                if prot.attributes[
                                                        'scheme'] == 'https' and rse_info[
                                                            'sign_url'] is not None:
                                                    pfn = get_signed_url(
                                                        rse_info['sign_url'],
                                                        'delete', pfn)
                                                prot.delete(pfn)
                                            else:
                                                logging.warning(
                                                    'Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s',
                                                    worker_number,
                                                    child_number,
                                                    replica['scope'],
                                                    replica['name'],
                                                    replica['pfn'], rse['rse'])
                                        monitor.record_timer(
                                            'daemons.reaper.delete.%s.%s' %
                                            (prot.attributes['scheme'],
                                             rse['rse']),
                                            (time.time() - start) * 1000)
                                        duration = time.time() - start

                                        deleted_files.append({
                                            'scope':
                                            replica['scope'],
                                            'name':
                                            replica['name']
                                        })

                                        add_message(
                                            'deletion-done', {
                                                'scope':
                                                replica['scope'],
                                                'name':
                                                replica['name'],
                                                'rse':
                                                rse_info['rse'],
                                                'rse_id':
                                                rse_info['id'],
                                                'file-size':
                                                replica['bytes'],
                                                'bytes':
                                                replica['bytes'],
                                                'url':
                                                replica['pfn'],
                                                'duration':
                                                duration,
                                                'protocol':
                                                prot.attributes['scheme']
                                            })
                                        logging.info(
                                            'Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'],
                                            duration)
                                    except SourceNotFound:
                                        err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'])
                                        logging.warning(err_msg)
                                        deleted_files.append({
                                            'scope':
                                            replica['scope'],
                                            'name':
                                            replica['name']
                                        })
                                        if replica[
                                                'state'] == ReplicaState.AVAILABLE:
                                            add_message(
                                                'deletion-failed', {
                                                    'scope':
                                                    replica['scope'],
                                                    'name':
                                                    replica['name'],
                                                    'rse':
                                                    rse_info['rse'],
                                                    'rse_id':
                                                    rse_info['id'],
                                                    'file-size':
                                                    replica['bytes'],
                                                    'bytes':
                                                    replica['bytes'],
                                                    'url':
                                                    replica['pfn'],
                                                    'reason':
                                                    str(err_msg),
                                                    'protocol':
                                                    prot.attributes['scheme']
                                                })
                                    except (ServiceUnavailable,
                                            RSEAccessDenied,
                                            ResourceTemporaryUnavailable
                                            ) as error:
                                        logging.warning(
                                            'Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'],
                                            str(error))
                                        add_message(
                                            'deletion-failed', {
                                                'scope':
                                                replica['scope'],
                                                'name':
                                                replica['name'],
                                                'rse':
                                                rse_info['rse'],
                                                'rse_id':
                                                rse_info['id'],
                                                'file-size':
                                                replica['bytes'],
                                                'bytes':
                                                replica['bytes'],
                                                'url':
                                                replica['pfn'],
                                                'reason':
                                                str(error),
                                                'protocol':
                                                prot.attributes['scheme']
                                            })
                                    except Exception as error:
                                        logging.critical(
                                            'Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'],
                                            str(traceback.format_exc()))
                                        add_message(
                                            'deletion-failed', {
                                                'scope':
                                                replica['scope'],
                                                'name':
                                                replica['name'],
                                                'rse':
                                                rse_info['rse'],
                                                'rse_id':
                                                rse_info['id'],
                                                'file-size':
                                                replica['bytes'],
                                                'bytes':
                                                replica['bytes'],
                                                'url':
                                                replica['pfn'],
                                                'reason':
                                                str(error),
                                                'protocol':
                                                prot.attributes['scheme']
                                            })
                                    except:
                                        logging.critical(
                                            'Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s',
                                            worker_number, child_number,
                                            replica['scope'], replica['name'],
                                            replica['pfn'], rse['rse'],
                                            str(traceback.format_exc()))
                            except (ServiceUnavailable, RSEAccessDenied,
                                    ResourceTemporaryUnavailable) as error:
                                for replica in files:
                                    logging.warning(
                                        'Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s',
                                        worker_number, child_number,
                                        replica['scope'], replica['name'],
                                        replica['pfn'], rse['rse'], str(error))
                                    add_message(
                                        'deletion-failed', {
                                            'scope': replica['scope'],
                                            'name': replica['name'],
                                            'rse': rse_info['rse'],
                                            'rse_id': rse_info['id'],
                                            'file-size': replica['bytes'],
                                            'bytes': replica['bytes'],
                                            'url': replica['pfn'],
                                            'reason': str(error),
                                            'protocol':
                                            prot.attributes['scheme']
                                        })
                                    break
                            finally:
                                prot.close()
                            start = time.time()
                            with monitor.record_timer_block(
                                    'reaper.delete_replicas'):
                                delete_replicas(rse_id=rse['id'],
                                                files=deleted_files)
                            logging.debug(
                                'Reaper %s-%s: delete_replicas successes %s %s %s',
                                worker_number, child_number, rse['rse'],
                                len(deleted_files),
                                time.time() - start)
                            monitor.record_counter(
                                counters='reaper.deletion.done',
                                delta=len(deleted_files))

                        except DatabaseException as error:
                            logging.warning(
                                'Reaper %s-%s: DatabaseException %s',
                                worker_number, child_number, str(error))
                        except UnsupportedOperation as error:
                            logging.warning(
                                'Reaper %s-%s: UnsupportedOperation %s',
                                worker_number, child_number, str(error))
                        except:
                            logging.critical(traceback.format_exc())

                except RSENotFound as error:
                    logging.warning('Reaper %s-%s: RSE not found %s',
                                    worker_number, child_number, str(error))

                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            time.sleep(1)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Ejemplo n.º 12
0
def deliver_emails(once=False, send_email=True, thread=0, bulk=1000, delay=10):
    '''
    Main loop to deliver emails via SMTP.
    '''
    logging.info('[email] starting - threads (%i) bulk (%i)', thread, bulk)

    executable = 'hermes [email]'
    hostname = socket.getfqdn()
    pid = os.getpid()
    heartbeat_thread = threading.current_thread()

    sanity_check(executable=executable, hostname=hostname)

    # Make an initial heartbeat so that all daemons have the correct worker number on the next try
    live(executable=executable,
         hostname=hostname,
         pid=pid,
         thread=heartbeat_thread)
    GRACEFUL_STOP.wait(1)

    email_from = config_get('messaging-hermes', 'email_from')

    while not GRACEFUL_STOP.is_set():

        heartbeat = live(executable, hostname, pid, heartbeat_thread)
        logging.debug('[email] %i:%i - bulk %i', heartbeat['assign_thread'],
                      heartbeat['nr_threads'], bulk)

        t_start = time.time()

        messages = retrieve_messages(bulk=bulk,
                                     thread=heartbeat['assign_thread'],
                                     total_threads=heartbeat['nr_threads'],
                                     event_type='email')

        if messages != []:
            to_delete = []
            for message in messages:
                logging.debug('[email] %i:%i - submitting: %s',
                              heartbeat['assign_thread'],
                              heartbeat['nr_threads'], str(message))

                msg = MIMEText(message['payload']['body'].encode('utf-8'))

                msg['From'] = email_from
                msg['To'] = ', '.join(message['payload']['to'])
                msg['Subject'] = message['payload']['subject'].encode('utf-8')

                if send_email:
                    smtp = smtplib.SMTP()
                    smtp.connect()
                    smtp.sendmail(msg['From'], message['payload']['to'],
                                  msg.as_string())
                    smtp.quit()

                to_delete.append({
                    'id': message['id'],
                    'created_at': message['created_at'],
                    'updated_at': message['created_at'],
                    'payload': str(message['payload']),
                    'event_type': 'email'
                })

                logging.debug('[email] %i:%i - submitting done: %s',
                              heartbeat['assign_thread'],
                              heartbeat['nr_threads'], str(message['id']))

            delete_messages(to_delete)
            logging.info('[email] %i:%i - submitted %i messages',
                         heartbeat['assign_thread'], heartbeat['nr_threads'],
                         len(to_delete))

        if once:
            break

        t_delay = delay - (time.time() - t_start)
        t_delay = t_delay if t_delay > 0 else 0
        if t_delay:
            logging.debug('[email] %i:%i - sleeping %s seconds',
                          heartbeat['assign_thread'], heartbeat['nr_threads'],
                          t_delay)
        time.sleep(t_delay)

    logging.debug('[email] %i:%i - graceful stop requested',
                  heartbeat['assign_thread'], heartbeat['nr_threads'])

    die(executable, hostname, pid, heartbeat_thread)

    logging.debug('[email] %i:%i - graceful stop done',
                  heartbeat['assign_thread'], heartbeat['nr_threads'])
Ejemplo n.º 13
0
def deliver_messages(once=False,
                     brokers_resolved=None,
                     thread=0,
                     bulk=1000,
                     delay=10,
                     broker_timeout=3,
                     broker_retry=3):
    '''
    Main loop to deliver messages to a broker.
    '''
    logging.info('[broker] starting - threads (%i) bulk (%i)', thread, bulk)

    if not brokers_resolved:
        logging.fatal('No brokers resolved.')
        return

    logging.info('[broker] checking authentication method')
    use_ssl = True
    try:
        use_ssl = config_get_bool('messaging-hermes', 'use_ssl')
    except:
        logging.info(
            '[broker] could not find use_ssl in configuration -- please update your rucio.cfg'
        )

    port = config_get_int('messaging-hermes', 'port')
    vhost = config_get('messaging-hermes',
                       'broker_virtual_host',
                       raise_exception=False)
    if not use_ssl:
        username = config_get('messaging-hermes', 'username')
        password = config_get('messaging-hermes', 'password')
        port = config_get_int('messaging-hermes', 'nonssl_port')

    conns = []
    for broker in brokers_resolved:
        if not use_ssl:
            logging.info(
                '[broker] setting up username/password authentication: %s' %
                broker)
            con = stomp.Connection12(host_and_ports=[(broker, port)],
                                     vhost=vhost,
                                     keepalive=True,
                                     timeout=broker_timeout)
        else:
            logging.info(
                '[broker] setting up ssl cert/key authentication: %s' % broker)
            con = stomp.Connection12(
                host_and_ports=[(broker, port)],
                use_ssl=True,
                ssl_key_file=config_get('messaging-hermes', 'ssl_key_file'),
                ssl_cert_file=config_get('messaging-hermes', 'ssl_cert_file'),
                vhost=vhost,
                keepalive=True,
                timeout=broker_timeout)

        con.set_listener(
            'rucio-hermes',
            HermesListener(con.transport._Transport__host_and_ports[0]))

        conns.append(con)
    destination = config_get('messaging-hermes', 'destination')

    executable = 'hermes [broker]'
    hostname = socket.getfqdn()
    pid = os.getpid()
    heartbeat_thread = threading.current_thread()

    # Make an initial heartbeat so that all daemons have the correct worker number on the next try
    sanity_check(executable=executable,
                 hostname=hostname,
                 pid=pid,
                 thread=heartbeat_thread)
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            t_start = time.time()

            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=heartbeat_thread)

            logging.debug('[broker] %i:%i - using: %s',
                          heartbeat['assign_thread'], heartbeat['nr_threads'],
                          [
                              conn.transport._Transport__host_and_ports[0][0]
                              for conn in conns
                          ])

            messages = retrieve_messages(bulk=bulk,
                                         thread=heartbeat['assign_thread'],
                                         total_threads=heartbeat['nr_threads'])

            if messages:

                logging.debug('[broker] %i:%i - retrieved %i messages',
                              heartbeat['assign_thread'],
                              heartbeat['nr_threads'], len(messages))
                to_delete = []
                for message in messages:
                    try:
                        conn = random.sample(conns, 1)[0]
                        if not conn.is_connected():
                            host_and_ports = conn.transport._Transport__host_and_ports[
                                0][0]
                            record_counter('daemons.hermes.reconnect.%s' %
                                           host_and_ports.split('.')[0])
                            conn.start()
                            if not use_ssl:
                                logging.info(
                                    '[broker] %i:%i - connecting with USERPASS to %s',
                                    heartbeat['assign_thread'],
                                    heartbeat['nr_threads'], host_and_ports)
                                conn.connect(username, password, wait=True)
                            else:
                                logging.info(
                                    '[broker] %i:%i - connecting with SSL to %s',
                                    heartbeat['assign_thread'],
                                    heartbeat['nr_threads'], host_and_ports)
                                conn.connect(wait=True)

                        conn.send(body=json.dumps({
                            'event_type':
                            str(message['event_type']).lower(),
                            'payload':
                            message['payload'],
                            'created_at':
                            str(message['created_at'])
                        }),
                                  destination=destination,
                                  headers={
                                      'persistent':
                                      'true',
                                      'event_type':
                                      str(message['event_type']).lower()
                                  })

                        to_delete.append({
                            'id':
                            message['id'],
                            'created_at':
                            message['created_at'],
                            'updated_at':
                            message['created_at'],
                            'payload':
                            json.dumps(message['payload']),
                            'event_type':
                            message['event_type']
                        })
                    except ValueError:
                        logging.warn('Cannot serialize payload to JSON: %s',
                                     str(message['payload']))
                        to_delete.append({
                            'id': message['id'],
                            'created_at': message['created_at'],
                            'updated_at': message['created_at'],
                            'payload': str(message['payload']),
                            'event_type': message['event_type']
                        })
                        continue
                    except stomp.exception.NotConnectedException as error:
                        logging.warn(
                            'Could not deliver message due to NotConnectedException: %s',
                            str(error))
                        continue
                    except stomp.exception.ConnectFailedException as error:
                        logging.warn(
                            'Could not deliver message due to ConnectFailedException: %s',
                            str(error))
                        continue
                    except Exception as error:
                        logging.warn('Could not deliver message: %s',
                                     str(error))
                        logging.critical(traceback.format_exc())
                        continue

                    if str(message['event_type']).lower().startswith(
                            'transfer') or str(message['event_type']).lower(
                            ).startswith('stagein'):
                        logging.debug(
                            '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, request-id: %s, transfer-id: %s, created_at: %s',
                            heartbeat['assign_thread'],
                            heartbeat['nr_threads'],
                            str(message['event_type']).lower(),
                            message['payload'].get('scope', None),
                            message['payload'].get('name', None),
                            message['payload'].get('dst-rse', None),
                            message['payload'].get('request-id', None),
                            message['payload'].get('transfer-id', None),
                            str(message['created_at']))

                    elif str(message['event_type']).lower().startswith(
                            'dataset'):
                        logging.debug(
                            '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, rule-id: %s, created_at: %s)',
                            heartbeat['assign_thread'],
                            heartbeat['nr_threads'],
                            str(message['event_type']).lower(),
                            message['payload']['scope'],
                            message['payload']['name'],
                            message['payload']['rse'],
                            message['payload']['rule_id'],
                            str(message['created_at']))

                    elif str(message['event_type']).lower().startswith(
                            'deletion'):
                        if 'url' not in message['payload']:
                            message['payload']['url'] = 'unknown'
                        logging.debug(
                            '[broker] %i:%i - event_type: %s, scope: %s, name: %s, rse: %s, url: %s, created_at: %s)',
                            heartbeat['assign_thread'],
                            heartbeat['nr_threads'],
                            str(message['event_type']).lower(),
                            message['payload']['scope'],
                            message['payload']['name'],
                            message['payload']['rse'],
                            message['payload']['url'],
                            str(message['created_at']))
                    else:
                        logging.debug('[broker] %i:%i - other message: %s',
                                      heartbeat['assign_thread'],
                                      heartbeat['nr_threads'], message)

                delete_messages(to_delete)
                logging.info('[broker] %i:%i - submitted %i messages',
                             heartbeat['assign_thread'],
                             heartbeat['nr_threads'], len(to_delete))

                if once:
                    break

        except NoResultFound:
            # silence this error: https://its.cern.ch/jira/browse/RUCIO-1699
            pass
        except:
            logging.critical(traceback.format_exc())

        t_delay = delay - (time.time() - t_start)
        t_delay = t_delay if t_delay > 0 else 0
        if t_delay:
            logging.debug('[broker] %i:%i - sleeping %s seconds',
                          heartbeat['assign_thread'], heartbeat['nr_threads'],
                          t_delay)
        time.sleep(t_delay)

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    logging.debug('[broker] %i:%i - graceful stop requested',
                  heartbeat['assign_thread'], heartbeat['nr_threads'])

    die(executable, hostname, pid, heartbeat_thread)

    logging.debug('[broker] %i:%i - graceful stop done',
                  heartbeat['assign_thread'], heartbeat['nr_threads'])
Ejemplo n.º 14
0
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, scope='tests', once=False, dataset_lifetime=None, set_metadata=False):
    sleep(sleep_time * (total_workers - worker_number) / total_workers)

    executable = 'automatix'
    hostname = socket.getfqdn()
    pid = getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers)
    while not GRACEFUL_STOP.is_set():
        heartbeat.live(executable, hostname, pid, hb_thread)
        starttime = time()
        prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers)
        logging.info('%s Getting data distribution', prepend_str)
        probabilities, data = get_data_distribution(inputfile)
        logging.debug('%s Probabilities %s', prepend_str, probabilities)
        totretries = 3
        status = False

        for site in sites:

            for retry in range(0, totretries):
                start_time = time()
                tmpdir = tempfile.mkdtemp()
                logging.info('%s Running on site %s', prepend_str, site)
                dic = choose_element(probabilities, data)
                metadata = dic['metadata']
                try:
                    nbfiles = dic['nbfiles']
                except KeyError:
                    nbfiles = 2
                    logging.warning('%s No nbfiles defined in the configuration, will use 2', prepend_str)
                try:
                    filesize = dic['filesize']
                except KeyError:
                    filesize = 1000000
                    logging.warning('%s No filesize defined in the configuration, will use 1M files', prepend_str)
                dsn = generate_didname(metadata, None, 'dataset')
                fnames = []
                lfns = []
                physical_fnames = []
                for _ in range(nbfiles):
                    fname = generate_didname(metadata=metadata, dsn=dsn, did_type='file')
                    lfns.append(fname)
                    logging.info('%s Generating file %s in dataset %s', prepend_str, fname, dsn)
                    physical_fname = '%s/%s' % (tmpdir, "".join(fname.split('/')))
                    physical_fnames.append(physical_fname)
                    generate_file(physical_fname, filesize)
                    fnames.append(fname)
                logging.info('%s Upload %s to %s', prepend_str, dsn, site)
                dsn = '%s:%s' % (scope, dsn)
                status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, worker_number=worker_number, total_workers=total_workers, dataset_lifetime=dataset_lifetime, did=dsn, set_metadata=set_metadata)
                for physical_fname in physical_fnames:
                    remove(physical_fname)
                rmdir(tmpdir)
                if status:
                    monitor.record_counter(counters='automatix.addnewdataset.done', delta=1)
                    monitor.record_counter(counters='automatix.addnewfile.done', delta=nbfiles)
                    monitor.record_timer('automatix.datasetinjection', (time() - start_time) * 1000)
                    break
                else:
                    logging.info('%s Failed to upload files. Will retry another time (attempt %s/%s)', prepend_str, str(retry + 1), str(totretries))
        if once is True:
            logging.info('%s Run with once mode. Exiting', prepend_str)
            break
        tottime = time() - starttime
        if status:
            logging.info('%s It took %s seconds to upload one dataset on %s', prepend_str, str(tottime), str(sites))
            if sleep_time > tottime:
                logging.info('%s Will sleep for %s seconds', prepend_str, str(sleep_time - tottime))
                sleep(sleep_time - tottime)
        else:
            logging.info('%s Retrying a new upload', prepend_str)
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
Ejemplo n.º 15
0
def minos(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Worker that gets a list of bad PFNs,
    extract the scope, name and rse_id and fill the bad_replicas table.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Minos starting')

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')

    states_mapping = {
        BadPFNStatus.BAD: BadFilesStatus.BAD,
        BadPFNStatus.SUSPICIOUS: BadFilesStatus.SUSPICIOUS,
        BadPFNStatus.TEMPORARY_UNAVAILABLE:
        BadFilesStatus.TEMPORARY_UNAVAILABLE
    }
    logger(logging.INFO, 'Minos started')

    chunk_size = 10  # The chunk size used for the commits

    while not GRACEFUL_STOP.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prefix = 'minos[%i/%i] ' % (heart_beat['assign_thread'],
                                    heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prefix + '%s')
        pfns = []
        try:
            bad_replicas = {}
            temporary_unvailables = {}
            pfns = get_bad_pfns(thread=heart_beat['assign_thread'],
                                total_threads=heart_beat['nr_threads'],
                                limit=bulk)

            # Class the PFNs into bad_replicas and temporary_unavailable
            for pfn in pfns:
                path = pfn['pfn']
                account = pfn['account']
                reason = pfn['reason']
                expires_at = pfn['expires_at']
                state = states_mapping[pfn['state']]
                if state in [BadFilesStatus.BAD, BadFilesStatus.SUSPICIOUS]:
                    if (account, reason, state) not in bad_replicas:
                        bad_replicas[(account, reason, state)] = []
                    bad_replicas[(account, reason, state)].append(path)
                elif state == BadFilesStatus.TEMPORARY_UNAVAILABLE:
                    if (account, reason,
                            expires_at) not in temporary_unvailables:
                        temporary_unvailables[(account, reason,
                                               expires_at)] = []
                    temporary_unvailables[(account, reason,
                                           expires_at)].append(path)

            # Process the bad and suspicious files
            # The scope, name, rse_id are extracted and filled into the bad_replicas table
            for account, reason, state in bad_replicas:
                vo = account.vo
                pfns = bad_replicas[(account, reason, state)]
                logger(
                    logging.INFO,
                    'Declaring %s replicas with state %s and reason %s' %
                    (len(pfns), str(state), reason))
                session = get_session()
                schemes = {}
                dict_rse = {}
                unknown_replicas = []
                try:
                    # Splitting the PFNs by schemes
                    for pfn in pfns:
                        scheme = pfn.split(':')[0]
                        if scheme not in schemes:
                            schemes[scheme] = []
                        schemes[scheme].append(pfn)
                    for scheme in schemes:
                        _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                            schemes[scheme], vo=vo)
                        for rse_id in tmp_dict_rse:
                            if rse_id not in dict_rse:
                                dict_rse[rse_id] = []
                            dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))
                    # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                    if unknown_replicas:
                        logger(
                            logging.INFO,
                            'The following replicas are unknown and will be removed : %s'
                            % str(unknown_replicas))
                        bulk_delete_bad_pfns(pfns=unknown_replicas,
                                             session=None)

                    for rse_id in dict_rse:
                        vo_str = '' if vo == 'def' else ' on VO ' + vo
                        logger(
                            logging.DEBUG,
                            'Running on RSE %s%s with %s replicas' %
                            (get_rse_name(rse_id=rse_id), vo_str,
                             len(dict_rse[rse_id])))
                        nchunk = 0
                        tot_chunk = int(
                            math.ceil(len(dict_rse[rse_id]) / chunk_size))
                        for chunk in chunks(dict_rse[rse_id], chunk_size):
                            nchunk += 1
                            logger(
                                logging.DEBUG,
                                'Running on %s chunk out of %s' %
                                (nchunk, tot_chunk))
                            unknown_replicas = declare_bad_file_replicas(
                                pfns=chunk,
                                reason=reason,
                                issuer=account,
                                status=state,
                                session=session)
                            if unknown_replicas:
                                logger(
                                    logging.DEBUG, 'Unknown replicas : %s' %
                                    (str(unknown_replicas)))
                            bulk_delete_bad_pfns(pfns=chunk, session=session)
                            session.commit()  # pylint: disable=no-member
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logger(logging.CRITICAL, 'Exception', exc_info=True)

            # Now get the temporary unavailable and update the replicas states
            for account, reason, expires_at in temporary_unvailables:
                vo = account.vo
                pfns = temporary_unvailables[(account, reason, expires_at)]
                logger(
                    logging.INFO,
                    'Declaring %s replicas temporary available with timeout %s and reason %s'
                    % (len(pfns), str(expires_at), reason))
                logger(logging.DEBUG, 'Extracting RSEs')
                schemes = {}
                dict_rse = {}
                unknown_replicas = []

                # Splitting the PFNs by schemes
                for pfn in pfns:
                    scheme = pfn.split(':')[0]
                    if scheme not in schemes:
                        schemes[scheme] = []
                    schemes[scheme].append(pfn)
                for scheme in schemes:
                    _, tmp_dict_rse, tmp_unknown_replicas = get_pfn_to_rse(
                        schemes[scheme], vo=vo)
                    for rse_id in tmp_dict_rse:
                        if rse_id not in dict_rse:
                            dict_rse[rse_id] = []
                        dict_rse[rse_id].extend(tmp_dict_rse[rse_id])
                        unknown_replicas.extend(
                            tmp_unknown_replicas.get('unknown', []))

                # The replicas in unknown_replicas do not exist, so we flush them from bad_pfns
                if unknown_replicas:
                    logger(
                        logging.INFO,
                        'The following replicas are unknown and will be removed : %s'
                        % str(unknown_replicas))
                    bulk_delete_bad_pfns(pfns=unknown_replicas, session=None)

                for rse_id in dict_rse:
                    replicas = []
                    rse = get_rse_name(rse_id=rse_id, session=None)
                    rse_vo_str = rse if vo == 'def' else '{} on {}'.format(
                        rse, vo)
                    logger(logging.DEBUG, 'Running on RSE %s' % rse_vo_str)
                    for rep in get_did_from_pfns(pfns=dict_rse[rse_id],
                                                 rse_id=None,
                                                 vo=vo,
                                                 session=None):
                        for pfn in rep:
                            scope = rep[pfn]['scope']
                            name = rep[pfn]['name']
                            replicas.append({
                                'scope': scope,
                                'name': name,
                                'rse_id': rse_id,
                                'state': ReplicaState.TEMPORARY_UNAVAILABLE,
                                'pfn': pfn
                            })
                    # The following part needs to be atomic
                    # We update the replicas states to TEMPORARY_UNAVAILABLE
                    # then insert a row in the bad_replicas table. TODO Update the row if it already exists
                    # then delete the corresponding rows into the bad_pfns table
                    logger(
                        logging.DEBUG, 'Running on %s replicas on RSE %s' %
                        (len(replicas), rse_vo_str))
                    nchunk = 0
                    tot_chunk = int(
                        math.ceil(len(replicas) / float(chunk_size)))
                    session = get_session()
                    for chunk in chunks(replicas, chunk_size):
                        try:
                            nchunk += 1
                            logger(
                                logging.DEBUG,
                                'Running on %s chunk out of %s' %
                                (nchunk, tot_chunk))
                            update_replicas_states(chunk,
                                                   nowait=False,
                                                   session=session)
                            bulk_add_bad_replicas(
                                chunk,
                                account,
                                state=BadFilesStatus.TEMPORARY_UNAVAILABLE,
                                reason=reason,
                                expires_at=expires_at,
                                session=session)
                            pfns = [entry['pfn'] for entry in chunk]
                            bulk_delete_bad_pfns(pfns=pfns, session=session)
                            session.commit()  # pylint: disable=no-member
                        except (UnsupportedOperation,
                                ReplicaNotFound) as error:
                            session.rollback()  # pylint: disable=no-member
                            logger(
                                logging.ERROR,
                                'Problem to bulk update PFNs. PFNs will be updated individually. Error : %s'
                                % str(error))
                            for rep in chunk:
                                logger(logging.DEBUG,
                                       'Working on %s' % (str(rep)))
                                try:
                                    get_metadata(rep['scope'], rep['name'])
                                    unavailable_states = []
                                    rep_state = get_replicas_state(
                                        rep['scope'], rep['name'])
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.TEMPORARY_UNAVAILABLE,
                                            []))
                                    unavailable_states.extend(
                                        rep_state.get(
                                            ReplicaState.BEING_DELETED, []))
                                    unavailable_states.extend(
                                        rep_state.get(ReplicaState.BAD, []))
                                    # If the replica is already not available, it is removed from the bad PFNs table
                                    if rep['rse_id'] in unavailable_states:
                                        logger(
                                            logging.INFO,
                                            '%s is in unavailable state. Will be removed from the list of bad PFNs'
                                            % str(rep['pfn']))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    # If the expiration date of the TEMPORARY_UNAVAILABLE is in the past, it is removed from the bad PFNs table
                                    elif expires_at < datetime.now():
                                        logger(
                                            logging.INFO,
                                            'PFN %s expiration time (%s) is older than now and is not in unavailable state. Removing the PFNs from bad_pfns'
                                            % (str(rep['pfn']), expires_at))
                                        bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                             session=None)
                                    # Else update everything in the same transaction
                                    else:
                                        try:
                                            session = get_session()
                                            update_replicas_states(
                                                [rep],
                                                nowait=False,
                                                session=session)
                                            bulk_add_bad_replicas(
                                                [rep],
                                                account,
                                                state=BadFilesStatus.
                                                TEMPORARY_UNAVAILABLE,
                                                reason=reason,
                                                expires_at=expires_at,
                                                session=session)
                                            bulk_delete_bad_pfns(
                                                pfns=[rep['pfn']],
                                                session=session)
                                            session.commit()  # pylint: disable=no-member
                                        except Exception:
                                            logger(
                                                logging.ERROR,
                                                'Cannot update state of %s' %
                                                str(rep['pfn']))
                                except (DataIdentifierNotFound,
                                        ReplicaNotFound):
                                    logger(
                                        logging.ERROR,
                                        'Will remove %s from the list of bad PFNs'
                                        % str(rep['pfn']))
                                    bulk_delete_bad_pfns(pfns=[rep['pfn']],
                                                         session=None)
                            session = get_session()
                        except Exception:
                            session.rollback()  # pylint: disable=no-member
                            logger(logging.CRITICAL,
                                   'Exception',
                                   exc_info=True)
                            session = get_session()

        except Exception as error:
            logger(logging.ERROR, '%s' % (str(error)))

        tottime = time.time() - start_time
        if once:
            break
        if len(pfns) == bulk:
            logger(
                logging.INFO,
                'Processed maximum number of pfns according to the bulk size. Restart immediately next cycle'
            )
        elif tottime < sleep_time:
            logger(logging.INFO,
                   'Will sleep for %s seconds' % (sleep_time - tottime))
            time.sleep(sleep_time - tottime)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
Ejemplo n.º 16
0
def submitter(once=False, rses=[], mock=False,
              process=0, total_processes=1, total_threads=1,
              bulk=100, group_bulk=1, group_policy='rule', fts_source_strategy='auto',
              activities=None, sleep_time=600, max_sources=4, retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    logging.info('Transfer submitter starting - process (%i/%i) threads (%i)' % (process,
                                                                                 total_processes,
                                                                                 total_threads))

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" % max_time_in_queue)

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)

    logging.info('Transfer submitter started - process (%i/%i) threads (%i/%i) timeout (%s)' % (process, total_processes,
                                                                                                hb['assign_thread'], hb['nr_threads'],
                                                                                                timeout))

    threadPool = ThreadPool(total_threads)
    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None

            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                logging.info("%s:%s Starting to get transfer transfers for %s" % (process, hb['assign_thread'], activity))
                ts = time.time()
                transfers = __get_transfers(process=process,
                                            total_processes=total_processes,
                                            thread=hb['assign_thread'],
                                            total_threads=hb['nr_threads'],
                                            failover_schemes=failover_scheme,
                                            limit=bulk,
                                            activity=activity,
                                            rses=rse_ids,
                                            schemes=scheme,
                                            mock=mock,
                                            max_sources=max_sources,
                                            bring_online=bring_online,
                                            retry_other_fts=retry_other_fts)
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.per_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1))
                record_counter('daemons.conveyor.transfer_submitter.get_transfers', len(transfers))
                record_timer('daemons.conveyor.transfer_submitter.get_transfers.transfers', len(transfers))
                logging.info("%s:%s Got %s transfers for %s" % (process, hb['assign_thread'], len(transfers), activity))

                # group transfers
                logging.info("%s:%s Starting to group transfers for %s" % (process, hb['assign_thread'], activity))
                ts = time.time()
                grouped_jobs = bulk_group_transfer(transfers, group_policy, group_bulk, fts_source_strategy, max_time_in_queue)
                record_timer('daemons.conveyor.transfer_submitter.bulk_group_transfer', (time.time() - ts) * 1000 / (len(transfers) if len(transfers) else 1))

                logging.info("%s:%s Starting to submit transfers for %s" % (process, hb['assign_thread'], activity))
                for external_host in grouped_jobs:
                    for job in grouped_jobs[external_host]:
                        # submit transfers
                        # job_requests = makeRequests(submit_transfer, args_list=[((external_host, job, 'transfer_submitter', process, thread), {})])
                        job_requests = makeRequests(submit_transfer, args_list=[((), {'external_host': external_host,
                                                                                      'job': job,
                                                                                      'submitter':
                                                                                      'transfer_submitter',
                                                                                      'process': process,
                                                                                      'thread': hb['assign_thread'],
                                                                                      'timeout': timeout})])
                        [threadPool.putRequest(job_req) for job_req in job_requests]
                threadPool.wait()

                if len(transfers) < group_bulk:
                    logging.info('%i:%i - only %s transfers for %s which is less than group bulk %s, sleep %s seconds' % (process, hb['assign_thread'], len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
        except:
            logging.critical('%s:%s %s' % (process, hb['assign_thread'], traceback.format_exc()))

        if once:
            break

    logging.info('%s:%s graceful stop requested' % (process, hb['assign_thread']))

    threadPool.dismissWorkers(total_threads, do_join=True)
    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s:%s graceful stop done' % (process, hb['assign_thread']))
    return
Ejemplo n.º 17
0
def automatix(sites,
              inputfile,
              sleep_time,
              account,
              worker_number=1,
              total_workers=1,
              once=False,
              dataset_lifetime=None):
    sleep(sleep_time * (total_workers - worker_number) / total_workers)

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():
        heartbeat.live(executable, hostname, pid, hb_thread)
        starttime = time()
        prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers)
        logging.info(prepend_str + 'Getting data distribution')
        probabilities, data = get_data_distribution(inputfile)
        logging.debug(prepend_str + 'Probabilities %s' % (probabilities))
        account = 'root'
        scope = 'tests'
        totretries = 3
        status = False
        for site in sites:
            for retry in xrange(0, totretries):
                start_time = time()
                tmpdir = tempfile.mkdtemp()
                logging.info(prepend_str + 'Running on site %s' % (site))
                dic = choose_element(probabilities, data)
                metadata = dic['metadata']
                metadata['version'] = str(random.randint(0, 1000))
                metadata['run_number'] = str(random.randint(0, 100000))
                metadata['stream_name'] = 'automatix_stream'
                metadata['campaign'] = 'automatix_campaign'
                try:
                    nbfiles = dic['nbfiles']
                except KeyError:
                    nbfiles = 2
                    logging.warning(
                        prepend_str +
                        'No nbfiles defined in the configuration, will use 2')
                try:
                    filesize = dic['filesize']
                except KeyError:
                    filesize = 1000000
                    logging.warning(
                        prepend_str +
                        'No filesize defined in the configuration, will use 1M files'
                    )
                dsn = 'tests:%s.%s.%s.%s.%s.%s' % (
                    metadata['project'], metadata['run_number'],
                    metadata['stream_name'], metadata['prod_step'],
                    metadata['datatype'], metadata['version'])
                fnames = []
                lfns = []
                for dummy_nbfile in xrange(nbfiles):
                    fname = '%s.%s' % (metadata['datatype'], generate_uuid())
                    lfns.append(fname)
                    fname = '%s/%s' % (tmpdir, fname)
                    logging.info(
                        prepend_str +
                        'Generating file %(fname)s in dataset %(dsn)s' %
                        locals())
                    generate_file(fname, filesize)
                    fnames.append(fname)
                logging.info(prepend_str + 'Upload %s to %s' % (dsn, site))
                status = upload(files=lfns,
                                scope=scope,
                                metadata=metadata,
                                rse=site,
                                account=account,
                                source_dir=tmpdir,
                                worker_number=worker_number,
                                total_workers=total_workers,
                                dataset_lifetime=dataset_lifetime,
                                did=dsn)
                for fname in fnames:
                    remove(fname)
                rmdir(tmpdir)
                if status:
                    monitor.record_counter(
                        counters='automatix.addnewdataset.done', delta=1)
                    monitor.record_counter(
                        counters='automatix.addnewfile.done', delta=nbfiles)
                    monitor.record_timer('automatix.datasetinjection',
                                         (time() - start_time) * 1000)
                    break
                else:
                    logging.info(
                        prepend_str +
                        'Failed to upload files. Will retry another time (attempt %s/%s)'
                        % (str(retry + 1), str(totretries)))
        if once is True:
            logging.info(prepend_str + 'Run with once mode. Exiting')
            break
        tottime = time() - starttime
        if status:
            logging.info(prepend_str +
                         'It took %s seconds to upload one dataset on %s' %
                         (str(tottime), str(sites)))
            if sleep_time > tottime:
                logging.info(prepend_str + 'Will sleep for %s seconds' %
                             (str(sleep_time - tottime)))
                sleep(sleep_time - tottime)
        else:
            logging.info(prepend_str + 'Retrying a new upload')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 18
0
def kronos_file(once=False,
                thread=0,
                brokers_resolved=None,
                dataset_queue=None,
                sleep_time=60):
    """
    Main loop to consume tracer reports.
    """

    logging.info('tracer consumer starting')

    hostname = gethostname()
    pid = getpid()
    thread = current_thread()

    chunksize = config_get_int('tracer-kronos', 'chunksize')
    prefetch_size = config_get_int('tracer-kronos', 'prefetch_size')
    subscription_id = config_get('tracer-kronos', 'subscription_id')
    try:
        bad_files_patterns = []
        pattern = get(section='kronos',
                      option='bad_files_patterns',
                      session=None)
        pattern = str(pattern)
        patterns = pattern.split(",")
        for pat in patterns:
            bad_files_patterns.append(re.compile(pat.strip()))
    except ConfigNotFound:
        bad_files_patterns = []
    except Exception as error:
        logging.error('(kronos_file) Failed to get bad_file_patterns' +
                      str(error))
        bad_files_patterns = []

    use_ssl = True
    try:
        use_ssl = config_get_bool('tracer-kronos', 'use_ssl')
    except Exception:
        pass

    if not use_ssl:
        username = config_get('tracer-kronos', 'username')
        password = config_get('tracer-kronos', 'password')

    excluded_usrdns = set(
        config_get('tracer-kronos', 'excluded_usrdns').split(','))

    conns = []
    for broker in brokers_resolved:
        if not use_ssl:
            conns.append(
                Connection(host_and_ports=[
                    (broker, config_get_int('tracer-kronos', 'port'))
                ],
                           use_ssl=False,
                           reconnect_attempts_max=config_get_int(
                               'tracer-kronos', 'reconnect_attempts')))
        else:
            conns.append(
                Connection(host_and_ports=[
                    (broker, config_get_int('tracer-kronos', 'port'))
                ],
                           use_ssl=True,
                           ssl_key_file=config_get('tracer-kronos',
                                                   'ssl_key_file'),
                           ssl_cert_file=config_get('tracer-kronos',
                                                    'ssl_cert_file'),
                           ssl_version=PROTOCOL_TLSv1,
                           reconnect_attempts_max=config_get_int(
                               'tracer-kronos', 'reconnect_attempts')))

    logging.info('(kronos_file) tracer consumer started')

    sanity_check(executable='kronos-file', hostname=hostname)
    while not graceful_stop.is_set():
        start_time = time()
        live(executable='kronos-file',
             hostname=hostname,
             pid=pid,
             thread=thread)
        for conn in conns:
            if not conn.is_connected():
                logging.info('(kronos_file) connecting to %s' %
                             conn.transport._Transport__host_and_ports[0][0])
                record_counter('daemons.tracer.kronos.reconnect.%s' %
                               conn.transport._Transport__host_and_ports[0]
                               [0].split('.')[0])
                conn.set_listener(
                    'rucio-tracer-kronos',
                    AMQConsumer(
                        broker=conn.transport._Transport__host_and_ports[0],
                        conn=conn,
                        queue=config_get('tracer-kronos', 'queue'),
                        chunksize=chunksize,
                        subscription_id=subscription_id,
                        excluded_usrdns=excluded_usrdns,
                        dataset_queue=dataset_queue,
                        bad_files_patterns=bad_files_patterns))
                conn.start()
                if not use_ssl:
                    conn.connect(username, password)
                else:
                    conn.connect()
                conn.subscribe(
                    destination=config_get('tracer-kronos', 'queue'),
                    ack='client-individual',
                    id=subscription_id,
                    headers={'activemq.prefetchSize': prefetch_size})
        tottime = time() - start_time
        if tottime < sleep_time:
            logging.info('(kronos_file) Will sleep for %s seconds' %
                         (sleep_time - tottime))
            sleep(sleep_time - tottime)

    logging.info('(kronos_file) graceful stop requested')

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    die(executable='kronos-file', hostname=hostname, pid=pid, thread=thread)
    logging.info('(kronos_file) graceful stop done')
Ejemplo n.º 19
0
def minos_tu_expiration(bulk=1000, once=False, sleep_time=60):
    """
    Creates a Minos Temporary Unavailable Replicas Expiration Worker that
    gets the list of expired TU replicas and sets them back to AVAILABLE.

    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'minos-temporary-expiration'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info('%s Minos Temporary Expiration starting', prepend_str)

    time.sleep(
        10
    )  # To prevent running on the same partition if all the daemons restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    logging.info('%s Minos Temporary Expiration started', prepend_str)

    chunk_size = 10  # The chunk size used for the commits

    while not graceful_stop.is_set():
        start_time = time.time()
        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])
        try:
            # Get list of expired TU replicas
            logging.info('%s Getting list of expired replicas', prepend_str)
            expired_replicas = list_expired_temporary_unavailable_replicas(
                total_workers=heart_beat['nr_threads'],
                worker_number=heart_beat['assign_thread'],
                limit=1000)
            logging.info('%s %s expired replicas returned', prepend_str,
                         len(expired_replicas))
            logging.debug('%s List of expired replicas returned %s',
                          prepend_str, str(expired_replicas))
            replicas = []
            bad_replicas = []
            nchunk = 0
            tot_chunk = int(
                math.ceil(len(expired_replicas) / float(chunk_size)))
            session = get_session()
            for chunk in chunks(expired_replicas, chunk_size):
                skip_replica_update = []
                # Process and update the replicas in chunks
                for replica in chunk:
                    scope, name, rse_id = replica[0], replica[1], replica[2]
                    states_dictionary = get_replicas_state(scope=scope,
                                                           name=name,
                                                           session=session)
                    # Check if the replica is not declared bad
                    # If already declared bad don't update the replica state, but remove from bad_pfns
                    if not (ReplicaState.BAD in states_dictionary
                            and rse_id in states_dictionary[ReplicaState.BAD]):
                        replicas.append({
                            'scope': scope,
                            'name': name,
                            'rse_id': rse_id,
                            'state': ReplicaState.AVAILABLE
                        })
                    else:
                        skip_replica_update.append((scope, name))
                    # Remove the replicas from bad_replicas table in chunks
                    bad_replicas.append({
                        'scope':
                        scope,
                        'name':
                        name,
                        'rse_id':
                        rse_id,
                        'state':
                        BadFilesStatus.TEMPORARY_UNAVAILABLE
                    })
                try:
                    nchunk += 1
                    logging.debug('%s Running on %s chunk out of %s',
                                  prepend_str, nchunk, tot_chunk)
                    update_replicas_states(replicas,
                                           nowait=True,
                                           session=session)
                    bulk_delete_bad_replicas(bad_replicas, session=session)
                    session.commit()  # pylint: disable=no-member
                except (ReplicaNotFound, DataIdentifierNotFound) as error:
                    session.rollback()  # pylint: disable=no-member
                    logging.warning(
                        '%s One of the replicas does not exist anymore. Updating and deleting one by one. Error : %s',
                        prepend_str, str(error))
                    for replica in chunk:
                        scope, name, rse_id = replica[0], replica[1], replica[
                            2]
                        logging.debug('%s Working on %s:%s on %s', prepend_str,
                                      scope, name, rse_id)
                        try:
                            # First check if the DID exists
                            get_metadata(scope, name)
                            if (scope, name) not in skip_replica_update:
                                update_replicas_states([
                                    {
                                        'scope': scope,
                                        'name': name,
                                        'rse_id': rse_id,
                                        'state': ReplicaState.AVAILABLE
                                    },
                                ],
                                                       nowait=True,
                                                       session=session)
                            bulk_delete_bad_replicas([
                                {
                                    'scope': scope,
                                    'name': name,
                                    'rse_id': rse_id,
                                    'state':
                                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                                },
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except DataIdentifierNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                '%s DID %s:%s does not exist anymore.',
                                prepend_str, scope, name)
                            bulk_delete_bad_replicas([
                                {
                                    'scope': scope,
                                    'name': name,
                                    'rse_id': rse_id,
                                    'state':
                                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                                },
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                        except ReplicaNotFound:
                            session.rollback()  # pylint: disable=no-member
                            logging.warning(
                                '%s Replica %s:%s on RSEID %s does not exist anymore.',
                                prepend_str, scope, name, rse_id)
                            bulk_delete_bad_replicas([
                                {
                                    'scope': scope,
                                    'name': name,
                                    'rse_id': rse_id,
                                    'state':
                                    BadFilesStatus.TEMPORARY_UNAVAILABLE
                                },
                            ],
                                                     session=session)
                            session.commit()  # pylint: disable=no-member
                    session = get_session()
                except Exception:
                    session.rollback()  # pylint: disable=no-member
                    logging.critical('%s %s', prepend_str,
                                     str(traceback.format_exc()))
                    session = get_session()

        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

        if once:
            break
        daemon_sleep(start_time=start_time,
                     sleep_time=sleep_time,
                     graceful_stop=graceful_stop)

    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info('%s Graceful stop requested', prepend_str)
    logging.info('%s Graceful stop done', prepend_str)
Ejemplo n.º 20
0
def poller(once=False,
           process=0, total_processes=1, thread=0, total_threads=1, activities=None, sleep_time=60,
           fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i) timeout (%s)' % (process, total_processes,
                                                                                              thread, total_threads,
                                                                                              db_bulk, timeout))

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)

    logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                hb['assign_thread'], hb['nr_threads'],
                                                                                db_bulk))

    activity_next_exe_time = defaultdict(time.time)
    threadPool = ThreadPool(total_threads)
    sleeping = False

    while not graceful_stop.is_set():

        try:
            hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            logging.debug('poller - thread (%i/%i)' % (hb['assign_thread'], hb['nr_threads']))

            if not sleeping:
                sleeping = True

            if activities is None:
                activities = [None]
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    time.sleep(1)
                    continue
                sleeping = False

                ts = time.time()
                logging.debug('%i:%i - start to poll transfers older than %i seconds for activity %s' % (process, hb['assign_thread'], older_than, activity))
                transfs = request.get_next_transfers(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                                     state=[RequestState.SUBMITTED],
                                                     limit=db_bulk,
                                                     older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than),
                                                     process=process, total_processes=total_processes,
                                                     thread=hb['assign_thread'], total_threads=hb['nr_threads'],
                                                     activity=activity,
                                                     activity_shares=activity_shares)
                record_timer('daemons.conveyor.poller.000-get_next_transfers', (time.time() - ts) * 1000)

                if transfs:
                    logging.debug('%i:%i - polling %i transfers for activity %s' % (process, hb['assign_thread'], len(transfs), activity))

                xfers_ids = {}
                for transf in transfs:
                    if not transf['external_host'] in xfers_ids:
                        xfers_ids[transf['external_host']] = []
                    xfers_ids[transf['external_host']].append(transf['external_id'])

                for external_host in xfers_ids:
                    for xfers in chunks(xfers_ids[external_host], fts_bulk):
                        # poll transfers
                        # xfer_requests = makeRequests(common.poll_transfers, args_list=[((external_host, xfers, process, thread), {})])
                        xfer_requests = makeRequests(common.poll_transfers, args_list=[((), {'external_host': external_host, 'xfers': xfers, 'process': process, 'thread': hb['assign_thread'], 'timeout': timeout})])
                        [threadPool.putRequest(xfer_req) for xfer_req in xfer_requests]
                threadPool.wait()

                if len(transfs) < db_bulk / 2:
                    logging.info("%i:%i - only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (process, hb['assign_thread'], len(transfs), activity, db_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
        except:
            logging.critical("%i:%i - %s" % (process, hb['assign_thread'], traceback.format_exc()))

        if once:
            break

    logging.info('%i:%i - graceful stop requests' % (process, hb['assign_thread']))

    threadPool.dismissWorkers(total_threads, do_join=True)
    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%i:%i - graceful stop done' % (process, hb['assign_thread']))
Ejemplo n.º 21
0
def throttler(once=False, sleep_time=600):
    """
    Main loop to check rse transfer limits.
    """

    logging.info('Throttler starting')

    executable = 'throttler'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)

    logging.info('Throttler started - thread (%i/%i) timeout (%s)' %
                 (hb['assign_thread'], hb['nr_threads'], sleep_time))

    current_time = time.time()
    while not graceful_stop.is_set():

        try:
            hb = heartbeat.live(executable,
                                hostname,
                                pid,
                                hb_thread,
                                older_than=3600)
            logging.info('Throttler - thread (%i/%i)' %
                         (hb['assign_thread'], hb['nr_threads']))
            if hb['assign_thread'] != 0:
                logging.info(
                    'Throttler thread id is not 0, will sleep. Only thread 0 will work'
                )
                while time.time() < current_time + sleep_time:
                    time.sleep(1)
                    if graceful_stop.is_set() or once:
                        break
                current_time = time.time()
                continue

            logging.info("Throttler thread %s - schedule requests" %
                         hb['assign_thread'])
            schedule_requests()

            while time.time() < current_time + sleep_time:
                time.sleep(1)
                if graceful_stop.is_set() or once:
                    break
            current_time = time.time()
        except:
            logging.critical('Throtter thread %s - %s' %
                             (hb['assign_thread'], traceback.format_exc()))

        if once:
            break

    logging.info('Throtter thread %s - graceful stop requested' %
                 (hb['assign_thread']))

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('Throtter thread %s - graceful stop done' %
                 (hb['assign_thread']))
Ejemplo n.º 22
0
def reaper(rses, chunk_size=100, once=False, scheme=None, sleep_time=300):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param sleep_time: Thread sleep time after each chunk of work.
    """

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    hash_executable = hashlib.sha256(
        (sys.argv[0] + ''.join(rses)).encode()).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    # heartbeat
    heartbeat = live(executable=executable,
                     hostname=hostname,
                     pid=pid,
                     thread=thread,
                     hash_executable=hash_executable)
    prepend_str = 'dark-reaper [%i/%i] : ' % (heartbeat['assign_thread'],
                                              heartbeat['nr_threads'])
    logger = formatted_logger(logging.log, prepend_str + '%s')
    logger(logging.INFO, 'Starting Dark Reaper on RSEs: %s', ', '.join(rses))

    if not once:
        logger(logging.INFO, 'Waiting for heartbeat synchonization')
        GRACEFUL_STOP.wait(
            10
        )  # To prevent running on the same partition if all the reapers restart at the same time

    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable=executable,
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             hash_executable=hash_executable)
            prepend_str = 'dark-reaper [%i/%i] : ' % (
                heartbeat['assign_thread'], heartbeat['nr_threads'])
            logger = formatted_logger(logging.log, prepend_str + '%s')
            logger(logging.INFO, 'Live gives {0[heartbeat]}'.format(locals()))
            nothing_to_do = True
            start_time = time.time()

            rses_to_process = list(
                set(rses) & set(list_rses_with_quarantined_replicas()))
            random.shuffle(rses_to_process)
            for rse_id in rses_to_process:
                # The following query returns the list of real replicas (deleted_replicas) and list of dark replicas (dark_replicas)
                # Real replicas can be directly removed from the quarantine table
                deleted_replicas, dark_replicas = list_quarantined_replicas(
                    rse_id=rse_id,
                    limit=chunk_size,
                    worker_number=heartbeat['assign_thread'],
                    total_workers=heartbeat['nr_threads'])

                rse_info = rsemgr.get_rse_info(rse_id=rse_id)
                rse = rse_info['rse']
                prot = rsemgr.create_protocol(rse_info,
                                              'delete',
                                              scheme=scheme)

                heartbeat = live(executable=executable,
                                 hostname=hostname,
                                 pid=pid,
                                 thread=thread,
                                 hash_executable=hash_executable)
                prepend_str = 'dark-reaper [%i/%i] : ' % (
                    heartbeat['assign_thread'], heartbeat['nr_threads'])
                logger = formatted_logger(logging.log, prepend_str + '%s')
                try:
                    prot.connect()
                    for replica in dark_replicas:
                        nothing_to_do = False
                        scope = ''
                        if replica['scope']:
                            scope = replica['scope'].external
                        try:
                            pfn = str(
                                list(
                                    rsemgr.lfns2pfns(
                                        rse_settings=rse_info,
                                        lfns=[{
                                            'scope': scope,
                                            'name': replica['name'],
                                            'path': replica['path']
                                        }],
                                        operation='delete',
                                        scheme=scheme).values())[0])
                            logger(logging.INFO,
                                   'Deletion ATTEMPT of %s:%s as %s on %s',
                                   scope, replica['name'], pfn, rse)
                            start = time.time()
                            prot.delete(pfn)
                            duration = time.time() - start
                            logger(
                                logging.INFO,
                                'Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                                scope, replica['name'], pfn, rse, duration)
                            payload = {
                                'scope': scope,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica.get('bytes') or 0,
                                'bytes': replica.get('bytes') or 0,
                                'url': pfn,
                                'duration': duration,
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-done', payload)
                            deleted_replicas.append(replica)
                        except SourceNotFound:
                            err_msg = (
                                'Deletion NOTFOUND of %s:%s as %s on %s' %
                                (scope, replica['name'], pfn, rse))
                            logger(logging.WARNING, err_msg)
                            deleted_replicas.append(replica)
                        except (ServiceUnavailable, RSEAccessDenied,
                                ResourceTemporaryUnavailable) as error:
                            err_msg = (
                                'Deletion NOACCESS of %s:%s as %s on %s: %s' %
                                (scope, replica['name'], pfn, rse, str(error)))
                            logger(logging.WARNING, err_msg)
                            payload = {
                                'scope': scope,
                                'name': replica['name'],
                                'rse': rse,
                                'rse_id': rse_id,
                                'file-size': replica['bytes'] or 0,
                                'bytes': replica['bytes'] or 0,
                                'url': pfn,
                                'reason': str(error),
                                'protocol': prot.attributes['scheme']
                            }
                            if replica['scope'].vo != 'def':
                                payload['vo'] = replica['scope'].vo
                            add_message('deletion-failed', payload)

                        except Exception:
                            logging.critical(traceback.format_exc())
                finally:
                    prot.close()

                delete_quarantined_replicas(rse_id=rse_id,
                                            replicas=deleted_replicas)

                if once:
                    break

            if once:
                break

            if nothing_to_do:
                logger(logging.INFO, 'Nothing to do')
                daemon_sleep(start_time=start_time,
                             sleep_time=sleep_time,
                             graceful_stop=GRACEFUL_STOP)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except Exception:
            logging.critical(traceback.format_exc())

    die(executable=executable,
        hostname=hostname,
        pid=pid,
        thread=thread,
        hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Ejemplo n.º 23
0
def hermes2(once=False, thread=0, bulk=1000, sleep_time=10):
    """
    Creates a Hermes2 Worker that can submit messages to different services (InfluXDB, ElasticSearch, ActiveMQ)
    The list of services need to be define in the config service in the hermes section.
    The list of endpoints need to be defined in rucio.cfg in the hermes section.

    :param once:       Run only once.
    :param thread:     Thread number at startup.
    :param bulk:       The number of requests to process.
    :param sleep_time: Time between two cycles.
    """

    executable = 'hermes2'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable,
                           hostname=hostname,
                           pid=pid,
                           thread=hb_thread)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])

    # Make an initial heartbeat so that all daemons have the correct worker number on the next try
    GRACEFUL_STOP.wait(10)
    heart_beat = heartbeat.live(executable,
                                hostname,
                                pid,
                                hb_thread,
                                older_than=3600)

    try:
        services_list = get('hermes', 'services_list')
        services_list = services_list.split(',')
    except ConfigNotFound:
        logging.debug('No services found, exiting')
        sys.exit(1)
    if 'influx' in services_list:
        try:
            influx_endpoint = config_get('hermes', 'influxdb_endpoint', False,
                                         None)
            if not influx_endpoint:
                logging.error(
                    'InfluxDB defined in the services list, but no endpoint can be find. Exiting'
                )
                sys.exit(1)
        except Exception as err:
            logging.error(err)
    if 'elastic' in services_list:
        try:
            elastic_endpoint = config_get('hermes', 'elastic_endpoint', False,
                                          None)
            if not elastic_endpoint:
                logging.error(
                    'Elastic defined in the services list, but no endpoint can be find. Exiting'
                )
                sys.exit(1)
        except Exception as err:
            logging.error(err)
    if 'activemq' in services_list:
        try:
            # activemq_endpoint = config_get('hermes', 'activemq_endpoint', False, None)
            conns, destination, username, password, use_ssl = setup_activemq(
                prepend_str)
            if not conns:
                logging.error(
                    'ActiveMQ defined in the services list, cannot be setup')
                sys.exit(1)
        except Exception as err:
            logging.error(err)

    while not GRACEFUL_STOP.is_set():
        message_status = copy.deepcopy(services_list)
        message_statuses = {}
        stime = time.time()
        try:
            start_time = time.time()
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])
            messages = retrieve_messages(
                bulk=bulk,
                thread=heart_beat['assign_thread'],
                total_threads=heart_beat['nr_threads'])

            if messages:
                for message in messages:
                    message_statuses[message['id']] = copy.deepcopy(
                        services_list)
                logging.debug(
                    '%s Retrieved %i messages retrieved in %s seconds',
                    prepend_str, len(messages),
                    time.time() - start_time)

                if 'influx' in message_status:
                    t_time = time.time()
                    logging.debug('%s Will submit to influxDB', prepend_str)
                    try:
                        state = aggregate_to_influx(messages=messages,
                                                    bin_size='1m',
                                                    endpoint=influx_endpoint,
                                                    prepend_str=prepend_str)
                    except Exception as error:
                        logging.error('%s Error sending to InfluxDB : %s',
                                      prepend_str, str(error))
                        state = 500
                    if state in [204, 200]:
                        logging.info(
                            '%s Messages successfully submitted to influxDB in %s seconds',
                            prepend_str,
                            time.time() - t_time)
                        for message in messages:
                            message_statuses[message['id']].remove('influx')
                    else:
                        logging.info('%s Failure to submit to influxDB',
                                     prepend_str)

                if 'elastic' in message_status:
                    t_time = time.time()
                    try:
                        state = submit_to_elastic(messages=messages,
                                                  endpoint=elastic_endpoint,
                                                  prepend_str=prepend_str)
                    except Exception as error:
                        logging.error('%s Error sending to Elastic : %s',
                                      prepend_str, str(error))
                        state = 500
                    if state in [200, 204]:
                        logging.info(
                            '%s Messages successfully submitted to elastic in %s seconds',
                            prepend_str,
                            time.time() - t_time)
                        for message in messages:
                            message_statuses[message['id']].remove('elastic')
                    else:
                        logging.info('%s Failure to submit to elastic',
                                     prepend_str)

                if 'emails' in message_status:
                    t_time = time.time()
                    try:
                        to_delete = deliver_emails(messages=messages,
                                                   prepend_str=prepend_str)
                        logging.info(
                            '%s Messages successfully submitted by emails in %s seconds',
                            prepend_str,
                            time.time() - t_time)
                        for message_id in to_delete:
                            message_statuses[message_id].remove('emails')
                    except Exception as error:
                        logging.error('%s Error sending email : %s',
                                      prepend_str, str(error))

                if 'activemq' in message_status:
                    t_time = time.time()
                    try:
                        to_delete = deliver_to_activemq(
                            messages=messages,
                            conns=conns,
                            destination=destination,
                            username=username,
                            password=password,
                            use_ssl=use_ssl,
                            prepend_str=prepend_str)
                        logging.info(
                            '%s Messages successfully submitted to ActiveMQ in %s seconds',
                            prepend_str,
                            time.time() - t_time)
                        for message_id in to_delete:
                            message_statuses[message_id].remove('activemq')
                    except Exception as error:
                        logging.error('%s Error sending to ActiveMQ : %s',
                                      prepend_str, str(error))

                to_delete = []
                to_update = {}
                for message in messages:
                    status = message_statuses[message['id']]
                    if not status:
                        to_delete.append({
                            'id': message['id'],
                            'created_at': message['created_at'],
                            'updated_at': message['created_at'],
                            'payload': str(message['payload']),
                            'event_type': message['event_type']
                        })
                    else:
                        status = ",".join(status)
                        if status not in to_update:
                            to_update[status] = []
                        to_update[status].append({
                            'id':
                            message['id'],
                            'created_at':
                            message['created_at'],
                            'updated_at':
                            message['created_at'],
                            'payload':
                            str(message['payload']),
                            'event_type':
                            message['event_type']
                        })
                logging.info('%s Deleting %s messages', prepend_str,
                             len(to_delete))
                delete_messages(messages=to_delete)
                for status in to_update:
                    logging.info(
                        '%s Failure to submit %s messages to %s. Will update the message status',
                        prepend_str, str(len(to_update[status])), status)
                    update_messages_services(messages=to_update[status],
                                             services=status)

            if once:
                break
            tottime = time.time() - stime
            if tottime < sleep_time:
                logging.info('%s Will sleep for %s seconds', prepend_str,
                             sleep_time - tottime)
                time.sleep(sleep_time - tottime)

        except:
            logging.critical(traceback.format_exc())
Ejemplo n.º 24
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting', worker_number)
    logging.info('Undertaker(%s): started', worker_number)
    hostname = socket.gethostname()
    pid = os.getpid()
    thread = threading.current_thread()
    sanity_check(executable='rucio-undertaker', hostname=hostname)

    paused_dids = {}  # {(scope, name): datetime}

    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable='rucio-undertaker',
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             older_than=6000)
            logging.info(
                'Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'
                .format(locals()))

            # Refresh paused dids
            iter_paused_dids = deepcopy(paused_dids)
            for key in iter_paused_dids:
                if datetime.utcnow() > paused_dids[key]:
                    del paused_dids[key]

            dids = list_expired_dids(worker_number=heartbeat['assign_thread'] +
                                     1,
                                     total_workers=heartbeat['nr_threads'],
                                     limit=10000)

            dids = [
                did for did in dids
                if (did['scope'], did['name']) not in paused_dids
            ]

            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.',
                             worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete',
                                 worker_number, len(chunk))
                    delete_dids(dids=chunk,
                                account=InternalAccount('root'),
                                expire_rules=True)
                    logging.info('Undertaker(%s): Delete %s dids',
                                 worker_number, len(chunk))
                    record_counter(counters='undertaker.delete_dids',
                                   delta=len(chunk))
                except RuleNotFound as error:
                    logging.error(error)
                except (DatabaseException, DatabaseError,
                        UnsupportedOperation) as e:
                    if match('.*ORA-00054.*', str(e.args[0])) or match(
                            '.*55P03.*', str(e.args[0])) or match(
                                '.*3572.*', str(e.args[0])):
                        for did in chunk:
                            paused_dids[(
                                did['scope'],
                                did['name'])] = datetime.utcnow() + timedelta(
                                    seconds=randint(600, 2400))
                        record_counter(
                            'undertaker.delete_dids.exceptions.LocksDetected')
                        logging.warning(
                            'undertaker[%s/%s]: Locks detected for chunk',
                            heartbeat['assign_thread'],
                            heartbeat['nr_threads'])
                    else:
                        logging.error('Undertaker(%s): Got database error %s.',
                                      worker_number, str(e))
        except:
            logging.critical(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    die(executable='rucio-undertaker',
        hostname=hostname,
        pid=pid,
        thread=thread)
    logging.info('Undertaker(%s): graceful stop requested', worker_number)
    logging.info('Undertaker(%s): graceful stop done', worker_number)
Ejemplo n.º 25
0
def atropos(thread,
            bulk,
            date_check,
            dry_run=True,
            grace_period=86400,
            once=True,
            unlock=False):
    """
    Creates an Atropos Worker that gets a list of rules which have an eol_at expired and delete them.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param grace_period: The grace_period for the rules.
    :param once: Run only once.
    """

    sleep_time = 60

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    now = datetime.datetime.now()
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    time.sleep(10)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1,
                                         hb['nr_threads'])
    logging.debug(prepend_str + 'Starting worker')
    summary = {}
    lifetime_exceptions = {}
    for excep in rucio.core.lifetime_exception.list_exceptions(
            exception_id=None,
            states=[
                LifetimeExceptionsState.APPROVED,
            ],
            session=None):
        if '%s:%s' % (excep['scope'],
                      excep['name']) not in lifetime_exceptions:
            lifetime_exceptions['%s:%s' %
                                (excep['scope'],
                                 excep['name'])] = excep['expires_at']
        elif lifetime_exceptions['%s:%s' %
                                 (excep['scope'],
                                  excep['name'])] < excep['expires_at']:
            lifetime_exceptions['%s:%s' %
                                (excep['scope'],
                                 excep['name'])] = excep['expires_at']
    logging.debug(prepend_str +
                  '%s active exceptions' % len(lifetime_exceptions))
    if not dry_run and date_check > now:
        logging.error(
            prepend_str +
            'Atropos cannot run in non-dry-run mode for date in the future')
    else:
        while not GRACEFUL_STOP.is_set():

            hb = heartbeat.live(executable, hostname, pid, hb_thread)
            prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1,
                                                 hb['nr_threads'])

            stime = time.time()
            try:
                rules = get_rules_beyond_eol(date_check,
                                             thread,
                                             hb['nr_threads'] - 1,
                                             session=None)
                logging.info(prepend_str + '%s rules to process' %
                             (len(rules)))
                for rule_idx, rule in enumerate(rules, start=1):
                    did = '%s:%s' % (rule.scope, rule.name)
                    logging.debug(prepend_str +
                                  'Working on rule %s on DID %s on %s' %
                                  (rule.id, did, rule.rse_expression))

                    if (rule_idx % 1000) == 0:
                        logging.info(prepend_str + '%s/%s rules processed' %
                                     (rule_idx, len(rules)))
                    # We compute the expected eol_at
                    rses = parse_expression(rule.rse_expression)
                    eol_at = rucio.core.lifetime_exception.define_eol(
                        rule.scope, rule.name, rses)
                    if eol_at != rule.eol_at:
                        logging.warning(
                            prepend_str +
                            'The computed eol %s differs from the one recorded %s for rule %s on %s at %s'
                            % (eol_at, rule.eol_at, rule.id, did,
                               rule.rse_expression))
                        try:
                            update_rule(rule.id, options={'eol_at': eol_at})
                        except RuleNotFound:
                            logging.warning(prepend_str +
                                            'Cannot find rule %s on DID %s' %
                                            (rule.id, did))

                    # Check the exceptions
                    if did in lifetime_exceptions:
                        if eol_at > lifetime_exceptions[did]:
                            logging.info(
                                prepend_str +
                                'Rule %s on DID %s on %s has longer expiration date than the one requested : %s'
                                % (rule.id, did, rule.rse_expression,
                                   lifetime_exceptions[did]))
                        else:
                            # If eol_at < requested extension, update eol_at
                            logging.info(
                                prepend_str +
                                'Updating rule %s on DID %s on %s according to the exception till %s'
                                % (rule.id, did, rule.rse_expression,
                                   lifetime_exceptions[did]))
                            eol_at = lifetime_exceptions[did]
                            try:
                                update_rule(rule.id,
                                            options={
                                                'eol_at':
                                                lifetime_exceptions[did]
                                            })
                            except RuleNotFound:
                                logging.warning(
                                    prepend_str +
                                    'Cannot find rule %s on DID %s' %
                                    (rule.id, did))

                    # Now check that the new eol_at is expired
                    if eol_at and eol_at < date_check:
                        no_locks = True
                        for lock in get_dataset_locks(rule.scope, rule.name):
                            if lock['rule_id'] == rule[4]:
                                no_locks = False
                                if lock['rse'] not in summary:
                                    summary[lock['rse']] = {}
                                if did not in summary[lock['rse']]:
                                    summary[lock['rse']][did] = {
                                        'length': lock['length'] or 0,
                                        'bytes': lock['bytes'] or 0
                                    }
                        if no_locks:
                            logging.warning(
                                prepend_str +
                                'Cannot find a lock for rule %s on DID %s' %
                                (rule.id, did))
                        if not dry_run:
                            logging.info(
                                prepend_str +
                                'Setting %s seconds lifetime for rule %s' %
                                (grace_period, rule.id))
                            options = {'lifetime': grace_period}
                            if rule.locked and unlock:
                                logging.info(prepend_str + 'Unlocking rule %s',
                                             rule.id)
                                options['locked'] = False
                            try:
                                update_rule(rule.id, options=options)
                            except RuleNotFound:
                                logging.warning(
                                    prepend_str +
                                    'Cannot find rule %s on DID %s' %
                                    (rule.id, did))
            except Exception:
                exc_type, exc_value, exc_traceback = exc_info()
                logging.critical(''.join(
                    format_exception(exc_type, exc_value,
                                     exc_traceback)).strip())

            for rse in summary:
                tot_size, tot_files, tot_datasets = 0, 0, 0
                for did in summary[rse]:
                    tot_datasets += 1
                    tot_files += summary[rse][did].get('length', 0)
                    tot_size += summary[rse][did].get('bytes', 0)
                logging.info(
                    prepend_str +
                    'For RSE %s %s datasets will be deleted representing %s files and %s bytes'
                    % (rse, tot_datasets, tot_files, tot_size))

            if once:
                break
            else:
                tottime = time.time() - stime
                if tottime < sleep_time:
                    logging.info(prepend_str + 'Will sleep for %s seconds' %
                                 (str(sleep_time - tottime)))
                    time.sleep(sleep_time - tottime)
                    continue

        logging.info(prepend_str + 'Graceful stop requested')
        heartbeat.die(executable, hostname, pid, hb_thread)
        logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 26
0
def receiver(id, total_threads=1, full_mode=False):
    """
    Main loop to consume messages from the FTS3 producer.
    """

    logging.info('receiver starting in full mode: %s' % full_mode)

    executable = 'conveyor-receiver'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()

    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    heartbeat.live(executable, hostname, pid, hb_thread)

    brokers_alias = []
    brokers_resolved = []
    try:
        brokers_alias = [
            b.strip()
            for b in config_get('messaging-fts3', 'brokers').split(',')
        ]
    except Exception:
        raise Exception('Could not load brokers from configuration')

    logging.info('resolving broker dns alias: %s' % brokers_alias)

    brokers_resolved = []
    for broker in brokers_alias:
        addrinfos = socket.getaddrinfo(broker, 0, socket.AF_INET, 0,
                                       socket.IPPROTO_TCP)
        brokers_resolved.extend(ai[4][0] for ai in addrinfos)

    logging.info('brokers resolved to %s', brokers_resolved)

    logging.info('checking authentication method')
    use_ssl = True
    try:
        use_ssl = config_get_bool('messaging-fts3', 'use_ssl')
    except:
        logging.info(
            'could not find use_ssl in configuration -- please update your rucio.cfg'
        )

    port = config_get_int('messaging-fts3', 'port')
    vhost = config_get('messaging-fts3',
                       'broker_virtual_host',
                       raise_exception=False)
    if not use_ssl:
        username = config_get('messaging-fts3', 'username')
        password = config_get('messaging-fts3', 'password')
        port = config_get_int('messaging-fts3', 'nonssl_port')

    conns = []
    for broker in brokers_resolved:
        if not use_ssl:
            logging.info('setting up username/password authentication: %s' %
                         broker)
            con = stomp.Connection12(host_and_ports=[(broker, port)],
                                     use_ssl=False,
                                     vhost=vhost,
                                     reconnect_attempts_max=999)
        else:
            logging.info('setting up ssl cert/key authentication: %s' % broker)
            con = stomp.Connection12(
                host_and_ports=[(broker, port)],
                use_ssl=True,
                ssl_key_file=config_get('messaging-fts3', 'ssl_key_file'),
                ssl_cert_file=config_get('messaging-fts3', 'ssl_cert_file'),
                vhost=vhost,
                reconnect_attempts_max=999)
        conns.append(con)

    logging.info('receiver started')

    while not graceful_stop.is_set():

        heartbeat.live(executable, hostname, pid, hb_thread)

        for conn in conns:

            if not conn.is_connected():
                logging.info('connecting to %s' %
                             conn.transport._Transport__host_and_ports[0][0])
                record_counter('daemons.messaging.fts3.reconnect.%s' %
                               conn.transport._Transport__host_and_ports[0]
                               [0].split('.')[0])

                conn.set_listener(
                    'rucio-messaging-fts3',
                    Receiver(
                        broker=conn.transport._Transport__host_and_ports[0],
                        id=id,
                        total_threads=total_threads,
                        full_mode=full_mode))
                conn.start()
                if not use_ssl:
                    conn.connect(username, password, wait=True)
                else:
                    conn.connect(wait=True)
                conn.subscribe(destination=config_get('messaging-fts3',
                                                      'destination'),
                               id='rucio-messaging-fts3',
                               ack='auto')

        time.sleep(1)

    logging.info('receiver graceful stop requested')

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('receiver graceful stop done')
Ejemplo n.º 27
0
def declare_suspicious_replicas_bad(once=False,
                                    younger_than=3,
                                    nattempts=10,
                                    rse_expression='MOCK',
                                    max_replicas_per_rse=100):
    """
    Main loop to check for available replicas which are labeled as suspicious

    Gets a list of suspicious replicas that are listed as AVAILABLE in 'replicas' table
    and available on other RSE. Finds surls of these replicas and declares them as bad.

    :param once: If True, the loop is run just once, otherwise the daemon continues looping until stopped.
    :param younger_than: The number of days since which bad_replicas table will be searched
                         for finding replicas declared 'SUSPICIOUS' at a specific RSE ('rse_expression'),
                         but 'AVAILABLE' on other RSE(s).
    :param nattempts: The minimum number of appearances in the bad_replica DB table
                      in order to appear in the resulting list of replicas for recovery.
    :param rse_expression: Search for suspicious replicas on RSEs matching the 'rse_expression'.
    :param max_replicas_per_rse: Maximum number of replicas which are allowed to be labeled as bad per RSE.
                                 If more is found, processing is skipped and warning is printed.
    :returns: None
    """

    # assembling the worker name identifier ('executable') including the rses from <rse_expression>
    # in order to have the possibility to detect a start of a second instance with the same set of RSES

    executable = argv[0]
    rses = []
    for rse in parse_expression(expression=rse_expression):
        rses.append(rse['rse'])
    rses.sort()
    executable += ' --rse-expression ' + str(rses)

    sanity_check(executable=executable, hostname=socket.gethostname())

    # make an initial heartbeat - expected only one replica-recoverer thread on one node
    # heartbeat mechanism is used in this daemon only for information purposes
    # (due to expected low load, the actual DB query does not filter the result based on worker number)
    live(executable=executable,
         hostname=socket.gethostname(),
         pid=os.getpid(),
         thread=threading.current_thread())

    # wait a moment in case all workers started at the same time
    GRACEFUL_STOP.wait(1)

    while not GRACEFUL_STOP.is_set():
        try:
            # issuing the heartbeat for a second time to make all workers aware of each other (there is only 1 worker allowed for this daemon)
            heartbeat = live(executable=executable,
                             hostname=socket.gethostname(),
                             pid=os.getpid(),
                             thread=threading.current_thread())
            total_workers = heartbeat['nr_threads']
            worker_number = heartbeat['assign_thread'] + 1

            # there is only 1 worker allowed for this daemon
            if total_workers != 1:
                logging.error(
                    'replica_recoverer: Another running instance on %s has been detected. Stopping gracefully.',
                    socket.gethostname())
                die(executable=executable,
                    hostname=socket.gethostname(),
                    pid=os.getpid(),
                    thread=threading.current_thread())
                break

            start = time.time()

            logging.info(
                'replica_recoverer[%i/%i]: ready to query replicas at RSE %s,'
                +
                ' reported suspicious in the last %i days at least %i times which are available on other RSEs.',
                worker_number, total_workers, rse_expression, younger_than,
                nattempts)

            getfileskwargs = {
                'younger_than': younger_than,
                'nattempts': nattempts,
                'exclude_states': ['B', 'R', 'D', 'L', 'T'],
                'available_elsewhere': True,
                'is_suspicious': True
            }

            recoverable_replicas = get_suspicious_files(
                rse_expression, **getfileskwargs)

            logging.info(
                'replica_recoverer[%i/%i]: suspicious replica query took %.2f seconds, total of %i replicas were found.',
                worker_number, total_workers,
                time.time() - start, len(recoverable_replicas))

            if not recoverable_replicas and not once:
                logging.info(
                    'replica_recoverer[%i/%i]: found %i recoverable suspicious replicas. Sleeping for 60 seconds.',
                    worker_number, total_workers, len(recoverable_replicas))
                GRACEFUL_STOP.wait(60)
            else:
                logging.info(
                    'replica_recoverer[%i/%i]: looking for replica surls.',
                    worker_number, total_workers)

                start = time.time()
                surls_to_recover = {
                }  # dictionary of { rse1: [surl1, surl2, ... ], rse2: ... }
                cnt_surl_not_found = 0
                for replica in recoverable_replicas:
                    scope = replica['scope']
                    name = replica['name']
                    rse = replica['rse']
                    rse_id = replica['rse_id']
                    if GRACEFUL_STOP.is_set():
                        break
                    if rse_id not in surls_to_recover:
                        surls_to_recover[rse_id] = []
                    # for each suspicious replica, we get its surl through the list_replicas function
                    surl_not_found = True
                    for rep in list_replicas([{'scope': scope, 'name': name}]):
                        for site in rep['rses']:
                            if site == rse_id:
                                surls_to_recover[rse_id].append(
                                    rep['rses'][site][0])
                                surl_not_found = False
                    if surl_not_found:
                        cnt_surl_not_found += 1
                        logging.warning(
                            'replica_recoverer[%i/%i]: skipping suspicious replica %s on %s, no surls were found.',
                            worker_number, total_workers, name, rse)

                logging.info(
                    'replica_recoverer[%i/%i]: found %i/%i surls (took %.2f seconds), declaring them as bad replicas now.',
                    worker_number, total_workers,
                    len(recoverable_replicas) - cnt_surl_not_found,
                    len(recoverable_replicas),
                    time.time() - start)

                for rse_id in surls_to_recover:
                    logging.info(
                        'replica_recoverer[%i/%i]: ready to declare %i bad replica(s) on %s: %s.',
                        worker_number, total_workers,
                        len(surls_to_recover[rse_id]), rse,
                        str(surls_to_recover[rse_id]))
                    if len(surls_to_recover[rse_id]) > max_replicas_per_rse:
                        logging.warning(
                            'replica_recoverer[%i/%i]: encountered more than %i suspicious replicas (%s) on %s. Please investigate.',
                            worker_number, total_workers, max_replicas_per_rse,
                            str(len(surls_to_recover[rse_id])), rse)
                    else:
                        declare_bad_file_replicas(
                            pfns=surls_to_recover[rse_id],
                            reason='Suspicious. Automatic recovery.',
                            issuer=InternalAccount('root'),
                            status=BadFilesStatus.BAD,
                            session=None)
                        logging.info(
                            'replica_recoverer[%i/%i]: finished declaring bad replicas on %s.',
                            worker_number, total_workers, rse)

        except (DatabaseException, DatabaseError) as err:
            if match('.*QueuePool.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
            elif match('.*ORA-03135.*', str(err.args[0])):
                logging.warning(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
            else:
                logging.critical(traceback.format_exc())
                record_counter('replica.recoverer.exceptions.%s',
                               err.__class__.__name__)
        except Exception as err:
            logging.critical(traceback.format_exc())
            record_counter('replica.recoverer.exceptions.%s',
                           err.__class__.__name__)
        if once:
            break

    die(executable=executable,
        hostname=socket.gethostname(),
        pid=os.getpid(),
        thread=threading.current_thread())
    logging.info('replica_recoverer[%i/%i]: graceful stop done', worker_number,
                 total_workers)
Ejemplo n.º 28
0
def poller(once=False,
           activities=None,
           sleep_time=60,
           fts_bulk=100,
           db_bulk=1000,
           older_than=60,
           activity_shares=None):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    executable = sys.argv[0]
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if activity_shares:
        activities.sort()
        executable += '--activity_shares' + str(activity_shares)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])
    logging.info(prepend_str +
                 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' %
                 (db_bulk, fts_bulk, timeout))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1,
                                         heart_beat['nr_threads'])

    logging.info(prepend_str + 'Poller started')

    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] +
                                                 1, heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                start_time = time.time()
                logging.debug(
                    prepend_str +
                    'Start to poll transfers older than %i seconds for activity %s'
                    % (older_than, activity))
                transfs = request_core.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[RequestState.SUBMITTED],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow() -
                    datetime.timedelta(seconds=older_than),
                    total_workers=heart_beat['nr_threads'] - 1,
                    worker_number=heart_beat['assign_thread'],
                    mode_all=False,
                    hash_variable='id',
                    activity=activity,
                    activity_shares=activity_shares)

                record_timer('daemons.conveyor.poller.000-get_next',
                             (time.time() - start_time) * 1000)

                if transfs:
                    logging.debug(prepend_str +
                                  'Polling %i transfers for activity %s' %
                                  (len(transfs), activity))

                xfers_ids = {}
                for transf in transfs:
                    if not transf['external_host'] in xfers_ids:
                        xfers_ids[transf['external_host']] = []
                    xfers_ids[transf['external_host']].append(
                        (transf['external_id'], transf['request_id']))

                for external_host in xfers_ids:
                    external_ids = list(
                        {trf[0]
                         for trf in xfers_ids[external_host]})
                    request_ids = [trf[1] for trf in xfers_ids[external_host]]
                    for xfers in chunks(external_ids, fts_bulk):
                        # poll transfers
                        poll_transfers(external_host=external_host,
                                       xfers=xfers,
                                       prepend_str=prepend_str,
                                       request_ids=request_ids,
                                       timeout=timeout)

                if len(transfs) < fts_bulk / 2:
                    logging.info(
                        prepend_str +
                        "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds"
                        % (len(transfs), activity, fts_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical(prepend_str + "%s" % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 29
0
def finisher(once=False,
             sleep_time=60,
             activities=None,
             bulk=100,
             db_bulk=1000,
             partition_wait_time=10):
    """
    Main loop to update the replicas and rules based on finished requests.
    """
    try:
        conveyor_config = {item[0]: item[1] for item in items('conveyor')}
    except ConfigNotFound:
        logging.log(logging.INFO, 'No configuration found for conveyor')
        conveyor_config = {}

    # Get suspicious patterns
    suspicious_patterns = conveyor_config.get('suspicious_pattern', [])
    if suspicious_patterns:
        pattern = str(suspicious_patterns)
        patterns = pattern.split(",")
        suspicious_patterns = [re.compile(pat.strip()) for pat in patterns]
    logging.log(
        logging.DEBUG, "Suspicious patterns: %s" %
        [pat.pattern for pat in suspicious_patterns])

    retry_protocol_mismatches = conveyor_config.get(
        'retry_protocol_mismatches', False)

    executable = 'conveyor-finisher'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prefix = 'conveyor-finisher[%i/%i] : ' % (heart_beat['assign_thread'],
                                              heart_beat['nr_threads'])
    logger = formatted_logger(logging.log, prefix + '%s')
    logger(logging.INFO, 'Finisher starting - db_bulk(%i) bulk (%i)', db_bulk,
           bulk)

    if partition_wait_time:
        graceful_stop.wait(partition_wait_time)
    while not graceful_stop.is_set():

        start_time = time.time()
        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prefix = 'conveyor-finisher[%i/%i] : ' % (
                heart_beat['assign_thread'], heart_beat['nr_threads'])
            logger = formatted_logger(logging.log, prefix + '%s')
            logger(logging.DEBUG, 'Starting new cycle')
            if activities is None:
                activities = [None]

            for activity in activities:
                logger(logging.DEBUG, 'Working on activity %s', activity)
                time1 = time.time()
                reqs = request_core.get_next(
                    request_type=[
                        RequestType.TRANSFER, RequestType.STAGEIN,
                        RequestType.STAGEOUT
                    ],
                    state=[
                        RequestState.DONE, RequestState.FAILED,
                        RequestState.LOST, RequestState.SUBMITTING,
                        RequestState.SUBMISSION_FAILED,
                        RequestState.NO_SOURCES,
                        RequestState.ONLY_TAPE_SOURCES,
                        RequestState.MISMATCH_SCHEME
                    ],
                    limit=db_bulk,
                    older_than=datetime.datetime.utcnow(),
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    mode_all=True,
                    hash_variable='rule_id')
                record_timer('daemons.conveyor.finisher.000-get_next',
                             (time.time() - time1) * 1000)
                time2 = time.time()
                if reqs:
                    logger(logging.DEBUG,
                           'Updating %i requests for activity %s', len(reqs),
                           activity)

                for chunk in chunks(reqs, bulk):
                    try:
                        time3 = time.time()
                        __handle_requests(chunk,
                                          suspicious_patterns,
                                          retry_protocol_mismatches,
                                          logger=logger)
                        record_timer(
                            'daemons.conveyor.finisher.handle_requests',
                            (time.time() - time3) * 1000 /
                            (len(chunk) if chunk else 1))
                        record_counter(
                            'daemons.conveyor.finisher.handle_requests',
                            len(chunk))
                    except Exception as error:
                        logger(logging.WARNING, '%s', str(error))
                if reqs:
                    logger(
                        logging.DEBUG,
                        'Finish to update %s finished requests for activity %s in %s seconds',
                        len(reqs), activity,
                        time.time() - time2)

        except (DatabaseException, DatabaseError) as error:
            if re.match('.*ORA-00054.*', error.args[0]) or re.match(
                    '.*ORA-00060.*',
                    error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                logger(logging.WARNING,
                       'Lock detected when handling request - skipping: %s',
                       str(error))
            else:
                logger(logging.ERROR, 'Exception', exc_info=True)
        except Exception:
            logger(logging.CRITICAL, 'Exception', exc_info=True)

        if once:
            break

        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logger(logging.INFO, 'Sleeping for a while :  %s seconds',
                   (sleep_time - time_diff))
            graceful_stop.wait(sleep_time - time_diff)

    logger(logging.INFO, 'Graceful stop requests')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logger(logging.INFO, 'Graceful stop done')
Ejemplo n.º 30
0
def kronos_file(thread=0, dataset_queue=None, sleep_time=60):
    """
    Main loop to consume tracer reports.
    """

    logging.info('kronos_file[%i/?] starting', thread)

    executable = 'kronos-file'
    hostname = socket.gethostname()
    pid = getpid()
    hb_thread = current_thread()

    chunksize = config_get_int('tracer-kronos', 'chunksize')
    prefetch_size = config_get_int('tracer-kronos', 'prefetch_size')
    subscription_id = config_get('tracer-kronos', 'subscription_id')
    try:
        bad_files_patterns = []
        pattern = config_get(section='kronos',
                             option='bad_files_patterns',
                             session=None)
        pattern = str(pattern)
        patterns = pattern.split(",")
        for pat in patterns:
            bad_files_patterns.append(re.compile(pat.strip()))
    except (NoOptionError, NoSectionError, RuntimeError):
        bad_files_patterns = []
    except Exception as error:
        logging.log(logging.ERROR,
                    'kronos_file[%i/?] Failed to get bad_file_patterns %s',
                    thread, str(error))
        bad_files_patterns = []

    use_ssl = True
    try:
        use_ssl = config_get_bool('tracer-kronos', 'use_ssl')
    except Exception:
        pass

    if not use_ssl:
        username = config_get('tracer-kronos', 'username')
        password = config_get('tracer-kronos', 'password')

    excluded_usrdns = set(
        config_get('tracer-kronos', 'excluded_usrdns').split(','))
    vhost = config_get('tracer-kronos',
                       'broker_virtual_host',
                       raise_exception=False)

    brokers_alias = [
        b.strip() for b in config_get('tracer-kronos', 'brokers').split(',')
    ]
    port = config_get_int('tracer-kronos', 'port')
    reconnect_attempts = config_get_int('tracer-kronos', 'reconnect_attempts')
    ssl_key_file = config_get('tracer-kronos',
                              'ssl_key_file',
                              raise_exception=False)
    ssl_cert_file = config_get('tracer-kronos',
                               'ssl_cert_file',
                               raise_exception=False)

    sanity_check(executable=executable, hostname=hostname)
    while not graceful_stop.is_set():
        start_time = time()
        heart_beat = live(executable, hostname, pid, hb_thread)
        prepend_str = 'kronos-file[%i/%i] ' % (heart_beat['assign_thread'],
                                               heart_beat['nr_threads'])
        logger = formatted_logger(logging.log, prepend_str + '%s')
        conns = get_stomp_brokers(brokers=brokers_alias,
                                  port=port,
                                  use_ssl=use_ssl,
                                  vhost=vhost,
                                  reconnect_attempts=reconnect_attempts,
                                  ssl_key_file=ssl_key_file,
                                  ssl_cert_file=ssl_cert_file,
                                  timeout=sleep_time,
                                  logger=logger)
        for conn in conns:
            if not conn.is_connected():
                logger(
                    logging.INFO, 'connecting to %s' %
                    str(conn.transport._Transport__host_and_ports[0]))
                record_counter(
                    'daemons.tracer.kronos.reconnect.{host}',
                    labels={
                        'host': conn.transport._Transport__host_and_ports[0][0]
                    })
                conn.set_listener(
                    'rucio-tracer-kronos',
                    AMQConsumer(
                        broker=conn.transport._Transport__host_and_ports[0],
                        conn=conn,
                        queue=config_get('tracer-kronos', 'queue'),
                        chunksize=chunksize,
                        subscription_id=subscription_id,
                        excluded_usrdns=excluded_usrdns,
                        dataset_queue=dataset_queue,
                        bad_files_patterns=bad_files_patterns,
                        logger=logger))
                if not use_ssl:
                    conn.connect(username, password)
                else:
                    conn.connect()
                conn.subscribe(
                    destination=config_get('tracer-kronos', 'queue'),
                    ack='client-individual',
                    id=subscription_id,
                    headers={'activemq.prefetchSize': prefetch_size})

        tottime = time() - start_time
        if tottime < sleep_time:
            logger(logging.INFO,
                   'Will sleep for %s seconds' % (sleep_time - tottime))
            sleep(sleep_time - tottime)

    logger(logging.INFO, 'graceful stop requested')

    for conn in conns:
        try:
            conn.disconnect()
        except Exception:
            pass

    die(executable=executable, hostname=hostname, pid=pid, thread=thread)
    logger(logging.INFO, 'graceful stop done')