Example #1
0
    def on_message(self, headers, message):
        record_counter('daemons.conveyor.consumer.message_all')

        if 'vo' not in headers or headers['vo'] != 'atlas':
            return

        msg = json.loads(message[:-1])  # message always ends with an unparseable EOT character
        if 'job_metadata' in msg.keys() \
           and isinstance(msg['job_metadata'], dict) \
           and 'issuer' in msg['job_metadata'].keys() \
           and str(msg['job_metadata']['issuer']) == str('rucio'):

            if 'job_m_replica' in msg.keys() and 'job_state' in msg.keys() \
               and (str(msg['job_m_replica']) == str('false') or (str(msg['job_m_replica']) == str('true') and str(msg['job_state']) != str('ACTIVE'))):

                response = {'new_state': None,
                            'transfer_id': msg.get('tr_id').split("__")[-1],
                            'job_state': msg.get('t_final_transfer_state', None),
                            'src_url': msg.get('src_url', None),
                            'dst_url': msg.get('dst_url', None),
                            'duration': (float(msg.get('tr_timestamp_complete', 0)) - float(msg.get('tr_timestamp_start', 0)))/1000,
                            'reason': msg.get('t__error_message', None),
                            'scope': msg['job_metadata'].get('scope', None),
                            'name': msg['job_metadata'].get('name', None),
                            'src_rse': msg['job_metadata'].get('src_rse', None),
                            'dst_rse': msg['job_metadata'].get('dst_rse', None),
                            'request_id': msg['job_metadata'].get('request_id', None),
                            'activity': msg['job_metadata'].get('activity', None),
                            'dest_rse_id': msg['job_metadata'].get('dest_rse_id', None),
                            'previous_attempt_id': msg['job_metadata'].get('previous_attempt_id', None),
                            'adler32': msg['job_metadata'].get('adler32', None),
                            'md5': msg['job_metadata'].get('md5', None),
                            'filesize': msg['job_metadata'].get('filesize', None),
                            'external_host': msg.get('endpnt', None),
                            'job_m_replica': msg.get('job_m_replica', None),
                            'details': {'files': msg['job_metadata']}}

                record_counter('daemons.conveyor.consumer.message_rucio')
                if str(msg['t_final_transfer_state']) == str(FTSCompleteState.OK):
                    response['new_state'] = RequestState.DONE
                elif str(msg['t_final_transfer_state']) == str(FTSCompleteState.ERROR):
                    response['new_state'] = RequestState.FAILED

                try:
                    if response['new_state']:
                        logging.debug('DID %s:%s FROM %s TO %s STATE %s' % (msg['job_metadata']['scope'],
                                                                            msg['job_metadata']['name'],
                                                                            msg['job_metadata']['src_rse'],
                                                                            msg['job_metadata']['dst_rse'],
                                                                            response['new_state']))

                        update_request_state(response)
                        record_counter('daemons.conveyor.consumer.update_request_state')
                except:
                    logging.critical(traceback.format_exc())
Example #2
0
def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000, older_than=60):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                 thread, total_threads,
                                                                                 bulk))

    logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                thread, total_threads,
                                                                                bulk))

    while not graceful_stop.is_set():

        try:
            ts = time.time()

            logging.debug('%i:%i - start to poll requests older than %i seconds' % (process, thread, older_than))
            reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                    state=RequestState.SUBMITTED,
                                    limit=bulk,
                                    older_than=datetime.datetime.utcnow()-datetime.timedelta(seconds=older_than),
                                    process=process, total_processes=total_processes,
                                    thread=thread, total_threads=total_threads)
            record_timer('daemons.conveyor.poller.000-get_next', (time.time()-ts)*1000)

            if reqs:
                logging.debug('%i:%i - polling %i requests' % (process, thread, len(reqs)))

            if not reqs or reqs == []:
                if once:
                    break
                logging.debug("no requests found. will sleep 60 seconds")
                time.sleep(60)  # Only sleep if there is nothing to do
                continue

            for xfers in chunks(reqs, bulk):
                try:
                    req_ids = {}
                    for req in xfers:
                        record_counter('daemons.conveyor.poller.query_request')
                        if not req['external_host'] in req_ids:
                            req_ids[req['external_host']] = []
                        req_ids[req['external_host']].append((req['request_id'], req['external_id']))

                    responses = {}
                    for external_host in req_ids:
                        ts = time.time()
                        resps = request.bulk_query_requests(external_host, req_ids[external_host], 'fts3')
                        record_timer('daemons.conveyor.poller.001-bulk_query_requests', (time.time()-ts)*1000/len(req_ids[external_host]))
                        responses = dict(responses.items() + resps.items())

                    for external_host in req_ids:
                        for request_id, external_id in req_ids[external_host]:
                            response = responses[request_id]
                            if isinstance(response, Exception):
                                logging.critical("Failed to poll request(%s) with FTS(%s) job (%s): %s" % (request_id, external_host, external_id, responses[request_id]))
                                record_counter('daemons.conveyor.poller.query_request_exception')
                                response = {'new_state': None, 'request_id': request_id, 'transfer_id': external_id, 'job_state': None}
                            common.update_request_state(response)
                            record_counter('daemons.conveyor.poller.update_request_state')
                            if response['new_state'] == RequestState.LOST:
                                record_counter('daemons.conveyor.poller.request_lost')
                except:
                    logging.critical(traceback.format_exc())

        except:
            logging.critical(traceback.format_exc())

        if once:
            return

    logging.debug('%i:%i - graceful stop requests' % (process, thread))

    logging.debug('%i:%i - graceful stop done' % (process, thread))
Example #3
0
    def on_message(self, headers, message):
        record_counter('daemons.conveyor.receiver.message_all')

        if 'vo' not in headers or headers['vo'] != 'atlas':
            return

        msg = json.loads(
            message[:-1]
        )  # message always ends with an unparseable EOT character
        if 'job_metadata' in msg.keys() \
           and isinstance(msg['job_metadata'], dict) \
           and 'issuer' in msg['job_metadata'].keys() \
           and str(msg['job_metadata']['issuer']) == str('rucio'):

            if 'job_m_replica' in msg.keys() and 'job_state' in msg.keys() \
               and (str(msg['job_m_replica']) == str('false') or (str(msg['job_m_replica']) == str('true') and str(msg['job_state']) != str('ACTIVE'))):

                if 'request_id' in msg['job_metadata']:
                    # submitted by old submitter
                    response = {
                        'new_state':
                        None,
                        'transfer_id':
                        msg.get('tr_id').split("__")[-1],
                        'job_state':
                        msg.get('t_final_transfer_state', None),
                        'src_url':
                        msg.get('src_url', None),
                        'dst_url':
                        msg.get('dst_url', None),
                        'transferred_at':
                        datetime.datetime.utcfromtimestamp(
                            float(msg.get('tr_timestamp_complete', 0)) / 1000),
                        'duration':
                        (float(msg.get('tr_timestamp_complete', 0)) -
                         float(msg.get('tr_timestamp_start', 0))) / 1000,
                        'reason':
                        msg.get('t__error_message', None),
                        'scope':
                        msg['job_metadata'].get('scope', None),
                        'name':
                        msg['job_metadata'].get('name', None),
                        'src_rse':
                        msg['job_metadata'].get('src_rse', None),
                        'dst_rse':
                        msg['job_metadata'].get('dst_rse', None),
                        'request_id':
                        msg['job_metadata'].get('request_id', None),
                        'activity':
                        msg['job_metadata'].get('activity', None),
                        'dest_rse_id':
                        msg['job_metadata'].get('dest_rse_id', None),
                        'previous_attempt_id':
                        msg['job_metadata'].get('previous_attempt_id', None),
                        'adler32':
                        msg['job_metadata'].get('adler32', None),
                        'md5':
                        msg['job_metadata'].get('md5', None),
                        'filesize':
                        msg['job_metadata'].get('filesize', None),
                        'external_host':
                        msg.get('endpnt', None),
                        'job_m_replica':
                        msg.get('job_m_replica', None),
                        'details': {
                            'files': msg['job_metadata']
                        }
                    }
                else:
                    # for new submitter, file_metadata replace the job_metadata
                    response = {
                        'new_state':
                        None,
                        'transfer_id':
                        msg.get('tr_id').split("__")[-1],
                        'job_state':
                        msg.get('t_final_transfer_state', None),
                        'src_url':
                        msg.get('src_url', None),
                        'dst_url':
                        msg.get('dst_url', None),
                        'started_at':
                        datetime.datetime.utcfromtimestamp(
                            float(msg.get('tr_timestamp_start', 0)) / 1000),
                        'transferred_at':
                        datetime.datetime.utcfromtimestamp(
                            float(msg.get('tr_timestamp_complete', 0)) / 1000),
                        'duration':
                        (float(msg.get('tr_timestamp_complete', 0)) -
                         float(msg.get('tr_timestamp_start', 0))) / 1000,
                        'reason':
                        msg.get('t__error_message', None),
                        'scope':
                        msg['file_metadata'].get('scope', None),
                        'name':
                        msg['file_metadata'].get('name', None),
                        'src_type':
                        msg['file_metadata'].get('src_type', None),
                        'dst_type':
                        msg['file_metadata'].get('dst_type', None),
                        'src_rse':
                        msg['file_metadata'].get('src_rse', None),
                        'dst_rse':
                        msg['file_metadata'].get('dst_rse', None),
                        'request_id':
                        msg['file_metadata'].get('request_id', None),
                        'activity':
                        msg['file_metadata'].get('activity', None),
                        'src_rse_id':
                        msg['file_metadata'].get('src_rse_id', None),
                        'dest_rse_id':
                        msg['file_metadata'].get('dest_rse_id', None),
                        'previous_attempt_id':
                        msg['file_metadata'].get('previous_attempt_id', None),
                        'adler32':
                        msg['file_metadata'].get('adler32', None),
                        'md5':
                        msg['file_metadata'].get('md5', None),
                        'filesize':
                        msg['file_metadata'].get('filesize', None),
                        'external_host':
                        msg.get('endpnt', None),
                        'job_m_replica':
                        msg.get('job_m_replica', None),
                        'details': {
                            'files': msg['file_metadata']
                        }
                    }

                record_counter('daemons.conveyor.receiver.message_rucio')
                if str(msg['t_final_transfer_state']) == str(
                        FTSCompleteState.OK):
                    response['new_state'] = RequestState.DONE
                elif str(msg['t_final_transfer_state']) == str(
                        FTSCompleteState.ERROR):
                    response['new_state'] = RequestState.FAILED

                try:
                    if response['new_state']:
                        logging.info(
                            'RECEIVED DID %s:%s FROM %s TO %s REQUEST %s TRANSFER_ID %s STATE %s'
                            % (response['scope'], response['name'],
                               response['src_rse'], response['dst_rse'],
                               response['request_id'], response['transfer_id'],
                               response['new_state']))

                        if self.__full_mode:
                            ret = common.update_request_state(response)
                            record_counter(
                                'daemons.conveyor.receiver.update_request_state.%s'
                                % ret)
                        else:
                            try:
                                logging.debug("Update request %s update time" %
                                              response['request_id'])
                                set_transfer_update_time(
                                    response['external_host'],
                                    response['transfer_id'],
                                    datetime.datetime.utcnow() -
                                    datetime.timedelta(hours=24))
                                record_counter(
                                    'daemons.conveyor.receiver.set_transfer_update_time'
                                )
                            except Exception, e:
                                logging.debug(
                                    "Failed to update transfer's update time: %s"
                                    % str(e))
                except:
                    logging.critical(traceback.format_exc())
Example #4
0
def poller_latest(external_hosts, once=False, last_nhours=1, fts_wait=1800):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()

    logging.info('polling latest %s hours on hosts: %s' %
                 (last_nhours, external_hosts))
    if external_hosts:
        if type(external_hosts) == str:
            external_hosts = [external_hosts]

    while not graceful_stop.is_set():

        try:
            heartbeat.live(executable, hostname, pid, hb_thread)

            start_time = time.time()
            for external_host in external_hosts:
                logging.debug('polling latest %s hours on host: %s' %
                              (last_nhours, external_host))
                ts = time.time()
                resps = None
                state = [
                    str(FTSState.FINISHED),
                    str(FTSState.FAILED),
                    str(FTSState.FINISHEDDIRTY),
                    str(FTSState.CANCELED)
                ]
                try:
                    resps = request.query_latest(external_host,
                                                 state=state,
                                                 last_nhours=last_nhours)
                except:
                    logging.error(traceback.format_exc())
                record_timer('daemons.conveyor.poller_latest.000-query_latest',
                             (time.time() - ts) * 1000)

                if resps:
                    logging.info('poller_latest - polling %i requests' %
                                 (len(resps)))

                if not resps or resps == []:
                    if once:
                        break
                    logging.info("no requests found. will sleep 60 seconds")
                    continue

                for resp in resps:
                    try:
                        ret = common.update_request_state(resp)
                        # if True, really update request content; if False, only touch request
                        record_counter(
                            'daemons.conveyor.poller_latest.update_request_state.%s'
                            % ret)
                    except:
                        logging.error(traceback.format_exc())
            if once:
                break

            time_left = fts_wait - abs(time.time() - start_time)
            if time_left > 0:
                logging.debug(
                    "Waiting %s seconds until next FTS terminal state retrieval"
                    % time_left)
                time.sleep(time_left)
        except RequestException, e:
            logging.error("Failed to contact FTS server: %s" % (str(e)))
        except: