def _perform_request_update(self, msg, session=None, logger=logging.log): external_host = msg.get('endpnt', None) request_id = msg['file_metadata'].get('request_id', None) try: tt_status_report = FTS3CompletionMessageTransferStatusReport( external_host, request_id=request_id, fts_message=msg) if tt_status_report.get_db_fields_to_update(session=session, logger=logger): logging.info('RECEIVED %s', tt_status_report) if self.__full_mode: ret = request_core.update_request_state(tt_status_report, session=session, logger=logger) record_counter( 'daemons.conveyor.receiver.update_request_state.{updated}', labels={'updated': ret}) else: try: logging.debug("Update request %s update time" % request_id) set_transfer_update_time(external_host, tt_status_report.external_id, datetime.datetime.utcnow() - datetime.timedelta(hours=24), session=session) record_counter( 'daemons.conveyor.receiver.set_transfer_update_time' ) except Exception as error: logging.debug( "Failed to update transfer's update time: %s" % str(error)) except Exception: logging.critical(traceback.format_exc())
def poller_latest(external_hosts, once=False, last_nhours=1, fts_wait=1800): """ Main loop to check the status of a transfer primitive with a transfertool. """ executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() logging.info('polling latest %s hours on hosts: %s' % (last_nhours, external_hosts)) if external_hosts: if type(external_hosts) == str: external_hosts = [external_hosts] while not graceful_stop.is_set(): try: heartbeat.live(executable, hostname, pid, hb_thread) start_time = time.time() for external_host in external_hosts: logging.debug('polling latest %s hours on host: %s' % (last_nhours, external_host)) ts = time.time() resps = None state = [str(FTSState.FINISHED), str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)] try: resps = transfer.query_latest(external_host, state=state, last_nhours=last_nhours) except: logging.error(traceback.format_exc()) record_timer('daemons.conveyor.poller_latest.000-query_latest', (time.time() - ts) * 1000) if resps: logging.info('poller_latest - polling %i requests' % (len(resps))) if not resps or resps == []: if once: break logging.info("no requests found. will sleep 60 seconds") continue for resp in resps: try: ret = request.update_request_state(resp) # if True, really update request content; if False, only touch request record_counter('daemons.conveyor.poller_latest.update_request_state.%s' % ret) except: logging.error(traceback.format_exc()) if once: break time_left = fts_wait - abs(time.time() - start_time) if time_left > 0: logging.debug("Waiting %s seconds until next FTS terminal state retrieval" % time_left) graceful_stop.wait(time_left) except RequestException, e: logging.error("Failed to contact FTS server: %s" % (str(e))) except:
class Receiver(object): def __init__(self, broker, id, total_threads, full_mode=False): self.__broker = broker self.__id = id self.__total_threads = total_threads self.__full_mode = full_mode def on_error(self, headers, message): record_counter('daemons.conveyor.receiver.error') logging.error('[%s] %s' % (self.__broker, message)) def on_message(self, headers, message): record_counter('daemons.conveyor.receiver.message_all') try: msg = json.loads(message) except Exception, e: msg = json.loads( message[:-1] ) # Note: I am not sure if this is needed anymore, this was due to an unparsable EOT character if 'vo' not in msg or msg['vo'] != get_policy(): return if 'job_metadata' in msg.keys() \ and isinstance(msg['job_metadata'], dict) \ and 'issuer' in msg['job_metadata'].keys() \ and str(msg['job_metadata']['issuer']) == str('rucio'): if 'job_m_replica' in msg.keys() and 'job_state' in msg.keys() \ and (str(msg['job_m_replica']).lower() == str('false') or (str(msg['job_m_replica']).lower() == str('true') and str(msg['job_state']) != str('ACTIVE'))): if 'request_id' in msg['job_metadata']: # submitted by old submitter response = { 'new_state': None, 'transfer_id': msg.get('tr_id').split("__")[-1], 'job_state': msg.get('t_final_transfer_state', None), 'src_url': msg.get('src_url', None), 'dst_url': msg.get('dst_url', None), 'transferred_at': datetime.datetime.utcfromtimestamp( float(msg.get('tr_timestamp_complete', 0)) / 1000), 'duration': (float(msg.get('tr_timestamp_complete', 0)) - float(msg.get('tr_timestamp_start', 0))) / 1000, 'reason': msg.get('t__error_message', None), 'scope': msg['job_metadata'].get('scope', None), 'name': msg['job_metadata'].get('name', None), 'src_rse': msg['job_metadata'].get('src_rse', None), 'dst_rse': msg['job_metadata'].get('dst_rse', None), 'request_id': msg['job_metadata'].get('request_id', None), 'activity': msg['job_metadata'].get('activity', None), 'dest_rse_id': msg['job_metadata'].get('dest_rse_id', None), 'previous_attempt_id': msg['job_metadata'].get('previous_attempt_id', None), 'adler32': msg['job_metadata'].get('adler32', None), 'md5': msg['job_metadata'].get('md5', None), 'filesize': msg['job_metadata'].get('filesize', None), 'external_host': msg.get('endpnt', None), 'job_m_replica': msg.get('job_m_replica', None), 'details': { 'files': msg['job_metadata'] } } else: # for new submitter, file_metadata replace the job_metadata response = { 'new_state': None, 'transfer_id': msg.get('tr_id').split("__")[-1], 'job_state': msg.get('t_final_transfer_state', None), 'src_url': msg.get('src_url', None), 'dst_url': msg.get('dst_url', None), 'started_at': datetime.datetime.utcfromtimestamp( float(msg.get('tr_timestamp_start', 0)) / 1000), 'transferred_at': datetime.datetime.utcfromtimestamp( float(msg.get('tr_timestamp_complete', 0)) / 1000), 'duration': (float(msg.get('tr_timestamp_complete', 0)) - float(msg.get('tr_timestamp_start', 0))) / 1000, 'reason': msg.get('t__error_message', None), 'scope': msg['file_metadata'].get('scope', None), 'name': msg['file_metadata'].get('name', None), 'src_type': msg['file_metadata'].get('src_type', None), 'dst_type': msg['file_metadata'].get('dst_type', None), 'src_rse': msg['file_metadata'].get('src_rse', None), 'dst_rse': msg['file_metadata'].get('dst_rse', None), 'request_id': msg['file_metadata'].get('request_id', None), 'activity': msg['file_metadata'].get('activity', None), 'src_rse_id': msg['file_metadata'].get('src_rse_id', None), 'dest_rse_id': msg['file_metadata'].get('dest_rse_id', None), 'previous_attempt_id': msg['file_metadata'].get('previous_attempt_id', None), 'adler32': msg['file_metadata'].get('adler32', None), 'md5': msg['file_metadata'].get('md5', None), 'filesize': msg['file_metadata'].get('filesize', None), 'external_host': msg.get('endpnt', None), 'job_m_replica': msg.get('job_m_replica', None), 'details': { 'files': msg['file_metadata'] } } record_counter('daemons.conveyor.receiver.message_rucio') if str(msg['t_final_transfer_state']) == str( FTSCompleteState.OK): response['new_state'] = RequestState.DONE elif str(msg['t_final_transfer_state']) == str( FTSCompleteState.ERROR): response['new_state'] = RequestState.FAILED try: if response['new_state']: logging.info( 'RECEIVED DID %s:%s FROM %s TO %s REQUEST %s TRANSFER_ID %s STATE %s' % (response['scope'], response['name'], response['src_rse'], response['dst_rse'], response['request_id'], response['transfer_id'], response['new_state'])) if self.__full_mode: ret = request.update_request_state(response) record_counter( 'daemons.conveyor.receiver.update_request_state.%s' % ret) else: try: logging.debug("Update request %s update time" % response['request_id']) set_transfer_update_time( response['external_host'], response['transfer_id'], datetime.datetime.utcnow() - datetime.timedelta(hours=24)) record_counter( 'daemons.conveyor.receiver.set_transfer_update_time' ) except Exception, e: logging.debug( "Failed to update transfer's update time: %s" % str(e)) except: logging.critical(traceback.format_exc())
def poll_transfers(external_host, xfers, prepend_str='', request_ids=None, timeout=None): """ Poll a list of transfers from an FTS server :param external_host: The FTS server to query from. :param xfrs: List of transfers to poll. :param process: Process number. :param thread: Thread number. :param timeout: Timeout. """ try: if TRANSFER_TOOL == 'mock': logging.debug(prepend_str + 'Setting %s transfer requests status to DONE per mock tool' % (len(xfers))) for task_id in xfers: ret = transfer_core.update_transfer_state(external_host=None, transfer_id=task_id, state=RequestState.DONE) record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) return try: tss = time.time() logging.info(prepend_str + 'Polling %i transfers against %s with timeout %s' % (len(xfers), external_host, timeout)) resps = transfer_core.bulk_query_transfers(external_host, xfers, TRANSFER_TOOL, timeout) record_timer('daemons.conveyor.poller.bulk_query_transfers', (time.time() - tss) * 1000 / len(xfers)) except TransferToolTimeout as error: logging.error(prepend_str + str(error)) return except TransferToolWrongAnswer as error: logging.error(prepend_str + str(error)) logging.error(prepend_str + 'Problem querying %s on %s. All jobs are being checked individually' % (str(xfers), external_host)) for xfer in xfers: try: logging.debug(prepend_str + 'Checking %s on %s' % (xfer, external_host)) status = transfer_core.bulk_query_transfers(external_host, [xfer, ], TRANSFER_TOOL, timeout) if xfer in status and isinstance(status[xfer], Exception): logging.error(prepend_str + 'Problem querying %s on %s . Error returned : %s' % (xfer, external_host, str(status[xfer]))) except Exception as err: logging.error(prepend_str + 'Problem querying %s on %s . Error returned : %s' % (xfer, external_host, str(err))) break return except RequestException as error: logging.error(prepend_str + "Failed to contact FTS server: %s" % (str(error))) return except Exception: logging.error(prepend_str + "Failed to query FTS info: %s" % (traceback.format_exc())) return logging.debug(prepend_str + 'Polled %s transfer requests status in %s seconds' % (len(xfers), (time.time() - tss))) tss = time.time() logging.debug(prepend_str + 'Updating %s transfer requests status' % (len(xfers))) cnt = 0 if TRANSFER_TOOL == 'globus': for task_id in resps: ret = transfer_core.update_transfer_state(external_host=None, transfer_id=task_id, state=resps[task_id]) record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) else: for transfer_id in resps: try: transf_resp = resps[transfer_id] # transf_resp is None: Lost. # is Exception: Failed to get fts job status. # is {}: No terminated jobs. # is {request_id: {file_status}}: terminated jobs. if transf_resp is None: transfer_core.update_transfer_state(external_host, transfer_id, RequestState.LOST, logging_prepend_str=prepend_str) record_counter('daemons.conveyor.poller.transfer_lost') elif isinstance(transf_resp, Exception): logging.warning(prepend_str + "Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp)) record_counter('daemons.conveyor.poller.query_transfer_exception') else: for request_id in transf_resp: if request_id in request_ids: ret = request_core.update_request_state(transf_resp[request_id], logging_prepend_str=prepend_str) # if True, really update request content; if False, only touch request if ret: cnt += 1 record_counter('daemons.conveyor.poller.update_request_state.%s' % ret) # should touch transfers. # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again. transfer_core.touch_transfer(external_host, transfer_id) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logging.warn(prepend_str + "Lock detected when handling request %s - skipping" % request_id) else: logging.error(traceback.format_exc()) logging.debug(prepend_str + 'Finished updating %s transfer requests status (%i requests state changed) in %s seconds' % (len(xfers), cnt, (time.time() - tss))) except Exception: logging.error(traceback.format_exc())
def on_message(self, frame): record_counter('daemons.conveyor.receiver.message_all') msg = json.loads(frame.body) if not self.__all_vos: if 'vo' not in msg or msg['vo'] != get_policy(): return if 'job_metadata' in msg.keys() \ and isinstance(msg['job_metadata'], dict) \ and 'issuer' in msg['job_metadata'].keys() \ and str(msg['job_metadata']['issuer']) == str('rucio'): if 'job_state' in msg.keys() and ( str(msg['job_state']) != str('ACTIVE') or str(msg['job_state']) == str('ACTIVE') and 'job_m_replica' in msg.keys() and (str(msg['job_m_replica']).lower() == str('true'))): response = { 'new_state': None, 'transfer_id': msg.get('tr_id').split("__")[-1], 'job_state': msg.get('t_final_transfer_state', None), 'src_url': msg.get('src_url', None), 'dst_url': msg.get('dst_url', None), 'started_at': datetime.datetime.utcfromtimestamp( float(msg.get('tr_timestamp_start', 0)) / 1000), 'transferred_at': datetime.datetime.utcfromtimestamp( float(msg.get('tr_timestamp_complete', 0)) / 1000), 'duration': (float(msg.get('tr_timestamp_complete', 0)) - float(msg.get('tr_timestamp_start', 0))) / 1000, 'reason': msg.get('t__error_message', None), 'scope': msg['file_metadata'].get('scope', None), 'name': msg['file_metadata'].get('name', None), 'src_type': msg['file_metadata'].get('src_type', None), 'dst_type': msg['file_metadata'].get('dst_type', None), 'src_rse': msg['file_metadata'].get('src_rse', None), 'dst_rse': msg['file_metadata'].get('dst_rse', None), 'request_id': msg['file_metadata'].get('request_id', None), 'activity': msg['file_metadata'].get('activity', None), 'src_rse_id': msg['file_metadata'].get('src_rse_id', None), 'dest_rse_id': msg['file_metadata'].get('dest_rse_id', None), 'previous_attempt_id': msg['file_metadata'].get('previous_attempt_id', None), 'adler32': msg['file_metadata'].get('adler32', None), 'md5': msg['file_metadata'].get('md5', None), 'filesize': msg['file_metadata'].get('filesize', None), 'external_host': msg.get('endpnt', None), 'job_m_replica': msg.get('job_m_replica', None), 'details': { 'files': msg['file_metadata'] } } record_counter('daemons.conveyor.receiver.message_rucio') if str(msg['t_final_transfer_state']) == FTS_COMPLETE_STATE.OK: # pylint:disable=no-member response['new_state'] = RequestState.DONE elif str(msg['t_final_transfer_state'] ) == FTS_COMPLETE_STATE.ERROR: # pylint:disable=no-member response['new_state'] = RequestState.FAILED try: if response['new_state']: logging.info( 'RECEIVED DID %s:%s FROM %s TO %s REQUEST %s TRANSFER_ID %s STATE %s' % (response['scope'], response['name'], response['src_rse'], response['dst_rse'], response['request_id'], response['transfer_id'], response['new_state'])) if self.__full_mode: ret = request.update_request_state(response) record_counter( 'daemons.conveyor.receiver.update_request_state.%s' % ret) else: try: logging.debug("Update request %s update time" % response['request_id']) set_transfer_update_time( response['external_host'], response['transfer_id'], datetime.datetime.utcnow() - datetime.timedelta(hours=24)) record_counter( 'daemons.conveyor.receiver.set_transfer_update_time' ) except Exception as error: logging.debug( "Failed to update transfer's update time: %s" % str(error)) except Exception: logging.critical(traceback.format_exc())
def _poll_transfers(external_host, transfers_by_eid, vo, timeout, logger): """ Helper function for poll_transfers which performs the actual polling and database update. """ is_bulk = len(transfers_by_eid) > 1 try: tss = time.time() logger( logging.INFO, 'Polling %i transfers against %s with timeout %s' % (len(transfers_by_eid), external_host, timeout)) resps = transfer_core.bulk_query_transfers(external_host, transfers_by_eid, TRANSFER_TOOL, vo, timeout, logger) duration = time.time() - tss record_timer('daemons.conveyor.poller.bulk_query_transfers', duration * 1000 / len(transfers_by_eid)) logger( logging.DEBUG, 'Polled %s transfer requests status in %s seconds' % (len(transfers_by_eid), duration)) except TransferToolTimeout as error: logger(logging.ERROR, str(error)) return except TransferToolWrongAnswer as error: logger(logging.ERROR, str(error)) if is_bulk: raise # The calling context will retry transfers one-by-one else: return except RequestException as error: logger(logging.ERROR, "Failed to contact FTS server: %s" % (str(error))) return except Exception: logger(logging.ERROR, "Failed to query FTS info", exc_info=True) return tss = time.time() logger(logging.DEBUG, 'Updating %s transfer requests status' % (len(transfers_by_eid))) cnt = 0 request_ids = { t['request_id'] for t in itertools.chain.from_iterable(transfers_by_eid.values()) } for transfer_id in resps: try: transf_resp = resps[transfer_id] # transf_resp is None: Lost. # is Exception: Failed to get fts job status. # is {}: No terminated jobs. # is {request_id: {file_status}}: terminated jobs. if transf_resp is None: for transfer in transfers_by_eid[transfer_id]: resp = transfer_core.fake_transfertool_response( transfer, new_state=RequestState.LOST, reason="The FTS job lost") request_core.update_request_state(resp, logger=logger) record_counter('daemons.conveyor.poller.transfer_lost') elif isinstance(transf_resp, Exception): logger( logging.WARNING, "Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp)) record_counter( 'daemons.conveyor.poller.query_transfer_exception') else: for request_id in request_ids.intersection(transf_resp): ret = request_core.update_request_state( transf_resp[request_id], logger=logger) # if True, really update request content; if False, only touch request if ret: cnt += 1 record_counter( 'daemons.conveyor.poller.update_request_state.{updated}', labels={'updated': ret}) # should touch transfers. # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again. transfer_core.touch_transfer(external_host, transfer_id) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match( '.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger( logging.WARNING, "Lock detected when handling request %s - skipping" % transfer_id) else: logger(logging.ERROR, 'Exception', exc_info=True) logger( logging.DEBUG, 'Finished updating %s transfer requests status (%i requests state changed) in %s seconds' % (len(transfers_by_eid), cnt, (time.time() - tss)))
def poll_transfers(external_host, xfers, prepend_str='', request_ids=None, timeout=None): """ Poll a list of transfers from an FTS server :param external_host: The FTS server to query from. :param xfrs: List of transfers to poll. :param process: Process number. :param thread: Thread number. :param timeout: Timeout. """ try: try: tss = time.time() logging.info(prepend_str + 'Polling %i transfers against %s with timeout %s' % (len(xfers), external_host, timeout)) resps = transfer_core.bulk_query_transfers(external_host, xfers, 'fts3', timeout) record_timer('daemons.conveyor.poller.bulk_query_transfers', (time.time() - tss) * 1000 / len(xfers)) except RequestException as error: logging.error(prepend_str + "Failed to contact FTS server: %s" % (str(error))) return except Exception: logging.error(prepend_str + "Failed to query FTS info: %s" % (traceback.format_exc())) return logging.debug(prepend_str + 'Polled %s transfer requests status in %s seconds' % (len(xfers), (time.time() - tss))) tss = time.time() logging.debug(prepend_str + 'Updating %s transfer requests status' % (len(xfers))) cnt = 0 for transfer_id in resps: try: transf_resp = resps[transfer_id] # transf_resp is None: Lost. # is Exception: Failed to get fts job status. # is {}: No terminated jobs. # is {request_id: {file_status}}: terminated jobs. if transf_resp is None: transfer_core.update_transfer_state( external_host, transfer_id, RequestState.LOST, logging_prepend_str=prepend_str) record_counter('daemons.conveyor.poller.transfer_lost') elif isinstance(transf_resp, Exception): logging.warning(prepend_str + "Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp)) record_counter( 'daemons.conveyor.poller.query_transfer_exception') else: for request_id in transf_resp: if request_id in request_ids: ret = request_core.update_request_state( transf_resp[request_id], logging_prepend_str=prepend_str) # if True, really update request content; if False, only touch request if ret: cnt += 1 record_counter( 'daemons.conveyor.poller.update_request_state.%s' % ret) # should touch transfers. # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again. transfer_core.touch_transfer(external_host, transfer_id) except (DatabaseException, DatabaseError) as error: if isinstance(error.args[0], tuple) and ( match('.*ORA-00054.*', error.args[0][0]) or match('.*ORA-00060.*', error.args[0][0]) or ('ERROR 1205 (HY000)' in error.args[0][0])): logging.warn( prepend_str + "Lock detected when handling request %s - skipping" % request_id) else: logging.error(traceback.format_exc()) logging.debug( prepend_str + 'Finished updating %s transfer requests status (%i requests state changed) in %s seconds' % (len(xfers), cnt, (time.time() - tss))) except Exception: logging.error(traceback.format_exc())