def poller(worker_number=1, total_workers=1, chunk_size=100): print '%(worker_number)s / %(total_workers)s: Poller started' % locals() while not graceful_stop.is_set(): try: s = time() transfer_requests = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=RequestState.SUBMITTED, thread=worker_number, total_threads=total_workers, limit=10000) n = len(transfer_requests) print '%(worker_number)s / %(total_workers)s: get_next %(n)s requests' % locals() if not transfer_requests: sleep(0.01) continue for chunk in chunks(transfer_requests, chunk_size): try: s = time() fts3.query_all(transfer_requests=chunk) print 'fts3.query_all', time() - s s = time() common.update_requests_states(chunk) print 'update_requests_states', time() - s except: print traceback.format_exc() except: print traceback.format_exc()
def get_requests(rse_id=None, process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activity=None): ts = time.time() reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=RequestState.QUEUED, limit=bulk, rse=rse_id, activity=activity, process=process, total_processes=total_processes, thread=thread, total_threads=total_threads) record_timer('daemons.conveyor.submitter.get_next', (time.time() - ts) * 1000) return reqs
def get_next(request_type, state, issuer, account): """ Retrieve the next request matching the request type and state. :param request_type: Type of the request as a string. :param state: State of the request as a string. :param issuer: Issuing account as a string. :param account: Account identifier as a string. :returns: Request as a dictionary. """ kwargs = {'account': account, 'issuer': issuer, 'request_type': request_type, 'state': state} if not permission.has_permission(issuer=issuer, action='get_next', kwargs=kwargs): raise exception.AccessDenied('%(account)s cannot get the next request of type %(request_type)s in state %(state)s' % locals()) return request.get_next(request_type, state)
def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000, older_than=60): """ Main loop to check the status of a transfer primitive with a transfertool. """ logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads, bulk)) logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes, thread, total_threads, bulk)) while not graceful_stop.is_set(): try: ts = time.time() logging.debug('%i:%i - start to poll requests older than %i seconds' % (process, thread, older_than)) reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=RequestState.SUBMITTED, limit=bulk, older_than=datetime.datetime.utcnow()-datetime.timedelta(seconds=older_than), process=process, total_processes=total_processes, thread=thread, total_threads=total_threads) record_timer('daemons.conveyor.poller.000-get_next', (time.time()-ts)*1000) if reqs: logging.debug('%i:%i - polling %i requests' % (process, thread, len(reqs))) if not reqs or reqs == []: if once: break logging.debug("no requests found. will sleep 60 seconds") time.sleep(60) # Only sleep if there is nothing to do continue for xfers in chunks(reqs, bulk): try: req_ids = {} for req in xfers: record_counter('daemons.conveyor.poller.query_request') if not req['external_host'] in req_ids: req_ids[req['external_host']] = [] req_ids[req['external_host']].append((req['request_id'], req['external_id'])) responses = {} for external_host in req_ids: ts = time.time() resps = request.bulk_query_requests(external_host, req_ids[external_host], 'fts3') record_timer('daemons.conveyor.poller.001-bulk_query_requests', (time.time()-ts)*1000/len(req_ids[external_host])) responses = dict(responses.items() + resps.items()) for external_host in req_ids: for request_id, external_id in req_ids[external_host]: response = responses[request_id] if isinstance(response, Exception): logging.critical("Failed to poll request(%s) with FTS(%s) job (%s): %s" % (request_id, external_host, external_id, responses[request_id])) record_counter('daemons.conveyor.poller.query_request_exception') response = {'new_state': None, 'request_id': request_id, 'transfer_id': external_id, 'job_state': None} common.update_request_state(response) record_counter('daemons.conveyor.poller.update_request_state') if response['new_state'] == RequestState.LOST: record_counter('daemons.conveyor.poller.request_lost') except: logging.critical(traceback.format_exc()) except: logging.critical(traceback.format_exc()) if once: return logging.debug('%i:%i - graceful stop requests' % (process, thread)) logging.debug('%i:%i - graceful stop done' % (process, thread))
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000): """ Main loop to update the replicas and rules based on finished requests. """ try: suspicious_patterns = [] pattern = get(section='conveyor', option='suspicious_pattern', session=None) pattern = str(pattern) patterns = pattern.split(",") for pat in patterns: suspicious_patterns.append(re.compile(pat.strip())) except ConfigNotFound: suspicious_patterns = [] logging.debug("Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns]) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) # Make an initial heartbeat so that all finishers have the correct worker number on the next try heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.info(prepend_str + 'Finisher starting - db_bulk(%i) bulk (%i)' % (db_bulk, bulk)) graceful_stop.wait(10) while not graceful_stop.is_set(): start_time = time.time() try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads']) logging.debug(prepend_str + 'Starting new cycle') if activities is None: activities = [None] for activity in activities: logging.debug(prepend_str + 'Working on activity %s' % activity) time1 = time.time() reqs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=[RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES], limit=db_bulk, older_than=datetime.datetime.utcnow(), total_workers=heart_beat['nr_threads'] - 1, worker_number=heart_beat['assign_thread'], mode_all=True, hash_variable='rule_id') record_timer('daemons.conveyor.finisher.000-get_next', (time.time() - time1) * 1000) time2 = time.time() if reqs: logging.debug(prepend_str + 'Updating %i requests for activity %s' % (len(reqs), activity)) for chunk in chunks(reqs, bulk): try: time3 = time.time() __handle_requests(chunk, suspicious_patterns, prepend_str) record_timer('daemons.conveyor.finisher.handle_requests', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1)) record_counter('daemons.conveyor.finisher.handle_requests', len(chunk)) except Exception as error: logging.warn(prepend_str + str(error)) if reqs: logging.debug(prepend_str + 'Finish to update %s finished requests for activity %s in %s seconds' % (len(reqs), activity, time.time() - time2)) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logging.warn(prepend_str + 'Lock detected when handling request - skipping: %s' % (str(error))) else: logging.error(prepend_str + '%s' % (traceback.format_exc())) except Exception as error: logging.critical(prepend_str + '%s' % (str(error))) end_time = time.time() time_diff = end_time - start_time if time_diff < sleep_time: logging.info(prepend_str + 'Sleeping for a while : %s seconds' % (sleep_time - time_diff)) graceful_stop.wait(sleep_time - time_diff) if once: return logging.info(prepend_str + 'Graceful stop requests') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000): """ Main loop to update the replicas and rules based on finished requests. """ try: conveyor_config = {item[0]: item[1] for item in items('conveyor')} except ConfigNotFound: logging.log(logging.INFO, 'No configuration found for conveyor') conveyor_config = {} # Get suspicious patterns suspicious_patterns = conveyor_config.get('suspicious_pattern', []) if suspicious_patterns: pattern = str(suspicious_patterns) patterns = pattern.split(",") suspicious_patterns = [re.compile(pat.strip()) for pat in patterns] logging.log( logging.DEBUG, "Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns]) retry_protocol_mismatches = conveyor_config.get( 'retry_protocol_mismatches', False) executable = 'conveyor-finisher' if activities: activities.sort() executable += '--activities ' + str(activities) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) # Make an initial heartbeat so that all finishers have the correct worker number on the next try heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prefix = 'conveyor-finisher[%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.INFO, 'Finisher starting - db_bulk(%i) bulk (%i)', db_bulk, bulk) graceful_stop.wait(10) while not graceful_stop.is_set(): start_time = time.time() try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prefix = 'conveyor-finisher[%i/%i] : ' % ( heart_beat['assign_thread'], heart_beat['nr_threads']) logger = formatted_logger(logging.log, prefix + '%s') logger(logging.DEBUG, 'Starting new cycle') if activities is None: activities = [None] for activity in activities: logger(logging.DEBUG, 'Working on activity %s', activity) time1 = time.time() reqs = request_core.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[ RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES, RequestState.MISMATCH_SCHEME ], limit=db_bulk, older_than=datetime.datetime.utcnow(), total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], mode_all=True, hash_variable='rule_id') record_timer('daemons.conveyor.finisher.000-get_next', (time.time() - time1) * 1000) time2 = time.time() if reqs: logger(logging.DEBUG, 'Updating %i requests for activity %s', len(reqs), activity) for chunk in chunks(reqs, bulk): try: time3 = time.time() __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, logger=logger) record_timer( 'daemons.conveyor.finisher.handle_requests', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1)) record_counter( 'daemons.conveyor.finisher.handle_requests', len(chunk)) except Exception as error: logger(logging.WARNING, '%s', str(error)) if reqs: logger( logging.DEBUG, 'Finish to update %s finished requests for activity %s in %s seconds', len(reqs), activity, time.time() - time2) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match( '.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: return end_time = time.time() time_diff = end_time - start_time if time_diff < sleep_time: logger(logging.INFO, 'Sleeping for a while : %s seconds', (sleep_time - time_diff)) graceful_stop.wait(sleep_time - time_diff) logger(logging.INFO, 'Graceful stop requests') heartbeat.die(executable, hostname, pid, hb_thread) logger(logging.INFO, 'Graceful stop done')
def poller(once=False, activities=None, sleep_time=60, fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None, partition_wait_time=10): """ Main loop to check the status of a transfer primitive with a transfertool. """ try: timeout = config_get('conveyor', 'poll_timeout') timeout = float(timeout) except NoOptionError: timeout = None multi_vo = config_get_bool('common', 'multi_vo', False, None) logger_prefix = executable = 'conveyor-poller' if activities: activities.sort() executable += '--activities ' + str(activities) if activity_shares: activities.sort() executable += '--activity_shares' + str(activity_shares) if FILTER_TRANSFERTOOL: executable += ' --filter-transfertool ' + FILTER_TRANSFERTOOL with HeartbeatHandler(executable=executable, logger_prefix=logger_prefix) as heartbeat_handler: logger = heartbeat_handler.logger logger( logging.INFO, 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' % (db_bulk, fts_bulk, timeout)) activity_next_exe_time = defaultdict(time.time) if partition_wait_time: graceful_stop.wait( partition_wait_time ) # To prevent running on the same partition if all the poller restart at the same time while not graceful_stop.is_set(): try: heart_beat, logger = heartbeat_handler.live(older_than=3600) if activities is None: activities = [None] for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue start_time = time.time() logger( logging.DEBUG, 'Start to poll transfers older than %i seconds for activity %s using transfer tool: %s' % (older_than, activity, FILTER_TRANSFERTOOL)) transfs = request_core.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[RequestState.SUBMITTED], limit=db_bulk, older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than) if older_than else None, total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], mode_all=True, hash_variable='id', activity=activity, activity_shares=activity_shares, transfertool=FILTER_TRANSFERTOOL) record_timer('daemons.conveyor.poller.get_next', (time.time() - start_time) * 1000) if TRANSFER_TOOL and not FILTER_TRANSFERTOOL: # only keep transfers which don't have any transfertool set, or have one equal to TRANSFER_TOOL transfs_tmp = [ t for t in transfs if not t['transfertool'] or t['transfertool'] == TRANSFER_TOOL ] if len(transfs_tmp) != len(transfs): logger( logging.INFO, 'Skipping %i transfers because of missmatched transfertool', len(transfs) - len(transfs_tmp)) transfs = transfs_tmp if transfs: logger( logging.DEBUG, 'Polling %i transfers for activity %s' % (len(transfs), activity)) transfs.sort(key=lambda t: (t['external_host'] or '', t[ 'scope'].vo if multi_vo else '', t['external_id'] or '', t['request_id'] or '')) for (external_host, vo), transfers_for_host in groupby( transfs, key=lambda t: (t['external_host'], t['scope'].vo if multi_vo else None)): transfers_by_eid = {} for external_id, xfers in groupby( transfers_for_host, key=lambda t: t['external_id']): transfers_by_eid[external_id] = list(xfers) for chunk in dict_chunks(transfers_by_eid, fts_bulk): try: poll_transfers(external_host=external_host, transfers_by_eid=chunk, vo=vo, timeout=timeout, logger=logger) except Exception: logger(logging.ERROR, 'Exception', exc_info=True) if len(transfs) < fts_bulk / 2: logger( logging.INFO, "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (len(transfs), activity, fts_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logger(logging.CRITICAL, "Exception", exc_info=True) if once: raise if once: break
def finisher(once=False, process=0, total_processes=1, thread=0, total_threads=1, sleep_time=60, activities=None, bulk=100, db_bulk=1000): """ Main loop to update the replicas and rules based on finished requests. """ logging.info( 'finisher starting - process (%i/%i) thread (%i/%i) db_bulk(%i) bulk (%i)' % (process, total_processes, thread, total_threads, db_bulk, bulk)) try: suspicious_patterns = [] pattern = get(section='conveyor', option='suspicious_pattern', session=None) pattern = str(pattern) patterns = pattern.split(",") for pat in patterns: suspicious_patterns.append(re.compile(pat.strip())) except ConfigNotFound: suspicious_patterns = [] logging.debug("Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns]) executable = ' '.join(sys.argv) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) # Make an initial heartbeat so that all finishers have the correct worker number on the next try hb = heartbeat.live(executable, hostname, pid, hb_thread) graceful_stop.wait(1) sleeping = False while not graceful_stop.is_set(): try: hb = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads']) if activities is None: activities = [None] if sleeping: logging.info(prepend_str + 'Nothing to do. will sleep %s seconds' % (sleep_time)) time.sleep(sleep_time) sleeping = True for activity in activities: ts = time.time() reqs = request.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[ RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES ], limit=db_bulk, older_than=datetime.datetime.utcnow(), activity=activity, process=process, total_processes=total_processes, thread=hb['assign_thread'], total_threads=hb['nr_threads']) record_timer('daemons.conveyor.finisher.000-get_next', (time.time() - ts) * 1000) if reqs: logging.debug(prepend_str + 'Updating %i requests for activity %s' % (len(reqs), activity)) sleeping = False for chunk in chunks(reqs, bulk): try: ts = time.time() common.handle_requests(chunk, suspicious_patterns) record_timer( 'daemons.conveyor.finisher.handle_requests', (time.time() - ts) * 1000 / (len(chunk) if len(chunk) else 1)) record_counter( 'daemons.conveyor.finisher.handle_requests', len(chunk)) except: logging.warn(str(traceback.format_exc())) if reqs: logging.debug( prepend_str + 'Finish to update %s finished requests for activity %s' % (len(reqs), activity)) except (DatabaseException, DatabaseError) as error: if isinstance( error.args[0], tuple) and (re.match('.*ORA-00054.*', error.args[0][0]) or ('ERROR 1205 (HY000)' in error.args[0][0])): logging.warn( prepend_str + 'Lock detected when handling request - skipping: %s' % (str(error))) else: logging.error(prepend_str + '%s' % (traceback.format_exc())) sleeping = False except: sleeping = False logging.critical(prepend_str + '%s' % (traceback.format_exc())) if once: return logging.info(prepend_str + 'Graceful stop requests') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def run_once(bulk, db_bulk, suspicious_patterns, retry_protocol_mismatches, heartbeat_handler, activity): worker_number, total_workers, logger = heartbeat_handler.live() try: logger(logging.DEBUG, 'Working on activity %s', activity) time1 = time.time() reqs = request_core.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[ RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES, RequestState.MISMATCH_SCHEME ], limit=db_bulk, older_than=datetime.datetime.utcnow(), total_workers=total_workers, worker_number=worker_number, mode_all=True, include_dependent=False, hash_variable='rule_id') record_timer('daemons.conveyor.finisher.get_next', (time.time() - time1) * 1000) time2 = time.time() if reqs: logger(logging.DEBUG, 'Updating %i requests for activity %s', len(reqs), activity) for chunk in chunks(reqs, bulk): try: worker_number, total_workers, logger = heartbeat_handler.live() time3 = time.time() __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, logger=logger) record_timer('daemons.conveyor.finisher.handle_requests_time', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1)) record_counter('daemons.conveyor.finisher.handle_requests', delta=len(chunk)) except Exception as error: logger(logging.WARNING, '%s', str(error)) if reqs: logger( logging.DEBUG, 'Finish to update %s finished requests for activity %s in %s seconds', len(reqs), activity, time.time() - time2) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match( '.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: raise return True
def poller(once=False, activities=None, sleep_time=60, fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None): """ Main loop to check the status of a transfer primitive with a transfertool. """ try: timeout = config_get('conveyor', 'poll_timeout') timeout = float(timeout) except NoOptionError: timeout = None executable = 'conveyor-poller' if activities: activities.sort() executable += '--activities ' + str(activities) if activity_shares: activities.sort() executable += '--activity_shares' + str(activity_shares) hostname = socket.getfqdn() pid = os.getpid() hb_thread = threading.current_thread() heartbeat.sanity_check(executable=executable, hostname=hostname) heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' % (db_bulk, fts_bulk, timeout)) time.sleep( 10 ) # To prevent running on the same partition if all the poller restart at the same time heart_beat = heartbeat.live(executable, hostname, pid, hb_thread) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) logging.info(prepend_str + 'Poller started') activity_next_exe_time = defaultdict(time.time) while not graceful_stop.is_set(): try: heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600) prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads']) if activities is None: activities = [None] for activity in activities: if activity_next_exe_time[activity] > time.time(): graceful_stop.wait(1) continue start_time = time.time() logging.debug( prepend_str + 'Start to poll transfers older than %i seconds for activity %s' % (older_than, activity)) transfs = request_core.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[RequestState.SUBMITTED], limit=db_bulk, older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than), total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], mode_all=False, hash_variable='id', activity=activity, activity_shares=activity_shares) record_timer('daemons.conveyor.poller.000-get_next', (time.time() - start_time) * 1000) if transfs: logging.debug(prepend_str + 'Polling %i transfers for activity %s' % (len(transfs), activity)) xfers_ids = {} for transf in transfs: if not transf['external_host'] in xfers_ids: xfers_ids[transf['external_host']] = [] xfers_ids[transf['external_host']].append( (transf['external_id'], transf['request_id'])) for external_host in xfers_ids: external_ids = list( {trf[0] for trf in xfers_ids[external_host]}) request_ids = [trf[1] for trf in xfers_ids[external_host]] for xfers in chunks(external_ids, fts_bulk): # poll transfers poll_transfers(external_host=external_host, xfers=xfers, prepend_str=prepend_str, request_ids=request_ids, timeout=timeout) if len(transfs) < fts_bulk / 2: logging.info( prepend_str + "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (len(transfs), activity, fts_bulk, sleep_time)) if activity_next_exe_time[activity] < time.time(): activity_next_exe_time[activity] = time.time( ) + sleep_time except Exception: logging.critical(prepend_str + "%s" % (traceback.format_exc())) if once: break logging.info(prepend_str + 'Graceful stop requested') heartbeat.die(executable, hostname, pid, hb_thread) logging.info(prepend_str + 'Graceful stop done')
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000, partition_wait_time=10): """ Main loop to update the replicas and rules based on finished requests. """ try: conveyor_config = {item[0]: item[1] for item in items('conveyor')} except ConfigNotFound: logging.log(logging.INFO, 'No configuration found for conveyor') conveyor_config = {} # Get suspicious patterns suspicious_patterns = conveyor_config.get('suspicious_pattern', []) if suspicious_patterns: pattern = str(suspicious_patterns) patterns = pattern.split(",") suspicious_patterns = [re.compile(pat.strip()) for pat in patterns] logging.log(logging.DEBUG, "Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns]) retry_protocol_mismatches = conveyor_config.get('retry_protocol_mismatches', False) logger_prefix = executable = 'conveyor-finisher' if activities: activities.sort() executable += '--activities ' + str(activities) with HeartbeatHandler(executable=executable, logger_prefix=logger_prefix) as heartbeat_handler: logger = heartbeat_handler.logger logger(logging.INFO, 'Finisher starting - db_bulk(%i) bulk (%i)', db_bulk, bulk) if partition_wait_time: graceful_stop.wait(partition_wait_time) while not graceful_stop.is_set(): start_time = time.time() try: heart_beat, logger = heartbeat_handler.live(older_than=3600) if activities is None: activities = [None] for activity in activities: logger(logging.DEBUG, 'Working on activity %s', activity) time1 = time.time() reqs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT], state=[RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES, RequestState.MISMATCH_SCHEME], limit=db_bulk, older_than=datetime.datetime.utcnow(), total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'], mode_all=True, hash_variable='rule_id') record_timer('daemons.conveyor.finisher.get_next', (time.time() - time1) * 1000) time2 = time.time() if reqs: logger(logging.DEBUG, 'Updating %i requests for activity %s', len(reqs), activity) for chunk in chunks(reqs, bulk): try: time3 = time.time() __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, logger=logger) record_timer('daemons.conveyor.finisher.handle_requests_time', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1)) record_counter('daemons.conveyor.finisher.handle_requests', delta=len(chunk)) except Exception as error: logger(logging.WARNING, '%s', str(error)) if reqs: logger(logging.DEBUG, 'Finish to update %s finished requests for activity %s in %s seconds', len(reqs), activity, time.time() - time2) except (DatabaseException, DatabaseError) as error: if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]: logger(logging.WARNING, 'Lock detected when handling request - skipping: %s', str(error)) else: logger(logging.ERROR, 'Exception', exc_info=True) except Exception: logger(logging.CRITICAL, 'Exception', exc_info=True) if once: raise if once: break daemon_sleep(start_time=start_time, sleep_time=sleep_time, graceful_stop=graceful_stop, logger=logger)
def run_once(fts_bulk, db_bulk, older_than, activity_shares, multi_vo, timeout, activity, heartbeat_handler): worker_number, total_workers, logger = heartbeat_handler.live() start_time = time.time() logger( logging.DEBUG, 'Start to poll transfers older than %i seconds for activity %s using transfer tool: %s' % (older_than, activity, FILTER_TRANSFERTOOL)) transfs = request_core.get_next( request_type=[ RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT ], state=[RequestState.SUBMITTED], limit=db_bulk, older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than) if older_than else None, total_workers=total_workers, worker_number=worker_number, mode_all=True, hash_variable='id', activity=activity, activity_shares=activity_shares, transfertool=FILTER_TRANSFERTOOL) record_timer('daemons.conveyor.poller.get_next', (time.time() - start_time) * 1000) if TRANSFER_TOOL and not FILTER_TRANSFERTOOL: # only keep transfers which don't have any transfertool set, or have one equal to TRANSFER_TOOL transfs_tmp = [ t for t in transfs if not t['transfertool'] or t['transfertool'] == TRANSFER_TOOL ] if len(transfs_tmp) != len(transfs): logger( logging.INFO, 'Skipping %i transfers because of missmatched transfertool', len(transfs) - len(transfs_tmp)) transfs = transfs_tmp if transfs: logger( logging.DEBUG, 'Polling %i transfers for activity %s' % (len(transfs), activity)) transfs.sort( key=lambda t: (t['external_host'] or '', t['scope'].vo if multi_vo else '', t['external_id'] or '', t['request_id'] or '')) for (external_host, vo), transfers_for_host in groupby(transfs, key=lambda t: (t['external_host'], t['scope'].vo if multi_vo else None)): transfers_by_eid = {} for external_id, xfers in groupby(transfers_for_host, key=lambda t: t['external_id']): transfers_by_eid[external_id] = {t['request_id']: t for t in xfers} for chunk in dict_chunks(transfers_by_eid, fts_bulk): try: if TRANSFER_TOOL == 'globus': transfertool_obj = GlobusTransferTool(external_host=None) else: transfertool_obj = FTS3Transfertool( external_host=external_host, vo=vo) worker_number, total_workers, logger = heartbeat_handler.live() poll_transfers(transfertool_obj=transfertool_obj, transfers_by_eid=chunk, timeout=timeout, logger=logger) except Exception: logger(logging.ERROR, 'Exception', exc_info=True) queue_empty = False if len(transfs) < fts_bulk / 2: logger( logging.INFO, "Only %s transfers for activity %s, which is less than half of the bulk %s" % (len(transfs), activity, fts_bulk)) queue_empty = True return queue_empty