def run(inFile,v_onlyTA,v_firstSubmission): try: import cPickle as pickle except ImportError: import pickle try: # read Jobs from file f = open(inFile, 'rb') jobs = pickle.load(f) f.close() except Exception as e: print("run() : %s %s" % (str(e), traceback.format_exc())) return # password from pandaserver.config import panda_config # initialize cx_Oracle using dummy connection from pandaserver.taskbuffer.Initializer import initializer initializer.init() # instantiate TB from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # run Setupper from pandaserver.dataservice.Setupper import Setupper thr = Setupper(taskBuffer,jobs,onlyTA=v_onlyTA,firstSubmission=v_firstSubmission) thr.start() thr.join() return
#!/usr/bin/python """ entry point """ import datetime import types # config file from pandaserver.config import panda_config # initialize cx_Oracle using dummy connection from pandaserver.taskbuffer.Initializer import initializer initializer.init() # initialzie TaskBuffer from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,panda_config.nDBConnection,True) # initialize JobDispatcher from pandaserver.jobdispatcher.JobDispatcher import jobDispatcher if panda_config.nDBConnection != 0: jobDispatcher.init(taskBuffer) # initialize DataService from pandaserver.dataservice.DataService import dataService if panda_config.nDBConnection != 0: dataService.init(taskBuffer)
def main(backGround=False): _logger.debug('starting ...') # register signal handler signal.signal(signal.SIGINT, catch_sig) signal.signal(signal.SIGHUP, catch_sig) signal.signal(signal.SIGTERM, catch_sig) signal.signal(signal.SIGALRM, catch_sig) signal.alarm(overallTimeout) # forking pid = os.fork() if pid != 0: # watch child process os.wait() time.sleep(1) else: # main loop from pandaserver.taskbuffer.TaskBuffer import taskBuffer # check certificate certName = '%s/pandasv1_usercert.pem' % panda_config.certdir keyName = '%s/pandasv1_userkey.pem' % panda_config.certdir _logger.debug('checking certificate {0}'.format(certName)) certOK, certMsg = DataServiceUtils.checkCertificate(certName) if not certOK: _logger.error('bad certificate : {0}'.format(certMsg)) # initialize cx_Oracle using dummy connection from pandaserver.taskbuffer.Initializer import initializer initializer.init() # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # ActiveMQ params queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices' ssl_opts = { 'use_ssl': True, 'ssl_version': ssl.PROTOCOL_TLSv1, 'ssl_cert_file': certName, 'ssl_key_file': keyName } # resolve multiple brokers brokerList = socket.gethostbyname_ex('atlas-mb.cern.ch')[-1] # set listener connList = [] for tmpBroker in brokerList: try: clientid = 'PANDA-' + socket.getfqdn() + '-' + tmpBroker subscription_id = 'panda-server-consumer-' + socket.getfqdn() _logger.debug('setting listener %s' % clientid) conn = stomp.Connection(host_and_ports=[(tmpBroker, 61023)], **ssl_opts) connList.append(conn) except Exception: errtype, errvalue = sys.exc_info()[:2] _logger.error("failed to connect to %s : %s %s" % (tmpBroker, errtype, errvalue)) catch_sig(None, None) while True: for conn in connList: try: if not conn.is_connected(): conn.set_listener( 'FileCallbackListener', FileCallbackListener(conn, taskBuffer, siteMapper, subscription_id)) conn.start() conn.connect(headers={'client-id': clientid}) conn.subscribe(destination=queue, id=subscription_id, ack='client-individual') _logger.debug('listener %s is up and running' % clientid) except Exception: errtype, errvalue = sys.exc_info()[:2] _logger.error("failed to set listener on %s : %s %s" % (tmpBroker, errtype, errvalue)) catch_sig(None, None) time.sleep(5)
def daemon_loop(dem_config, msg_queue, pipe_conn, worker_lifetime, tbuf=None): # pid of the worker my_pid = os.getpid() my_full_pid = '{0}-{1}-{2}'.format(socket.getfqdn().split('.')[0], os.getpgrp(), my_pid) # logger to log in file base_logger = logger_utils.setup_logger('daemons') tmp_log = logger_utils.make_logger(base_logger, 'worker_pid={pid}'.format(pid=my_pid)) tmp_log.info('daemon worker start') # signal handler def got_end_sig(sig, frame): tmp_log.warning('(got signal {sig})'.format(sig=sig)) for sig in END_SIGNALS: signal.signal(sig, got_end_sig) # dict of all daemons and their script module object module_map = {} # package of daemon scripts mod_package = getattr(daemon_config, 'package') # start timestamp start_ts = time.time() # expiry time expiry_ts = start_ts + worker_lifetime # create taskBuffer object if not given if tbuf is None: # initialize cx_Oracle using dummy connection try: from pandaserver.taskbuffer.Initializer import initializer initializer.init() except Exception as e: tmp_log.error('failed to launch initializer with {err} ; terminated'.format( err='{0}: {1}'.format(e.__class__.__name__, e))) return # taskBuffer object try: from pandaserver.taskbuffer.TaskBuffer import taskBuffer as tbuf tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) tmp_log.debug('taskBuffer initialized') except Exception as e: tmp_log.error('failed to initialize taskBuffer with {err} ; terminated'.format( err='{0}: {1}'.format(e.__class__.__name__, e))) return # import module of all daemons for dem_name, attrs in dem_config.items(): mod_name = attrs['module'] try: the_module = importlib.import_module('.{mod}'.format(mod=mod_name), mod_package) module_map[dem_name] = the_module except Exception as e: tmp_log.warning('for daemon {dem}, failed to import {mod} with {err} ; skipped it'.format( dem=dem_name, mod=mod_name, err='{0}: {1}'.format(e.__class__.__name__, e))) else: module_map[dem_name] = the_module tmp_log.debug('initialized, running') # loop while True: # stop the worker since when reaches its lifetime if time.time() > expiry_ts: tmp_log.info('worker reached its lifetime, stop this worker') break # get command from pipe if pipe_conn.poll(): cmd = pipe_conn.recv() if cmd == CMD_STOP: # got stop command, stop the process tmp_log.info('got stop command, stop this worker') break else: tmp_log.debug('got invalid command "{cmd}" ; skipped it'.format(cmd=cmd)) # clean up memory gc.collect() # get a message from queue tmp_log.debug('waiting for message...') keep_going = True one_msg = None while True: try: one_msg = msg_queue.get(timeout=5) break except queue.Empty: # timeout to get from queue, check whether to keep going if time.time() > expiry_ts: # worker expired, do not keep going keep_going = False break # keep going if not keep_going: continue # process message if one_msg in module_map and one_msg is not None: # got a daemon name, get the module object and corresponding attributes dem_name = one_msg tmp_log.debug('got message of {dem}'.format(dem=dem_name)) the_module = module_map[dem_name] attrs = dem_config[dem_name] mod_args = attrs['arguments'] mod_argv = tuple([__file__] + mod_args) dem_period = attrs['period'] dem_period_in_minute = dem_period/60. is_sync = attrs['sync'] is_loop = attrs['loop'] # initialize variables to_run_daemon = False has_run = False last_run_start_ts = 0 last_run_end_ts = 0 # component name in lock table component = 'pandaD.{dem}'.format(dem=dem_name) # whether the daemon shoule be synchronized among nodes if is_sync: # sychronized daemon, check process lock in DB ret_val, locked_time = tbuf.checkProcessLock_PANDA(component=component, pid=my_full_pid, time_limit=dem_period_in_minute) if ret_val: # locked by some process on other nodes last_run_start_ts = int((locked_time - EPOCH).total_seconds()) tmp_log.debug('found {dem} is locked by other process ; skipped it'.format(dem=dem_name)) else: # try to get the lock got_lock = tbuf.lockProcess_PANDA(component=component, pid=my_full_pid, time_limit=dem_period_in_minute) if got_lock: # got the lock to_run_daemon = True tmp_log.debug('got lock of {dem}'.format(dem=dem_name)) else: # did not get lock, skip last_run_start_ts = int(time.time()) tmp_log.debug('did not get lock of {dem} ; skipped it'.format(dem=dem_name)) else: to_run_daemon = True # run daemon if to_run_daemon: last_run_start_ts = int(time.time()) try: if is_loop: # go looping the script until reaching daemon period tmp_log.info('{dem} start looping'.format(dem=dem_name)) start_ts = time.time() while True: ret_val = the_module.main(argv=mod_argv, tbuf=tbuf) now_ts = time.time() if not ret_val: # daemon main function says stop the loop break if now_ts > start_ts + dem_period: # longer than the period, stop the loop break tmp_log.info('{dem} finish looping'.format(dem=dem_name)) else: # execute the module script with arguments tmp_log.info('{dem} start'.format(dem=dem_name)) the_module.main(argv=mod_argv, tbuf=tbuf) tmp_log.info('{dem} finish'.format(dem=dem_name)) except Exception as e: # with error tb = traceback.format_exc() tmp_log.error('failed to run daemon {dem} with {err} ; stop this worker'.format( dem=dem_name, err='{0}: {1}\n{2}\n'.format(e.__class__.__name__, e, tb))) # daemon has run but failed last_run_end_ts = int(time.time()) has_run = True # send daemon status back to master status_tuple = (dem_name, has_run, last_run_start_ts, last_run_end_ts) pipe_conn.send(status_tuple) # stop the worker break else: # daemon has run last_run_end_ts = int(time.time()) has_run = True # send daemon status back to master status_tuple = (dem_name, has_run, last_run_start_ts, last_run_end_ts) pipe_conn.send(status_tuple) # FIXME: stop and spawn worker in every run for now since some script breaks the worker without exception # tmp_log.info('as script done, stop this worker') # break else: # got invalid message tmp_log.warning('got invalid message "{msg}", skipped it'.format(msg=one_msg)) # sleep time.sleep(2**-5)