def main(tbuf=None, **kwargs): # logger tmpLog = LogWrapper(_logger) tmpLog.debug("================= start ==================") # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) else: taskBuffer = tbuf # instantiate MyProxy I/F my_proxy_interface_instance = panda_proxy_cache.MyProxyInterface() # roles if hasattr(panda_config,'proxy_cache_roles'): roles = panda_config.proxy_cache_roles.split(',') else: roles = ['atlas','atlas:/atlas/Role=production','atlas:/atlas/Role=pilot'] # get users sql = 'select distinct DN FROM ATLAS_PANDAMETA.users WHERE GRIDPREF LIKE :patt' varMap = {} varMap[':patt'] = '%p%' tmpStat,tmpRes = taskBuffer.querySQLS(sql,varMap) for realDN, in tmpRes: if realDN is None: continue realDN = CoreUtils.get_bare_dn(realDN, keep_digits=False) name = taskBuffer.cleanUserID(realDN) # check proxy tmpLog.debug("check proxy cache for {}".format(name)) for role in roles: my_proxy_interface_instance.checkProxy(realDN, role=role, name=name) tmpLog.debug("done")
def run(inFile,v_onlyTA,v_firstSubmission): try: import cPickle as pickle except ImportError: import pickle try: # read Jobs from file f = open(inFile, 'rb') jobs = pickle.load(f) f.close() except Exception as e: print("run() : %s %s" % (str(e), traceback.format_exc())) return # password from pandaserver.config import panda_config # initialize cx_Oracle using dummy connection from pandaserver.taskbuffer.Initializer import initializer initializer.init() # instantiate TB from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # run Setupper from pandaserver.dataservice.Setupper import Setupper thr = Setupper(taskBuffer,jobs,onlyTA=v_onlyTA,firstSubmission=v_firstSubmission) thr.start() thr.join() return
def __init__(self, site, cloud, nJobs): """Initialize class with parameters """ self.__site = site self.__cloud = cloud self.__nJobs = nJobs taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1)
def main(tbuf=None, **kwargs): # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # run WorkerSync(tbuf=taskBuffer).run()
def main(argv=tuple(), tbuf=None, **kwargs): # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # dbif session session = dbif.get_session() # If no argument, call the basic configurator if len(argv) == 1: _logger = logger_utils.make_logger(base_logger, 'Configurator') t1 = time.time() configurator = Configurator(session=session) if not configurator.run(): _logger.critical('Configurator loop FAILED') t2 = time.time() _logger.debug('Configurator run took {0}s'.format(t2 - t1)) # If --network argument, call the network configurator elif len(argv) == 2 and argv[1].lower() == '--network': _logger = logger_utils.make_logger(base_logger, 'NetworkConfigurator') t1 = time.time() network_configurator = NetworkConfigurator(taskBuffer=taskBuffer, session=session) if not network_configurator.run(): _logger.critical('Configurator loop FAILED') t2 = time.time() _logger.debug(' run took {0}s'.format(t2 - t1)) # If --json_dump elif len(argv) == 2 and argv[1].lower() == '--json_dump': _logger = logger_utils.make_logger(base_logger, 'JsonDumper') t1 = time.time() json_dumper = JsonDumper(taskBuffer=taskBuffer, session=session) out_msg = json_dumper.run() _logger.debug('Json_dumper finished with {0}'.format(out_msg)) t2 = time.time() _logger.debug(' run took {0}s'.format(t2 - t1)) else: _logger.error( 'Configurator being called with wrong arguments. Use either no arguments or --network or --json_dump' ) # dbif session close session.close() dbif.engine_dispose()
def __init__(self): """ Initialization and configuration """ threading.Thread.__init__(self) taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) if hasattr(panda_config, 'AGIS_URL_SCHEDCONFIG'): self.AGIS_URL_SCHEDCONFIG = panda_config.AGIS_URL_SCHEDCONFIG else: self.AGIS_URL_SCHEDCONFIG = 'http://atlas-agis-api.cern.ch/request/pandaqueue/query/list/?json&preset=schedconf.all&vo_name=atlas&state=ACTIVE' _logger.debug('Getting schedconfig dump...') self.schedconfig_dump = aux.get_dump(self.AGIS_URL_SCHEDCONFIG) _logger.debug('Done')
def __init__(self): threading.Thread.__init__(self) taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) if hasattr(panda_config, 'NWS_URL'): self.NWS_URL = panda_config.NWS_URL else: self.NWS_URL = 'http://atlas-adc-netmetrics-lb.cern.ch/metrics/latest.json' _logger.debug('Getting NWS dump...') self.nws_dump = aux.get_dump(self.NWS_URL) _logger.debug('Done') if hasattr(panda_config, 'AGIS_URL_CM'): self.AGIS_URL_CM = panda_config.AGIS_URL_CM else: self.AGIS_URL_CM = 'http://atlas-agis-api.cern.ch/request/site/query/list_links/?json' _logger.debug('Getting AGIS cost matrix dump...') self.agis_cm_dump = aux.get_dump(self.AGIS_URL_CM) _logger.debug('Done')
def main(tbuf=None, **kwargs): _logger.debug("===================== start =====================") # overall timeout value overallTimeout = 300 # prefix of evp files prefixEVP = 'evp.' # file pattern of evp files evpFilePatt = panda_config.cache_dir + '/' + prefixEVP + '*' # # kill old process # try: # # time limit # timeLimit = datetime.datetime.utcnow() - datetime.timedelta(minutes=overallTimeout) # # get process list # scriptName = sys.argv[0] # out = commands_get_status_output('env TZ=UTC ps axo user,pid,lstart,args | grep %s' % scriptName)[-1] # for line in out.split('\n'): # items = line.split() # # owned process # if items[0] not in ['sm','atlpan','pansrv','root']: # ['os.getlogin()']: doesn't work in cron # continue # # look for python # if re.search('python',line) is None: # continue # # PID # pid = items[1] # # start time # timeM = re.search('(\S+\s+\d+ \d+:\d+:\d+ \d+)',line) # startTime = datetime.datetime(*time.strptime(timeM.group(1),'%b %d %H:%M:%S %Y')[:6]) # # kill old process # if startTime < timeLimit: # _logger.debug("old process : %s %s" % (pid,startTime)) # _logger.debug(line) # commands_get_status_output('kill -9 %s' % pid) # except Exception: # type, value, traceBack = sys.exc_info() # _logger.error("kill process : %s %s" % (type,value)) # instantiate PD2P # if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # else: # taskBuffer = tbuf siteMapper = SiteMapper.SiteMapper(taskBuffer) # thread pool class ThreadPool: def __init__(self): self.lock = threading.Lock() self.list = [] def add(self,obj): self.lock.acquire() self.list.append(obj) self.lock.release() def remove(self,obj): self.lock.acquire() self.list.remove(obj) self.lock.release() def join(self): self.lock.acquire() thrlist = tuple(self.list) self.lock.release() for thr in thrlist: thr.join() # thread to ev-pd2p class EvpThr (threading.Thread): def __init__(self,lock,pool,aTaskBuffer,aSiteMapper,fileName,ignoreError): threading.Thread.__init__(self) self.lock = lock self.pool = pool self.fileName = fileName self.evp = EventPicker(aTaskBuffer,aSiteMapper,fileName,ignoreError) self.pool.add(self) def run(self): self.lock.acquire() retRun = self.evp.run() _logger.debug("%s : %s" % (retRun,self.fileName)) self.pool.remove(self) self.lock.release() # get files _logger.debug("EVP session") timeNow = datetime.datetime.utcnow() timeInt = datetime.datetime.utcnow() fileList = glob.glob(evpFilePatt) fileList.sort() # create thread pool and semaphore adderLock = threading.Semaphore(1) adderThreadPool = ThreadPool() # add while len(fileList) != 0: # time limit to aviod too many copyArchve running at the sametime if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout): _logger.debug("time over in EVP session") break # try to get Semaphore adderLock.acquire() # get fileList if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15): timeInt = datetime.datetime.utcnow() # get file fileList = glob.glob(evpFilePatt) fileList.sort() # choose a file fileName = fileList.pop(0) # release lock adderLock.release() if not os.path.exists(fileName): continue try: modTime = datetime.datetime(*(time.gmtime(os.path.getmtime(fileName))[:7])) if (timeNow - modTime) > datetime.timedelta(hours=24): # last chance _logger.debug("Last event picking : %s" % fileName) thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,False) thr.start() elif (timeInt - modTime) > datetime.timedelta(minutes=1): # try _logger.debug("event picking : %s" % fileName) thr = EvpThr(adderLock,adderThreadPool,taskBuffer,siteMapper,fileName,True) thr.start() else: _logger.debug("%s : %s" % ((timeInt - modTime),fileName)) except Exception: errType,errValue = sys.exc_info()[:2] _logger.error("%s %s" % (errType,errValue)) # join all threads adderThreadPool.join() _logger.debug("===================== end =====================")
import sys try: from urllib import urlencode, urlopen from urllib2 import Request except ImportError: from urllib.parse import urlencode from urllib.request import urlopen, Request from pandaserver.taskbuffer.TaskBuffer import taskBuffer from pandaserver.config import panda_config taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) d = taskBuffer.queryDatasetWithMap({'name': sys.argv[1]}) node = {} node['vuid'] = d.vuid node['site'] = sys.argv[2] url = 'https://localhost:25443/server/panda/datasetCompleted' rdata = urlencode(node) req = Request(url) fd = urlopen(req, rdata) data = fd.read() print(data)
def main(backGround=False): _logger.debug('starting ...') # register signal handler signal.signal(signal.SIGINT, catch_sig) signal.signal(signal.SIGHUP, catch_sig) signal.signal(signal.SIGTERM, catch_sig) signal.signal(signal.SIGALRM, catch_sig) signal.alarm(overallTimeout) # forking pid = os.fork() if pid != 0: # watch child process os.wait() time.sleep(1) else: # main loop from pandaserver.taskbuffer.TaskBuffer import taskBuffer # check certificate certName = '%s/pandasv1_usercert.pem' % panda_config.certdir keyName = '%s/pandasv1_userkey.pem' % panda_config.certdir _logger.debug('checking certificate {0}'.format(certName)) certOK, certMsg = DataServiceUtils.checkCertificate(certName) if not certOK: _logger.error('bad certificate : {0}'.format(certMsg)) # initialize cx_Oracle using dummy connection from pandaserver.taskbuffer.Initializer import initializer initializer.init() # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) # ActiveMQ params queue = '/queue/Consumer.PANDA.atlas.ddm.siteservices' ssl_opts = { 'use_ssl': True, 'ssl_version': ssl.PROTOCOL_TLSv1, 'ssl_cert_file': certName, 'ssl_key_file': keyName } # resolve multiple brokers brokerList = socket.gethostbyname_ex('atlas-mb.cern.ch')[-1] # set listener connList = [] for tmpBroker in brokerList: try: clientid = 'PANDA-' + socket.getfqdn() + '-' + tmpBroker subscription_id = 'panda-server-consumer-' + socket.getfqdn() _logger.debug('setting listener %s' % clientid) conn = stomp.Connection(host_and_ports=[(tmpBroker, 61023)], **ssl_opts) connList.append(conn) except Exception: errtype, errvalue = sys.exc_info()[:2] _logger.error("failed to connect to %s : %s %s" % (tmpBroker, errtype, errvalue)) catch_sig(None, None) while True: for conn in connList: try: if not conn.is_connected(): conn.set_listener( 'FileCallbackListener', FileCallbackListener(conn, taskBuffer, siteMapper, subscription_id)) conn.start() conn.connect(headers={'client-id': clientid}) conn.subscribe(destination=queue, id=subscription_id, ack='client-individual') _logger.debug('listener %s is up and running' % clientid) except Exception: errtype, errvalue = sys.exc_info()[:2] _logger.error("failed to set listener on %s : %s %s" % (tmpBroker, errtype, errvalue)) catch_sig(None, None) time.sleep(5)
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int prelock_pid = GenericThread().get_pid() tmpLog = LogWrapper(_logger, "<pid={}>".format(prelock_pid)) tmpLog.debug("===================== start =====================") # return value, true to run main again in next daemon loop ret_val = True # grace period try: gracePeriod = int(argv[1]) except Exception: gracePeriod = 1 # lock interval in minutes lock_interval = 10 # retry interval in minutes retry_interval = 3 # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # thread for adder class AdderThread(GenericThread): def __init__(self, taskBuffer, aSiteMapper, job_output_reports): GenericThread.__init__(self) self.taskBuffer = taskBuffer self.aSiteMapper = aSiteMapper self.job_output_reports = job_output_reports # main loop def run(self): # initialize taskBuffer = self.taskBuffer aSiteMapper = self.aSiteMapper # get file list timeNow = datetime.datetime.utcnow() timeInt = datetime.datetime.utcnow() # unique pid GenericThread.__init__(self) uniq_pid = self.get_pid() # log pid tmpLog.debug("pid={0} : run".format(uniq_pid)) # stats n_processed = 0 # loop while True: # get report one_jor = self.job_output_reports.pop() if not one_jor: break # lock panda_id, job_status, attempt_nr, time_stamp = one_jor got_lock = taskBuffer.lockJobOutputReport( panda_id=panda_id, attempt_nr=attempt_nr, pid=uniq_pid, time_limit=lock_interval) if not got_lock: continue # add try: modTime = time_stamp if (timeNow - modTime) > datetime.timedelta(hours=24): # last add tmpLog.debug( "pid={0} : last add job={1}.{2} st={3}".format( uniq_pid, panda_id, attempt_nr, job_status)) ignoreTmpError = False else: # usual add tmpLog.debug("pid={0} : add job={1}.{2} st={3}".format( uniq_pid, panda_id, attempt_nr, job_status)) ignoreTmpError = True # get adder adder_gen = AdderGen(taskBuffer, panda_id, job_status, attempt_nr, ignoreTmpError=ignoreTmpError, siteMapper=aSiteMapper, pid=uniq_pid, prelock_pid=uniq_pid, lock_offset=lock_interval - retry_interval) n_processed += 1 # execute adder_gen.run() del adder_gen except Exception as e: tmpLog.error("pid={} : failed to run with {} {}".format( uniq_pid, str(e), traceback.format_exc())) # stats tmpLog.debug("pid={} : processed {}".format(uniq_pid, n_processed)) # launcher, run with multiprocessing def proc_launch(self): # run self.process = multiprocessing.Process(target=self.run) self.process.start() # join of multiprocessing def proc_join(self): self.process.join() # TaskBuffer with more connections behind TaskBufferInterface tmpLog.debug("setup taskBufferIF") n_connections = 4 _tbuf = TaskBuffer() _tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=n_connections) taskBufferIF = TaskBufferInterface() taskBufferIF.launch(_tbuf) # add files tmpLog.debug("run Adder") interval = 10 nLoop = 10 for iLoop in range(10): tmpLog.debug('start iLoop={}/{}'.format(iLoop, nLoop)) start_time = datetime.datetime.utcnow() adderThrList = [] nThr = 10 n_jors_per_batch = 1000 jor_lists = WeightedLists(multiprocessing.Lock()) # get some job output reports jor_list_others = taskBuffer.listJobOutputReport( only_unlocked=True, time_limit=lock_interval, limit=n_jors_per_batch * nThr, grace_period=gracePeriod, anti_labels=['user']) jor_lists.add(3, jor_list_others) jor_list_user = taskBuffer.listJobOutputReport( only_unlocked=True, time_limit=lock_interval, limit=n_jors_per_batch * nThr, grace_period=gracePeriod, labels=['user']) jor_lists.add(7, jor_list_user) # adder consumer processes _n_thr_with_tbuf = 0 tbuf_list = [] tmpLog.debug("got {} job reports".format(len(jor_lists))) for i in range(nThr): if i < _n_thr_with_tbuf: tbuf = TaskBuffer() tbuf_list.append(tbuf) tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) thr = AdderThread(tbuf, aSiteMapper, jor_lists) else: thr = AdderThread(taskBufferIF.getInterface(), aSiteMapper, jor_lists) adderThrList.append(thr) # start all threads for thr in adderThrList: # thr.start() thr.proc_launch() time.sleep(0.25) # join all threads for thr in adderThrList: # thr.join() thr.proc_join() [tbuf.cleanup() for tbuf in tbuf_list] end_time = datetime.datetime.utcnow() sleep_time = interval - (end_time - start_time).seconds if sleep_time > 0 and iLoop + 1 < nLoop: sleep_time = random.randint(1, sleep_time) tmpLog.debug("sleep {} sec".format(sleep_time)) time.sleep(sleep_time) # stop TaskBuffer IF taskBufferIF.stop() tmpLog.debug("===================== end =====================") # return return ret_val
def main(tbuf=None, **kwargs): _logger.debug("===================== start =====================") # overall timeout value overallTimeout = 300 # prefix of the files if 'target' in kwargs and kwargs['target']: evpFilePatt = kwargs['target'] else: prefixEVP = '/workflow.' # file pattern of evp files evpFilePatt = panda_config.cache_dir + '/' + prefixEVP + '*' from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) test_mode = kwargs.get('test_mode', False) dump_workflow = kwargs.get('dump_workflow', False) # thread pool class ThreadPool: def __init__(self): self.lock = threading.Lock() self.list = [] def add(self, obj): self.lock.acquire() self.list.append(obj) self.lock.release() def remove(self, obj): self.lock.acquire() self.list.remove(obj) self.lock.release() def join(self): self.lock.acquire() thrlist = tuple(self.list) self.lock.release() for thr in thrlist: thr.join() # thread class EvpThr(threading.Thread): def __init__(self, task_buffer, lock, pool, file_name, to_delete, get_log): threading.Thread.__init__(self) self.lock = lock self.pool = pool self.fileName = file_name self.to_delete = to_delete self.get_log = get_log self.pool.add(self) self.processor = WorkflowProcessor(task_buffer=task_buffer, log_stream=_logger) def run(self): self.lock.acquire() try: self.processor.process(self.fileName, self.to_delete, test_mode, self.get_log, dump_workflow) except Exception as e: _logger.error("{} {}".format(str(e), traceback.format_exc())) self.pool.remove(self) self.lock.release() # get files timeNow = datetime.datetime.utcnow() timeInt = datetime.datetime.utcnow() fileList = glob.glob(evpFilePatt) fileList.sort() # create thread pool and semaphore adderLock = threading.Semaphore(1) adderThreadPool = ThreadPool() # add while len(fileList) != 0: # time limit to aviod too many copyArchve running at the sametime if (datetime.datetime.utcnow() - timeNow) > datetime.timedelta(minutes=overallTimeout): _logger.debug("time over in main session") break # try to get Semaphore adderLock.acquire() # get fileList if (datetime.datetime.utcnow() - timeInt) > datetime.timedelta(minutes=15): timeInt = datetime.datetime.utcnow() # get file fileList = glob.glob(evpFilePatt) fileList.sort() # choose a file fileName = fileList.pop(0) # release lock adderLock.release() if not os.path.exists(fileName): continue try: modTime = datetime.datetime( *(time.gmtime(os.path.getmtime(fileName))[:7])) to_go = True if test_mode: _logger.debug("Testing : %s" % fileName) to_delete = False elif (timeNow - modTime) > datetime.timedelta(hours=2): # last chance _logger.debug("Last attempt : %s" % fileName) to_delete = True elif (timeInt - modTime) > datetime.timedelta(seconds=5): # try _logger.debug("Normal attempt : %s" % fileName) to_delete = False else: _logger.debug("Wait %s : %s" % ((timeInt - modTime), fileName)) to_go = False if to_go: thr = EvpThr(taskBuffer, adderLock, adderThreadPool, fileName, to_delete, False) thr.start() except Exception as e: _logger.error("{} {}".format(str(e), traceback.format_exc())) # join all threads adderThreadPool.join() _logger.debug("===================== end =====================")
def daemon_loop(dem_config, msg_queue, pipe_conn, worker_lifetime, tbuf=None): # pid of the worker my_pid = os.getpid() my_full_pid = '{0}-{1}-{2}'.format(socket.getfqdn().split('.')[0], os.getpgrp(), my_pid) # logger to log in file base_logger = logger_utils.setup_logger('daemons') tmp_log = logger_utils.make_logger(base_logger, 'worker_pid={pid}'.format(pid=my_pid)) tmp_log.info('daemon worker start') # signal handler def got_end_sig(sig, frame): tmp_log.warning('(got signal {sig})'.format(sig=sig)) for sig in END_SIGNALS: signal.signal(sig, got_end_sig) # dict of all daemons and their script module object module_map = {} # package of daemon scripts mod_package = getattr(daemon_config, 'package') # start timestamp start_ts = time.time() # expiry time expiry_ts = start_ts + worker_lifetime # create taskBuffer object if not given if tbuf is None: # initialize cx_Oracle using dummy connection try: from pandaserver.taskbuffer.Initializer import initializer initializer.init() except Exception as e: tmp_log.error('failed to launch initializer with {err} ; terminated'.format( err='{0}: {1}'.format(e.__class__.__name__, e))) return # taskBuffer object try: from pandaserver.taskbuffer.TaskBuffer import taskBuffer as tbuf tbuf.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) tmp_log.debug('taskBuffer initialized') except Exception as e: tmp_log.error('failed to initialize taskBuffer with {err} ; terminated'.format( err='{0}: {1}'.format(e.__class__.__name__, e))) return # import module of all daemons for dem_name, attrs in dem_config.items(): mod_name = attrs['module'] try: the_module = importlib.import_module('.{mod}'.format(mod=mod_name), mod_package) module_map[dem_name] = the_module except Exception as e: tmp_log.warning('for daemon {dem}, failed to import {mod} with {err} ; skipped it'.format( dem=dem_name, mod=mod_name, err='{0}: {1}'.format(e.__class__.__name__, e))) else: module_map[dem_name] = the_module tmp_log.debug('initialized, running') # loop while True: # stop the worker since when reaches its lifetime if time.time() > expiry_ts: tmp_log.info('worker reached its lifetime, stop this worker') break # get command from pipe if pipe_conn.poll(): cmd = pipe_conn.recv() if cmd == CMD_STOP: # got stop command, stop the process tmp_log.info('got stop command, stop this worker') break else: tmp_log.debug('got invalid command "{cmd}" ; skipped it'.format(cmd=cmd)) # clean up memory gc.collect() # get a message from queue tmp_log.debug('waiting for message...') keep_going = True one_msg = None while True: try: one_msg = msg_queue.get(timeout=5) break except queue.Empty: # timeout to get from queue, check whether to keep going if time.time() > expiry_ts: # worker expired, do not keep going keep_going = False break # keep going if not keep_going: continue # process message if one_msg in module_map and one_msg is not None: # got a daemon name, get the module object and corresponding attributes dem_name = one_msg tmp_log.debug('got message of {dem}'.format(dem=dem_name)) the_module = module_map[dem_name] attrs = dem_config[dem_name] mod_args = attrs['arguments'] mod_argv = tuple([__file__] + mod_args) dem_period = attrs['period'] dem_period_in_minute = dem_period/60. is_sync = attrs['sync'] is_loop = attrs['loop'] # initialize variables to_run_daemon = False has_run = False last_run_start_ts = 0 last_run_end_ts = 0 # component name in lock table component = 'pandaD.{dem}'.format(dem=dem_name) # whether the daemon shoule be synchronized among nodes if is_sync: # sychronized daemon, check process lock in DB ret_val, locked_time = tbuf.checkProcessLock_PANDA(component=component, pid=my_full_pid, time_limit=dem_period_in_minute) if ret_val: # locked by some process on other nodes last_run_start_ts = int((locked_time - EPOCH).total_seconds()) tmp_log.debug('found {dem} is locked by other process ; skipped it'.format(dem=dem_name)) else: # try to get the lock got_lock = tbuf.lockProcess_PANDA(component=component, pid=my_full_pid, time_limit=dem_period_in_minute) if got_lock: # got the lock to_run_daemon = True tmp_log.debug('got lock of {dem}'.format(dem=dem_name)) else: # did not get lock, skip last_run_start_ts = int(time.time()) tmp_log.debug('did not get lock of {dem} ; skipped it'.format(dem=dem_name)) else: to_run_daemon = True # run daemon if to_run_daemon: last_run_start_ts = int(time.time()) try: if is_loop: # go looping the script until reaching daemon period tmp_log.info('{dem} start looping'.format(dem=dem_name)) start_ts = time.time() while True: ret_val = the_module.main(argv=mod_argv, tbuf=tbuf) now_ts = time.time() if not ret_val: # daemon main function says stop the loop break if now_ts > start_ts + dem_period: # longer than the period, stop the loop break tmp_log.info('{dem} finish looping'.format(dem=dem_name)) else: # execute the module script with arguments tmp_log.info('{dem} start'.format(dem=dem_name)) the_module.main(argv=mod_argv, tbuf=tbuf) tmp_log.info('{dem} finish'.format(dem=dem_name)) except Exception as e: # with error tb = traceback.format_exc() tmp_log.error('failed to run daemon {dem} with {err} ; stop this worker'.format( dem=dem_name, err='{0}: {1}\n{2}\n'.format(e.__class__.__name__, e, tb))) # daemon has run but failed last_run_end_ts = int(time.time()) has_run = True # send daemon status back to master status_tuple = (dem_name, has_run, last_run_start_ts, last_run_end_ts) pipe_conn.send(status_tuple) # stop the worker break else: # daemon has run last_run_end_ts = int(time.time()) has_run = True # send daemon status back to master status_tuple = (dem_name, has_run, last_run_start_ts, last_run_end_ts) pipe_conn.send(status_tuple) # FIXME: stop and spawn worker in every run for now since some script breaks the worker without exception # tmp_log.info('as script done, stop this worker') # break else: # got invalid message tmp_log.warning('got invalid message "{msg}", skipped it'.format(msg=one_msg)) # sleep time.sleep(2**-5)
""" import datetime import types # config file from pandaserver.config import panda_config # initialize cx_Oracle using dummy connection from pandaserver.taskbuffer.Initializer import initializer initializer.init() # initialzie TaskBuffer from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,panda_config.nDBConnection,True) # initialize JobDispatcher from pandaserver.jobdispatcher.JobDispatcher import jobDispatcher if panda_config.nDBConnection != 0: jobDispatcher.init(taskBuffer) # initialize DataService from pandaserver.dataservice.DataService import dataService if panda_config.nDBConnection != 0: dataService.init(taskBuffer) # initialize UserIF from pandaserver.userinterface.UserIF import userIF if panda_config.nDBConnection != 0: userIF.init(taskBuffer)
def main(taskBuffer=None, exec_options=None, log_stream=None, args_list=None): # options parser = argparse.ArgumentParser() if taskBuffer: parser.add_argument('--ds',action='store',dest='ds',default=None, help='dataset name') else: parser.add_argument('--ds',action='store',dest='ds',default=None,required=True, help='dataset name') parser.add_argument('--files',action='store',dest='files',default=None, help='comma-separated list of lost file names. The list is dedeuced if this option is omitted') parser.add_argument('--noChildRetry',action='store_const',const=True,dest='noChildRetry',default=False, help='not retry child tasks') parser.add_argument('--resurrectDS',action='store_const',const=True,dest='resurrectDS',default=False, help='resurrect output and log datasets if they were already deleted') parser.add_argument('--dryRun',action='store_const',const=True,dest='dryRun',default=False, help='dry run') parser.add_argument('--force', action='store_const', const=True, dest='force', default=False, help='force retry even if no lost files') parser.add_argument('--reproduceParent', action='store_const', const=True, dest='reproduceParent', default=False, help='reproduce the input files from which the lost files were produced. ' 'Typically useful to recover merged files when unmerged files were already deleted') # parse options if taskBuffer: if args_list: options = parser.parse_args(args_list) else: options, unknown = parser.parse_known_args() else: if args_list: options = parser.parse_args(args_list) else: options = parser.parse_args() # executed via command-line givenTaskID = None dn = None if taskBuffer is None: # instantiate TB from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) else: # set options from dict if exec_options is None: exec_options = {} keys = set(vars(options).keys()) for k in exec_options: if k in keys: setattr(options, k, exec_options[k]) if 'jediTaskID' in exec_options: givenTaskID = exec_options['jediTaskID'] if 'userName' in exec_options: dn = exec_options['userName'] ds_files = {} if options.files is not None: files = options.files.split(',') ds_files[options.ds] = files else: # look for lost files if not givenTaskID: # get files from rucio st, files_rucio = get_files_from_rucio(options.ds, log_stream) if st is not True: return st, files_rucio # get files from panda dsName = options.ds.split(':')[-1] fd, fo = taskBuffer.querySQLS( 'SELECT c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c ' 'WHERE c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND ' 'd.type IN (:t1,:t2) AND c.status=:s AND d.datasetName=:name ', {':s': 'finished', ':t1': 'output', ':t2': 'log', ':name': dsName}) for tmpLFN, in fo: if tmpLFN not in files_rucio: ds_files.setdefault(options.ds, []) ds_files[options.ds].append(tmpLFN) # get taskID td, to = taskBuffer.querySQLS( 'SELECT jediTaskID FROM ATLAS_PANDA.JEDI_Datasets ' 'WHERE datasetName=:datasetName AND type IN (:t1,:t2) ', {':t1': 'output', ':t2': 'log', ':datasetName': dsName}) jediTaskID, = to[0] else: # get dataset names dd, do = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets ' 'WHERE jediTaskID=:jediTaskID AND type IN (:t1,:t2) ', {':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID}) # get files from rucio files_rucio = set() for tmpDS, in do: st, tmp_files_rucio = get_files_from_rucio(tmpDS, log_stream) if st is None: return st, tmp_files_rucio # ignore unknown dataset if st: files_rucio = files_rucio.union(tmp_files_rucio) # get files from rucio fd, fo = taskBuffer.querySQLS( 'SELECT d.datasetName,c.lfn FROM ATLAS_PANDA.JEDI_Datasets d,ATLAS_PANDA.JEDI_Dataset_Contents c ' 'WHERE d.jediTaskID=:jediTaskID AND c.jediTaskID=d.jediTaskID AND c.datasetID=d.datasetID AND ' 'd.type IN (:t1,:t2) AND c.status=:s ', {':s': 'finished', ':t1': 'output', ':t2': 'log', ':jediTaskID': givenTaskID}) for tmpDS, tmpLFN in fo: if tmpLFN not in files_rucio: ds_files.setdefault(tmpDS, []) ds_files[tmpDS].append(tmpLFN) for tmpDS in ds_files: files = ds_files[tmpDS] msgStr = '{} has {} lost files -> {}'.format(tmpDS, len(files), ','.join(files)) if log_stream: log_stream.info(msgStr) else: print(msgStr) # no lost files if not ds_files and not options.force: return True, "No lost files. Use --force to ignore this check" # reset file status s = False for tmpDS in ds_files: files = ds_files[tmpDS] if dn: ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI(dn, False, tmpDS, files, options.reproduceParent, options.dryRun) else: ts, jediTaskID, lostInputFiles = taskBuffer.resetFileStatusInJEDI('', True, tmpDS, files, options.reproduceParent, options.dryRun) msgStr = 'reset file status for {} in the DB: done with {} for jediTaskID={}'.format(tmpDS, ts, jediTaskID) if log_stream: log_stream.info(msgStr) else: print(msgStr) s |= ts # recover parent if options.reproduceParent: # reproduce input for lostDS in lostInputFiles: com_args = ['--ds', lostDS, '--noChildRetry', '--resurrectDS'] if options.dryRun: com_args.append('--dryRun') com_args += ['--files', ','.join(lostInputFiles[lostDS])] main(taskBuffer=taskBuffer, log_stream=log_stream, args_list=com_args) # go ahead if options.dryRun: return True, 'Done in the dry-run mode with {}'.format(s) if s or options.force: if options.resurrectDS: sd,so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', {':id': jediTaskID, ':t1': 'output', ':t2': 'log'}) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print('resurrect {0}'.format(datasetName)) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except Exception: pass if not options.reproduceParent: msgStr = Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1] else: msgStr = Client.reloadInput(jediTaskID)[-1][-1] if log_stream: log_stream.info("Retried task with {}".format(msgStr)) log_stream.info("Done") else: print("Retried task: done with {}".format(msgStr)) return True, msgStr else: msgStr = 'failed' if log_stream: log_stream.error(msgStr) else: print(msgStr) return False, msgStr
def main(tbuf=None, **kwargs): # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # pid my_pid = os.getpid() my_full_pid = '{0}-{1}-{2}'.format(socket.getfqdn().split('.')[0], os.getpgrp(), my_pid) # go if DRY_RUN: # dry run, regardless of lock, not update DB fetcher = FetchData(taskBuffer) # loop over all fetch data methods to run and update to DB for metric_name, update_type, period in metric_list: main_logger.debug('(dry-run) start {metric_name}'.format( metric_name=metric_name)) # fetch data and update DB the_method = getattr(fetcher, metric_name) fetched_data = the_method() if fetched_data is None: main_logger.warning( '(dry-run) {metric_name} got no valid data'.format( metric_name=metric_name)) continue main_logger.debug( '(dry-run) done {metric_name}'.format(metric_name=metric_name)) else: # real run, will update DB # instantiate mdb = MetricsDB(taskBuffer) fetcher = FetchData(taskBuffer) # loop over all fetch data methods to run and update to DB for metric_name, update_type, period in metric_list: # metric lock lock_component_name = 'pandaMetr.{0:.30}.{1:0x}'.format( metric_name, adler32(metric_name.encode('utf-8'))) # try to get lock got_lock = taskBuffer.lockProcess_PANDA( component=lock_component_name, pid=my_full_pid, time_limit=period) if got_lock: main_logger.debug('got lock of {metric_name}'.format( metric_name=metric_name)) else: main_logger.debug( '{metric_name} locked by other process; skipped...'.format( metric_name=metric_name)) continue main_logger.debug( 'start {metric_name}'.format(metric_name=metric_name)) # fetch data and update DB the_method = getattr(fetcher, metric_name) fetched_data = the_method() if fetched_data is None: main_logger.warning('{metric_name} got no valid data'.format( metric_name=metric_name)) continue mdb.update(metric=metric_name, update_type=update_type, entity_dict=fetched_data) main_logger.debug( 'done {metric_name}'.format(metric_name=metric_name))
def main(argv=tuple(), tbuf=None, **kwargs): try: long except NameError: long = int tmpLog = LogWrapper(_logger, None) tmpLog.debug("===================== start =====================") # current minute currentMinute = datetime.datetime.utcnow().minute # instantiate TB if tbuf is None: from pandaserver.taskbuffer.TaskBuffer import taskBuffer taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) else: taskBuffer = tbuf # instantiate sitemapper aSiteMapper = SiteMapper(taskBuffer) # delete tmpLog.debug("Del session") status, retSel = taskBuffer.querySQLS( "SELECT MAX(PandaID) FROM ATLAS_PANDA.jobsDefined4", {}) if retSel is not None: try: maxID = retSel[0][0] tmpLog.debug("maxID : %s" % maxID) if maxID is not None: varMap = {} varMap[':maxID'] = maxID varMap[':jobStatus1'] = 'activated' varMap[':jobStatus2'] = 'waiting' varMap[':jobStatus3'] = 'failed' varMap[':jobStatus4'] = 'cancelled' status, retDel = taskBuffer.querySQLS( "DELETE FROM ATLAS_PANDA.jobsDefined4 WHERE PandaID<:maxID AND jobStatus IN (:jobStatus1,:jobStatus2,:jobStatus3,:jobStatus4)", varMap) except Exception: pass # count # of getJob/updateJob in dispatcher's log try: # don't update when logrotate is running timeNow = datetime.datetime.utcnow() logRotateTime = timeNow.replace(hour=3, minute=2, second=0, microsecond=0) if (timeNow > logRotateTime and (timeNow-logRotateTime) < datetime.timedelta(minutes=5)) or \ (logRotateTime > timeNow and (logRotateTime-timeNow) < datetime.timedelta(minutes=5)): tmpLog.debug("skip pilotCounts session for logrotate") else: # log filename dispLogName = '%s/panda-PilotRequests.log' % panda_config.logdir # time limit timeLimit = datetime.datetime.utcnow() - datetime.timedelta( hours=3) timeLimitS = datetime.datetime.utcnow() - datetime.timedelta( hours=1) # check if tgz is required com = 'head -1 %s' % dispLogName lostat, loout = commands_get_status_output(com) useLogTgz = True if lostat == 0: match = re.search('^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}', loout) if match is not None: startTime = datetime.datetime(*time.strptime( match.group(0), '%Y-%m-%d %H:%M:%S')[:6]) # current log contains all info if startTime < timeLimit: useLogTgz = False # log files dispLogNameList = [dispLogName] if useLogTgz: today = datetime.date.today() dispLogNameList.append('{0}-{1}.gz'.format( dispLogName, today.strftime('%Y%m%d'))) # delete tmp commands_get_status_output('rm -f %s.tmp-*' % dispLogName) # tmp name tmpLogName = '%s.tmp-%s' % (dispLogName, datetime.datetime.utcnow( ).strftime('%Y-%m-%d-%H-%M-%S')) # loop over all files pilotCounts = {} pilotCountsS = {} for tmpDispLogName in dispLogNameList: # expand or copy if tmpDispLogName.endswith('.gz'): com = 'gunzip -c %s > %s' % (tmpDispLogName, tmpLogName) else: com = 'cp %s %s' % (tmpDispLogName, tmpLogName) lostat, loout = commands_get_status_output(com) if lostat != 0: errMsg = 'failed to expand/copy %s with : %s' % ( tmpDispLogName, loout) raise RuntimeError(errMsg) # search string sStr = '^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}).*' sStr += 'method=(.+),site=(.+),node=(.+),type=(.+)' # read logFH = open(tmpLogName) for line in logFH: # check format match = re.search(sStr, line) if match is not None: # check timerange timeStamp = datetime.datetime(*time.strptime( match.group(1), '%Y-%m-%d %H:%M:%S')[:6]) if timeStamp < timeLimit: continue tmpMethod = match.group(2) tmpSite = match.group(3) tmpNode = match.group(4) tmpType = match.group(5) # protection against corrupted entries from pilot, # e.g. pilot reading site json from cvmfs while it was being updated if tmpSite not in aSiteMapper.siteSpecList: continue # sum pilotCounts.setdefault(tmpSite, {}) pilotCounts[tmpSite].setdefault(tmpMethod, {}) pilotCounts[tmpSite][tmpMethod].setdefault(tmpNode, 0) pilotCounts[tmpSite][tmpMethod][tmpNode] += 1 # short if timeStamp > timeLimitS: if tmpSite not in pilotCountsS: pilotCountsS[tmpSite] = dict() if tmpMethod not in pilotCountsS[tmpSite]: pilotCountsS[tmpSite][tmpMethod] = dict() if tmpNode not in pilotCountsS[tmpSite][tmpMethod]: pilotCountsS[tmpSite][tmpMethod][tmpNode] = 0 pilotCountsS[tmpSite][tmpMethod][tmpNode] += 1 # close logFH.close() # delete tmp commands_get_status_output('rm %s' % tmpLogName) # update hostID = panda_config.pserverhost.split('.')[0] tmpLog.debug("pilotCounts session") retPC = taskBuffer.updateSiteData(hostID, pilotCounts, interval=3) tmpLog.debug(retPC) retPC = taskBuffer.updateSiteData(hostID, pilotCountsS, interval=1) tmpLog.debug(retPC) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("updateJob/getJob : %s %s" % (errType, errValue)) # nRunning tmpLog.debug("nRunning session") try: if (currentMinute / panda_config.nrun_interval ) % panda_config.nrun_hosts == panda_config.nrun_snum: retNR = taskBuffer.insertnRunningInSiteData() tmpLog.debug(retNR) except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("nRunning : %s %s" % (errType, errValue)) # session for co-jumbo jobs tmpLog.debug("co-jumbo session") try: ret = taskBuffer.getCoJumboJobsToBeFinished(30, 0, 1000) if ret is None: tmpLog.debug("failed to get co-jumbo jobs to finish") else: coJumboA, coJumboD, coJumboW, coJumboTokill = ret tmpLog.debug("finish {0} co-jumbo jobs in Active".format( len(coJumboA))) if len(coJumboA) > 0: jobSpecs = taskBuffer.peekJobs(coJumboA, fromDefined=False, fromActive=True, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False) tmpLog.debug("finish {0} co-jumbo jobs in Defined".format( len(coJumboD))) if len(coJumboD) > 0: jobSpecs = taskBuffer.peekJobs(coJumboD, fromDefined=True, fromActive=False, fromArchived=False, fromWaiting=False) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], True) tmpLog.debug("finish {0} co-jumbo jobs in Waiting".format( len(coJumboW))) if len(coJumboW) > 0: jobSpecs = taskBuffer.peekJobs(coJumboW, fromDefined=False, fromActive=False, fromArchived=False, fromWaiting=True) for jobSpec in jobSpecs: fileCheckInJEDI = taskBuffer.checkInputFileStatusInJEDI( jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = pandaserver.taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except Exception: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName def run(self): if 'VIRTUAL_ENV' in os.environ: prefix = os.environ['VIRTUAL_ENV'] else: prefix = '' setupStr = 'source {0}/etc/sysconfig/panda_server; '.format(prefix) runStr = '%s/python -Wignore ' % panda_config.native_python runStr += panda_config.pandaPython_dir + '/dataservice/forkSetupper.py -i ' runStr += self.fileName if self.fileName.split('/')[-1].startswith('set.NULL.'): runStr += ' -t' comStr = setupStr + runStr tmpLog.debug(comStr) commands_get_status_output(comStr) # get set.* files filePatt = panda_config.logdir + '/' + 'set.*' fileList = glob.glob(filePatt) # the max number of threads maxThr = 10 nThr = 0 # loop over all files forkThrList = [] timeNow = datetime.datetime.utcnow() for tmpName in fileList: if not os.path.exists(tmpName): continue try: # takes care of only recent files modTime = datetime.datetime( *(time.gmtime(os.path.getmtime(tmpName))[:7])) if (timeNow - modTime) > datetime.timedelta(minutes=1) and \ (timeNow - modTime) < datetime.timedelta(hours=1): cSt, cOut = commands_get_status_output( 'ps aux | grep fork | grep -v PYTH') # if no process is running for the file if cSt == 0 and tmpName not in cOut: nThr += 1 thr = ForkThr(tmpName) thr.start() forkThrList.append(thr) if nThr > maxThr: break except Exception: errType, errValue = sys.exc_info()[:2] tmpLog.error("%s %s" % (errType, errValue)) # join fork threads for thr in forkThrList: thr.join() # terminate TaskBuffer IF # taskBufferIF.terminate() tmpLog.debug("===================== end =====================")