def post_processing(self, workspec, jobspec_list, map_type): ''' Take the jobReport placed by aCT in the access point and fill metadata attributes of the workspec. ''' # get logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='post_processing') if not workspec.workAttributes: workspec.workAttributes = {} for pandaID in workspec.pandaid_list: workspec.workAttributes[pandaID] = {} # look for job report accessPoint = self.get_access_point(workspec, pandaID) jsonFilePath = os.path.join(accessPoint, jsonJobReport) tmpLog.debug('looking for job report file {0}'.format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found') else: try: with open(jsonFilePath) as jsonFile: workspec.workAttributes[pandaID] = json.load(jsonFile) tmpLog.debug('got {0} kB of job report'.format(os.stat(jsonFilePath).st_size / 1024)) except: tmpLog.debug('failed to load {0}'.format(jsonFilePath)) tmpLog.debug("pilot info for {0}: {1}".format(pandaID, workspec.workAttributes[pandaID])) return True
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) # Set up aCT DB connection self.log = core_utils.make_logger(baseLogger, 'aCT submitter', method_name='__init__') self.actDB = aCTDBPanda(self.log) # Credential dictionary role: proxy file self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)], list(harvester_config.credmanager.outCertFile))) # Map of role to aCT proxyid self.proxymap = {} # Get proxy info # TODO: better to send aCT the proxy file and let it handle it for role, proxy in self.certs.items(): cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials) uc = arc.UserConfig(cred_type) uc.ProxyPath(str(proxy)) cred = arc.Credential(uc) dn = cred.GetIdentityName() self.log.info("Proxy {0} with DN {1} and role {2}".format(proxy, dn, role)) actp = aCTProxy(self.log) attr = '/atlas/Role='+role proxyid = actp.getProxyId(dn, attr) if not proxyid: raise Exception("Proxy with DN {0} and attribute {1} was not found in proxies table".format(dn, attr)) self.proxymap[role] = proxyid
def is_alive(self, workspec, time_limit): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='is_alive') # json file jsonFilePath = os.path.join(workspec.get_access_point(), heartbeatFile) tmpLog.debug('looking for heartbeat file {0}'.format(jsonFilePath)) if not os.path.exists(jsonFilePath): # no heartbeat file was found tmpLog.debug('startTime: {0}, now: {1}'.format(workspec.startTime, datetime.datetime.utcnow())) if not workspec.startTime: # the worker didn't even have time to start tmpLog.debug('heartbeat not found, but no startTime yet for worker') return True elif datetime.datetime.utcnow() - workspec.startTime < datetime.timedelta(minutes=time_limit): # the worker is too young and maybe didn't have time to generate the heartbeat tmpLog.debug('heartbeat not found, but worker too young') return True else: # the worker is old and the heartbeat should be expected tmpLog.debug('not found') return None try: mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(jsonFilePath)) tmpLog.debug('last modification time : {0}'.format(mtime)) if datetime.datetime.utcnow() - mtime > datetime.timedelta(minutes=time_limit): tmpLog.debug('too old') return False tmpLog.debug('OK') return True except Exception: tmpLog.debug('failed to get mtime') return None
def renew_session(self, retry=3, init=False): # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.renew_session') # Clear security session if not initialization if not init: tmpLog.info('Renew condor session') self.secman.invalidateAllSessions() # Recreate collector and schedd object i_try = 1 while i_try <= retry: try: tmpLog.info('Try {0}'.format(i_try)) if self.condor_pool: self.collector = htcondor.Collector(self.condor_pool) else: self.collector = htcondor.Collector() if self.condor_schedd: self.scheddAd = self.collector.locate(htcondor.DaemonTypes.Schedd, self.condor_schedd) else: self.scheddAd = self.collector.locate(htcondor.DaemonTypes.Schedd) self.schedd = htcondor.Schedd(self.scheddAd) tmpLog.info('Success') break except Exception as e: tmpLog.warning('Recreate condor collector and schedd failed: {0}'.format(e)) if i_try < retry: tmpLog.warning('Failed. Retry...') else: tmpLog.warning('Retry {0} times. Still failed. Skipped'.format(i_try)) i_try += 1 self.secman.invalidateAllSessions() time.sleep(3) # Sleep time.sleep(3)
def submit_with_python(self, jdl_list, use_spool=False): # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_python') # Start tmpLog.debug('Start') # Initialize errStr = '' batchIDs_list = [] # Make list of jdl map with dummy submit objects jdl_map_list = [ dict(htcondor.Submit(jdl).items()) for jdl in jdl_list ] # Go submit_obj = htcondor.Submit() try: with self.schedd.transaction() as txn: # TODO: Currently spool is not supported in htcondor.Submit ... submit_result = submit_obj.queue_with_itemdata(txn, 1, iter(jdl_map_list)) clusterid = submit_result.cluster() first_proc = submit_result.first_proc() num_proc = submit_result.num_procs() batchIDs_list.extend(['{0}.{1}'.format(clusterid, procid) for procid in range(first_proc, first_proc + num_proc)]) except RuntimeError as e: errStr = '{0}: {1}'.format(e.__class__.__name__, e) tmpLog.error('submission failed: {0}'.format(errStr)) raise if batchIDs_list: n_jobs = len(batchIDs_list) tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list))) elif not errStr: tmpLog.error('submitted nothing') tmpLog.debug('Done') # Return return (batchIDs_list, errStr)
def feed_events(self, workspec, events_dict): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='feed_events') retVal = True if workspec.mapType in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers]: # put the json just under the access point jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName) tmpLog.debug('feeding events to {0}'.format(jsonFilePath)) try: with open(jsonFilePath, 'w') as jsonFile: json.dump(events_dict, jsonFile) except Exception: core_utils.dump_error_message(tmpLog) retVal = False elif workspec.mapType == WorkSpec.MT_MultiJobs: # TOBEFIXED pass # remove request file try: jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsRequestFileName) os.remove(jsonFilePath) except Exception: pass tmpLog.debug('done') return retVal
def kill_worker(self, workspec): """ Mark aCT job as tobekilled. :param workspec: worker specification :type workspec: WorkSpec :return: A tuple of return code (True for success, False otherwise) and error dialog :rtype: (bool, string) """ # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='kill_worker') if workspec.batchID is None: tmpLog.info('workerID={0} has no batch ID so assume was not submitted - skipped'.format( workspec.workerID)) return True, '' try: # Only kill jobs which are still active self.actDB.updateJobs("id={0} AND actpandastatus IN ('sent', 'starting', 'running')".format(workspec.batchID), {'actpandastatus': 'tobekilled', 'pandastatus': None}) except Exception as e: tmpLog.error('Failed to cancel job {0} in aCT: {1}'.format(workspec.batchID, str(e))) return False, str(e) tmpLog.info('Job {0} cancelled in aCT'.format(workspec.batchID)) return True, ''
def wrapper(self, *args, **kwargs): if self.is_connected: return func(self, *args, **kwargs) else: tmpLog = core_utils.make_logger(_logger, method_name=func.__name__) tmpLog.warning('instance not alive; method {0} returns None'.format(func.__name__)) return None
def __init__(self, submissionHost, *args, **kwargs): self.submissionHost = submissionHost # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.__init__') # Initialize tmpLog.debug('Initializing client') self.lock = threading.Lock() self.condor_api = CONDOR_API self.condor_schedd = None self.condor_pool = None # Parse condor command remote options from workspec if self.submissionHost in ('LOCAL', 'None'): tmpLog.debug('submissionHost is {0}, treated as local schedd. Skipped'.format(self.submissionHost)) else: try: self.condor_schedd, self.condor_pool = self.submissionHost.split(',')[0:2] except ValueError: tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(self.submissionHost)) # Use Python API or fall back to command if self.condor_api == 'python': try: self.secman = htcondor.SecMan() self.renew_session(init=True) except Exception as e: tmpLog.error('Error when using htcondor Python API. Exception {0}: {1}'.format(e.__class__.__name__, e)) raise tmpLog.debug('Initialized client')
def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal
def __init__(self, *args, **kwargs): self.submissionHost = str(kwargs.get('id')) # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0} thrid={1} oid={2}'.format(self.submissionHost, get_ident(), id(self)), method_name='CondorJobManage.__init__') # Initialize tmpLog.debug('Start') self.lock = threading.Lock() CondorClient.__init__(self, self.submissionHost, *args, **kwargs) tmpLog.debug('Initialize done')
def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='submit_workers') queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) prodSourceLabel = queueconfig.get_source_label() # If jobSpec is defined we are in push mode, if not pull mode # Both assume one to one worker to job mapping jobSpec = workSpec.get_jobspec_list() if jobSpec: jobSpec = jobSpec[0] tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map())) desc = {} desc['pandastatus'] = 'sent' desc['actpandastatus'] = 'sent' desc['siteName'] = workSpec.computingSite desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel == 'user' else 'production'] desc['sendhb'] = 0 metadata = {'harvesteraccesspoint': workSpec.get_access_point(), 'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)} desc['metadata'] = json.dumps(metadata) if jobSpec: # push mode: aCT takes the url-encoded job description (like it gets from panda server) pandaid = jobSpec.PandaID actjobdesc = urllib.urlencode(jobSpec.jobParams) else: # pull mode: just set pandaid (to workerid) and prodsourcelabel pandaid = workSpec.workerID actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (pandaid, prodSourceLabel) tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc))) try: batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()'] except Exception as e: result = (False, "Failed to insert job into aCT DB: {0}".format(str(e))) else: tmpLog.info("aCT batch id {0}".format(batchid)) workSpec.batchID = str(batchid) # Set log files in workSpec today = time.strftime('%Y-%m-%d', time.gmtime()) logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)]) workSpec.set_log_file('batch_log', '{0}.log'.format(logurl)) workSpec.set_log_file('stdout', '{0}.out'.format(logurl)) workSpec.set_log_file('stderr', '{0}.err'.format(logurl)) result = (True, '') retList.append(result) return retList
def _make_init_script(workspec, template_str): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_make_init_script') # make init tempfile tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_init.sh', dir=workspec.get_access_point()) new_template_str = _init_script_replace(template_str, **workspec.__dict__) tmpFile.write(new_template_str) tmpFile.close() tmpLog.debug('done') return tmpFile.name
def kill_requested(self, workspec): tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='kill_requested') tmpLog.debug('start') try: ret = self.conn.root.kill_requested(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def feed_jobs(self, workspec, jobspec_list): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='feed_jobs') retVal = True # get PFC pfc = core_utils.make_pool_file_catalog(jobspec_list) pandaIDs = [] for jobSpec in jobspec_list: accessPoint = self.get_access_point(workspec, jobSpec.PandaID) jobSpecFilePath = os.path.join(accessPoint, jobSpecFileName) xmlFilePath = os.path.join(accessPoint, xmlPoolCatalogFileName) tmpLog.debug('feeding jobs to {0}'.format(jobSpecFilePath)) try: # put job spec file with open(jobSpecFilePath, 'w') as jobSpecFile: jobParams = jobSpec.get_job_params(self.stripJobParams) if self.jobSpecFileFormat == 'cgi': jobSpecFile.write(urlencode(jobParams)) else: json.dump({jobSpec.PandaID: jobParams}, jobSpecFile) # put PFC.xml with open(xmlFilePath, 'w') as pfcFile: pfcFile.write(pfc) # make symlink inFiles = jobSpec.get_input_file_attributes() for inLFN, inFile in iteritems(inFiles): dstPath = os.path.join(accessPoint, inLFN) if 'path' in inFile and inFile['path'] != dstPath: # test if symlink exists if so remove it if os.path.exists(dstPath): os.unlink(dstPath) tmpLog.debug("removing existing symlink %s" % dstPath) os.symlink(inFile['path'], dstPath) pandaIDs.append(jobSpec.PandaID) except Exception: core_utils.dump_error_message(tmpLog) retVal = False # put PandaIDs file try: jsonFilePath = os.path.join(workspec.get_access_point(), pandaIDsFile) with open(jsonFilePath, 'w') as jsonPandaIDsFile: json.dump(pandaIDs, jsonPandaIDsFile) except Exception: core_utils.dump_error_message(tmpLog) retVal = False # remove request file try: reqFilePath = os.path.join(workspec.get_access_point(), jsonJobRequestFileName) os.remove(reqFilePath) except Exception: pass tmpLog.debug('done') return retVal
def acknowledge_events_files(self, workspec): tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='acknowledge_events_files') tmpLog.debug('start') try: ret = self.conn.root.acknowledge_events_files(self.original_config, workspec) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def submit_with_command(self, jdl_list, use_spool=False, tmp_str='', keep_temp_sdf=False): # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_command') # Initialize errStr = '' batchIDs_list = [] # make sdf temp file from jdls tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=(not keep_temp_sdf), suffix='_{0}_cluster_submit.sdf'.format(tmp_str)) sdf_file = tmpFile.name tmpFile.write('\n\n'.join(jdl_list)) tmpFile.flush() # make condor remote options name_opt = '-name {0}'.format(self.condor_schedd) if self.condor_schedd else '' pool_opt = '-pool {0}'.format(self.condor_pool) if self.condor_pool else '' spool_opt = '-remote -spool' if use_spool and self.condor_schedd else '' # command comStr = 'condor_submit -single-cluster {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format( sdf_file=sdf_file, name_opt=name_opt, pool_opt=pool_opt, spool_opt=spool_opt) # submit tmpLog.debug('submit with command: {0}'.format(comStr)) try: p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode except Exception as e: stdOut = '' stdErr = core_utils.dump_error_message(tmpLog, no_message=True) retCode = 1 errStr = '{0}: {1}'.format(e.__class__.__name__, e) finally: tmpFile.close() tmpLog.debug('retCode={0}'.format(retCode)) if retCode == 0: # extract clusterid and n_jobs job_id_match = None for tmp_line_str in stdOut.split('\n'): job_id_match = re.search('^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str) if job_id_match: break if job_id_match is not None: n_jobs = int(job_id_match.group(1)) clusterid = job_id_match.group(2) batchIDs_list = ['{0}.{1}'.format(clusterid, procid) for procid in range(n_jobs)] tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list))) else: errStr = 'no job submitted: {0}'.format(errStr) tmpLog.error(errStr) else: tmpLog.error('submission failed: {0} ; {1}'.format(stdErr, errStr)) # Return return (batchIDs_list, errStr)
def post_processing(self, workspec, jobspec_list, map_type): tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='post_processing') tmpLog.debug('start') try: ret = self.conn.root.post_processing(self.original_config, workspec, jobspec_list, map_type) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def check_workers(self, workspec_list): tmpLog = core_utils.make_logger(_logger, method_name='check_workers') tmpLog.debug('start') try: ret = self.conn.root.check_workers(self.original_config, workspec_list) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def is_alive(self, workspec, worker_heartbeat_limit): tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='is_alive') tmpLog.debug('start') try: ret = self.conn.root.is_alive(self.original_config, workspec, worker_heartbeat_limit) except Exception: core_utils.dump_error_message(tmpLog) ret = None else: tmpLog.debug('done') return ret
def _get_connection(self): tmpLog = core_utils.make_logger(_logger, method_name='_get_connection') tmpLog.debug('start') sshTunnelPool.make_tunnel_server(self.remoteHost, self.remotePort, self.remoteBindPort, self.numTunnels, ssh_username=self.sshUserName, ssh_password=self.sshPassword, private_key=self.privateKey, pass_phrase=self.passPhrase, jump_host=self.jumpHost, jump_port=self.jumpPort) tunnelHost, tunnelPort, tunnelCore = sshTunnelPool.get_tunnel(self.remoteHost, self.remotePort) self.conn = rpyc.connect(tunnelHost, tunnelPort, config={"allow_all_attrs": True, "allow_setattr": True, "allow_delattr": True}) tmpLog.debug('connected successfully to {0}:{1}'.format(tunnelHost, tunnelPort))
def kill_requested(self, workspec): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='kill_requested') # look for the json just under the access point jsonFilePath = os.path.join(workspec.get_access_point(), killWorkerFile) tmpLog.debug('looking for kill request file {0}'.format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found') return False tmpLog.debug('kill requested') return True
def remove_with_command(self, batchIDs_list=[]): # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove_with_command') # if workspec.batchID is None: # tmpLog.info('Found workerID={0} has submissionHost={1} batchID={2} . Cannot kill. Skipped '.format( # workspec.workerID, workspec.submissionHost, workspec.batchID)) # ret_list.append((True, '')) # # ## Parse condor remote options # name_opt, pool_opt = '', '' # if workspec.submissionHost is None or workspec.submissionHost == 'LOCAL': # pass # else: # try: # condor_schedd, condor_pool = workspec.submissionHost.split(',')[0:2] # except ValueError: # errStr = 'Invalid submissionHost: {0} . Skipped'.format(workspec.submissionHost) # tmpLog.error(errStr) # ret_list.append((False, errStr)) # name_opt = '-name {0}'.format(condor_schedd) if condor_schedd else '' # pool_opt = '-pool {0}'.format(condor_pool) if condor_pool else '' # # ## Kill command # comStr = 'condor_rm {name_opt} {pool_opt} {batchID}'.format(name_opt=name_opt, # pool_opt=pool_opt, # batchID=workspec.batchID) # (retCode, stdOut, stdErr) = _runShell(comStr) # if retCode != 0: # comStr = 'condor_q -l {name_opt} {pool_opt} {batchID}'.format(name_opt=name_opt, # pool_opt=pool_opt, # batchID=workspec.batchID) # (retCode, stdOut, stdErr) = _runShell(comStr) # if ('ClusterId = {0}'.format(workspec.batchID) in str(stdOut) \ # and 'JobStatus = 3' not in str(stdOut)) or retCode != 0: # ## Force to cancel if batch job not terminated first time # comStr = 'condor_rm -forcex {name_opt} {pool_opt} {batchID}'.format(name_opt=name_opt, # pool_opt=pool_opt, # batchID=workspec.batchID) # (retCode, stdOut, stdErr) = _runShell(comStr) # if retCode != 0: # ## Command failed to kill # errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format(comStr, retCode, stdOut, stdErr) # tmpLog.error(errStr) # ret_list.append((False, errStr)) # ## Found already killed # tmpLog.info('Found workerID={0} submissionHost={1} batchID={2} already killed'.format( # workspec.workerID, workspec.submissionHost, workspec.batchID)) # else: # tmpLog.info('Succeeded to kill workerID={0} submissionHost={1} batchID={2}'.format( # workspec.workerID, workspec.submissionHost, workspec.batchID)) raise NotImplementedError
def check_workers(self, workspec_list): retList = [] for workSpec in workspec_list: # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') try: tmpLog.debug('Querying aCT for id {0}'.format(workSpec.batchID)) columns = ['actpandastatus', 'pandastatus', 'computingElement', 'node'] actjobs = self.actDB.getJobs("id={0}".format(workSpec.batchID), columns) except Exception as e: tmpLog.error("Failed to query aCT DB: {0}".format(str(e))) # send back current status retList.append((workSpec.status, '')) continue if not actjobs: tmpLog.error("Job with id {0} not found in aCT".format(workSpec.batchID)) # send back current status retList.append((WorkSpec.ST_failed, "Job not found in aCT")) continue actstatus = actjobs[0]['actpandastatus'] newStatus = WorkSpec.ST_running if actstatus in ['sent', 'starting']: newStatus = WorkSpec.ST_submitted elif actstatus == 'done': newStatus = self.check_pilot_status(workSpec, tmpLog) elif actstatus == 'donefailed': newStatus = WorkSpec.ST_failed elif actstatus == 'donecancelled': newStatus = WorkSpec.ST_cancelled if newStatus != workSpec.status: tmpLog.info('ID {0} updated status {1} -> {2} ({3})'.format(workSpec.batchID, workSpec.status, newStatus, actstatus)) else: tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(actstatus, newStatus)) if actjobs[0]['computingElement']: workSpec.computingElement = actjobs[0]['computingElement'] if actjobs[0]['node']: try: pandaid = workSpec.get_jobspec_list()[0].PandaID workSpec.set_work_attributes({pandaid: {'node': actjobs[0]['node']}}) except: tmpLog.warning('Could not extract panda ID for worker {0}'.format(workSpec.batchID)) retList.append((newStatus, '')) return True, retList
def events_to_update(self, workspec): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='events_to_update') # look for the json just under the access point retDict = dict() for pandaID in workspec.pandaid_list: # look for the json just under the access point accessPoint = self.get_access_point(workspec, pandaID) jsonFilePath = os.path.join(accessPoint, jsonEventsUpdateFileName) readJsonPath = jsonFilePath + suffixReadJson # first look for json.read which is not yet acknowledged tmpLog.debug('looking for event update file {0}'.format(readJsonPath)) if os.path.exists(readJsonPath): pass else: tmpLog.debug('looking for event update file {0}'.format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found') continue try: # rename to prevent from being overwritten os.rename(jsonFilePath, readJsonPath) except Exception: tmpLog.error('failed to rename json') continue # load json nData = 0 try: with open(readJsonPath) as jsonFile: tmpOrigDict = json.load(jsonFile) newDict = dict() # change the key from str to int for tmpPandaID, tmpDict in iteritems(tmpOrigDict): tmpPandaID = long(tmpPandaID) retDict[tmpPandaID] = tmpDict nData += len(tmpDict) except Exception: tmpLog.error('failed to load json') # delete empty file if nData == 0: try: os.remove(readJsonPath) except Exception: pass tmpLog.debug('got {0} events for PandaID={1}'.format(nData, pandaID)) return retDict
def remove(self, batchIDs_list=[]): # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove') # Get all tmpLog.debug('Start') job_ads_all_dict = {} if self.condor_api == 'python': try: retVal = self.remove_with_python(batchIDs_list) except Exception as e: tmpLog.error('Exception {0}: {1}'.format(e.__class__.__name__, e)) raise else: retVal = self.remove_with_command(batchIDs_list) return retVal
def get_all(self, batchIDs_list=[], allJobs=False): # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobQuery.get_all') # Get all tmpLog.debug('Start') job_ads_all_dict = {} if self.condor_api == 'python': try: job_ads_all_dict = self.query_with_python(batchIDs_list, allJobs) except Exception as e: tmpLog.error('Exception {0}: {1}'.format(e.__class__.__name__, e)) raise else: job_ads_all_dict = self.query_with_command(batchIDs_list) return job_ads_all_dict
def __init__(self, cacheEnable=False, cacheRefreshInterval=None, useCondorHistory=True, *args, **kwargs): self.submissionHost = str(kwargs.get('id')) # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0} thrid={1} oid={2}'.format(self.submissionHost, get_ident(), id(self)), method_name='CondorJobQuery.__init__') # Initialize with self.classLock: tmpLog.debug('Start') CondorClient.__init__(self, self.submissionHost, *args, **kwargs) # For condor_q cache self.cacheEnable = cacheEnable if self.cacheEnable: self.cache = ([], 0) self.cacheRefreshInterval = cacheRefreshInterval self.useCondorHistory = useCondorHistory tmpLog.debug('Initialize done')
def check_a_worker(workspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='check_a_worker') dummyFilePath = os.path.join(workspec.get_access_point(), 'status.txt') tmpLog.debug('look for {0}'.format(dummyFilePath)) newStatus = WorkSpec.ST_finished try: with open(dummyFilePath) as dummyFile: newStatus = dummyFile.readline() newStatus = newStatus.strip() except: pass tmpLog.debug('newStatus={0}'.format(newStatus)) return (newStatus, '')
def remove_with_python(self, batchIDs_list=[]): # Make logger tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove_with_python') # Start tmpLog.debug('Start') # Acquire class lock with self.classLock: tmpLog.debug('Got class lock') # Initialize ret_list = [] retMap = {} # Go n_jobs = len(batchIDs_list) act_ret = self.schedd.act(htcondor.JobAction.Remove, batchIDs_list) # Check if all jobs clear (off from schedd queue) is_all_clear = (n_jobs == act_ret['TotalAlreadyDone'] + act_ret['TotalNotFound'] + act_ret['TotalSuccess']) if act_ret and is_all_clear: tmpLog.debug('removed {0} jobs: {1}'.format(n_jobs, ','.join(batchIDs_list))) for batchid in batchIDs_list: condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) retMap[condor_job_id] = (True, '') else: tmpLog.error('job removal failed; batchIDs_list={0}, got: {1}'.format(batchIDs_list, act_ret)) # need to query queue for unterminated jobs not removed yet clusterids_set = set([ get_job_id_tuple_from_batchid(batchid)[0] for batchid in batchIDs_list ]) clusterids_str = ','.join(list(clusterids_set)) requirements = 'member(ClusterID, {{{0}}}) && JobStatus =!= 3 && JobStatus =!= 4'.format(clusterids_str) jobs_iter = self.schedd.xquery(requirements=requirements, projection=CONDOR_JOB_ADS_LIST) all_batchid_map = {} ok_batchid_list = [] ng_batchid_list = [] for job in jobs_iter: job_ads_dict = dict(job) batchid = get_batchid_from_job(job_ads_dict) all_batchid_map[batchid] = job_ads_dict for batchid in batchIDs_list: condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) if batchid in all_batchid_map: ng_batchid_list.append(batchid) retMap[condor_job_id] = (False, 'batchID={0} still unterminated in condor queue'.format(batchid)) else: ok_batchid_list.append(batchid) retMap[condor_job_id] = (True, '') tmpLog.debug('removed {0} jobs: {1} ; failed to remove {2} jobs: {3}'.format( len(ok_batchid_list), ','.join(ok_batchid_list), len(ng_batchid_list), ','.join(ng_batchid_list))) tmpLog.debug('Done') # Return return retMap
def trigger_stage_out(self, jobspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_stage_out') tmpLog.debug('start') # loop over all files lifetime = 7 * 24 * 60 * 60 allChecked = True ErrMsg = 'These files failed to upload : ' zip_datasetName = 'harvester_stage_out.{0}'.format(str(uuid.uuid4())) fileAttrs = jobspec.get_output_file_attributes() for fileSpec in jobspec.outFiles: fileSpec.fileAttributes['transferID'] = None #synchronius transfer # skip already done tmpLog.debug(' file: %s status: %s' % (fileSpec.lfn, fileSpec.status)) if fileSpec.status in ['finished', 'failed']: continue # set destination RSE if fileSpec.fileType in ['es_output', 'zip_output', 'output']: dstRSE = self.dstRSE_Out elif fileSpec.fileType == 'log': dstRSE = self.dstRSE_Log else: errMsg = 'unsupported file type {0}'.format(fileSpec.fileType) tmpLog.error(errMsg) return (False, errMsg) # skip if destination is None if dstRSE is None: continue # get/set scope and dataset name if fileSpec.fileType != 'zip_output': scope = fileAttrs[fileSpec.lfn]['scope'] datasetName = fileAttrs[fileSpec.lfn]['dataset'] else: # use panda scope for zipped files scope = self.scopeForTmp datasetName = zip_datasetName # for now mimic behaviour and code of pilot v2 rucio copy tool (rucio download) change when needed executable = ['/usr/bin/env', 'rucio', '-v', 'upload'] executable += ['--no-register'] executable += ['--lifetime', ('%d' % lifetime)] executable += ['--rse', dstRSE] executable += ['--scope', scope] if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes: executable += ['--guid', fileSpec.fileAttributes['guid']] executable += [('%s:%s' % (scope, datasetName))] executable += [('%s' % fileSpec.path)] #print executable tmpLog.debug('rucio upload command: {0} '.format(executable)) tmpLog.debug('rucio upload command (for human): %s ' % ' '.join(executable)) process = subprocess.Popen(executable, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) stdout, stderr = process.communicate() if process.returncode == 0: fileSpec.status = 'finished' tmpLog.debug(stdout) else: # check what failed file_exists = False rucio_sessions_limit_error = False for line in stdout.split('\n'): if 'File name in specified scope already exists' in line: file_exists = True break elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line: rucio_sessions_limit_error = True if file_exists: tmpLog.debug('file exists, marking transfer as finished') fileSpec.status = 'finished' elif rucio_sessions_limit_error: # do nothing tmpLog.warning( 'rucio returned error, will retry: stdout: %s' % stdout) # do not change fileSpec.status and Harvester will retry if this function returns False allChecked = False continue else: fileSpec.status = 'failed' tmpLog.error('rucio upload failed with stdout: %s' % stdout) ErrMsg += '%s failed with rucio error stdout="%s"' % ( fileSpec.lfn, stdout) allChecked = False # force update fileSpec.force_update('status') tmpLog.debug('file: %s status: %s' % (fileSpec.lfn, fileSpec.status)) # return tmpLog.debug('done') if allChecked: return True, '' else: return False, ErrMsg
def trigger_stage_out(self, jobspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='trigger_stage_out') tmpLog.debug('start') # loop over all files files = dict() transferIDs = dict() transferDatasets = dict() fileAttrs = jobspec.get_output_file_attributes() for fileSpec in jobspec.outFiles: # skip zipped files if fileSpec.zipFileID is not None: continue # skip if already processed if 'transferDataset' in fileSpec.fileAttributes: if fileSpec.fileType not in transferDatasets: transferDatasets[ fileSpec. fileType] = fileSpec.fileAttributes['transferDataset'] if fileSpec.fileType not in transferIDs: transferIDs[fileSpec.fileType] = fileSpec.fileAttributes[ 'transferID'] continue # set OS ID if fileSpec.fileType == ['es_output', 'zip_output']: fileSpec.objstoreID = self.objStoreID_ES # make path where file is copied for transfer if fileSpec.fileType != 'zip_output': scope = fileAttrs[fileSpec.lfn]['scope'] datasetName = fileAttrs[fileSpec.lfn]['dataset'] else: # use panda scope for zipped files scope = self.scopeForTmp datasetName = 'dummy' srcPath = fileSpec.path dstPath = mover_utils.construct_file_path(self.srcBasePath, scope, fileSpec.lfn) # remove if os.path.exists(dstPath): os.remove(dstPath) # copy tmpLog.debug('copy src={srcPath} dst={dstPath}'.format( srcPath=srcPath, dstPath=dstPath)) dstDir = os.path.dirname(dstPath) if not os.path.exists(dstDir): os.makedirs(dstDir) shutil.copyfile(srcPath, dstPath) # collect files tmpFile = dict() tmpFile['scope'] = scope tmpFile['name'] = fileSpec.lfn tmpFile['bytes'] = fileSpec.fsize if fileSpec.fileType not in files: files[fileSpec.fileType] = [] files[fileSpec.fileType].append(tmpFile) # loop over all file types to be registered to rucio rucioAPI = RucioClient() for fileType, fileList in iteritems(files): # set destination RSE if fileType in ['es_output', 'zip_output']: dstRSE = self.dstRSE_ES elif fileType == 'output': dstRSE = self.dstRSE_Out elif fileType == 'log': dstRSE = self.dstRSE_Log else: errMsg = 'unsupported file type {0}'.format(fileType) tmpLog.error(errMsg) return (False, errMsg) # skip if destination is None if dstRSE is None: continue # make datasets if missing if fileType not in transferDatasets: try: tmpScope = self.scopeForTmp tmpDS = 'panda.harvester_stage_out.{0}'.format( str(uuid.uuid4())) rucioAPI.add_dataset(tmpScope, tmpDS, meta={'hidden': True}, lifetime=7 * 24 * 60 * 60, files=fileList, rse=self.srcRSE) transferDatasets[fileType] = tmpDS # add rule tmpDID = dict() tmpDID['scope'] = tmpScope tmpDID['name'] = tmpDS tmpRet = rucioAPI.add_replication_rule([tmpDID], 1, dstRSE, lifetime=7 * 24 * 60 * 60) tmpTransferIDs = tmpRet[0] transferIDs[fileType] = tmpTransferIDs tmpLog.debug('register dataset {0} with rule {1}'.format( tmpDS, str(tmpTransferIDs))) except: errMsg = core_utils.dump_error_message(tmpLog) return (False, errMsg) else: # add files to existing dataset try: tmpScope = self.scopeForTmp tmpDS = transferDatasets[fileType] rucioAPI.add_files_to_dataset(tmpScope, tmpDS, fileList, self.srcRSE) tmpLog.debug('added files to {0}'.format(tmpDS)) except: errMsg = core_utils.dump_error_message(tmpLog) return (False, errMsg) # set transfer datasets and rules for fileSpec in jobspec.outFiles: # skip zipped files if fileSpec.zipFileID is not None: continue # skip already done if fileSpec.status in ['finished', 'failed']: continue # skip if already processed if 'transferDataset' in fileSpec.fileAttributes: continue # no destination if fileSpec.fileType not in transferDatasets: fileSpec.status = 'finished' continue # set dataset fileSpec.fileAttributes['transferDataset'] = transferDatasets[ fileSpec.fileType] # set rule fileSpec.fileAttributes['transferID'] = transferIDs[ fileSpec.fileType] # force update fileSpec.force_update('fileAttributes') # return tmpLog.debug('done') return (True, '')
queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator queueConfig.preparator[ 'module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator' queueConfig.preparator['name'] = 'GlobusBulkPreparator' modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() # get stage-out plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger _logger = core_utils.setup_logger('further_testing_go_bulk_preparator') tmpLog = core_utils.make_logger( _logger, method_name='further_testing_go_bulk_preparator') tmpLog.debug('start') for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr)
def submit_bag_of_workers(data_list): # make logger tmpLog = core_utils.make_logger(baseLogger, method_name='submit_bag_of_workers') # keep order of workers in data_list workerIDs_list = [ data['workspec'].workerID for data in data_list ] # initialization worker_retval_map = {} worker_data_map = {} host_jdl_list_workerid_map = {} # go for data in data_list: workspec = data['workspec'] workerID = workspec.workerID worker_data_map[workerID] = data to_submit = data['to_submit'] # no need to submit bad worker if not to_submit: errStr = '{0} not submitted due to incomplete data of the worker'.format(workerID) tmpLog.warning(errStr) tmpRetVal = (None, errStr) # return tmpRetVal, workspec.get_changed_attributes() worker_retval_map[workerID] = (tmpRetVal, workspec.get_changed_attributes()) # attributes try: use_spool = data['use_spool'] except KeyError: errStr = '{0} not submitted due to incomplete data of the worker'.format(workerID) tmpLog.warning(errStr) tmpRetVal = (None, errStr) # return tmpRetVal, workspec.get_changed_attributes() worker_retval_map[workerID] = (tmpRetVal, workspec.get_changed_attributes()) else: workspec.reset_changed_list() # fill in host_jdl_list_workerid_map a_jdl, placeholder_map = make_a_jdl(**data) val = (workspec, a_jdl, placeholder_map) try: host_jdl_list_workerid_map[workspec.submissionHost].append(val) except KeyError: host_jdl_list_workerid_map[workspec.submissionHost] = [val] # loop over submissionHost for host, val_list in host_jdl_list_workerid_map.items(): # make jdl string of workers jdl_list = [ val[1] for val in val_list ] # condor job submit object tmpLog.debug('submitting to submissionHost={0}'.format(host)) # submit try: condor_job_submit = CondorJobSubmit(id=host) batchIDs_list, ret_err_str = condor_job_submit.submit(jdl_list, use_spool=use_spool) except Exception as e: batchIDs_list = None ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e) # result if batchIDs_list: # submitted n_workers = len(val_list) tmpLog.debug('submitted {0} workers to submissionHost={1}'.format(n_workers, host)) for val_i in range(n_workers): val = val_list[val_i] workspec = val[0] placeholder_map = val[2] # got batchID workspec.batchID = batchIDs_list[val_i] tmpLog.debug('workerID={0} submissionHost={1} batchID={2}'.format( workspec.workerID, workspec.submissionHost, workspec.batchID)) # get worker data data = worker_data_map[workspec.workerID] # set computingElement ce_info_dict = data['ce_info_dict'] workspec.computingElement = ce_info_dict.get('ce_endpoint', '') # set log batch_log_dict = data['batch_log_dict'] (clusterid, procid) = get_job_id_tuple_from_batchid(workspec.batchID) batch_log = _condor_macro_replace(batch_log_dict['batch_log'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) batch_stdout = _condor_macro_replace(batch_log_dict['batch_stdout'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) batch_stderr = _condor_macro_replace(batch_log_dict['batch_stderr'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map) try: batch_jdl = '{0}.jdl'.format(batch_stderr[:-4]) except Exception: batch_jdl = None workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) workspec.set_log_file('jdl', batch_jdl) if not workspec.get_jobspec_list(): tmpLog.debug('No jobspec associated in the worker of workerID={0}'.format(workspec.workerID)) else: for jobSpec in workspec.get_jobspec_list(): # using batchLog and stdOut URL as pilotID and pilotLog jobSpec.set_one_attribute('pilotID', workspec.workAttributes['stdOut']) jobSpec.set_one_attribute('pilotLog', workspec.workAttributes['batchLog']) tmpLog.debug('Done set_log_file after submission of workerID={0}'.format(workspec.workerID)) tmpRetVal = (True, '') worker_retval_map[workspec.workerID] = (tmpRetVal, workspec.get_changed_attributes()) else: # failed tmpLog.debug('failed to submit workers to submissionHost={0} ; {1}'.format(host, ret_err_str)) for val in val_list: workspec = val[0] errStr = 'submission failed: {0}'.format(ret_err_str) tmpLog.error(errStr) tmpRetVal = (None, errStr) worker_retval_map[workspec.workerID] = (tmpRetVal, workspec.get_changed_attributes()) # make return list retValList = [ worker_retval_map[w_id] for w_id in workerIDs_list ] return retValList
def query_with_command(self, batchIDs_list=[]): # Make logger tmpLog = core_utils.make_logger( baseLogger, method_name='CondorJobQuery.query_with_command') ## Start query tmpLog.debug('Start query') job_ads_all_dict = {} batchIDs_list = list(batchIDs_list) for orig_comStr in self.orig_comStr_list: ## String of batchIDs batchIDs_str = ' '.join(batchIDs_list) ## Command if 'condor_q' in orig_comStr or ('condor_history' in orig_comStr and batchIDs_list): name_opt = '-name {0}'.format( self.condor_schedd) if self.condor_schedd else '' pool_opt = '-pool {0}'.format( self.condor_pool) if self.condor_pool else '' ids = batchIDs_str comStr = '{cmd} {name_opt} {pool_opt} {ids}'.format( cmd=orig_comStr, name_opt=name_opt, pool_opt=pool_opt, ids=ids) else: # tmpLog.debug('No batch job left to query in this cycle by this thread') continue tmpLog.debug('check with {0}'.format(comStr)) (retCode, stdOut, stdErr) = _runShell(comStr) if retCode == 0: ## Command succeeded job_ads_xml_str = '\n'.join(str(stdOut).split(self.badtext)) if '<c>' in job_ads_xml_str: ## Found at least one job ## XML parsing xml_root = ET.fromstring(job_ads_xml_str) def _getAttribute_tuple(attribute_xml_element): ## Attribute name _n = str(attribute_xml_element.get('n')) ## Attribute value text _t = ' '.join(attribute_xml_element.itertext()) return (_n, _t) ## Every batch job for _c in xml_root.findall('c'): job_ads_dict = dict() ## Every attribute attribute_iter = map(_getAttribute_tuple, _c.findall('a')) job_ads_dict.update(attribute_iter) batchid = str(job_ads_dict['ClusterId']) condor_job_id = '{0}#{1}'.format( self.submissionHost, batchid) job_ads_all_dict[condor_job_id] = job_ads_dict ## Remove batch jobs already gotten from the list if batchid in batchIDs_list: batchIDs_list.remove(batchid) else: ## Job not found tmpLog.debug('job not found with {0}'.format(comStr)) continue else: ## Command failed errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format( comStr, retCode, stdOut, stdErr) tmpLog.error(errStr) if len(batchIDs_list) > 0: ## Job unfound via both condor_q or condor_history, marked as unknown worker in harvester for batchid in batchIDs_list: condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid) job_ads_all_dict[condor_job_id] = dict() tmpLog.info('Unfound batch jobs of submissionHost={0}: {1}'.format( self.submissionHost, ' '.join(batchIDs_list))) ## Return return job_ads_all_dict
def execute(self): # avoid too early check if not self.singleMode and datetime.datetime.utcnow() - self.startTime \ < datetime.timedelta(seconds=harvester_config.watcher.checkInterval): return mainLog = core_utils.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='execute') mainLog.debug('start') # get file lock try: with core_utils.get_file_lock( lockFileName, harvester_config.watcher.checkInterval): logFileName = os.path.join(logDir, 'panda-db_proxy.log') timeNow = datetime.datetime.utcnow() if os.path.exists(logFileName): # get latest timestamp try: p = subprocess.Popen(['tail', '-1', logFileName], stdout=subprocess.PIPE, stderr=subprocess.PIPE) line = p.stdout.readline() lastTime = datetime.datetime.strptime( line[:23], "%Y-%m-%d %H:%M:%S,%f") except Exception: lastTime = None # get processing time for last 1000 queries logDuration = None try: p = subprocess.Popen('tail -{0} {1} | head -1'.format( harvester_config.watcher.nMessages, logFileName), stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) line = p.stdout.readline() firstTime = datetime.datetime.strptime( line[:23], "%Y-%m-%d %H:%M:%S,%f") if lastTime is not None: logDuration = lastTime - firstTime except Exception: pass tmpMsg = 'last log message at {0}. '.format(lastTime) if logDuration is not None: tmpMsg += '{0} messages took {1} sec'.format( harvester_config.watcher.nMessages, logDuration.total_seconds()) mainLog.debug(tmpMsg) # check timestamp doAction = False if harvester_config.watcher.maxStalled > 0 and lastTime is not None and \ timeNow - lastTime > datetime.timedelta(seconds=harvester_config.watcher.maxStalled): mainLog.warning( 'last log message is too old. seems to be stalled') doAction = True elif harvester_config.watcher.maxDuration > 0 and logDuration is not None and \ logDuration.total_seconds() > harvester_config.watcher.maxDuration: mainLog.warning( 'slow message generation. seems to be a performance issue' ) doAction = True # take action if doAction: # email if 'email' in harvester_config.watcher.actions.split( ','): # get pass phrase toSkip = False mailUser = None mailPass = None if harvester_config.watcher.mailUser != '' and \ harvester_config.watcher.mailPassword != '': envName = harvester_config.watcher.passphraseEnv if envName not in os.environ: tmpMsg = '{0} is undefined in etc/sysconfig/panda_harvester'.format( envName) mainLog.error(tmpMsg) toSkip = True else: key = os.environ[envName] mailUser = core_utils.decrypt_string( key, harvester_config.watcher.mailUser) mailPass = core_utils.decrypt_string( key, harvester_config.watcher.mailPassword) if not toSkip: # message msgBody = 'harvester {0} '.format( harvester_config.master.harvester_id) msgBody += 'is having a problem on {0} '.format( socket.getfqdn()) msgBody += 'at {0} (UTC)'.format( datetime.datetime.utcnow()) message = MIMEText(msgBody) message['Subject'] = "Harvester Alarm" message[ 'From'] = harvester_config.watcher.mailFrom message['To'] = harvester_config.watcher.mailTo # send email mainLog.debug('sending email to {0}'.format( harvester_config.watcher.mailTo)) server = smtplib.SMTP( harvester_config.watcher.mailServer, harvester_config.watcher.mailPort) if hasattr(harvester_config.watcher, 'mailUseSSL') and \ harvester_config.watcher.mailUseSSL is True: server.starttls() if mailUser is not None and mailPass is not None: server.login(mailUser, mailPass) server.ehlo() server.sendmail( harvester_config.watcher.mailFrom, harvester_config.watcher.mailTo.split(','), message.as_string()) server.quit() # kill if 'kill' in harvester_config.watcher.actions.split( ','): # send USR2 fist mainLog.debug('sending SIGUSR2') os.killpg(os.getpgrp(), signal.SIGUSR2) time.sleep(60) mainLog.debug('sending SIGKILL') os.killpg(os.getpgrp(), signal.SIGKILL) else: mainLog.debug('skip as {0} is missing'.format(logFileName)) except IOError: mainLog.debug( 'skip as locked by another thread or too early to check') except Exception: core_utils.dump_error_message(mainLog) mainLog.debug('done')
def __init__(self, **kwarg): tmpLog = core_utils.make_logger(baseLogger, method_name='__init__') self.logBaseURL = None self.templateFile = None PluginBase.__init__(self, **kwarg) # number of processes try: self.nProcesses except AttributeError: self.nProcesses = 1 else: if (not self.nProcesses) or (self.nProcesses < 1): self.nProcesses = 1 # executable file try: self.executableFile except AttributeError: self.executableFile = None # condor log directory try: self.logDir except AttributeError: self.logDir = os.getenv('TMPDIR') or '/tmp' # Default x509 proxy for a queue try: self.x509UserProxy except AttributeError: self.x509UserProxy = os.getenv('X509_USER_PROXY') # x509 proxy for analysis jobs in grandly unified queues try: self.x509UserProxyAnalysis except AttributeError: self.x509UserProxyAnalysis = os.getenv('X509_USER_PROXY_ANAL') # ATLAS AGIS try: self.useAtlasAGIS = bool(self.useAtlasAGIS) except AttributeError: self.useAtlasAGIS = False # ATLAS Grid CE, requiring AGIS try: self.useAtlasGridCE = bool(self.useAtlasGridCE) except AttributeError: self.useAtlasGridCE = False finally: self.useAtlasAGIS = self.useAtlasAGIS or self.useAtlasGridCE # sdf template directories of CEs; ignored if templateFile is set try: self.CEtemplateDir except AttributeError: self.CEtemplateDir = '' # remote condor schedd and pool name (collector) try: self.condorSchedd except AttributeError: self.condorSchedd = None try: self.condorPool except AttributeError: self.condorPool = None # json config file of remote condor host: schedd/pool and weighting. If set, condorSchedd and condorPool are overwritten try: self.condorHostConfig except AttributeError: self.condorHostConfig = False if self.condorHostConfig: try: self.condorSchedd = [] self.condorPool = [] self.condorHostWeight = [] with open(self.condorHostConfig, 'r') as f: condor_host_config_map = json.load(f) for _schedd, _cm in condor_host_config_map.items(): _pool = _cm['pool'] _weight = int(_cm['weight']) self.condorSchedd.append(_schedd) self.condorPool.append(_pool) self.condorHostWeight.append(_weight) except Exception as e: tmpLog.error('error when parsing condorHostConfig json file; {0}: {1}'.format(e.__class__.__name__, e)) raise else: if isinstance(self.condorSchedd, list): self.condorHostWeight = [1] * len(self.condorSchedd) else: self.condorHostWeight = [1] # condor spool mechanism. If False, need shared FS across remote schedd try: self.useSpool except AttributeError: self.useSpool = False # number of workers less than this number will be bulkily submitted in only one schedd try: self.minBulkToRamdomizedSchedd except AttributeError: self.minBulkToRamdomizedSchedd = 20 # record of information of CE statistics self.ceStatsLock = threading.Lock() self.ceStats = dict() # allowed associated parameters from AGIS self._allowed_agis_attrs = ( 'pilot_url', )
def check_workers(self, workspec_list): ## Make logger for batch job query tmpLog = core_utils.make_logger(baseLogger, '{0}'.format('batch job query'), method_name='check_workers') ## Initial a list all batchIDs of workspec_list batchIDs_list = map( lambda _x: str(_x.batchID) , workspec_list ) ## Query commands orig_comStr_list = [ 'condor_q -xml', 'condor_history -xml', ] ## Record batch job query result to this dict, with key = batchID job_ads_all_dict = dict() ## Start query for orig_comStr in orig_comStr_list: ## String of batchIDs batchIDs_str = ' '.join(batchIDs_list) ## Command if batchIDs_str: comStr = '{0} {1}'.format(orig_comStr, batchIDs_str) else: tmpLog.debug('No batch job left to query in this cycle by this thread') continue tmpLog.debug('check with {0}'.format(comStr)) (retCode, stdOut, stdErr) = _runShell(comStr) if retCode == 0: ## Command succeeded ## Kill out redundant xml roots # badtext = """ # <?xml version="1.0"?> # <!DOCTYPE classads SYSTEM "classads.dtd"> # <classads> # # </classads> # """ badtext = """ </classads> <?xml version="1.0"?> <!DOCTYPE classads SYSTEM "classads.dtd"> <classads> """ # badtext_re_str = '<?xml version="1.0"?>\W+<!DOCTYPE classads SYSTEM "classads.dtd">\W+<classads>\W+</classads>' job_ads_xml_str = '\n'.join(str(stdOut).split(badtext)) # job_ads_xml_str = re.sub(badtext_re_str, '\n', str(stdOut)) # tmpLog.debug(job_ads_xml_str) import time with open('/tmp/jobads-{0}.xml'.format(time.time()), 'wb') as _f: _f.write(job_ads_xml_str) if '<c>' in job_ads_xml_str: ## Found at least one job ## XML parsing xml_root = ET.fromstring(job_ads_xml_str) def _getAttribute_tuple(attribute_xml_element): ## Attribute name _n = str(attribute_xml_element.get('n')) ## Attribute value text _t = ' '.join(attribute_xml_element.itertext()) return (_n, _t) ## Every batch job for _c in xml_root.findall('c'): job_ads_dict = dict() ## Every attribute attribute_iter = map(_getAttribute_tuple, _c.findall('a')) job_ads_dict.update(attribute_iter) batchid = str(job_ads_dict['ClusterId']) job_ads_all_dict[batchid] = job_ads_dict ## Remove batch jobs already gotten from the list batchIDs_list.remove(batchid) else: ## Job not found tmpLog.debug('job not found with {0}'.format(comStr)) continue else: ## Command failed errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format(comStr, retCode, stdOut, stdErr) tmpLog.error(errStr) return False, errStr if batchIDs_list: tmpLog.info( 'Unfound batch jobs: '.format( ' '.join(batchIDs_list) ) ) ## Check for all workers with Pool(self.nProcesses) as _pool: retList = _pool.map(lambda _x: _check_one_worker(_x, job_ads_all_dict), workspec_list) return True, retList
def create_job_from_yaml(self, yaml_content, work_spec, prod_source_label, container_image, executable, args, cert, cert_in_secret=True, cpu_adjust_ratio=100, memory_adjust_ratio=100, max_time=None): tmp_log = core_utils.make_logger(base_logger, method_name='create_job_from_yaml') # consider PULL mode as default, unless specified submit_mode = 'PULL' # create the configmap in push mode worker_id = None if work_spec.mapType != 'NoJob': submit_mode = 'PUSH' worker_id = str(work_spec.workerID) res = self.create_configmap(work_spec) if not res: # if the configmap creation failed, don't submit a job because the pod creation will hang return res, 'Failed to create a configmap' # retrieve panda queue information panda_queues_dict = PandaQueuesDict() queue_name = panda_queues_dict.get_panda_queue_name( work_spec.computingSite) # set the worker name yaml_content['metadata']['name'] = yaml_content['metadata'][ 'name'] + "-" + str(work_spec.workerID) # set the resource type and other metadata to filter the pods yaml_content['spec']['template'].setdefault('metadata', {}) yaml_content['spec']['template']['metadata'].update( {'labels': { 'resourceType': str(work_spec.resourceType) }}) # fill the container details. we can only handle one container (take the first, delete the rest) yaml_containers = yaml_content['spec']['template']['spec'][ 'containers'] del (yaml_containers[1:len(yaml_containers)]) container_env = yaml_containers[0] container_env.setdefault('resources', {}) # set the container image if 'image' not in container_env: container_env['image'] = container_image if 'command' not in container_env: container_env['command'] = executable container_env['args'] = args # set the resources (CPU and memory) we need for the container # note that predefined values in the yaml template will NOT be overwritten # Be familiar with QoS classes: https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod # The CPU & memory settings will affect the QoS for the pod container_env.setdefault('resources', {}) if work_spec.nCore > 0: # CPU limits container_env['resources'].setdefault('limits', {}) if 'cpu' not in container_env['resources']['limits']: container_env['resources']['limits']['cpu'] = str( work_spec.nCore) # CPU requests container_env['resources'].setdefault('requests', {}) if 'cpu' not in container_env['resources']['requests']: container_env['resources']['requests']['cpu'] = str( work_spec.nCore * cpu_adjust_ratio / 100.0) if work_spec.minRamCount > 4: # K8S minimum memory limit = 4 MB # memory limits # container_env['resources'].setdefault('limits', {}) # if 'memory' not in container_env['resources']['limits']: # container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M' # memory requests container_env['resources'].setdefault('requests', {}) if 'memory' not in container_env['resources']['requests']: container_env['resources']['requests']['memory'] = str( work_spec.minRamCount * memory_adjust_ratio / 100.0) + 'M' container_env.setdefault('env', []) # try to retrieve the stdout log file name try: log_file_name = work_spec.workAttributes['stdout'] except (KeyError, AttributeError): tmp_log.debug( 'work_spec does not have stdout workAttribute, using default') log_file_name = '' container_env['env'].extend([{ 'name': 'computingSite', 'value': work_spec.computingSite }, { 'name': 'pandaQueueName', 'value': queue_name }, { 'name': 'resourceType', 'value': work_spec.resourceType }, { 'name': 'prodSourceLabel', 'value': prod_source_label }, { 'name': 'jobType', 'value': work_spec.jobType }, { 'name': 'proxySecretPath', 'value': cert if cert_in_secret else None }, { 'name': 'proxyContent', 'value': None if cert_in_secret else self.set_proxy(cert) }, { 'name': 'workerID', 'value': str(work_spec.workerID) }, { 'name': 'logs_frontend_w', 'value': harvester_config.pandacon.pandaCacheURL_W }, { 'name': 'logs_frontend_r', 'value': harvester_config.pandacon.pandaCacheURL_R }, { 'name': 'stdout_name', 'value': log_file_name }, { 'name': 'PANDA_JSID', 'value': 'harvester-' + harvester_config.master.harvester_id }, { 'name': 'HARVESTER_WORKER_ID', 'value': str(work_spec.workerID) }, { 'name': 'HARVESTER_ID', 'value': harvester_config.master.harvester_id }, { 'name': 'submit_mode', 'value': submit_mode }]) # in push mode, add the configmap as a volume to the pod if submit_mode == 'PUSH' and worker_id: yaml_content['spec']['template']['spec'].setdefault('volumes', []) yaml_volumes = yaml_content['spec']['template']['spec']['volumes'] yaml_volumes.append({ 'name': 'job-config', 'configMap': { 'name': worker_id } }) # mount the volume to the filesystem container_env.setdefault('volumeMounts', []) container_env['volumeMounts'].append({ 'name': 'job-config', 'mountPath': CONFIG_DIR }) # set the affinity if 'affinity' not in yaml_content['spec']['template']['spec']: yaml_content = self.set_affinity(yaml_content) # set max_time to avoid having a pod running forever if 'activeDeadlineSeconds' not in yaml_content['spec']['template'][ 'spec']: if not max_time: # 4 days max_time = 4 * 24 * 23600 yaml_content['spec']['template']['spec'][ 'activeDeadlineSeconds'] = max_time tmp_log.debug('creating job {0}'.format(yaml_content)) rsp = self.batchv1.create_namespaced_job(body=yaml_content, namespace=self.namespace) return rsp, yaml_content
def make_batch_script(workspec, template, n_core_per_node, log_dir, panda_queue_name, x509_user_proxy, ce_info_dict=dict(), batch_log_dict=dict(), special_par=''): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='make_batch_script') tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sdf', dir=workspec.get_access_point()) # Note: In workspec, unit of minRamCount and of maxDiskCount are both MB. # In HTCondor SDF, unit of request_memory is MB, and request_disk is KB. n_core_total = workspec.nCore if workspec.nCore else n_core_per_node request_ram = max( workspec.minRamCount, 1 * n_core_total) if workspec.minRamCount else 1 * n_core_total request_disk = workspec.maxDiskCount * 1024 if workspec.maxDiskCount else 1 request_walltime = workspec.maxWalltime if workspec.maxWalltime else 0 ce_info_dict = ce_info_dict.copy() batch_log_dict = batch_log_dict.copy() # possible override by AGIS special_par if special_par: special_par_attr_list = [ 'queue', 'maxWallTime', 'xcount', ] _match_special_par_dict = { attr: re.search('\({attr}=([^)]+)\)'.format(attr=attr), special_par) \ for attr in special_par_attr_list } for attr, _match in _match_special_par_dict.items(): if not _match: continue elif attr == 'queue': ce_info_dict['ce_queue_name'] = str(_match.group(1)) elif attr == 'maxWallTime': request_walltime = int(_match.group(1)) elif attr == 'xcount': n_core_total = int(_match.group(1)) tmpLog.debug( 'job attributes override by AGIS special_par: {0}={1}'.format( attr, str(_match.group(1)))) # derived job attributes n_node = _div_round_up(n_core_total, n_core_per_node) request_ram_per_core = _div_round_up(request_ram * n_node, n_core_total) request_cputime = request_walltime * n_core_total request_walltime_minute = _div_round_up(request_walltime, 60) request_cputime_minute = _div_round_up(request_cputime, 60) # fill in template tmpFile.write( template.format( nCorePerNode=n_core_per_node, nCoreTotal=n_core_total, nNode=n_node, requestRam=request_ram, requestRamPerCore=request_ram_per_core, requestDisk=request_disk, requestWalltime=request_walltime, requestWalltimeMinute=request_walltime_minute, requestCputime=request_cputime, requestCputimeMinute=request_cputime_minute, accessPoint=workspec.accessPoint, harvesterID=harvester_config.master.harvester_id, workerID=workspec.workerID, computingSite=workspec.computingSite, pandaQueueName=panda_queue_name, x509UserProxy=x509_user_proxy, ceEndpoint=ce_info_dict.get('ce_endpoint', ''), ceHostname=ce_info_dict.get('ce_hostname', ''), ceFlavour=ce_info_dict.get('ce_flavour', ''), ceJobmanager=ce_info_dict.get('ce_jobmanager', ''), ceQueueName=ce_info_dict.get('ce_queue_name', ''), ceVersion=ce_info_dict.get('ce_version', ''), logDir=log_dir, gtag=batch_log_dict.get('gtag', 'fake_GTAG_string'), )) tmpFile.close() tmpLog.debug('done') return tmpFile.name
def run(self): while True: sw = core_utils.get_stopwatch() mainLog = core_utils.make_logger(_logger, 'id={0}'.format(self.ident), method_name='run') mainLog.debug('getting jobs to propagate') jobSpecs = self.dbProxy.get_jobs_to_propagate( harvester_config.propagator.maxJobs, harvester_config.propagator.lockInterval, harvester_config.propagator.updateInterval, self.ident) mainLog.debug('got {0} jobs'.format(len(jobSpecs))) # update jobs in central database iJobs = 0 nJobs = harvester_config.propagator.nJobsInBulk hbSuppressMap = dict() while iJobs < len(jobSpecs): jobList = jobSpecs[iJobs:iJobs + nJobs] iJobs += nJobs # collect jobs to update or check jobListToSkip = [] jobListToUpdate = [] jobListToCheck = [] retList = [] for tmpJobSpec in jobList: if tmpJobSpec.computingSite not in hbSuppressMap: queueConfig = self.queueConfigMapper.get_queue( tmpJobSpec.computingSite) hbSuppressMap[ tmpJobSpec. computingSite] = queueConfig.get_no_heartbeat_status( ) # heartbeat is suppressed if tmpJobSpec.status in hbSuppressMap[ tmpJobSpec.computingSite]: # check running job to detect lost heartbeat if tmpJobSpec.status == 'running': jobListToCheck.append(tmpJobSpec) else: jobListToSkip.append(tmpJobSpec) retList.append({'StatusCode': 0, 'command': None}) else: jobListToUpdate.append(tmpJobSpec) retList += self.communicator.check_jobs(jobListToCheck) retList += self.communicator.update_jobs(jobListToUpdate) # logging for tmpJobSpec, tmpRet in zip( jobListToSkip + jobListToCheck + jobListToUpdate, retList): if tmpRet['StatusCode'] == 0: if tmpJobSpec in jobListToUpdate: mainLog.debug( 'updated PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) else: mainLog.debug( 'skip updating PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) # release job tmpJobSpec.propagatorLock = None if tmpJobSpec.is_final_status( ) and tmpJobSpec.status == tmpJobSpec.get_status(): # unset to disable further updating tmpJobSpec.propagatorTime = None tmpJobSpec.subStatus = 'done' else: # check event availability if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \ tmpJobSpec.subStatus != 'submitted': tmpEvStat, tmpEvRet = self.communicator.check_event_availability( tmpJobSpec) if tmpEvStat and tmpEvRet == 0: mainLog.debug( 'kill PandaID={0} due to no event'. format(tmpJobSpec.PandaID)) tmpRet['command'] = 'tobekilled' # got kill command if 'command' in tmpRet and tmpRet['command'] in [ 'tobekilled' ]: nWorkers = self.dbProxy.kill_workers_with_job( tmpJobSpec.PandaID) if nWorkers == 0: # no remaining workers tmpJobSpec.status = 'cancelled' tmpJobSpec.subStatus = 'killed' tmpJobSpec.stateChangeTime = datetime.datetime.utcnow( ) tmpJobSpec.trigger_propagation() self.dbProxy.update_job(tmpJobSpec, {'propagatorLock': self.ident}) else: mainLog.error( 'failed to update PandaID={0} status={1}'.format( tmpJobSpec.PandaID, tmpJobSpec.status)) mainLog.debug('getting workers to propagate') workSpecs = self.dbProxy.get_workers_to_propagate( harvester_config.propagator.maxWorkers, harvester_config.propagator.updateInterval) mainLog.debug('got {0} workers'.format(len(workSpecs))) # update workers in central database iWorkers = 0 nWorkers = harvester_config.propagator.nWorkersInBulk while iWorkers < len(workSpecs): workList = workSpecs[iWorkers:iWorkers + nJobs] iWorkers += nWorkers retList, tmpErrStr = self.communicator.update_workers(workList) # logging if retList is None: mainLog.error( 'failed to update workers with {0}'.format(tmpErrStr)) else: for tmpWorkSpec, tmpRet in zip(workList, retList): if tmpRet: mainLog.debug( 'updated workerID={0} status={1}'.format( tmpWorkSpec.workerID, tmpWorkSpec.status)) # update logs for logFilePath, logOffset, logSize, logRemoteName in \ tmpWorkSpec.get_log_files_to_upload(): with open(logFilePath, 'rb') as logFileObj: tmpStat, tmpErr = self.communicator.upload_file( logRemoteName, logFileObj, logOffset, logSize) if tmpStat: tmpWorkSpec.update_log_files_to_upload( logFilePath, logOffset + logSize) # disable further update if tmpWorkSpec.is_final_status(): tmpWorkSpec.disable_propagation() self.dbProxy.update_worker( tmpWorkSpec, {'workerID': tmpWorkSpec.workerID}) else: mainLog.error( 'failed to update workerID={0} status={1}'. format(tmpWorkSpec.workerID, tmpWorkSpec.status)) mainLog.debug('getting commands') commandSpecs = self.dbProxy.get_commands_for_receiver('propagator') mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: if commandSpec.command.startswith( CommandSpec.COM_reportWorkerStats): # get worker stats siteName = commandSpec.command.split(':')[-1] workerStats = self.dbProxy.get_worker_stats(siteName) if len(workerStats) == 0: mainLog.error( 'failed to get worker stats for {0}'.format( siteName)) else: # report worker stats tmpRet, tmpStr = self.communicator.update_worker_stats( siteName, workerStats) if tmpRet: mainLog.debug( 'updated worker stats for {0}'.format( siteName)) else: mainLog.error( 'failed to update worker stats for {0} err={1}' .format(siteName, tmpStr)) mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.propagator.sleepTime): mainLog.debug('terminated') return
def _handle_one_worker(workspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workspec.workerID), method_name='_handle_one_worker') # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get( 'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 ce_info_dict = dict() batch_log_dict = dict() special_par = '' if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get( 'queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_endpoint_from_queue = '' ce_flavour_str = '' ce_version_str = '' random.shuffle(queues_from_queue_list) for _queue_dict in queues_from_queue_list: if _queue_dict.get('ce_endpoint') and str( _queue_dict.get('ce_state', '')).upper() == 'ACTIVE': ce_flavour_str = str(_queue_dict.get('ce_flavour', '')).lower() ce_version_str = str(_queue_dict.get('ce_version', '')).lower() if ce_flavour_str in set( ['arc-ce', 'cream-ce', 'htcondor-ce']): ce_info_dict = _queue_dict.copy() ce_endpoint_from_queue = ce_info_dict.get( 'ce_endpoint', '') ce_info_dict['ce_hostname'] = re.sub( ':\w*', '', ce_endpoint_from_queue) break else: ce_flavour_str = '' tmpLog.debug( 'For site {0} got CE endpoint: "{1}", flavour: "{2}"'. format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format( ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join(self.CEtemplateDir, sdf_template_filename) # template for batch script tmpFile = open(self.templateFile) sdf_template = tmpFile.read() tmpFile.close() # get batch_log, stdout, stderr filename for _line in sdf_template.split('\n'): if _line.startswith('#'): continue _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # URLs for log files if not (self.logBaseURL is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename( value_str=batch_log_value, file_dir=self.logDir, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename( value_str=stdout_value, file_dir=self.logDir, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename( value_str=stderr_value, file_dir=self.logDir, batchID=batchID, guess=guess) batch_log = '{0}/{1}'.format(self.logBaseURL, batch_log_filename) batch_stdout = '{0}/{1}'.format(self.logBaseURL, stdout_path_file_name) batch_stderr = '{0}/{1}'.format(self.logBaseURL, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file') if not workspec.get_jobspec_list(): tmpLog.debug( 'No jobspec associated in the worker of workerID={0}'. format(workspec.workerID)) else: for jobSpec in workspec.get_jobspec_list(): # using batchLog and stdOut URL as pilotID and pilotLog jobSpec.set_one_attribute( 'pilotID', workspec.workAttributes['stdOut']) jobSpec.set_one_attribute( 'pilotLog', workspec.workAttributes['batchLog']) tmpLog.debug('Done jobspec attribute setting') # set data dict data = { 'workspec': workspec, 'template': sdf_template, 'log_dir': self.logDir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, } return data
def submit_a_worker(data): workspec = data['workspec'] template = data['template'] log_dir = data['log_dir'] n_core_per_node = data['n_core_per_node'] panda_queue_name = data['panda_queue_name'] x509_user_proxy = data['x509_user_proxy'] ce_info_dict = data['ce_info_dict'] batch_log_dict = data['batch_log_dict'] special_par = data['special_par'] workspec.reset_changed_list() # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='submit_a_worker') # make batch script # batchFile = make_batch_script(workspec=workspec, template=template, n_core_per_node=n_core_per_node, log_dir=log_dir, # panda_queue_name=panda_queue_name, x509_user_proxy=x509_user_proxy, # ce_info_dict=ce_info_dict, batch_log_dict=batch_log_dict, special_par=special_par) batchFile = make_batch_script(**data) # command comStr = 'condor_submit {0}'.format(batchFile) # submit tmpLog.debug('submit with {0}'.format(batchFile)) try: p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode except: stdOut = '' stdErr = core_utils.dump_error_message(tmpLog, no_message=True) retCode = 1 tmpLog.debug('retCode={0}'.format(retCode)) if retCode == 0: # extract batchID job_id_match = None for tmp_line_str in stdOut.split('\n'): job_id_match = re.search( '^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str) if job_id_match: break if job_id_match is not None: workspec.batchID = job_id_match.group(2) tmpLog.debug('batchID={0}'.format(workspec.batchID)) batch_log = _condor_macro_replace(batch_log_dict['batch_log'], ClusterId=workspec.batchID) batch_stdout = _condor_macro_replace( batch_log_dict['batch_stdout'], ClusterId=workspec.batchID) batch_stderr = _condor_macro_replace( batch_log_dict['batch_stderr'], ClusterId=workspec.batchID) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) tmpRetVal = (True, '') else: errStr = 'batchID cannot be found' tmpLog.error(errStr) tmpRetVal = (False, errStr) else: # failed errStr = '{0} \n {1}'.format(stdOut, stdErr) tmpLog.error(errStr) tmpRetVal = (False, errStr) return tmpRetVal, workspec.get_changed_attributes()
queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_preparator = queueConfig.preparator queueConfig.preparator[ 'module'] = 'pandaharvester.harvesterpreparator.go_preparator' queueConfig.preparator['name'] = 'GoPreparator' modified_queueConfig_preparator = queueConfig.preparator pluginFactory = PluginFactory() # get stage-out plugin preparatorCore = pluginFactory.get_plugin(queueConfig.preparator) # logger _logger = core_utils.setup_logger('stageInTest_go_preparator') tmpLog = core_utils.make_logger(_logger, method_name='stageInTest_go_preparator') tmpLog.debug('start') for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr)
def _check_one_worker(workspec, job_ads_all_dict): # Make logger for one single worker tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_check_one_worker') ## Initialize newStatus newStatus = workspec.status errStr = '' name_opt, pool_opt = '', '' if workspec.submissionHost: try: condor_schedd, condor_pool = workspec.submissionHost.split( ',')[0:2] except ValueError: pass name_opt = '-name {0}'.format(condor_schedd) if condor_schedd else '' pool_opt = '-pool {0}'.format(condor_pool) if condor_pool else '' try: job_ads_dict = job_ads_all_dict[str(workspec.batchID)] except KeyError: got_job_ads = False except Exception as e: got_job_ads = False tmpLog.error('With error {0}'.format(e)) else: got_job_ads = True ## Parse job ads if got_job_ads: ## Check JobStatus try: batchStatus = job_ads_dict['JobStatus'] except KeyError: errStr = 'cannot get JobStatus of job batchID={0}. Regard the worker as canceled by default'.format( workspec.batchID) tmpLog.error(errStr) newStatus = WorkSpec.ST_cancelled else: # Propagate native condor job status workspec.nativeStatus = CONDOR_JOB_STATUS_MAP.get( batchStatus, 'unexpected') if batchStatus in ['2', '6']: # 2 running, 6 transferring output newStatus = WorkSpec.ST_running elif batchStatus in ['1', '7']: # 1 idle, 7 suspended newStatus = WorkSpec.ST_submitted elif batchStatus in ['3']: # 3 removed errStr = 'Condor HoldReason: {0} ; Condor RemoveReason: {1} '.format( job_ads_dict.get('LastHoldReason'), job_ads_dict.get('RemoveReason')) newStatus = WorkSpec.ST_cancelled elif batchStatus in ['5']: # 5 held if (job_ads_dict.get('HoldReason') == 'Job not found' or int(time.time()) - int(job_ads_dict.get('EnteredCurrentStatus', 0)) > 7200): # Kill the job if held too long or other reasons (retCode, stdOut, stdErr) = _runShell( 'condor_rm {name_opt} {pool_opt} {batchID}'.format( batchID=workspec.batchID, name_opt=name_opt, pool_opt=pool_opt, )) if retCode == 0: tmpLog.info('killed held job batchID={0}'.format( workspec.batchID)) else: newStatus = WorkSpec.ST_cancelled tmpLog.error( 'cannot kill held job batchID={0}. Force worker to be in cancelled status' .format(workspec.batchID)) # Mark the PanDA job as closed instead of failed workspec.set_pilot_closed() tmpLog.debug('Called workspec set_pilot_closed') else: newStatus = WorkSpec.ST_submitted elif batchStatus in ['4']: # 4 completed try: payloadExitCode = job_ads_dict['ExitCode'] except KeyError: errStr = 'cannot get ExitCode of job batchID={0}'.format( workspec.batchID) tmpLog.error(errStr) newStatus = WorkSpec.ST_failed else: # Propagate condor return code workspec.nativeExitCode = payloadExitCode if payloadExitCode in ['0']: # Payload should return 0 after successful run newStatus = WorkSpec.ST_finished else: # Other return codes are considered failed newStatus = WorkSpec.ST_failed errStr = 'Payload execution error: returned non-zero' tmpLog.debug(errStr) tmpLog.info('Payload return code = {0}'.format( payloadExitCode)) else: errStr = 'cannot get reasonable JobStatus of job batchID={0}. Regard the worker as failed by default'.format( workspec.batchID) tmpLog.error(errStr) newStatus = WorkSpec.ST_failed tmpLog.info( 'batchID={0} : batchStatus {1} -> workerStatus {2}'.format( workspec.batchID, batchStatus, newStatus)) else: tmpLog.error( 'condor job batchID={0} not found. Regard the worker as canceled by default' .format(workspec.batchID)) newStatus = WorkSpec.ST_cancelled tmpLog.info( 'batchID={0}: batchStatus {1} -> workerStatus {2}'.format( workspec.batchID, batchStatus, newStatus)) ## Return return (newStatus, errStr)
def get_files_to_stage_out(self, workspec): # get logger tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format( workspec.workerID), method_name='get_files_to_stage_out') fileDict = dict() # look for the json just under the access point for pandaID in workspec.pandaid_list: # look for the json just under the access point accessPoint = self.get_access_point(workspec, pandaID) jsonFilePath = os.path.join(accessPoint, jsonOutputsFileName) readJsonPath = jsonFilePath + suffixReadJson # first look for json.read which is not yet acknowledged tmpLog.debug('looking for output file {0}'.format(readJsonPath)) if os.path.exists(readJsonPath): pass else: tmpLog.debug( 'looking for output file {0}'.format(jsonFilePath)) if not os.path.exists(jsonFilePath): # not found tmpLog.debug('not found') continue try: tmpLog.debug('found') # rename to prevent from being overwritten os.rename(jsonFilePath, readJsonPath) except Exception: tmpLog.error('failed to rename json') continue # load json toSkip = False loadDict = None try: with open(readJsonPath) as jsonFile: loadDict = json.load(jsonFile) except Exception: tmpLog.error('failed to load json') toSkip = True # test validity of data format (ie it should be a Dictionary) if not toSkip: if not isinstance(loadDict, dict): tmpLog.error('loaded data is not a dictionary') toSkip = True # collect files and events nData = 0 if not toSkip: sizeMap = dict() chksumMap = dict() eventsList = dict() for tmpPandaID, tmpEventMapList in iteritems(loadDict): tmpPandaID = long(tmpPandaID) # test if tmpEventMapList is a list if not isinstance(tmpEventMapList, list): tmpLog.error('loaded data item is not a list') toSkip = True break for tmpEventInfo in tmpEventMapList: try: nData += 1 if 'eventRangeID' in tmpEventInfo: tmpEventRangeID = tmpEventInfo['eventRangeID'] else: tmpEventRangeID = None tmpFileDict = dict() pfn = tmpEventInfo['path'] lfn = os.path.basename(pfn) tmpFileDict['path'] = pfn if pfn not in sizeMap: if 'fsize' in tmpEventInfo: sizeMap[pfn] = tmpEventInfo['fsize'] else: sizeMap[pfn] = os.stat(pfn).st_size tmpFileDict['fsize'] = sizeMap[pfn] tmpFileDict['type'] = tmpEventInfo['type'] if tmpEventInfo['type'] in ['log', 'output']: # disable zipping tmpFileDict['isZip'] = 0 elif tmpEventInfo['type'] == 'zip_output': # already zipped tmpFileDict['isZip'] = 1 elif 'isZip' in tmpEventInfo: tmpFileDict['isZip'] = tmpEventInfo['isZip'] # guid if 'guid' in tmpEventInfo: tmpFileDict['guid'] = tmpEventInfo['guid'] else: tmpFileDict['guid'] = str(uuid.uuid4()) # get checksum if pfn not in chksumMap: if 'chksum' in tmpEventInfo: chksumMap[pfn] = tmpEventInfo['chksum'] else: chksumMap[pfn] = core_utils.calc_adler32( pfn) tmpFileDict['chksum'] = chksumMap[pfn] if tmpPandaID not in fileDict: fileDict[tmpPandaID] = dict() if lfn not in fileDict[tmpPandaID]: fileDict[tmpPandaID][lfn] = [] fileDict[tmpPandaID][lfn].append(tmpFileDict) # skip if unrelated to events if tmpFileDict['type'] not in [ 'es_output', 'zip_output' ]: continue tmpFileDict['eventRangeID'] = tmpEventRangeID if tmpPandaID not in eventsList: eventsList[tmpPandaID] = list() eventsList[tmpPandaID].append({ 'eventRangeID': tmpEventRangeID, 'eventStatus': tmpEventInfo['eventStatus'] }) except Exception: core_utils.dump_error_message(tmpLog) # dump events if not toSkip: if len(eventsList) > 0: curName = os.path.join(accessPoint, jsonEventsUpdateFileName) newName = curName + '.new' f = open(newName, 'w') json.dump(eventsList, f) f.close() os.rename(newName, curName) # remove empty file if toSkip or nData == 0: try: os.remove(readJsonPath) except Exception: pass tmpLog.debug('got {0} files for PandaID={1}'.format( nData, pandaID)) return fileDict
def do_POST(self): # logger if self.tmpLog is None: self.tmpLog = core_utils.make_logger(_logger) toSkip = False form = None methodName = None dataStr = None message = '' # parse the form data posted try: form = self.get_form() except Exception: message = 'corrupted json' toSkip = True # check parameters if not toSkip: toSkip = True # method is not set if 'methodName' not in form: message = 'methodName is not given' self.send_response(400) elif 'workerID' not in form: message = 'workerID is not given' self.send_response(400) elif 'data' not in form: message = 'data is not given' self.send_response(400) else: toSkip = False # get worker if not toSkip: try: workerID = form['workerID'] workSpec = self.dbProxy.get_worker_with_id(workerID) if workSpec is None: message = 'workerID={0} not found in DB'.format(workerID) self.send_response(400) else: # chose file and operation for each action methodName = form['methodName'] opType = None filePath = '' if methodName == 'requestJobs': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonJobRequestFileName) opType = 'w' elif methodName == 'getJobs': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jobSpecFileName) opType = 'r' elif methodName == 'requestEventRanges': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonEventsRequestFileName) opType = 'w' elif methodName == 'getEventRanges': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonEventsFeedFileName) opType = 'r' elif methodName == 'updateJobs': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonAttrsFileName) opType = 'w' elif methodName == 'uploadJobReport': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonJobReport) opType = 'w' elif methodName == 'uploadEventOutputDump': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.jsonOutputsFileName) opType = 'w' elif methodName == 'setPandaIDs': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.pandaIDsFile) opType = 'w' elif methodName == 'killWorker': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.killWorkerFile) opType = 'w' elif methodName == 'heartbeat': filePath = os.path.join( workSpec.get_access_point(), shared_file_messenger.heartbeatFile) opType = 'w' else: self.send_response(501) message = 'method not implemented' toSkip = True # take action if not toSkip: # write actions if opType == 'w': # check if file exists. Methods such as heartbeat however need to overwrite the file if os.path.exists(filePath) and methodName not in [ 'heartbeat' ]: message = 'previous request is not yet processed' self.send_response(503) else: with open(filePath, 'w') as fileHandle: json.dump(form['data'], fileHandle) message = 'OK' self.send_response(200) else: # read actions if os.path.exists(filePath): with open(filePath) as fileHandle: try: _message = json.load(fileHandle) message = json.dumps(_message) self.send_header( 'Content-Type', 'application/json') except JSONDecodeError: _f_qs = open(filePath).read() # _message = dict(parse_qsl(_f_qs, keep_blank_values=True)) message = _f_qs self.send_header( 'Content-Type', 'text/plain') self.send_response(200) else: message = 'previous request is not yet processed' self.send_response(503) except Exception: self.send_response(500) message = core_utils.dump_error_message(_logger) if harvester_config.frontend.verbose: self.tmpLog.debug('method={0} json={1} msg={2}'.format( methodName, dataStr, message)) # set the response self.do_postprocessing(message) return
def run(self): lockedBy = 'monitor-{0}'.format(self.ident) # init messengers for queueConfig in self.queueConfigMapper.get_all_queues().values(): # just import for module initialization self.pluginFactory.get_plugin(queueConfig.messenger) # main while True: sw = core_utils.get_stopwatch() mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting workers to monitor') workSpecsPerQueue = self.dbProxy.get_workers_to_update( harvester_config.monitor.maxWorkers, harvester_config.monitor.checkInterval, harvester_config.monitor.lockInterval, lockedBy) mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue))) # loop over all workers for queueName, workSpecsList in iteritems(workSpecsPerQueue): tmpQueLog = core_utils.make_logger(_logger, 'id={0} queue={1}'.format( lockedBy, queueName), method_name='run') # check queue if not self.queueConfigMapper.has_queue(queueName): tmpQueLog.error('config not found') continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get plugins monCore = self.pluginFactory.get_plugin(queueConfig.monitor) messenger = self.pluginFactory.get_plugin( queueConfig.messenger) # check workers allWorkers = [ item for sublist in workSpecsList for item in sublist ] tmpQueLog.debug('checking {0} workers'.format(len(allWorkers))) tmpRetMap = self.check_workers(monCore, messenger, allWorkers, queueConfig, tmpQueLog) # loop over all worker chunks tmpQueLog.debug('update jobs and workers') iWorker = 0 for workSpecs in workSpecsList: jobSpecs = None filesToStageOut = dict() pandaIDsList = [] eventsToUpdateList = [] filesToStageOutList = [] for workSpec in workSpecs: tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format( workSpec.workerID), method_name='run') tmpOut = tmpRetMap[workSpec.workerID] newStatus = tmpOut['newStatus'] monStatus = tmpOut['monStatus'] diagMessage = tmpOut['diagMessage'] workAttributes = tmpOut['workAttributes'] eventsToUpdate = tmpOut['eventsToUpdate'] filesToStageOut = tmpOut['filesToStageOut'] eventsRequestParams = tmpOut['eventsRequestParams'] nJobsToReFill = tmpOut['nJobsToReFill'] pandaIDs = tmpOut['pandaIDs'] tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} ' tmpStr += 'postProcessed={3} files={4}' tmpLog.debug( tmpStr.format(newStatus, monStatus, diagMessage, workSpec.is_post_processed(), str(filesToStageOut))) iWorker += 1 # check status if newStatus not in WorkSpec.ST_LIST: tmpLog.error( 'unknown status={0}'.format(newStatus)) continue # update worker workSpec.set_status(newStatus) workSpec.set_work_attributes(workAttributes) # request events if eventsRequestParams != {}: workSpec.eventsRequest = WorkSpec.EV_requestEvents workSpec.eventsRequestParams = eventsRequestParams # jobs to refill if nJobsToReFill is not None: workSpec.nJobsToReFill = nJobsToReFill # get associated jobs for the worker chunk if workSpec.hasJob == 1 and jobSpecs is None: jobSpecs = self.dbProxy.get_jobs_with_worker_id( workSpec.workerID, None, only_running=True) # pandaIDs for push pandaIDsList.append(pandaIDs) if len(eventsToUpdate) > 0: eventsToUpdateList.append(eventsToUpdate) if len(filesToStageOut) > 0: filesToStageOutList.append(filesToStageOut) # update jobs and workers if jobSpecs is not None: tmpQueLog.debug( 'updating {0} jobs with {1} workers'.format( len(jobSpecs), len(workSpecs))) core_utils.update_job_attributes_with_workers( queueConfig.mapType, jobSpecs, workSpecs, filesToStageOutList, eventsToUpdateList) for jobSpec in jobSpecs: tmpLog = core_utils.make_logger( _logger, 'PandaID={0}'.format(jobSpec.PandaID), method_name='run') tmpLog.debug( 'new status={0} subStatus={1} status_in_metadata={2}' .format( jobSpec.status, jobSpec.subStatus, jobSpec.get_job_status_from_attributes())) # update local database tmpRet = self.dbProxy.update_jobs_workers( jobSpecs, workSpecs, lockedBy, pandaIDsList) if not tmpRet: for workSpec in workSpecs: tmpLog = core_utils.make_logger( _logger, 'workerID={0}'.format(workSpec.workerID), method_name='run') tmpLog.error( 'failed to update the DB. lockInterval may be too short' ) # send ACK to workers for events and files if len(eventsToUpdateList) > 0 or len( filesToStageOutList) > 0: for workSpec in workSpecs: messenger.acknowledge_events_files(workSpec) tmpQueLog.debug('done') mainLog.debug('done' + sw.get_elapsed_time()) # check if being terminated if self.terminated(harvester_config.monitor.sleepTime): mainLog.debug('terminated') return
def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'site={0} workerID={1}'.format(self.queueName, workspec.workerID), method_name='_handle_one_worker') def _choose_proxy(workspec): """ Choose the proxy based on the job type """ job_type = workspec.jobType proxy = self.x509UserProxy if is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis') and self.x509UserProxyAnalysis: tmpLog.debug('Taking analysis proxy') proxy = self.x509UserProxyAnalysis else: tmpLog.debug('Taking default proxy') return proxy # initialize ce_info_dict = dict() batch_log_dict = dict() data = {'workspec': workspec, 'to_submit': to_submit,} if to_submit: sdf_template_file = None if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint') ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy() # go on info of the CE; ignore protocol prefix in ce_endpoint ce_endpoint_from_queue = re.sub('^\w+://', '', ce_info_dict.get('ce_endpoint', '')) ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub(':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port) tmpLog.debug('Got pilot version: "{0}"; CE endpoint: "{1}", flavour: "{2}"'.format( pilot_version, ce_endpoint_from_queue, ce_flavour_str)) if self.templateFile: sdf_template_file = self.templateFile elif os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}{sdf_suffix_str}.sdf'.format( ce_flavour_str=ce_flavour_str, sdf_suffix_str=sdf_suffix_str) sdf_template_file = os.path.join(self.CEtemplateDir, sdf_template_filename) else: if self.templateFile: sdf_template_file = self.templateFile try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice(self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass try: # Manually define ceQueueName if self.ceQueueName: ce_info_dict['ce_queue_name'] = self.ceQueueName except AttributeError: pass # template for batch script try: tmpFile = open(sdf_template_file) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found') to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers condor_schedd, condor_pool = random.choice(schedd_pool_choice_list) # set submissionHost if not condor_schedd and not condor_pool: workspec.submissionHost = 'LOCAL' else: workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool) tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost)) # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # choose the x509 certificate based on the type of job (analysis or production) proxy = _choose_proxy(workspec) # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': proxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, 'pilot_url': pilot_url, 'pilot_version': pilot_version, 'python_version': python_version, }) return data
def submit_a_worker(data): workspec = data['workspec'] to_submit = data['to_submit'] # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='submit_a_worker') # no need to submit bad worker if not to_submit: errStr = 'Not submitted, due to incomplete data of the worker' tmpLog.warning(errStr) tmpRetVal = (None, errStr) return tmpRetVal, workspec.get_changed_attributes() # attributes try: ce_info_dict = data['ce_info_dict'] batch_log_dict = data['batch_log_dict'] condor_schedd = data['condor_schedd'] condor_pool = data['condor_pool'] use_spool = data['use_spool'] except KeyError: errStr = 'Not submitted, due to incomplete data of the worker' tmpLog.warning(errStr) tmpRetVal = (None, errStr) return tmpRetVal, workspec.get_changed_attributes() else: workspec.reset_changed_list() # make batch script batchFile = make_batch_script(**data) # make condor remote options name_opt = '-name {0}'.format(condor_schedd) if condor_schedd else '' pool_opt = '-pool {0}'.format(condor_pool) if condor_pool else '' spool_opt = '-spool'.format( use_spool) if use_spool and condor_schedd else '' # command comStr = 'condor_submit {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format( sdf_file=batchFile, name_opt=name_opt, pool_opt=pool_opt, spool_opt=spool_opt) # submit tmpLog.debug('submit with command: {0}'.format(comStr)) try: p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # check return code stdOut, stdErr = p.communicate() retCode = p.returncode except Exception: stdOut = '' stdErr = core_utils.dump_error_message(tmpLog, no_message=True) retCode = 1 tmpLog.debug('retCode={0}'.format(retCode)) if retCode == 0: # extract batchID job_id_match = None for tmp_line_str in stdOut.split('\n'): job_id_match = re.search( '^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str) if job_id_match: break if job_id_match is not None: workspec.batchID = job_id_match.group(2) # set submissionHost if not condor_schedd and not condor_pool: workspec.submissionHost = None else: workspec.submissionHost = '{0},{1}'.format( condor_schedd, condor_pool) tmpLog.debug('submissionHost={0} batchID={1}'.format( workspec.submissionHost, workspec.batchID)) # set computingElement workspec.computingElement = ce_info_dict.get('ce_endpoint', '') # set log batch_log = _condor_macro_replace(batch_log_dict['batch_log'], ClusterId=workspec.batchID) batch_stdout = _condor_macro_replace( batch_log_dict['batch_stdout'], ClusterId=workspec.batchID) batch_stderr = _condor_macro_replace( batch_log_dict['batch_stderr'], ClusterId=workspec.batchID) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) if not workspec.get_jobspec_list(): tmpLog.debug( 'No jobspec associated in the worker of workerID={0}'. format(workspec.workerID)) else: for jobSpec in workspec.get_jobspec_list(): # using batchLog and stdOut URL as pilotID and pilotLog jobSpec.set_one_attribute( 'pilotID', workspec.workAttributes['stdOut']) jobSpec.set_one_attribute( 'pilotLog', workspec.workAttributes['batchLog']) tmpLog.debug('Done set_log_file after submission') tmpRetVal = (True, '') else: errStr = 'batchID cannot be found' tmpLog.error(errStr) tmpRetVal = (None, errStr) else: # failed errStr = '{0} \n {1}'.format(stdOut, stdErr) tmpLog.error(errStr) tmpRetVal = (None, errStr) return tmpRetVal, workspec.get_changed_attributes()
def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, executable_file, x509_user_proxy, log_subdir=None, ce_info_dict=dict(), batch_log_dict=dict(), pilot_url=None, special_par='', harvester_queue_config=None, is_unified_queue=False, pilot_version='unknown', python_version='unknown', **kwarg): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='make_a_jdl') # Note: In workspec, unit of minRamCount and of maxDiskCount are both MB. # In HTCondor SDF, unit of request_memory is MB, and request_disk is KB. n_core_total = workspec.nCore if workspec.nCore else n_core_per_node request_ram = max(workspec.minRamCount, 1 * n_core_total) if workspec.minRamCount else 1 * n_core_total request_disk = workspec.maxDiskCount * 1024 if workspec.maxDiskCount else 1 request_walltime = workspec.maxWalltime if workspec.maxWalltime else 0 io_intensity = workspec.ioIntensity if workspec.ioIntensity else 0 ce_info_dict = ce_info_dict.copy() batch_log_dict = batch_log_dict.copy() # possible override by AGIS special_par if special_par: special_par_attr_list = ['queue', 'maxWallTime', 'xcount', ] _match_special_par_dict = { attr: re.search('\({attr}=([^)]+)\)'.format(attr=attr), special_par) \ for attr in special_par_attr_list } for attr, _match in _match_special_par_dict.items(): if not _match: continue elif attr == 'queue': ce_info_dict['ce_queue_name'] = str(_match.group(1)) elif attr == 'maxWallTime': request_walltime = int(_match.group(1)) elif attr == 'xcount': n_core_total = int(_match.group(1)) tmpLog.debug('job attributes override by AGIS special_par: {0}={1}'.format(attr, str(_match.group(1)))) # derived job attributes n_node = _div_round_up(n_core_total, n_core_per_node) request_ram_per_core = _div_round_up(request_ram * n_node, n_core_total) request_cputime = request_walltime * n_core_total request_walltime_minute = _div_round_up(request_walltime, 60) request_cputime_minute = _div_round_up(request_cputime, 60) # decide prodSourceLabel pilot_opt_dict = submitter_common.get_complicated_pilot_options(workspec.pilotType, pilot_url=pilot_url) if pilot_opt_dict is None: prod_source_label = harvester_queue_config.get_source_label(workspec.jobType) pilot_type_opt = workspec.pilotType pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' else: prod_source_label = pilot_opt_dict['prod_source_label'] pilot_type_opt = pilot_opt_dict['pilot_type_opt'] pilot_url_str = pilot_opt_dict['pilot_url_str'] # open tmpfile as submit description file tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sdf', dir=workspec.get_access_point()) # placeholder map placeholder_map = { 'sdfPath': tmpFile.name, 'executableFile': executable_file, 'nCorePerNode': n_core_per_node, 'nCoreTotal': n_core_total, 'nNode': n_node, 'requestRam': request_ram, 'requestRamPerCore': request_ram_per_core, 'requestDisk': request_disk, 'requestWalltime': request_walltime, 'requestWalltimeMinute': request_walltime_minute, 'requestCputime': request_cputime, 'requestCputimeMinute': request_cputime_minute, 'accessPoint': workspec.accessPoint, 'harvesterID': harvester_config.master.harvester_id, 'workerID': workspec.workerID, 'computingSite': workspec.computingSite, 'pandaQueueName': panda_queue_name, 'x509UserProxy': x509_user_proxy, 'ceEndpoint': ce_info_dict.get('ce_endpoint', ''), 'ceHostname': ce_info_dict.get('ce_hostname', ''), 'ceFlavour': ce_info_dict.get('ce_flavour', ''), 'ceJobmanager': ce_info_dict.get('ce_jobmanager', ''), 'ceQueueName': ce_info_dict.get('ce_queue_name', ''), 'ceVersion': ce_info_dict.get('ce_version', ''), 'logDir': log_dir, 'logSubdir': log_subdir, 'gtag': batch_log_dict.get('gtag', 'fake_GTAG_string'), 'prodSourceLabel': prod_source_label, 'jobType': workspec.jobType, 'resourceType': _get_resource_type(workspec.resourceType, is_unified_queue), 'pilotResourceTypeOption': _get_resource_type(workspec.resourceType, is_unified_queue, True), 'ioIntensity': io_intensity, 'pilotType': pilot_type_opt, 'pilotUrlOption': pilot_url_str, 'pilotVersion': pilot_version, 'pilotPythonOption': submitter_common.get_python_version_option(python_version, prod_source_label), 'submissionHost': workspec.submissionHost, 'submissionHostShort': workspec.submissionHost.split('.')[0], } # fill in template string jdl_str = template.format(**placeholder_map) # save jdl to submit description file tmpFile.write(jdl_str) tmpFile.close() tmpLog.debug('saved sdf at {0}'.format(tmpFile.name)) tmpLog.debug('done') return jdl_str, placeholder_map
def _handle_one_worker(workspec): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workspec.workerID), method_name='_handle_one_worker') # get default information from queue info to_submit = True n_core_per_node_from_queue = this_panda_queue_dict.get( 'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \ or this_panda_queue_dict.get('capability', '') == 'ucore' ce_info_dict = dict() batch_log_dict = dict() special_par = '' data = { 'workspec': workspec, 'to_submit': to_submit, } if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get( 'queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not (_queue_dict.get('ce_endpoint') and str(_queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce'])): continue ce_endpoint = _queue_dict.get('ce_endpoint') if (ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default'): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) queue_status_dict = self.dbInterface.get_queue_status( self.queueName) worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats( self.queueName) ce_weight_dict = _get_ce_weight_dict( ce_endpoint_list=list(ce_auxilary_dict.keys()), queue_status_dict=queue_status_dict, worker_ce_stats_dict=worker_ce_stats_dict) # good CEs which can be submitted to, duplicate by weight good_ce_weighted_list = [] for _ce_endpoint in ce_auxilary_dict.keys(): good_ce_weighted_list.extend( [_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0)) tmpLog.debug( 'queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}' .format(queue_status_dict, worker_ce_stats_dict, ce_weight_dict)) try: if len(good_ce_weighted_list) > 0: ce_info_dict = ce_auxilary_dict[random.choice( good_ce_weighted_list)].copy() else: tmpLog.info( 'No good CE endpoint left. Choose an arbitrary CE endpoint' ) ce_info_dict = random.choice( list(ce_auxilary_dict.values())).copy() except IndexError: tmpLog.error('No valid CE endpoint found') ce_info_dict = {} to_submit = False else: ce_endpoint_from_queue = ce_info_dict.get( 'ce_endpoint', '') ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub( ':\w*', '', ce_endpoint_from_queue) tmpLog.debug( 'For site {0} got CE endpoint: "{1}", flavour: "{2}"'. format(self.queueName, ce_endpoint_from_queue, ce_flavour_str)) if os.path.isdir(self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}.sdf'.format( ce_flavour_str=ce_flavour_str) self.templateFile = os.path.join( self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance( self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict[ 'ce_endpoint'] = random.choice( list(zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice( self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass # Choose from Condor schedd and central managers if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: condor_schedd, condor_pool = random.choice( list(zip(self.condorSchedd, self.condorPool))) else: condor_schedd = random.choice(self.condorSchedd) condor_pool = self.condorPool else: condor_schedd = self.condorSchedd condor_pool = self.condorPool # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub( r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # template for batch script try: tmpFile = open(self.templateFile) sdf_template = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error( 'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found' ) to_submit = False else: # get batch_log, stdout, stderr filename for _line in sdf_template.split('\n'): if _line.startswith('#'): continue _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename( value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename( value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename( value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes['stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': self.x509UserProxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, }) return data
def __init__(self, **kwarg): PluginBase.__init__(self, **kwarg) tmpLog = core_utils.make_logger(baseLogger, method_name='__init__') tmpLog.info("[{0}] SAGA adaptor will be used".format(self.adaptor))
end_job_id = int(sys.argv[3]) queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue(queueName) initial_queueConfig_stager = queueConfig.stager queueConfig.stager['module'] = 'pandaharvester.harvesterstager.go_bulk_stager' queueConfig.stager['name'] = 'GlobusBulkStager' modified_queueConfig_stager = queueConfig.stager pluginFactory = PluginFactory() # get stage-out plugin stagerCore = pluginFactory.get_plugin(queueConfig.stager) # logger _logger = core_utils.setup_logger('stageOutTest_go_bulk_stager') tmpLog = core_utils.make_logger(_logger, method_name='stageOutTest_go_bulk_stager') tmpLog.debug('start') for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems(): #print "loggerName - {}".format(loggerName) if loggerName.startswith('panda.log'): if len(loggerObj.handlers) == 0: continue if loggerName.split('.')[-1] in ['db_proxy']: continue stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(stagerCore.__class__.__name__) tmpLog.debug(msgStr)
def define_num_workers(self, static_num_workers, site_name): tmpLog = core_utils.make_logger(_logger, 'site={0}'.format(site_name), method_name='define_num_workers') tmpLog.debug('start') dyn_num_workers = copy.copy(static_num_workers) try: # get queue status queueStat = self.dbProxy.get_cache("panda_queues.json", None) if queueStat is None: queueStat = dict() else: queueStat = queueStat.data # define num of new workers for queueName, tmpVal in iteritems(static_num_workers): # set 0 to num of new workers when the queue is disabled if queueName in queueStat and queueStat[queueName][ 'status'] in ['offline']: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 since status={0}'.format( queueStat[queueName]['status']) tmpLog.debug(retMsg) continue # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) # get throttler if queueName not in self.throttlerMap: if hasattr(queueConfig, 'throttler'): throttler = self.pluginFactory.get_plugin( queueConfig.throttler) else: throttler = None self.throttlerMap[queueName] = throttler # check throttler throttler = self.throttlerMap[queueName] if throttler is not None: toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig) if toThrottle: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by {0}:{1}'.format( throttler.__class__.__name__, tmpMsg) tmpLog.debug(retMsg) continue # check stats nQueue = tmpVal['nQueue'] nReady = tmpVal['nReady'] nRunning = tmpVal['nRunning'] nQueueLimit = queueConfig.nQueueLimitWorker maxWorkers = queueConfig.maxWorkers if queueConfig.runMode == 'slave': nNewWorkersDef = tmpVal['nNewWorkers'] if nNewWorkersDef == 0: dyn_num_workers[queueName]['nNewWorkers'] = 0 retMsg = 'set nNewWorkers=0 by panda in slave mode' tmpLog.debug(retMsg) continue else: nNewWorkersDef = None # define num of new workers based on static site config nNewWorkers = 0 if nQueueLimit > 0 and nQueue >= nQueueLimit: # enough queued workers retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimit({1})'.format( nQueue, nQueueLimit) tmpLog.debug(retMsg) pass elif maxWorkers > 0 and (nQueue + nReady + nRunning) >= maxWorkers: # enough workers in the system retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format( nQueue, nReady, nRunning) retMsg += '>= maxWorkers({0})'.format(maxWorkers) tmpLog.debug(retMsg) pass else: # get max number of queued workers maxQueuedWorkers = 0 if nQueueLimit > 0: maxQueuedWorkers = nQueueLimit if maxQueuedWorkers == 0: if nNewWorkersDef is not None: # slave mode maxQueuedWorkers = nNewWorkersDef + nQueue else: # use default value maxQueuedWorkers = 1 # new workers nNewWorkers = max(maxQueuedWorkers - nQueue, 0) if maxWorkers > 0: nNewWorkers = min( nNewWorkers, max(maxWorkers - nQueue - nReady - nRunning, 0)) if queueConfig.maxNewWorkersPerCycle > 0: nNewWorkers = min(nNewWorkers, queueConfig.maxNewWorkersPerCycle) dyn_num_workers[queueName]['nNewWorkers'] = nNewWorkers # dump tmpLog.debug('defined {0}'.format(str(dyn_num_workers))) return dyn_num_workers except: # dump error errMsg = core_utils.dump_error_message(tmpLog) return None
def check_status(self, jobspec): # make logger tmpLog = core_utils.make_logger(_logger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_status') tmpLog.debug('start') # default return tmpRetVal = (True, '') # check that jobspec.computingSite is defined if jobspec.computingSite is None: # not found tmpLog.error('jobspec.computingSite is not defined') return False, 'jobspec.computingSite is not defined' else: tmpLog.debug('jobspec.computingSite : {0}'.format( jobspec.computingSite)) # test we have a Globus Transfer Client if not self.tc: errStr = 'failed to get Globus Transfer Client' tmpLog.error(errStr) return False, errStr # set transferID to None transferID = None # get transfer groups groups = jobspec.get_groups_of_output_files() tmpLog.debug( 'jobspec.get_groups_of_output_files() = : {0}'.format(groups)) # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests if self.dummy_transfer_id in groups: # lock for 120 sec if not self.have_db_lock: tmpLog.debug( 'attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.get_object_lock( self.dummy_transfer_id, lock_interval=120) if not self.have_db_lock: # escape since locked by another thread msgStr = 'escape since locked by another thread' tmpLog.debug(msgStr) return None, msgStr # refresh group information since that could have been updated by another thread before getting the lock self.dbInterface.refresh_file_group_info(jobspec) # get transfer groups again with refreshed info groups = jobspec.get_groups_of_output_files() # the dummy transfer ID is still there if self.dummy_transfer_id in groups: groupUpdateTime = groups[ self.dummy_transfer_id]['groupUpdateTime'] # get files with the dummy transfer ID across jobs fileSpecs = self.dbInterface.get_files_with_group_id( self.dummy_transfer_id) # submit transfer if there are more than 10 files or the group was made before more than 10 min msgStr = 'self.dummy_transfer_id = {0} number of files = {1}'.format( self.dummy_transfer_id, len(fileSpecs)) tmpLog.debug(msgStr) if len(fileSpecs) >= 10 or \ groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10): tmpLog.debug('prepare to transfer files') # submit transfer and get a real transfer ID # set the Globus destination Endpoint id and path will get them from Agis eventually from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper queueConfigMapper = QueueConfigMapper() queueConfig = queueConfigMapper.get_queue( jobspec.computingSite) #self.Globus_srcPath = queueConfig.stager['Globus_srcPath'] self.srcEndpoint = queueConfig.stager['srcEndpoint'] self.Globus_srcPath = self.basePath self.Globus_dstPath = queueConfig.stager['Globus_dstPath'] self.dstEndpoint = queueConfig.stager['dstEndpoint'] # Test the endpoints and create the transfer data class errMsg = None try: # Test endpoints for activation tmpStatsrc, srcStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.srcEndpoint) tmpStatdst, dstStr = globus_utils.check_endpoint_activation( tmpLog, self.tc, self.dstEndpoint) if tmpStatsrc and tmpStatdst: errStr = 'source Endpoint and destination Endpoint activated' tmpLog.debug(errStr) else: errMsg = '' if not tmpStatsrc: errMsg += ' source Endpoint not activated ' if not tmpStatdst: errMsg += ' destination Endpoint not activated ' # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (None, errMsg) return tmpRetVal # both endpoints activated now prepare to transfer data tdata = TransferData(self.tc, self.srcEndpoint, self.dstEndpoint, sync_level="checksum") except: errStat, errMsg = globus_utils.handle_globus_exception( tmpLog) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (errStat, errMsg) return tmpRetVal # loop over all files for fileSpec in fileSpecs: attrs = jobspec.get_output_file_attributes() msgStr = "len(jobSpec.get_output_file_attributes()) = {0} type - {1}".format( len(attrs), type(attrs)) tmpLog.debug(msgStr) for key, value in attrs.iteritems(): msgStr = "output file attributes - {0} {1}".format( key, value) tmpLog.debug(msgStr) msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format( fileSpec.lfn, fileSpec.scope) tmpLog.debug(msgStr) scope = fileSpec.scope hash = hashlib.md5() hash.update('%s:%s' % (scope, fileSpec.lfn)) hash_hex = hash.hexdigest() correctedscope = "/".join(scope.split('.')) srcURL = fileSpec.path dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format( endPoint=self.Globus_dstPath, scope=correctedscope, hash1=hash_hex[0:2], hash2=hash_hex[2:4], lfn=fileSpec.lfn) tmpLog.debug('src={srcURL} dst={dstURL}'.format( srcURL=srcURL, dstURL=dstURL)) # add files to transfer object - tdata if os.access(srcURL, os.R_OK): tmpLog.debug("tdata.add_item({},{})".format( srcURL, dstURL)) tdata.add_item(srcURL, dstURL) else: errMsg = "source file {} does not exist".format( srcURL) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (False, errMsg) return tmpRetVal # submit transfer try: transfer_result = self.tc.submit_transfer(tdata) # check status code and message tmpLog.debug(str(transfer_result)) if transfer_result['code'] == "Accepted": # succeeded # set transfer ID which are used for later lookup transferID = transfer_result['task_id'] tmpLog.debug( 'successfully submitted id={0}'.format( transferID)) # set status for files self.dbInterface.set_file_group( fileSpecs, transferID, 'running') msgStr = 'submitted transfer with ID={0}'.format( transferID) tmpLog.debug(msgStr) else: # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg = 'Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) tmpRetVal = (None, transfer_result['message']) return tmpRetVal except Exception as e: errStat, errMsg = globus_utils.handle_globus_exception( tmpLog) # release process lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: errMsg += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(errMsg) return errStat, errMsg else: msgStr = 'wait until enough files are pooled' tmpLog.debug(msgStr) # release the lock tmpLog.debug( 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}' .format(self.id, self.dummy_transfer_id)) self.have_db_lock = self.dbInterface.release_object_lock( self.dummy_transfer_id) if not self.have_db_lock: msgStr += ' - Could not release DB lock for {}'.format( self.dummy_transfer_id) tmpLog.error(msgStr) # return None to retry later return None, msgStr # check transfer with real transfer IDs # get transfer groups groups = jobspec.get_groups_of_output_files() for transferID in groups: if transferID != self.dummy_transfer_id: # get transfer task tmpStat, transferTasks = globus_utils.get_transfer_task_by_id( tmpLog, self.tc, transferID) # return a temporary error when failed to get task if not tmpStat: errStr = 'failed to get transfer task' tmpLog.error(errStr) return None, errStr # return a temporary error when task is missing if transferID not in transferTasks: errStr = 'transfer task ID - {} is missing'.format( transferID) tmpLog.error(errStr) return None, errStr # succeeded in finding a transfer task by tranferID if transferTasks[transferID]['status'] == 'SUCCEEDED': tmpLog.debug( 'transfer task {} succeeded'.format(transferID)) self.set_FileSpec_status(jobspec, 'finished') return True, '' # failed if transferTasks[transferID]['status'] == 'FAILED': errStr = 'transfer task {} failed'.format(transferID) tmpLog.error(errStr) self.set_FileSpec_status(jobspec, 'failed') return False, errStr # another status tmpStr = 'transfer task {0} status: {1}'.format( transferID, transferTasks[transferID]['status']) tmpLog.debug(tmpStr) return None, ''
def submit_workers(self, workspec_list): retList = [] for workSpec in workspec_list: tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workSpec.workerID), method_name='submit_workers') queueconfigmapper = QueueConfigMapper() queueconfig = queueconfigmapper.get_queue(workSpec.computingSite) prodSourceLabel = queueconfig.get_source_label() # If jobSpec is defined we are in push mode, if not pull mode # Both assume one to one worker to job mapping jobSpec = workSpec.get_jobspec_list() if jobSpec: jobSpec = jobSpec[0] tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map())) desc = {} # If we need to prefetch events, set aCT status waiting. # feed_events in act_messenger will fill events and release the job if queueconfig.prefetchEvents: desc['pandastatus'] = 'waiting' desc['actpandastatus'] = 'waiting' desc['arcjobid'] = -1 # dummy id to prevent submission else: desc['pandastatus'] = 'sent' desc['actpandastatus'] = 'sent' desc['siteName'] = workSpec.computingSite desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel == 'user' else 'production'] desc['sendhb'] = 0 metadata = { 'harvesteraccesspoint': workSpec.get_access_point(), 'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id) } desc['metadata'] = json.dumps(metadata) if jobSpec: # push mode: aCT takes the url-encoded job description (like it gets from panda server) pandaid = jobSpec.PandaID actjobdesc = urllib.urlencode(jobSpec.jobParams) else: # pull mode: just set pandaid (to workerid) and prodsourcelabel pandaid = workSpec.workerID actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % ( pandaid, prodSourceLabel) tmpLog.info("Inserting job {0} into aCT DB: {1}".format( pandaid, str(desc))) try: batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()'] except Exception as e: result = (False, "Failed to insert job into aCT DB: {0}".format( str(e))) else: tmpLog.info("aCT batch id {0}".format(batchid)) workSpec.batchID = str(batchid) workSpec.submissionHost = self.hostname workSpec.nativeStatus = desc['actpandastatus'] # Set log files in workSpec today = time.strftime('%Y-%m-%d', time.gmtime()) logurl = '/'.join([ queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid) ]) workSpec.set_log_file('batch_log', '{0}.log'.format(logurl)) workSpec.set_log_file('stdout', '{0}.out'.format(logurl)) workSpec.set_log_file('stderr', '{0}.err'.format(logurl)) workSpec.set_log_file('jdl', '{0}.jdl'.format(logurl)) result = (True, '') retList.append(result) return retList
def run(self): lockedBy = 'submitter-{0}'.format(self.ident) while True: mainLog = core_utils.make_logger(_logger, 'id={0}'.format(lockedBy), method_name='run') mainLog.debug('getting queues to submit workers') # get queues associated to a site to submit workers curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit( harvester_config.submitter.nQueues, harvester_config.submitter.lookupTime, harvester_config.submitter.lockInterval) if siteName is not None: mainLog.debug('got {0} queues for site {1}'.format( len(curWorkers), siteName)) # get commands comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers, siteName) commandSpecs = self.dbProxy.get_commands_for_receiver( 'submitter', comStr) mainLog.debug('got {0} commands'.format(len(commandSpecs))) for commandSpec in commandSpecs: newLimits = self.dbProxy.set_queue_limit( siteName, commandSpec.params) for tmpResource, tmpNewVal in iteritems(newLimits): # if available, overwrite new worker value with the command from panda server if tmpResource in resMap: tmpQueueName = resMap[tmpResource] if tmpQueueName in curWorkers: curWorkers[tmpQueueName][tmpResource][ 'nNewWorkers'] = tmpNewVal # define number of new workers if len(curWorkers) == 0: n_workers_per_queue_and_rt = dict() else: n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers( curWorkers, siteName) if n_workers_per_queue_and_rt is None: mainLog.error( 'WorkerAdjuster failed to define the number of workers' ) elif len(n_workers_per_queue_and_rt) == 0: pass else: # loop over all queues and resource types for queueName in n_workers_per_queue_and_rt: for resource_type, tmpVal in iteritems( n_workers_per_queue_and_rt[queueName]): tmpLog = core_utils.make_logger( _logger, 'id={0} queue={1} resource_type={2}'.format( lockedBy, queueName, resource_type), method_name='run') tmpLog.debug('start') nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady'] nReady = tmpVal['nReady'] # check queue if not self.queueConfigMapper.has_queue(queueName): tmpLog.error('config not found') continue # no new workers if nWorkers == 0: tmpLog.debug( 'skipped since no new worker is needed based on current stats' ) continue # get queue queueConfig = self.queueConfigMapper.get_queue( queueName) # actions based on mapping type if queueConfig.mapType == WorkSpec.MT_NoJob: # workers without jobs jobChunks = [] for i in range(nWorkers): jobChunks.append([]) elif queueConfig.mapType == WorkSpec.MT_OneToOne: # one worker per one job jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, 1, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) elif queueConfig.mapType == WorkSpec.MT_MultiJobs: # one worker for multiple jobs nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker( queueConfig, nWorkers, resource_type) tmpLog.debug('nJobsPerWorker={0}'.format( nJobsPerWorker)) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, nJobsPerWorker, None, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy, queueConfig.allowJobMixture) elif queueConfig.mapType == WorkSpec.MT_MultiWorkers: # multiple workers for one job nWorkersPerJob = self.workerMaker.get_num_workers_per_job( queueConfig, nWorkers, resource_type) jobChunks = self.dbProxy.get_job_chunks_for_workers( queueName, nWorkers, nReady, None, nWorkersPerJob, queueConfig.useJobLateBinding, harvester_config.submitter.checkInterval, harvester_config.submitter.lockInterval, lockedBy) else: tmpLog.error('unknown mapType={0}'.format( queueConfig.mapType)) continue tmpLog.debug('got {0} job chunks'.format( len(jobChunks))) if len(jobChunks) == 0: continue # make workers okChunks, ngChunks = self.workerMaker.make_workers( jobChunks, queueConfig, nReady, resource_type) if len(ngChunks) == 0: tmpLog.debug( 'successfully made {0} workers'.format( len(okChunks))) else: tmpLog.debug( 'made {0} workers, while {1} workers failed' .format(len(okChunks), len(ngChunks))) timeNow = datetime.datetime.utcnow() # NG (=not good) for ngJobs in ngChunks: for jobSpec in ngJobs: jobSpec.status = 'failed' jobSpec.subStatus = 'failedtomake' jobSpec.stateChangeTime = timeNow jobSpec.lockedBy = None jobSpec.trigger_propagation() self.dbProxy.update_job( jobSpec, { 'lockedBy': lockedBy, 'subStatus': 'prepared' }) # OK pandaIDs = set() workSpecList = [] if len(okChunks) > 0: for workSpec, okJobs in okChunks: # has job if (queueConfig.useJobLateBinding and workSpec.workerID is None) \ or queueConfig.mapType == WorkSpec.MT_NoJob: workSpec.hasJob = 0 else: workSpec.hasJob = 1 if workSpec.nJobsToReFill in [None, 0]: workSpec.set_jobspec_list(okJobs) else: # refill free slots during the worker is running workSpec.set_jobspec_list( okJobs[:workSpec. nJobsToReFill]) workSpec.nJobsToReFill = None for jobSpec in okJobs[ workSpec.nJobsToReFill:]: pandaIDs.add(jobSpec.PandaID) # map type workSpec.mapType = queueConfig.mapType # queue name workSpec.computingSite = queueConfig.queueName # set access point workSpec.accessPoint = queueConfig.messenger[ 'accessPoint'] # events if len(okJobs) > 0 and ( 'eventService' in okJobs[0].jobParams or 'cloneJob' in okJobs[0].jobParams): workSpec.eventsRequest = WorkSpec.EV_useEvents workSpecList.append(workSpec) if len(workSpecList) > 0: # get plugin for submitter submitterCore = self.pluginFactory.get_plugin( queueConfig.submitter) if submitterCore is None: # not found tmpLog.error( 'submitter plugin for {0} not found'. format(jobSpec.computingSite)) continue # get plugin for messenger messenger = self.pluginFactory.get_plugin( queueConfig.messenger) if messenger is None: # not found tmpLog.error( 'messenger plugin for {0} not found'. format(jobSpec.computingSite)) continue # setup access points messenger.setup_access_points(workSpecList) # feed jobs for workSpec in workSpecList: if workSpec.hasJob == 1: tmpStat = messenger.feed_jobs( workSpec, workSpec.get_jobspec_list()) if tmpStat is False: tmpLog.error( 'failed to send jobs to workerID={0}' .format(workSpec.workerID)) else: tmpLog.debug( 'sent jobs to workerID={0} with {1}' .format( workSpec.workerID, tmpStat)) # insert workers self.dbProxy.insert_workers( workSpecList, lockedBy) # submit tmpLog.info('submitting {0} workers'.format( len(workSpecList))) workSpecList, tmpRetList, tmpStrList = self.submit_workers( submitterCore, workSpecList) for iWorker, (tmpRet, tmpStr) in enumerate( zip(tmpRetList, tmpStrList)): workSpec, jobList = okChunks[iWorker] # use associated job list since it can be truncated for re-filling jobList = workSpec.get_jobspec_list() # set status if not tmpRet: # failed submission tmpLog.error( 'failed to submit a workerID={0} with {1}' .format(workSpec.workerID, tmpStr)) workSpec.set_status(WorkSpec.ST_missed) workSpec.set_dialog_message(tmpStr) jobList = [] elif queueConfig.useJobLateBinding and workSpec.hasJob == 1: # directly go to running after feeding jobs for late biding workSpec.set_status( WorkSpec.ST_running) else: # normal successful submission workSpec.set_status( WorkSpec.ST_submitted) workSpec.submitTime = timeNow workSpec.modificationTime = timeNow # prefetch events if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents: workSpec.eventsRequest = WorkSpec.EV_requestEvents eventsRequestParams = dict() for jobSpec in jobList: eventsRequestParams[ jobSpec.PandaID] = { 'pandaID': jobSpec.PandaID, 'taskID': jobSpec.taskID, 'jobsetID': jobSpec. jobParams['jobsetID'], 'nRanges': jobSpec. jobParams['coreCount'], } workSpec.eventsRequestParams = eventsRequestParams # register worker tmpStat = self.dbProxy.register_worker( workSpec, jobList, lockedBy) if jobList is not None: for jobSpec in jobList: pandaIDs.add(jobSpec.PandaID) if tmpStat: tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}' tmpLog.info( tmpStr.format( workSpec.workerID, jobSpec.PandaID, workSpec.batchID)) else: tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}' tmpLog.error( tmpStr.format( jobSpec.PandaID, workSpec.batchID)) # release jobs self.dbProxy.release_jobs(pandaIDs, lockedBy) tmpLog.info('done') mainLog.debug('done') # define sleep interval if siteName is None: sleepTime = harvester_config.submitter.sleepTime else: sleepTime = 0 # check if being terminated if self.terminated(sleepTime): mainLog.debug('terminated') return