コード例 #1
0
    def post_processing(self, workspec, jobspec_list, map_type):
        '''
        Take the jobReport placed by aCT in the access point and fill metadata
        attributes of the workspec.
        '''

        # get logger
        tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                        method_name='post_processing')
        if not workspec.workAttributes:
            workspec.workAttributes = {}

        for pandaID in workspec.pandaid_list:
            workspec.workAttributes[pandaID] = {}
            # look for job report
            accessPoint = self.get_access_point(workspec, pandaID)
            jsonFilePath = os.path.join(accessPoint, jsonJobReport)
            tmpLog.debug('looking for job report file {0}'.format(jsonFilePath))
            if not os.path.exists(jsonFilePath):
                # not found
                tmpLog.debug('not found')
            else:
                try:
                    with open(jsonFilePath) as jsonFile:
                        workspec.workAttributes[pandaID] = json.load(jsonFile)
                    tmpLog.debug('got {0} kB of job report'.format(os.stat(jsonFilePath).st_size / 1024))
                except:
                    tmpLog.debug('failed to load {0}'.format(jsonFilePath))
            tmpLog.debug("pilot info for {0}: {1}".format(pandaID, workspec.workAttributes[pandaID]))
        return True
コード例 #2
0
    def __init__(self, **kwarg):
        PluginBase.__init__(self, **kwarg)

        # Set up aCT DB connection
        self.log = core_utils.make_logger(baseLogger, 'aCT submitter', method_name='__init__')
        self.actDB = aCTDBPanda(self.log)
        # Credential dictionary role: proxy file
        self.certs = dict(zip([r.split('=')[1] for r in list(harvester_config.credmanager.voms)],
                              list(harvester_config.credmanager.outCertFile)))
        # Map of role to aCT proxyid
        self.proxymap = {}

        # Get proxy info
        # TODO: better to send aCT the proxy file and let it handle it
        for role, proxy in self.certs.items():
            cred_type = arc.initializeCredentialsType(arc.initializeCredentialsType.SkipCredentials)
            uc = arc.UserConfig(cred_type)
            uc.ProxyPath(str(proxy))
            cred = arc.Credential(uc)
            dn = cred.GetIdentityName()
            self.log.info("Proxy {0} with DN {1} and role {2}".format(proxy, dn, role))
    
            actp = aCTProxy(self.log)
            attr = '/atlas/Role='+role
            proxyid = actp.getProxyId(dn, attr)
            if not proxyid:
                raise Exception("Proxy with DN {0} and attribute {1} was not found in proxies table".format(dn, attr))

            self.proxymap[role] = proxyid
コード例 #3
0
 def is_alive(self, workspec, time_limit):
     # get logger
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                     method_name='is_alive')
     # json file
     jsonFilePath = os.path.join(workspec.get_access_point(), heartbeatFile)
     tmpLog.debug('looking for heartbeat file {0}'.format(jsonFilePath))
     if not os.path.exists(jsonFilePath): # no heartbeat file was found
         tmpLog.debug('startTime: {0}, now: {1}'.format(workspec.startTime, datetime.datetime.utcnow()))
         if not workspec.startTime:
             # the worker didn't even have time to start
             tmpLog.debug('heartbeat not found, but no startTime yet for worker')
             return True
         elif datetime.datetime.utcnow() - workspec.startTime < datetime.timedelta(minutes=time_limit):
             # the worker is too young and maybe didn't have time to generate the heartbeat
             tmpLog.debug('heartbeat not found, but worker too young')
             return True
         else:
             # the worker is old and the heartbeat should be expected
             tmpLog.debug('not found')
             return None
     try:
         mtime = datetime.datetime.utcfromtimestamp(os.path.getmtime(jsonFilePath))
         tmpLog.debug('last modification time : {0}'.format(mtime))
         if datetime.datetime.utcnow() - mtime > datetime.timedelta(minutes=time_limit):
             tmpLog.debug('too old')
             return False
         tmpLog.debug('OK')
         return True
     except Exception:
         tmpLog.debug('failed to get mtime')
         return None
コード例 #4
0
 def renew_session(self, retry=3, init=False):
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.renew_session')
     # Clear security session if not initialization
     if not init:
         tmpLog.info('Renew condor session')
         self.secman.invalidateAllSessions()
     # Recreate collector and schedd object
     i_try = 1
     while i_try <= retry:
         try:
             tmpLog.info('Try {0}'.format(i_try))
             if self.condor_pool:
                 self.collector = htcondor.Collector(self.condor_pool)
             else:
                 self.collector = htcondor.Collector()
             if self.condor_schedd:
                 self.scheddAd = self.collector.locate(htcondor.DaemonTypes.Schedd, self.condor_schedd)
             else:
                 self.scheddAd = self.collector.locate(htcondor.DaemonTypes.Schedd)
             self.schedd = htcondor.Schedd(self.scheddAd)
             tmpLog.info('Success')
             break
         except Exception as e:
             tmpLog.warning('Recreate condor collector and schedd failed: {0}'.format(e))
             if i_try < retry:
                 tmpLog.warning('Failed. Retry...')
             else:
                 tmpLog.warning('Retry {0} times. Still failed. Skipped'.format(i_try))
             i_try += 1
             self.secman.invalidateAllSessions()
             time.sleep(3)
     # Sleep
     time.sleep(3)
コード例 #5
0
 def submit_with_python(self, jdl_list, use_spool=False):
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_python')
     # Start
     tmpLog.debug('Start')
     # Initialize
     errStr = ''
     batchIDs_list = []
     # Make list of jdl map with dummy submit objects
     jdl_map_list = [ dict(htcondor.Submit(jdl).items()) for jdl in jdl_list ]
     # Go
     submit_obj = htcondor.Submit()
     try:
         with self.schedd.transaction() as txn:
             # TODO: Currently spool is not supported in htcondor.Submit ...
             submit_result = submit_obj.queue_with_itemdata(txn, 1, iter(jdl_map_list))
             clusterid = submit_result.cluster()
             first_proc = submit_result.first_proc()
             num_proc = submit_result.num_procs()
             batchIDs_list.extend(['{0}.{1}'.format(clusterid, procid)
                                     for procid in range(first_proc, first_proc + num_proc)])
     except RuntimeError as e:
         errStr = '{0}: {1}'.format(e.__class__.__name__, e)
         tmpLog.error('submission failed: {0}'.format(errStr))
         raise
     if batchIDs_list:
         n_jobs = len(batchIDs_list)
         tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list)))
     elif not errStr:
         tmpLog.error('submitted nothing')
     tmpLog.debug('Done')
     # Return
     return (batchIDs_list, errStr)
コード例 #6
0
 def feed_events(self, workspec, events_dict):
     # get logger
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                     method_name='feed_events')
     retVal = True
     if workspec.mapType in [WorkSpec.MT_OneToOne, WorkSpec.MT_MultiWorkers]:
         # put the json just under the access point
         jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsFeedFileName)
         tmpLog.debug('feeding events to {0}'.format(jsonFilePath))
         try:
             with open(jsonFilePath, 'w') as jsonFile:
                 json.dump(events_dict, jsonFile)
         except Exception:
             core_utils.dump_error_message(tmpLog)
             retVal = False
     elif workspec.mapType == WorkSpec.MT_MultiJobs:
         # TOBEFIXED
         pass
     # remove request file
     try:
         jsonFilePath = os.path.join(workspec.get_access_point(), jsonEventsRequestFileName)
         os.remove(jsonFilePath)
     except Exception:
         pass
     tmpLog.debug('done')
     return retVal
コード例 #7
0
    def kill_worker(self, workspec):
        """ Mark aCT job as tobekilled.

        :param workspec: worker specification
        :type workspec: WorkSpec
        :return: A tuple of return code (True for success, False otherwise) and error dialog
        :rtype: (bool, string)
        """
        # make logger
        tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                        method_name='kill_worker')

        if workspec.batchID is None:
            tmpLog.info('workerID={0} has no batch ID so assume was not submitted - skipped'.format(
                        workspec.workerID))
            return True, ''

        try:
            # Only kill jobs which are still active
            self.actDB.updateJobs("id={0} AND actpandastatus IN ('sent', 'starting', 'running')".format(workspec.batchID),
                                  {'actpandastatus': 'tobekilled', 'pandastatus': None})
        except Exception as e:
            tmpLog.error('Failed to cancel job {0} in aCT: {1}'.format(workspec.batchID, str(e)))
            return False, str(e)

        tmpLog.info('Job {0} cancelled in aCT'.format(workspec.batchID))
        return True, ''
コード例 #8
0
 def wrapper(self, *args, **kwargs):
     if self.is_connected:
         return func(self, *args, **kwargs)
     else:
         tmpLog = core_utils.make_logger(_logger, method_name=func.__name__)
         tmpLog.warning('instance not alive; method {0} returns None'.format(func.__name__))
         return None
コード例 #9
0
 def __init__(self, submissionHost, *args, **kwargs):
     self.submissionHost = submissionHost
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.__init__')
     # Initialize
     tmpLog.debug('Initializing client')
     self.lock = threading.Lock()
     self.condor_api = CONDOR_API
     self.condor_schedd = None
     self.condor_pool = None
     # Parse condor command remote options from workspec
     if self.submissionHost in ('LOCAL', 'None'):
         tmpLog.debug('submissionHost is {0}, treated as local schedd. Skipped'.format(self.submissionHost))
     else:
         try:
             self.condor_schedd, self.condor_pool = self.submissionHost.split(',')[0:2]
         except ValueError:
             tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(self.submissionHost))
     # Use Python API or fall back to command
     if self.condor_api == 'python':
         try:
             self.secman = htcondor.SecMan()
             self.renew_session(init=True)
         except Exception as e:
             tmpLog.error('Error when using htcondor Python API. Exception {0}: {1}'.format(e.__class__.__name__, e))
             raise
     tmpLog.debug('Initialized client')
コード例 #10
0
 def _propagate_attributes(workspec, tmpVal):
     # make logger
     tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                     method_name='_propagate_attributes')
     (retVal, tmpDict) = tmpVal
     workspec.set_attributes_with_dict(tmpDict)
     tmpLog.debug('Done workspec attributes propagation')
     return retVal
コード例 #11
0
 def __init__(self, *args, **kwargs):
     self.submissionHost = str(kwargs.get('id'))
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0} thrid={1} oid={2}'.format(self.submissionHost, get_ident(), id(self)), method_name='CondorJobManage.__init__')
     # Initialize
     tmpLog.debug('Start')
     self.lock = threading.Lock()
     CondorClient.__init__(self, self.submissionHost, *args, **kwargs)
     tmpLog.debug('Initialize done')
コード例 #12
0
    def submit_workers(self, workspec_list):
        retList = []
        for workSpec in workspec_list:

            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID),
                                            method_name='submit_workers')

            queueconfigmapper = QueueConfigMapper()
            queueconfig = queueconfigmapper.get_queue(workSpec.computingSite)
            prodSourceLabel = queueconfig.get_source_label()

            # If jobSpec is defined we are in push mode, if not pull mode
            # Both assume one to one worker to job mapping
            jobSpec = workSpec.get_jobspec_list()
            if jobSpec:
                jobSpec = jobSpec[0]
                tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map()))

            desc = {}
            desc['pandastatus'] = 'sent'
            desc['actpandastatus'] = 'sent'
            desc['siteName'] = workSpec.computingSite
            desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel == 'user' else 'production']
            desc['sendhb'] = 0
            metadata = {'harvesteraccesspoint': workSpec.get_access_point(),
                        'schedulerid': 'harvester-{}'.format(harvester_config.master.harvester_id)}
            desc['metadata'] = json.dumps(metadata)

            if jobSpec:
                # push mode: aCT takes the url-encoded job description (like it gets from panda server)
                pandaid = jobSpec.PandaID
                actjobdesc = urllib.urlencode(jobSpec.jobParams)
            else:
                # pull mode: just set pandaid (to workerid) and prodsourcelabel
                pandaid = workSpec.workerID
                actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (pandaid, prodSourceLabel)

            tmpLog.info("Inserting job {0} into aCT DB: {1}".format(pandaid, str(desc)))
            try:
                batchid = self.actDB.insertJob(pandaid, actjobdesc, desc)['LAST_INSERT_ID()']
            except Exception as e:
                result = (False, "Failed to insert job into aCT DB: {0}".format(str(e)))
            else:
                tmpLog.info("aCT batch id {0}".format(batchid))
                workSpec.batchID = str(batchid)
                # Set log files in workSpec
                today = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([queueconfig.submitter.get('logBaseURL'), today, workSpec.computingSite, str(pandaid)])
                workSpec.set_log_file('batch_log', '{0}.log'.format(logurl))
                workSpec.set_log_file('stdout', '{0}.out'.format(logurl))
                workSpec.set_log_file('stderr', '{0}.err'.format(logurl))
                result = (True, '')
            retList.append(result)

        return retList
コード例 #13
0
def _make_init_script(workspec, template_str):
    # make logger
    tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_make_init_script')

    # make init tempfile
    tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_init.sh', dir=workspec.get_access_point())
    new_template_str = _init_script_replace(template_str, **workspec.__dict__)
    tmpFile.write(new_template_str)
    tmpFile.close()
    tmpLog.debug('done')
    return tmpFile.name
コード例 #14
0
 def kill_requested(self, workspec):
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='kill_requested')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.kill_requested(self.original_config, workspec)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
コード例 #15
0
 def feed_jobs(self, workspec, jobspec_list):
     # get logger
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                     method_name='feed_jobs')
     retVal = True
     # get PFC
     pfc = core_utils.make_pool_file_catalog(jobspec_list)
     pandaIDs = []
     for jobSpec in jobspec_list:
         accessPoint = self.get_access_point(workspec, jobSpec.PandaID)
         jobSpecFilePath = os.path.join(accessPoint, jobSpecFileName)
         xmlFilePath = os.path.join(accessPoint, xmlPoolCatalogFileName)
         tmpLog.debug('feeding jobs to {0}'.format(jobSpecFilePath))
         try:
             # put job spec file
             with open(jobSpecFilePath, 'w') as jobSpecFile:
                 jobParams = jobSpec.get_job_params(self.stripJobParams)
                 if self.jobSpecFileFormat == 'cgi':
                     jobSpecFile.write(urlencode(jobParams))
                 else:
                     json.dump({jobSpec.PandaID: jobParams}, jobSpecFile)
             # put PFC.xml
             with open(xmlFilePath, 'w') as pfcFile:
                 pfcFile.write(pfc)
             # make symlink
             inFiles = jobSpec.get_input_file_attributes()
             for inLFN, inFile in iteritems(inFiles):
                 dstPath = os.path.join(accessPoint, inLFN)
                 if 'path' in inFile and inFile['path'] != dstPath:
                     # test if symlink exists if so remove it
                     if os.path.exists(dstPath):
                         os.unlink(dstPath)
                         tmpLog.debug("removing existing symlink %s" % dstPath)
                     os.symlink(inFile['path'], dstPath)
             pandaIDs.append(jobSpec.PandaID)
         except Exception:
             core_utils.dump_error_message(tmpLog)
             retVal = False
     # put PandaIDs file
     try:
         jsonFilePath = os.path.join(workspec.get_access_point(), pandaIDsFile)
         with open(jsonFilePath, 'w') as jsonPandaIDsFile:
             json.dump(pandaIDs, jsonPandaIDsFile)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         retVal = False
     # remove request file
     try:
         reqFilePath = os.path.join(workspec.get_access_point(), jsonJobRequestFileName)
         os.remove(reqFilePath)
     except Exception:
         pass
     tmpLog.debug('done')
     return retVal
コード例 #16
0
 def acknowledge_events_files(self, workspec):
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='acknowledge_events_files')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.acknowledge_events_files(self.original_config, workspec)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
コード例 #17
0
 def submit_with_command(self, jdl_list, use_spool=False, tmp_str='', keep_temp_sdf=False):
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_command')
     # Initialize
     errStr = ''
     batchIDs_list = []
     # make sdf temp file from jdls
     tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=(not keep_temp_sdf),
                                 suffix='_{0}_cluster_submit.sdf'.format(tmp_str))
     sdf_file = tmpFile.name
     tmpFile.write('\n\n'.join(jdl_list))
     tmpFile.flush()
     # make condor remote options
     name_opt = '-name {0}'.format(self.condor_schedd) if self.condor_schedd else ''
     pool_opt = '-pool {0}'.format(self.condor_pool) if self.condor_pool else ''
     spool_opt = '-remote -spool' if use_spool and self.condor_schedd else ''
     # command
     comStr = 'condor_submit -single-cluster {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format(
                 sdf_file=sdf_file, name_opt=name_opt, pool_opt=pool_opt, spool_opt=spool_opt)
     # submit
     tmpLog.debug('submit with command: {0}'.format(comStr))
     try:
         p = subprocess.Popen(comStr.split(), shell=False, universal_newlines=True,
                              stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         # check return code
         stdOut, stdErr = p.communicate()
         retCode = p.returncode
     except Exception as e:
         stdOut = ''
         stdErr = core_utils.dump_error_message(tmpLog, no_message=True)
         retCode = 1
         errStr = '{0}: {1}'.format(e.__class__.__name__, e)
     finally:
         tmpFile.close()
     tmpLog.debug('retCode={0}'.format(retCode))
     if retCode == 0:
         # extract clusterid and n_jobs
         job_id_match = None
         for tmp_line_str in stdOut.split('\n'):
             job_id_match = re.search('^(\d+) job[(]s[)] submitted to cluster (\d+)\.$', tmp_line_str)
             if job_id_match:
                 break
         if job_id_match is not None:
             n_jobs = int(job_id_match.group(1))
             clusterid = job_id_match.group(2)
             batchIDs_list = ['{0}.{1}'.format(clusterid, procid) for procid in range(n_jobs)]
             tmpLog.debug('submitted {0} jobs: {1}'.format(n_jobs, ' '.join(batchIDs_list)))
         else:
             errStr = 'no job submitted: {0}'.format(errStr)
             tmpLog.error(errStr)
     else:
         tmpLog.error('submission failed: {0} ; {1}'.format(stdErr, errStr))
     # Return
     return (batchIDs_list, errStr)
コード例 #18
0
 def post_processing(self, workspec, jobspec_list, map_type):
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='post_processing')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.post_processing(self.original_config, workspec, jobspec_list, map_type)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
コード例 #19
0
 def check_workers(self, workspec_list):
     tmpLog = core_utils.make_logger(_logger, method_name='check_workers')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.check_workers(self.original_config, workspec_list)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
コード例 #20
0
 def is_alive(self, workspec, worker_heartbeat_limit):
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID), method_name='is_alive')
     tmpLog.debug('start')
     try:
         ret = self.conn.root.is_alive(self.original_config, workspec, worker_heartbeat_limit)
     except Exception:
         core_utils.dump_error_message(tmpLog)
         ret = None
     else:
         tmpLog.debug('done')
     return ret
コード例 #21
0
 def _get_connection(self):
     tmpLog = core_utils.make_logger(_logger, method_name='_get_connection')
     tmpLog.debug('start')
     sshTunnelPool.make_tunnel_server(self.remoteHost, self.remotePort, self.remoteBindPort, self.numTunnels,
                                  ssh_username=self.sshUserName, ssh_password=self.sshPassword,
                                  private_key=self.privateKey, pass_phrase=self.passPhrase,
                                  jump_host=self.jumpHost, jump_port=self.jumpPort)
     tunnelHost, tunnelPort, tunnelCore = sshTunnelPool.get_tunnel(self.remoteHost, self.remotePort)
     self.conn = rpyc.connect(tunnelHost, tunnelPort, config={"allow_all_attrs": True,
                                                                 "allow_setattr": True,
                                                                 "allow_delattr": True})
     tmpLog.debug('connected successfully to {0}:{1}'.format(tunnelHost, tunnelPort))
コード例 #22
0
 def kill_requested(self, workspec):
     # get logger
     tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                     method_name='kill_requested')
     # look for the json just under the access point
     jsonFilePath = os.path.join(workspec.get_access_point(), killWorkerFile)
     tmpLog.debug('looking for kill request file {0}'.format(jsonFilePath))
     if not os.path.exists(jsonFilePath):
         # not found
         tmpLog.debug('not found')
         return False
     tmpLog.debug('kill requested')
     return True
コード例 #23
0
 def remove_with_command(self, batchIDs_list=[]):
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove_with_command')
     # if workspec.batchID is None:
     #     tmpLog.info('Found workerID={0} has submissionHost={1} batchID={2} . Cannot kill. Skipped '.format(
     #                     workspec.workerID, workspec.submissionHost, workspec.batchID))
     #     ret_list.append((True, ''))
     #
     # ## Parse condor remote options
     # name_opt, pool_opt = '', ''
     # if workspec.submissionHost is None or workspec.submissionHost == 'LOCAL':
     #     pass
     # else:
     #     try:
     #         condor_schedd, condor_pool = workspec.submissionHost.split(',')[0:2]
     #     except ValueError:
     #         errStr = 'Invalid submissionHost: {0} . Skipped'.format(workspec.submissionHost)
     #         tmpLog.error(errStr)
     #         ret_list.append((False, errStr))
     #     name_opt = '-name {0}'.format(condor_schedd) if condor_schedd else ''
     #     pool_opt = '-pool {0}'.format(condor_pool) if condor_pool else ''
     #
     # ## Kill command
     # comStr = 'condor_rm {name_opt} {pool_opt} {batchID}'.format(name_opt=name_opt,
     #                                                             pool_opt=pool_opt,
     #                                                             batchID=workspec.batchID)
     # (retCode, stdOut, stdErr) = _runShell(comStr)
     # if retCode != 0:
     #     comStr = 'condor_q -l {name_opt} {pool_opt} {batchID}'.format(name_opt=name_opt,
     #                                                                 pool_opt=pool_opt,
     #                                                                 batchID=workspec.batchID)
     #     (retCode, stdOut, stdErr) = _runShell(comStr)
     #     if ('ClusterId = {0}'.format(workspec.batchID) in str(stdOut) \
     #         and 'JobStatus = 3' not in str(stdOut)) or retCode != 0:
     #         ## Force to cancel if batch job not terminated first time
     #         comStr = 'condor_rm -forcex {name_opt} {pool_opt} {batchID}'.format(name_opt=name_opt,
     #                                                                     pool_opt=pool_opt,
     #                                                                     batchID=workspec.batchID)
     #         (retCode, stdOut, stdErr) = _runShell(comStr)
     #         if retCode != 0:
     #             ## Command failed to kill
     #             errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format(comStr, retCode, stdOut, stdErr)
     #             tmpLog.error(errStr)
     #             ret_list.append((False, errStr))
     #     ## Found already killed
     #     tmpLog.info('Found workerID={0} submissionHost={1} batchID={2} already killed'.format(
     #                     workspec.workerID, workspec.submissionHost, workspec.batchID))
     # else:
     #     tmpLog.info('Succeeded to kill workerID={0} submissionHost={1} batchID={2}'.format(
     #                     workspec.workerID, workspec.submissionHost, workspec.batchID))
     raise NotImplementedError
コード例 #24
0
    def check_workers(self, workspec_list):
        retList = []
        for workSpec in workspec_list:
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID),
                                            method_name='check_workers')
            try:
                tmpLog.debug('Querying aCT for id {0}'.format(workSpec.batchID))
                columns = ['actpandastatus', 'pandastatus', 'computingElement', 'node']
                actjobs = self.actDB.getJobs("id={0}".format(workSpec.batchID), columns)
            except Exception as e:
                tmpLog.error("Failed to query aCT DB: {0}".format(str(e)))
                # send back current status
                retList.append((workSpec.status, ''))
                continue

            if not actjobs:
                tmpLog.error("Job with id {0} not found in aCT".format(workSpec.batchID))
                # send back current status
                retList.append((WorkSpec.ST_failed, "Job not found in aCT"))
                continue

            actstatus = actjobs[0]['actpandastatus']
            newStatus = WorkSpec.ST_running
            if actstatus in ['sent', 'starting']:
                newStatus = WorkSpec.ST_submitted
            elif actstatus == 'done':
                newStatus = self.check_pilot_status(workSpec, tmpLog)
            elif actstatus == 'donefailed':
                newStatus = WorkSpec.ST_failed
            elif actstatus == 'donecancelled':
                newStatus = WorkSpec.ST_cancelled

            if newStatus != workSpec.status:
                tmpLog.info('ID {0} updated status {1} -> {2} ({3})'.format(workSpec.batchID, workSpec.status, newStatus, actstatus))
            else:
                tmpLog.debug('batchStatus {0} -> workerStatus {1}'.format(actstatus, newStatus))

            if actjobs[0]['computingElement']:
                workSpec.computingElement = actjobs[0]['computingElement']
            if actjobs[0]['node']:
                try:
                    pandaid = workSpec.get_jobspec_list()[0].PandaID
                    workSpec.set_work_attributes({pandaid: {'node': actjobs[0]['node']}})
                except:
                    tmpLog.warning('Could not extract panda ID for worker {0}'.format(workSpec.batchID))

            retList.append((newStatus, ''))

        return True, retList
コード例 #25
0
    def events_to_update(self, workspec):
        # get logger
        tmpLog = core_utils.make_logger(_logger, 'workerID={0}'.format(workspec.workerID),
                                        method_name='events_to_update')
        # look for the json just under the access point
        retDict = dict()
        for pandaID in workspec.pandaid_list:
            # look for the json just under the access point
            accessPoint = self.get_access_point(workspec, pandaID)

            jsonFilePath = os.path.join(accessPoint, jsonEventsUpdateFileName)
            readJsonPath = jsonFilePath + suffixReadJson
            # first look for json.read which is not yet acknowledged
            tmpLog.debug('looking for event update file {0}'.format(readJsonPath))
            if os.path.exists(readJsonPath):
                pass
            else:
                tmpLog.debug('looking for event update file {0}'.format(jsonFilePath))
                if not os.path.exists(jsonFilePath):
                    # not found
                    tmpLog.debug('not found')
                    continue
                try:
                    # rename to prevent from being overwritten
                    os.rename(jsonFilePath, readJsonPath)
                except Exception:
                    tmpLog.error('failed to rename json')
                    continue
            # load json
            nData = 0
            try:
                with open(readJsonPath) as jsonFile:
                    tmpOrigDict = json.load(jsonFile)
                    newDict = dict()
                    # change the key from str to int
                    for tmpPandaID, tmpDict in iteritems(tmpOrigDict):
                        tmpPandaID = long(tmpPandaID)
                        retDict[tmpPandaID] = tmpDict
                        nData += len(tmpDict)
            except Exception:
                tmpLog.error('failed to load json')
            # delete empty file
            if nData == 0:
                try:
                    os.remove(readJsonPath)
                except Exception:
                    pass
            tmpLog.debug('got {0} events for PandaID={1}'.format(nData, pandaID))
        return retDict
コード例 #26
0
 def remove(self, batchIDs_list=[]):
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove')
     # Get all
     tmpLog.debug('Start')
     job_ads_all_dict = {}
     if self.condor_api == 'python':
         try:
             retVal = self.remove_with_python(batchIDs_list)
         except Exception as e:
             tmpLog.error('Exception {0}: {1}'.format(e.__class__.__name__, e))
             raise
     else:
         retVal = self.remove_with_command(batchIDs_list)
     return retVal
コード例 #27
0
 def get_all(self, batchIDs_list=[], allJobs=False):
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobQuery.get_all')
     # Get all
     tmpLog.debug('Start')
     job_ads_all_dict = {}
     if self.condor_api == 'python':
         try:
             job_ads_all_dict = self.query_with_python(batchIDs_list, allJobs)
         except Exception as e:
             tmpLog.error('Exception {0}: {1}'.format(e.__class__.__name__, e))
             raise
     else:
         job_ads_all_dict = self.query_with_command(batchIDs_list)
     return job_ads_all_dict
コード例 #28
0
 def __init__(self, cacheEnable=False, cacheRefreshInterval=None, useCondorHistory=True, *args, **kwargs):
     self.submissionHost = str(kwargs.get('id'))
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0} thrid={1} oid={2}'.format(self.submissionHost, get_ident(), id(self)), method_name='CondorJobQuery.__init__')
     # Initialize
     with self.classLock:
         tmpLog.debug('Start')
         CondorClient.__init__(self, self.submissionHost, *args, **kwargs)
         # For condor_q cache
         self.cacheEnable = cacheEnable
         if self.cacheEnable:
             self.cache = ([], 0)
             self.cacheRefreshInterval = cacheRefreshInterval
         self.useCondorHistory = useCondorHistory
         tmpLog.debug('Initialize done')
コード例 #29
0
def check_a_worker(workspec):
    # make logger
    tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                    method_name='check_a_worker')
    dummyFilePath = os.path.join(workspec.get_access_point(), 'status.txt')
    tmpLog.debug('look for {0}'.format(dummyFilePath))
    newStatus = WorkSpec.ST_finished
    try:
        with open(dummyFilePath) as dummyFile:
            newStatus = dummyFile.readline()
            newStatus = newStatus.strip()
    except:
        pass
    tmpLog.debug('newStatus={0}'.format(newStatus))
    return (newStatus, '')
コード例 #30
0
 def remove_with_python(self, batchIDs_list=[]):
     # Make logger
     tmpLog = core_utils.make_logger(baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobManage.remove_with_python')
     # Start
     tmpLog.debug('Start')
     # Acquire class lock
     with self.classLock:
         tmpLog.debug('Got class lock')
         # Initialize
         ret_list = []
         retMap = {}
         # Go
         n_jobs = len(batchIDs_list)
         act_ret = self.schedd.act(htcondor.JobAction.Remove, batchIDs_list)
         # Check if all jobs clear (off from schedd queue)
         is_all_clear = (n_jobs == act_ret['TotalAlreadyDone'] + act_ret['TotalNotFound'] + act_ret['TotalSuccess'])
         if act_ret and is_all_clear:
             tmpLog.debug('removed {0} jobs: {1}'.format(n_jobs, ','.join(batchIDs_list)))
             for batchid in batchIDs_list:
                 condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid)
                 retMap[condor_job_id] = (True, '')
         else:
             tmpLog.error('job removal failed; batchIDs_list={0}, got: {1}'.format(batchIDs_list, act_ret))
             # need to query queue for unterminated jobs not removed yet
             clusterids_set = set([ get_job_id_tuple_from_batchid(batchid)[0] for batchid in batchIDs_list ])
             clusterids_str = ','.join(list(clusterids_set))
             requirements = 'member(ClusterID, {{{0}}}) && JobStatus =!= 3 && JobStatus =!= 4'.format(clusterids_str)
             jobs_iter = self.schedd.xquery(requirements=requirements, projection=CONDOR_JOB_ADS_LIST)
             all_batchid_map = {}
             ok_batchid_list = []
             ng_batchid_list = []
             for job in jobs_iter:
                 job_ads_dict = dict(job)
                 batchid = get_batchid_from_job(job_ads_dict)
                 all_batchid_map[batchid] = job_ads_dict
             for batchid in batchIDs_list:
                 condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid)
                 if batchid in all_batchid_map:
                     ng_batchid_list.append(batchid)
                     retMap[condor_job_id] = (False, 'batchID={0} still unterminated in condor queue'.format(batchid))
                 else:
                     ok_batchid_list.append(batchid)
                     retMap[condor_job_id] = (True, '')
             tmpLog.debug('removed {0} jobs: {1} ; failed to remove {2} jobs: {3}'.format(
                             len(ok_batchid_list), ','.join(ok_batchid_list), len(ng_batchid_list), ','.join(ng_batchid_list)))
     tmpLog.debug('Done')
     # Return
     return retMap
コード例 #31
0
    def trigger_stage_out(self, jobspec):
        # make logger
        tmpLog = core_utils.make_logger(baseLogger,
                                        'PandaID={0}'.format(jobspec.PandaID),
                                        method_name='trigger_stage_out')
        tmpLog.debug('start')
        # loop over all files
        lifetime = 7 * 24 * 60 * 60
        allChecked = True
        ErrMsg = 'These files failed to upload : '
        zip_datasetName = 'harvester_stage_out.{0}'.format(str(uuid.uuid4()))
        fileAttrs = jobspec.get_output_file_attributes()
        for fileSpec in jobspec.outFiles:
            fileSpec.fileAttributes['transferID'] = None  #synchronius transfer
            # skip already done
            tmpLog.debug(' file: %s status: %s' %
                         (fileSpec.lfn, fileSpec.status))
            if fileSpec.status in ['finished', 'failed']:
                continue
            # set destination RSE
            if fileSpec.fileType in ['es_output', 'zip_output', 'output']:
                dstRSE = self.dstRSE_Out
            elif fileSpec.fileType == 'log':
                dstRSE = self.dstRSE_Log
            else:
                errMsg = 'unsupported file type {0}'.format(fileSpec.fileType)
                tmpLog.error(errMsg)
                return (False, errMsg)
            # skip if destination is None
            if dstRSE is None:
                continue

            # get/set scope and dataset name
            if fileSpec.fileType != 'zip_output':
                scope = fileAttrs[fileSpec.lfn]['scope']
                datasetName = fileAttrs[fileSpec.lfn]['dataset']
            else:
                # use panda scope for zipped files
                scope = self.scopeForTmp
                datasetName = zip_datasetName

            # for now mimic behaviour and code of pilot v2 rucio copy tool (rucio download) change when needed

            executable = ['/usr/bin/env', 'rucio', '-v', 'upload']
            executable += ['--no-register']
            executable += ['--lifetime', ('%d' % lifetime)]
            executable += ['--rse', dstRSE]
            executable += ['--scope', scope]
            if fileSpec.fileAttributes is not None and 'guid' in fileSpec.fileAttributes:
                executable += ['--guid', fileSpec.fileAttributes['guid']]
            executable += [('%s:%s' % (scope, datasetName))]
            executable += [('%s' % fileSpec.path)]

            #print executable

            tmpLog.debug('rucio upload command: {0} '.format(executable))
            tmpLog.debug('rucio upload command (for human): %s ' %
                         ' '.join(executable))

            process = subprocess.Popen(executable,
                                       stdout=subprocess.PIPE,
                                       stderr=subprocess.STDOUT)

            stdout, stderr = process.communicate()

            if process.returncode == 0:
                fileSpec.status = 'finished'
                tmpLog.debug(stdout)
            else:
                # check what failed
                file_exists = False
                rucio_sessions_limit_error = False
                for line in stdout.split('\n'):
                    if 'File name in specified scope already exists' in line:
                        file_exists = True
                        break
                    elif 'exceeded simultaneous SESSIONS_PER_USER limit' in line:
                        rucio_sessions_limit_error = True
                if file_exists:
                    tmpLog.debug('file exists, marking transfer as finished')
                    fileSpec.status = 'finished'
                elif rucio_sessions_limit_error:
                    # do nothing
                    tmpLog.warning(
                        'rucio returned error, will retry: stdout: %s' %
                        stdout)
                    # do not change fileSpec.status and Harvester will retry if this function returns False
                    allChecked = False
                    continue
                else:
                    fileSpec.status = 'failed'
                    tmpLog.error('rucio upload failed with stdout: %s' %
                                 stdout)
                    ErrMsg += '%s failed with rucio error stdout="%s"' % (
                        fileSpec.lfn, stdout)
                    allChecked = False

            # force update
            fileSpec.force_update('status')

            tmpLog.debug('file: %s status: %s' %
                         (fileSpec.lfn, fileSpec.status))

        # return
        tmpLog.debug('done')
        if allChecked:
            return True, ''
        else:
            return False, ErrMsg
コード例 #32
0
 def trigger_stage_out(self, jobspec):
     # make logger
     tmpLog = core_utils.make_logger(baseLogger,
                                     'PandaID={0}'.format(jobspec.PandaID),
                                     method_name='trigger_stage_out')
     tmpLog.debug('start')
     # loop over all files
     files = dict()
     transferIDs = dict()
     transferDatasets = dict()
     fileAttrs = jobspec.get_output_file_attributes()
     for fileSpec in jobspec.outFiles:
         # skip zipped files
         if fileSpec.zipFileID is not None:
             continue
         # skip if already processed
         if 'transferDataset' in fileSpec.fileAttributes:
             if fileSpec.fileType not in transferDatasets:
                 transferDatasets[
                     fileSpec.
                     fileType] = fileSpec.fileAttributes['transferDataset']
             if fileSpec.fileType not in transferIDs:
                 transferIDs[fileSpec.fileType] = fileSpec.fileAttributes[
                     'transferID']
             continue
         # set OS ID
         if fileSpec.fileType == ['es_output', 'zip_output']:
             fileSpec.objstoreID = self.objStoreID_ES
         # make path where file is copied for transfer
         if fileSpec.fileType != 'zip_output':
             scope = fileAttrs[fileSpec.lfn]['scope']
             datasetName = fileAttrs[fileSpec.lfn]['dataset']
         else:
             # use panda scope for zipped files
             scope = self.scopeForTmp
             datasetName = 'dummy'
         srcPath = fileSpec.path
         dstPath = mover_utils.construct_file_path(self.srcBasePath, scope,
                                                   fileSpec.lfn)
         # remove
         if os.path.exists(dstPath):
             os.remove(dstPath)
         # copy
         tmpLog.debug('copy src={srcPath} dst={dstPath}'.format(
             srcPath=srcPath, dstPath=dstPath))
         dstDir = os.path.dirname(dstPath)
         if not os.path.exists(dstDir):
             os.makedirs(dstDir)
         shutil.copyfile(srcPath, dstPath)
         # collect files
         tmpFile = dict()
         tmpFile['scope'] = scope
         tmpFile['name'] = fileSpec.lfn
         tmpFile['bytes'] = fileSpec.fsize
         if fileSpec.fileType not in files:
             files[fileSpec.fileType] = []
         files[fileSpec.fileType].append(tmpFile)
     # loop over all file types to be registered to rucio
     rucioAPI = RucioClient()
     for fileType, fileList in iteritems(files):
         # set destination RSE
         if fileType in ['es_output', 'zip_output']:
             dstRSE = self.dstRSE_ES
         elif fileType == 'output':
             dstRSE = self.dstRSE_Out
         elif fileType == 'log':
             dstRSE = self.dstRSE_Log
         else:
             errMsg = 'unsupported file type {0}'.format(fileType)
             tmpLog.error(errMsg)
             return (False, errMsg)
         # skip if destination is None
         if dstRSE is None:
             continue
         # make datasets if missing
         if fileType not in transferDatasets:
             try:
                 tmpScope = self.scopeForTmp
                 tmpDS = 'panda.harvester_stage_out.{0}'.format(
                     str(uuid.uuid4()))
                 rucioAPI.add_dataset(tmpScope,
                                      tmpDS,
                                      meta={'hidden': True},
                                      lifetime=7 * 24 * 60 * 60,
                                      files=fileList,
                                      rse=self.srcRSE)
                 transferDatasets[fileType] = tmpDS
                 # add rule
                 tmpDID = dict()
                 tmpDID['scope'] = tmpScope
                 tmpDID['name'] = tmpDS
                 tmpRet = rucioAPI.add_replication_rule([tmpDID],
                                                        1,
                                                        dstRSE,
                                                        lifetime=7 * 24 *
                                                        60 * 60)
                 tmpTransferIDs = tmpRet[0]
                 transferIDs[fileType] = tmpTransferIDs
                 tmpLog.debug('register dataset {0} with rule {1}'.format(
                     tmpDS, str(tmpTransferIDs)))
             except:
                 errMsg = core_utils.dump_error_message(tmpLog)
                 return (False, errMsg)
         else:
             # add files to existing dataset
             try:
                 tmpScope = self.scopeForTmp
                 tmpDS = transferDatasets[fileType]
                 rucioAPI.add_files_to_dataset(tmpScope, tmpDS, fileList,
                                               self.srcRSE)
                 tmpLog.debug('added files to {0}'.format(tmpDS))
             except:
                 errMsg = core_utils.dump_error_message(tmpLog)
                 return (False, errMsg)
     # set transfer datasets and rules
     for fileSpec in jobspec.outFiles:
         # skip zipped files
         if fileSpec.zipFileID is not None:
             continue
         # skip already done
         if fileSpec.status in ['finished', 'failed']:
             continue
         # skip if already processed
         if 'transferDataset' in fileSpec.fileAttributes:
             continue
         # no destination
         if fileSpec.fileType not in transferDatasets:
             fileSpec.status = 'finished'
             continue
         # set dataset
         fileSpec.fileAttributes['transferDataset'] = transferDatasets[
             fileSpec.fileType]
         # set rule
         fileSpec.fileAttributes['transferID'] = transferIDs[
             fileSpec.fileType]
         # force update
         fileSpec.force_update('fileAttributes')
     # return
     tmpLog.debug('done')
     return (True, '')
コード例 #33
0
queueConfigMapper = QueueConfigMapper()
queueConfig = queueConfigMapper.get_queue(queueName)
initial_queueConfig_preparator = queueConfig.preparator
queueConfig.preparator[
    'module'] = 'pandaharvester.harvesterpreparator.go_bulk_preparator'
queueConfig.preparator['name'] = 'GlobusBulkPreparator'
modified_queueConfig_preparator = queueConfig.preparator

pluginFactory = PluginFactory()
# get stage-out plugin
preparatorCore = pluginFactory.get_plugin(queueConfig.preparator)

# logger
_logger = core_utils.setup_logger('further_testing_go_bulk_preparator')
tmpLog = core_utils.make_logger(
    _logger, method_name='further_testing_go_bulk_preparator')
tmpLog.debug('start')

for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems():
    #print "loggerName - {}".format(loggerName)
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

msgStr = "plugin={0}".format(preparatorCore.__class__.__name__)
tmpLog.debug(msgStr)
コード例 #34
0
def submit_bag_of_workers(data_list):
    # make logger
    tmpLog = core_utils.make_logger(baseLogger, method_name='submit_bag_of_workers')
    # keep order of workers in data_list
    workerIDs_list = [ data['workspec'].workerID for data in data_list ]
    # initialization
    worker_retval_map = {}
    worker_data_map = {}
    host_jdl_list_workerid_map = {}
    # go
    for data in data_list:
        workspec = data['workspec']
        workerID = workspec.workerID
        worker_data_map[workerID] = data
        to_submit = data['to_submit']
        # no need to submit bad worker
        if not to_submit:
            errStr = '{0} not submitted due to incomplete data of the worker'.format(workerID)
            tmpLog.warning(errStr)
            tmpRetVal = (None, errStr)
            # return tmpRetVal, workspec.get_changed_attributes()
            worker_retval_map[workerID] = (tmpRetVal, workspec.get_changed_attributes())
        # attributes
        try:
            use_spool = data['use_spool']
        except KeyError:
            errStr = '{0} not submitted due to incomplete data of the worker'.format(workerID)
            tmpLog.warning(errStr)
            tmpRetVal = (None, errStr)
            # return tmpRetVal, workspec.get_changed_attributes()
            worker_retval_map[workerID] = (tmpRetVal, workspec.get_changed_attributes())
        else:
            workspec.reset_changed_list()
            # fill in host_jdl_list_workerid_map
            a_jdl, placeholder_map = make_a_jdl(**data)
            val = (workspec, a_jdl, placeholder_map)
            try:
                host_jdl_list_workerid_map[workspec.submissionHost].append(val)
            except KeyError:
                host_jdl_list_workerid_map[workspec.submissionHost] = [val]
    # loop over submissionHost
    for host, val_list in host_jdl_list_workerid_map.items():
        # make jdl string of workers
        jdl_list = [ val[1] for val in val_list ]
        # condor job submit object
        tmpLog.debug('submitting to submissionHost={0}'.format(host))
        # submit
        try:
            condor_job_submit = CondorJobSubmit(id=host)
            batchIDs_list, ret_err_str = condor_job_submit.submit(jdl_list, use_spool=use_spool)
        except Exception as e:
            batchIDs_list = None
            ret_err_str = 'Exception {0}: {1}'.format(e.__class__.__name__, e)
        # result
        if batchIDs_list:
            # submitted
            n_workers = len(val_list)
            tmpLog.debug('submitted {0} workers to submissionHost={1}'.format(n_workers, host))
            for val_i in range(n_workers):
                val = val_list[val_i]
                workspec = val[0]
                placeholder_map = val[2]
                # got batchID
                workspec.batchID = batchIDs_list[val_i]
                tmpLog.debug('workerID={0} submissionHost={1} batchID={2}'.format(
                                workspec.workerID, workspec.submissionHost, workspec.batchID))
                # get worker data
                data = worker_data_map[workspec.workerID]
                # set computingElement
                ce_info_dict = data['ce_info_dict']
                workspec.computingElement = ce_info_dict.get('ce_endpoint', '')
                # set log
                batch_log_dict = data['batch_log_dict']
                (clusterid, procid) = get_job_id_tuple_from_batchid(workspec.batchID)
                batch_log = _condor_macro_replace(batch_log_dict['batch_log'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map)
                batch_stdout = _condor_macro_replace(batch_log_dict['batch_stdout'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map)
                batch_stderr = _condor_macro_replace(batch_log_dict['batch_stderr'], ClusterId=clusterid, ProcId=procid).format(**placeholder_map)
                try:
                    batch_jdl = '{0}.jdl'.format(batch_stderr[:-4])
                except Exception:
                    batch_jdl = None
                workspec.set_log_file('batch_log', batch_log)
                workspec.set_log_file('stdout', batch_stdout)
                workspec.set_log_file('stderr', batch_stderr)
                workspec.set_log_file('jdl', batch_jdl)
                if not workspec.get_jobspec_list():
                    tmpLog.debug('No jobspec associated in the worker of workerID={0}'.format(workspec.workerID))
                else:
                    for jobSpec in workspec.get_jobspec_list():
                        # using batchLog and stdOut URL as pilotID and pilotLog
                        jobSpec.set_one_attribute('pilotID', workspec.workAttributes['stdOut'])
                        jobSpec.set_one_attribute('pilotLog', workspec.workAttributes['batchLog'])
                tmpLog.debug('Done set_log_file after submission of workerID={0}'.format(workspec.workerID))
                tmpRetVal = (True, '')
                worker_retval_map[workspec.workerID] = (tmpRetVal, workspec.get_changed_attributes())
        else:
            # failed
            tmpLog.debug('failed to submit workers to submissionHost={0} ; {1}'.format(host, ret_err_str))
            for val in val_list:
                workspec = val[0]
                errStr = 'submission failed: {0}'.format(ret_err_str)
                tmpLog.error(errStr)
                tmpRetVal = (None, errStr)
                worker_retval_map[workspec.workerID] = (tmpRetVal, workspec.get_changed_attributes())
    # make return list
    retValList = [ worker_retval_map[w_id] for w_id in workerIDs_list ]
    return retValList
コード例 #35
0
    def query_with_command(self, batchIDs_list=[]):
        # Make logger
        tmpLog = core_utils.make_logger(
            baseLogger, method_name='CondorJobQuery.query_with_command')
        ## Start query
        tmpLog.debug('Start query')
        job_ads_all_dict = {}
        batchIDs_list = list(batchIDs_list)
        for orig_comStr in self.orig_comStr_list:
            ## String of batchIDs
            batchIDs_str = ' '.join(batchIDs_list)
            ## Command
            if 'condor_q' in orig_comStr or ('condor_history' in orig_comStr
                                             and batchIDs_list):
                name_opt = '-name {0}'.format(
                    self.condor_schedd) if self.condor_schedd else ''
                pool_opt = '-pool {0}'.format(
                    self.condor_pool) if self.condor_pool else ''
                ids = batchIDs_str
                comStr = '{cmd} {name_opt} {pool_opt} {ids}'.format(
                    cmd=orig_comStr,
                    name_opt=name_opt,
                    pool_opt=pool_opt,
                    ids=ids)
            else:
                # tmpLog.debug('No batch job left to query in this cycle by this thread')
                continue
            tmpLog.debug('check with {0}'.format(comStr))
            (retCode, stdOut, stdErr) = _runShell(comStr)
            if retCode == 0:
                ## Command succeeded
                job_ads_xml_str = '\n'.join(str(stdOut).split(self.badtext))
                if '<c>' in job_ads_xml_str:
                    ## Found at least one job
                    ## XML parsing
                    xml_root = ET.fromstring(job_ads_xml_str)

                    def _getAttribute_tuple(attribute_xml_element):
                        ## Attribute name
                        _n = str(attribute_xml_element.get('n'))
                        ## Attribute value text
                        _t = ' '.join(attribute_xml_element.itertext())
                        return (_n, _t)

                    ## Every batch job
                    for _c in xml_root.findall('c'):
                        job_ads_dict = dict()
                        ## Every attribute
                        attribute_iter = map(_getAttribute_tuple,
                                             _c.findall('a'))
                        job_ads_dict.update(attribute_iter)
                        batchid = str(job_ads_dict['ClusterId'])
                        condor_job_id = '{0}#{1}'.format(
                            self.submissionHost, batchid)
                        job_ads_all_dict[condor_job_id] = job_ads_dict
                        ## Remove batch jobs already gotten from the list
                        if batchid in batchIDs_list:
                            batchIDs_list.remove(batchid)
                else:
                    ## Job not found
                    tmpLog.debug('job not found with {0}'.format(comStr))
                    continue
            else:
                ## Command failed
                errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format(
                    comStr, retCode, stdOut, stdErr)
                tmpLog.error(errStr)
        if len(batchIDs_list) > 0:
            ## Job unfound via both condor_q or condor_history, marked as unknown worker in harvester
            for batchid in batchIDs_list:
                condor_job_id = '{0}#{1}'.format(self.submissionHost, batchid)
                job_ads_all_dict[condor_job_id] = dict()
            tmpLog.info('Unfound batch jobs of submissionHost={0}: {1}'.format(
                self.submissionHost, ' '.join(batchIDs_list)))
        ## Return
        return job_ads_all_dict
コード例 #36
0
ファイル: watcher.py プロジェクト: rukmarr/panda-harvester
 def execute(self):
     # avoid too early check
     if not self.singleMode and datetime.datetime.utcnow() - self.startTime \
             < datetime.timedelta(seconds=harvester_config.watcher.checkInterval):
         return
     mainLog = core_utils.make_logger(_logger,
                                      'id={0}'.format(self.get_pid()),
                                      method_name='execute')
     mainLog.debug('start')
     # get file lock
     try:
         with core_utils.get_file_lock(
                 lockFileName, harvester_config.watcher.checkInterval):
             logFileName = os.path.join(logDir, 'panda-db_proxy.log')
             timeNow = datetime.datetime.utcnow()
             if os.path.exists(logFileName):
                 # get latest timestamp
                 try:
                     p = subprocess.Popen(['tail', '-1', logFileName],
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE)
                     line = p.stdout.readline()
                     lastTime = datetime.datetime.strptime(
                         line[:23], "%Y-%m-%d %H:%M:%S,%f")
                 except Exception:
                     lastTime = None
                 # get processing time for last 1000 queries
                 logDuration = None
                 try:
                     p = subprocess.Popen('tail -{0} {1} | head -1'.format(
                         harvester_config.watcher.nMessages, logFileName),
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.PIPE,
                                          shell=True)
                     line = p.stdout.readline()
                     firstTime = datetime.datetime.strptime(
                         line[:23], "%Y-%m-%d %H:%M:%S,%f")
                     if lastTime is not None:
                         logDuration = lastTime - firstTime
                 except Exception:
                     pass
                 tmpMsg = 'last log message at {0}. '.format(lastTime)
                 if logDuration is not None:
                     tmpMsg += '{0} messages took {1} sec'.format(
                         harvester_config.watcher.nMessages,
                         logDuration.total_seconds())
                 mainLog.debug(tmpMsg)
                 # check timestamp
                 doAction = False
                 if harvester_config.watcher.maxStalled > 0 and lastTime is not None and \
                         timeNow - lastTime > datetime.timedelta(seconds=harvester_config.watcher.maxStalled):
                     mainLog.warning(
                         'last log message is too old. seems to be stalled')
                     doAction = True
                 elif harvester_config.watcher.maxDuration > 0 and logDuration is not None and \
                         logDuration.total_seconds() > harvester_config.watcher.maxDuration:
                     mainLog.warning(
                         'slow message generation. seems to be a performance issue'
                     )
                     doAction = True
                 # take action
                 if doAction:
                     # email
                     if 'email' in harvester_config.watcher.actions.split(
                             ','):
                         # get pass phrase
                         toSkip = False
                         mailUser = None
                         mailPass = None
                         if harvester_config.watcher.mailUser != '' and \
                                 harvester_config.watcher.mailPassword != '':
                             envName = harvester_config.watcher.passphraseEnv
                             if envName not in os.environ:
                                 tmpMsg = '{0} is undefined in etc/sysconfig/panda_harvester'.format(
                                     envName)
                                 mainLog.error(tmpMsg)
                                 toSkip = True
                             else:
                                 key = os.environ[envName]
                                 mailUser = core_utils.decrypt_string(
                                     key, harvester_config.watcher.mailUser)
                                 mailPass = core_utils.decrypt_string(
                                     key,
                                     harvester_config.watcher.mailPassword)
                         if not toSkip:
                             # message
                             msgBody = 'harvester {0} '.format(
                                 harvester_config.master.harvester_id)
                             msgBody += 'is having a problem on {0} '.format(
                                 socket.getfqdn())
                             msgBody += 'at {0} (UTC)'.format(
                                 datetime.datetime.utcnow())
                             message = MIMEText(msgBody)
                             message['Subject'] = "Harvester Alarm"
                             message[
                                 'From'] = harvester_config.watcher.mailFrom
                             message['To'] = harvester_config.watcher.mailTo
                             # send email
                             mainLog.debug('sending email to {0}'.format(
                                 harvester_config.watcher.mailTo))
                             server = smtplib.SMTP(
                                 harvester_config.watcher.mailServer,
                                 harvester_config.watcher.mailPort)
                             if hasattr(harvester_config.watcher, 'mailUseSSL') and \
                                     harvester_config.watcher.mailUseSSL is True:
                                 server.starttls()
                             if mailUser is not None and mailPass is not None:
                                 server.login(mailUser, mailPass)
                             server.ehlo()
                             server.sendmail(
                                 harvester_config.watcher.mailFrom,
                                 harvester_config.watcher.mailTo.split(','),
                                 message.as_string())
                             server.quit()
                     # kill
                     if 'kill' in harvester_config.watcher.actions.split(
                             ','):
                         # send USR2 fist
                         mainLog.debug('sending SIGUSR2')
                         os.killpg(os.getpgrp(), signal.SIGUSR2)
                         time.sleep(60)
                         mainLog.debug('sending SIGKILL')
                         os.killpg(os.getpgrp(), signal.SIGKILL)
             else:
                 mainLog.debug('skip as {0} is missing'.format(logFileName))
     except IOError:
         mainLog.debug(
             'skip as locked by another thread or too early to check')
     except Exception:
         core_utils.dump_error_message(mainLog)
     mainLog.debug('done')
コード例 #37
0
 def __init__(self, **kwarg):
     tmpLog = core_utils.make_logger(baseLogger, method_name='__init__')
     self.logBaseURL = None
     self.templateFile = None
     PluginBase.__init__(self, **kwarg)
     # number of processes
     try:
         self.nProcesses
     except AttributeError:
         self.nProcesses = 1
     else:
         if (not self.nProcesses) or (self.nProcesses < 1):
             self.nProcesses = 1
     # executable file
     try:
         self.executableFile
     except AttributeError:
         self.executableFile = None
     # condor log directory
     try:
         self.logDir
     except AttributeError:
         self.logDir = os.getenv('TMPDIR') or '/tmp'
     # Default x509 proxy for a queue
     try:
         self.x509UserProxy
     except AttributeError:
         self.x509UserProxy = os.getenv('X509_USER_PROXY')
     # x509 proxy for analysis jobs in grandly unified queues
     try:
         self.x509UserProxyAnalysis
     except AttributeError:
         self.x509UserProxyAnalysis = os.getenv('X509_USER_PROXY_ANAL')
     # ATLAS AGIS
     try:
         self.useAtlasAGIS = bool(self.useAtlasAGIS)
     except AttributeError:
         self.useAtlasAGIS = False
     # ATLAS Grid CE, requiring AGIS
     try:
         self.useAtlasGridCE = bool(self.useAtlasGridCE)
     except AttributeError:
         self.useAtlasGridCE = False
     finally:
         self.useAtlasAGIS = self.useAtlasAGIS or self.useAtlasGridCE
     # sdf template directories of CEs; ignored if templateFile is set
     try:
         self.CEtemplateDir
     except AttributeError:
         self.CEtemplateDir = ''
     # remote condor schedd and pool name (collector)
     try:
         self.condorSchedd
     except AttributeError:
         self.condorSchedd = None
     try:
         self.condorPool
     except AttributeError:
         self.condorPool = None
     # json config file of remote condor host: schedd/pool and weighting. If set, condorSchedd and condorPool are overwritten
     try:
         self.condorHostConfig
     except AttributeError:
         self.condorHostConfig = False
     if self.condorHostConfig:
         try:
             self.condorSchedd = []
             self.condorPool = []
             self.condorHostWeight = []
             with open(self.condorHostConfig, 'r') as f:
                 condor_host_config_map = json.load(f)
                 for _schedd, _cm in condor_host_config_map.items():
                     _pool = _cm['pool']
                     _weight = int(_cm['weight'])
                     self.condorSchedd.append(_schedd)
                     self.condorPool.append(_pool)
                     self.condorHostWeight.append(_weight)
         except Exception as e:
             tmpLog.error('error when parsing condorHostConfig json file; {0}: {1}'.format(e.__class__.__name__, e))
             raise
     else:
         if isinstance(self.condorSchedd, list):
             self.condorHostWeight = [1] * len(self.condorSchedd)
         else:
             self.condorHostWeight = [1]
     # condor spool mechanism. If False, need shared FS across remote schedd
     try:
         self.useSpool
     except AttributeError:
         self.useSpool = False
     # number of workers less than this number will be bulkily submitted in only one schedd
     try:
         self.minBulkToRamdomizedSchedd
     except AttributeError:
         self.minBulkToRamdomizedSchedd = 20
     # record of information of CE statistics
     self.ceStatsLock = threading.Lock()
     self.ceStats = dict()
     # allowed associated parameters from AGIS
     self._allowed_agis_attrs = (
             'pilot_url',
         )
コード例 #38
0
    def check_workers(self, workspec_list):
        ## Make logger for batch job query
        tmpLog = core_utils.make_logger(baseLogger, '{0}'.format('batch job query'),
                                        method_name='check_workers')

        ## Initial a list all batchIDs of workspec_list
        batchIDs_list = map( lambda _x: str(_x.batchID) , workspec_list )

        ## Query commands
        orig_comStr_list = [
            'condor_q -xml',
            'condor_history -xml',
        ]

        ## Record batch job query result to this dict, with key = batchID
        job_ads_all_dict = dict()

        ## Start query
        for orig_comStr in orig_comStr_list:
            ## String of batchIDs
            batchIDs_str = ' '.join(batchIDs_list)

            ## Command
            if batchIDs_str:
                comStr = '{0} {1}'.format(orig_comStr, batchIDs_str)
            else:
                tmpLog.debug('No batch job left to query in this cycle by this thread')
                continue

            tmpLog.debug('check with {0}'.format(comStr))
            (retCode, stdOut, stdErr) = _runShell(comStr)

            if retCode == 0:
                ## Command succeeded

                ## Kill out redundant xml roots
#                 badtext = """
# <?xml version="1.0"?>
# <!DOCTYPE classads SYSTEM "classads.dtd">
# <classads>
#
# </classads>
# """
                badtext = """
</classads>

<?xml version="1.0"?>
<!DOCTYPE classads SYSTEM "classads.dtd">
<classads>
"""

                # badtext_re_str = '<?xml version="1.0"?>\W+<!DOCTYPE classads SYSTEM "classads.dtd">\W+<classads>\W+</classads>'

                job_ads_xml_str = '\n'.join(str(stdOut).split(badtext))
                # job_ads_xml_str = re.sub(badtext_re_str, '\n', str(stdOut))
                # tmpLog.debug(job_ads_xml_str)
                import time
                with open('/tmp/jobads-{0}.xml'.format(time.time()), 'wb') as _f: _f.write(job_ads_xml_str)

                if '<c>' in job_ads_xml_str:
                    ## Found at least one job

                    ## XML parsing
                    xml_root = ET.fromstring(job_ads_xml_str)

                    def _getAttribute_tuple(attribute_xml_element):
                        ## Attribute name
                        _n = str(attribute_xml_element.get('n'))
                        ## Attribute value text
                        _t = ' '.join(attribute_xml_element.itertext())
                        return (_n, _t)


                    ## Every batch job
                    for _c in xml_root.findall('c'):
                        job_ads_dict = dict()
                        ## Every attribute
                        attribute_iter = map(_getAttribute_tuple, _c.findall('a'))
                        job_ads_dict.update(attribute_iter)
                        batchid = str(job_ads_dict['ClusterId'])
                        job_ads_all_dict[batchid] = job_ads_dict
                        ## Remove batch jobs already gotten from the list
                        batchIDs_list.remove(batchid)

                else:
                    ## Job not found
                    tmpLog.debug('job not found with {0}'.format(comStr))
                    continue

            else:
                ## Command failed
                errStr = 'command "{0}" failed, retCode={1}, error: {2} {3}'.format(comStr, retCode, stdOut, stdErr)
                tmpLog.error(errStr)
                return False, errStr

        if batchIDs_list:
            tmpLog.info( 'Unfound batch jobs: '.format( ' '.join(batchIDs_list) ) )

        ## Check for all workers
        with Pool(self.nProcesses) as _pool:
            retList = _pool.map(lambda _x: _check_one_worker(_x, job_ads_all_dict), workspec_list)


        return True, retList
コード例 #39
0
    def create_job_from_yaml(self,
                             yaml_content,
                             work_spec,
                             prod_source_label,
                             container_image,
                             executable,
                             args,
                             cert,
                             cert_in_secret=True,
                             cpu_adjust_ratio=100,
                             memory_adjust_ratio=100,
                             max_time=None):

        tmp_log = core_utils.make_logger(base_logger,
                                         method_name='create_job_from_yaml')

        # consider PULL mode as default, unless specified
        submit_mode = 'PULL'

        # create the configmap in push mode
        worker_id = None
        if work_spec.mapType != 'NoJob':
            submit_mode = 'PUSH'
            worker_id = str(work_spec.workerID)
            res = self.create_configmap(work_spec)
            if not res:  # if the configmap creation failed, don't submit a job because the pod creation will hang
                return res, 'Failed to create a configmap'

        # retrieve panda queue information
        panda_queues_dict = PandaQueuesDict()
        queue_name = panda_queues_dict.get_panda_queue_name(
            work_spec.computingSite)

        # set the worker name
        yaml_content['metadata']['name'] = yaml_content['metadata'][
            'name'] + "-" + str(work_spec.workerID)

        # set the resource type and other metadata to filter the pods
        yaml_content['spec']['template'].setdefault('metadata', {})
        yaml_content['spec']['template']['metadata'].update(
            {'labels': {
                'resourceType': str(work_spec.resourceType)
            }})

        # fill the container details. we can only handle one container (take the first, delete the rest)
        yaml_containers = yaml_content['spec']['template']['spec'][
            'containers']
        del (yaml_containers[1:len(yaml_containers)])

        container_env = yaml_containers[0]

        container_env.setdefault('resources', {})
        # set the container image
        if 'image' not in container_env:
            container_env['image'] = container_image

        if 'command' not in container_env:
            container_env['command'] = executable
            container_env['args'] = args

        # set the resources (CPU and memory) we need for the container
        # note that predefined values in the yaml template will NOT be overwritten
        # Be familiar with QoS classes: https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod
        # The CPU & memory settings will affect the QoS for the pod
        container_env.setdefault('resources', {})
        if work_spec.nCore > 0:

            # CPU limits
            container_env['resources'].setdefault('limits', {})
            if 'cpu' not in container_env['resources']['limits']:
                container_env['resources']['limits']['cpu'] = str(
                    work_spec.nCore)
            # CPU requests
            container_env['resources'].setdefault('requests', {})
            if 'cpu' not in container_env['resources']['requests']:
                container_env['resources']['requests']['cpu'] = str(
                    work_spec.nCore * cpu_adjust_ratio / 100.0)

        if work_spec.minRamCount > 4:  # K8S minimum memory limit = 4 MB
            # memory limits
            # container_env['resources'].setdefault('limits', {})
            # if 'memory' not in container_env['resources']['limits']:
            #     container_env['resources']['limits']['memory'] = str(work_spec.minRamCount) + 'M'
            # memory requests
            container_env['resources'].setdefault('requests', {})
            if 'memory' not in container_env['resources']['requests']:
                container_env['resources']['requests']['memory'] = str(
                    work_spec.minRamCount * memory_adjust_ratio / 100.0) + 'M'

        container_env.setdefault('env', [])
        # try to retrieve the stdout log file name
        try:
            log_file_name = work_spec.workAttributes['stdout']
        except (KeyError, AttributeError):
            tmp_log.debug(
                'work_spec does not have stdout workAttribute, using default')
            log_file_name = ''

        container_env['env'].extend([{
            'name': 'computingSite',
            'value': work_spec.computingSite
        }, {
            'name': 'pandaQueueName',
            'value': queue_name
        }, {
            'name': 'resourceType',
            'value': work_spec.resourceType
        }, {
            'name': 'prodSourceLabel',
            'value': prod_source_label
        }, {
            'name': 'jobType',
            'value': work_spec.jobType
        }, {
            'name': 'proxySecretPath',
            'value': cert if cert_in_secret else None
        }, {
            'name':
            'proxyContent',
            'value':
            None if cert_in_secret else self.set_proxy(cert)
        }, {
            'name': 'workerID',
            'value': str(work_spec.workerID)
        }, {
            'name':
            'logs_frontend_w',
            'value':
            harvester_config.pandacon.pandaCacheURL_W
        }, {
            'name':
            'logs_frontend_r',
            'value':
            harvester_config.pandacon.pandaCacheURL_R
        }, {
            'name': 'stdout_name',
            'value': log_file_name
        }, {
            'name':
            'PANDA_JSID',
            'value':
            'harvester-' + harvester_config.master.harvester_id
        }, {
            'name': 'HARVESTER_WORKER_ID',
            'value': str(work_spec.workerID)
        }, {
            'name':
            'HARVESTER_ID',
            'value':
            harvester_config.master.harvester_id
        }, {
            'name': 'submit_mode',
            'value': submit_mode
        }])

        # in push mode, add the configmap as a volume to the pod
        if submit_mode == 'PUSH' and worker_id:
            yaml_content['spec']['template']['spec'].setdefault('volumes', [])
            yaml_volumes = yaml_content['spec']['template']['spec']['volumes']
            yaml_volumes.append({
                'name': 'job-config',
                'configMap': {
                    'name': worker_id
                }
            })
            # mount the volume to the filesystem
            container_env.setdefault('volumeMounts', [])
            container_env['volumeMounts'].append({
                'name': 'job-config',
                'mountPath': CONFIG_DIR
            })

        # set the affinity
        if 'affinity' not in yaml_content['spec']['template']['spec']:
            yaml_content = self.set_affinity(yaml_content)

        # set max_time to avoid having a pod running forever
        if 'activeDeadlineSeconds' not in yaml_content['spec']['template'][
                'spec']:
            if not max_time:  # 4 days
                max_time = 4 * 24 * 23600
            yaml_content['spec']['template']['spec'][
                'activeDeadlineSeconds'] = max_time

        tmp_log.debug('creating job {0}'.format(yaml_content))

        rsp = self.batchv1.create_namespaced_job(body=yaml_content,
                                                 namespace=self.namespace)
        return rsp, yaml_content
コード例 #40
0
def make_batch_script(workspec,
                      template,
                      n_core_per_node,
                      log_dir,
                      panda_queue_name,
                      x509_user_proxy,
                      ce_info_dict=dict(),
                      batch_log_dict=dict(),
                      special_par=''):
    # make logger
    tmpLog = core_utils.make_logger(baseLogger,
                                    'workerID={0}'.format(workspec.workerID),
                                    method_name='make_batch_script')
    tmpFile = tempfile.NamedTemporaryFile(mode='w',
                                          delete=False,
                                          suffix='_submit.sdf',
                                          dir=workspec.get_access_point())

    # Note: In workspec, unit of minRamCount and of maxDiskCount are both MB.
    #       In HTCondor SDF, unit of request_memory is MB, and request_disk is KB.
    n_core_total = workspec.nCore if workspec.nCore else n_core_per_node
    request_ram = max(
        workspec.minRamCount, 1 *
        n_core_total) if workspec.minRamCount else 1 * n_core_total
    request_disk = workspec.maxDiskCount * 1024 if workspec.maxDiskCount else 1
    request_walltime = workspec.maxWalltime if workspec.maxWalltime else 0
    ce_info_dict = ce_info_dict.copy()
    batch_log_dict = batch_log_dict.copy()

    # possible override by AGIS special_par
    if special_par:
        special_par_attr_list = [
            'queue',
            'maxWallTime',
            'xcount',
        ]
        _match_special_par_dict = { attr: re.search('\({attr}=([^)]+)\)'.format(attr=attr), special_par) \
                                        for attr in special_par_attr_list }
        for attr, _match in _match_special_par_dict.items():
            if not _match:
                continue
            elif attr == 'queue':
                ce_info_dict['ce_queue_name'] = str(_match.group(1))
            elif attr == 'maxWallTime':
                request_walltime = int(_match.group(1))
            elif attr == 'xcount':
                n_core_total = int(_match.group(1))
            tmpLog.debug(
                'job attributes override by AGIS special_par: {0}={1}'.format(
                    attr, str(_match.group(1))))

    # derived job attributes
    n_node = _div_round_up(n_core_total, n_core_per_node)
    request_ram_per_core = _div_round_up(request_ram * n_node, n_core_total)
    request_cputime = request_walltime * n_core_total
    request_walltime_minute = _div_round_up(request_walltime, 60)
    request_cputime_minute = _div_round_up(request_cputime, 60)

    # fill in template
    tmpFile.write(
        template.format(
            nCorePerNode=n_core_per_node,
            nCoreTotal=n_core_total,
            nNode=n_node,
            requestRam=request_ram,
            requestRamPerCore=request_ram_per_core,
            requestDisk=request_disk,
            requestWalltime=request_walltime,
            requestWalltimeMinute=request_walltime_minute,
            requestCputime=request_cputime,
            requestCputimeMinute=request_cputime_minute,
            accessPoint=workspec.accessPoint,
            harvesterID=harvester_config.master.harvester_id,
            workerID=workspec.workerID,
            computingSite=workspec.computingSite,
            pandaQueueName=panda_queue_name,
            x509UserProxy=x509_user_proxy,
            ceEndpoint=ce_info_dict.get('ce_endpoint', ''),
            ceHostname=ce_info_dict.get('ce_hostname', ''),
            ceFlavour=ce_info_dict.get('ce_flavour', ''),
            ceJobmanager=ce_info_dict.get('ce_jobmanager', ''),
            ceQueueName=ce_info_dict.get('ce_queue_name', ''),
            ceVersion=ce_info_dict.get('ce_version', ''),
            logDir=log_dir,
            gtag=batch_log_dict.get('gtag', 'fake_GTAG_string'),
        ))
    tmpFile.close()
    tmpLog.debug('done')
    return tmpFile.name
コード例 #41
0
 def run(self):
     while True:
         sw = core_utils.get_stopwatch()
         mainLog = core_utils.make_logger(_logger,
                                          'id={0}'.format(self.ident),
                                          method_name='run')
         mainLog.debug('getting jobs to propagate')
         jobSpecs = self.dbProxy.get_jobs_to_propagate(
             harvester_config.propagator.maxJobs,
             harvester_config.propagator.lockInterval,
             harvester_config.propagator.updateInterval, self.ident)
         mainLog.debug('got {0} jobs'.format(len(jobSpecs)))
         # update jobs in central database
         iJobs = 0
         nJobs = harvester_config.propagator.nJobsInBulk
         hbSuppressMap = dict()
         while iJobs < len(jobSpecs):
             jobList = jobSpecs[iJobs:iJobs + nJobs]
             iJobs += nJobs
             # collect jobs to update or check
             jobListToSkip = []
             jobListToUpdate = []
             jobListToCheck = []
             retList = []
             for tmpJobSpec in jobList:
                 if tmpJobSpec.computingSite not in hbSuppressMap:
                     queueConfig = self.queueConfigMapper.get_queue(
                         tmpJobSpec.computingSite)
                     hbSuppressMap[
                         tmpJobSpec.
                         computingSite] = queueConfig.get_no_heartbeat_status(
                         )
                 # heartbeat is suppressed
                 if tmpJobSpec.status in hbSuppressMap[
                         tmpJobSpec.computingSite]:
                     # check running job to detect lost heartbeat
                     if tmpJobSpec.status == 'running':
                         jobListToCheck.append(tmpJobSpec)
                     else:
                         jobListToSkip.append(tmpJobSpec)
                         retList.append({'StatusCode': 0, 'command': None})
                 else:
                     jobListToUpdate.append(tmpJobSpec)
             retList += self.communicator.check_jobs(jobListToCheck)
             retList += self.communicator.update_jobs(jobListToUpdate)
             # logging
             for tmpJobSpec, tmpRet in zip(
                     jobListToSkip + jobListToCheck + jobListToUpdate,
                     retList):
                 if tmpRet['StatusCode'] == 0:
                     if tmpJobSpec in jobListToUpdate:
                         mainLog.debug(
                             'updated PandaID={0} status={1}'.format(
                                 tmpJobSpec.PandaID, tmpJobSpec.status))
                     else:
                         mainLog.debug(
                             'skip updating PandaID={0} status={1}'.format(
                                 tmpJobSpec.PandaID, tmpJobSpec.status))
                     # release job
                     tmpJobSpec.propagatorLock = None
                     if tmpJobSpec.is_final_status(
                     ) and tmpJobSpec.status == tmpJobSpec.get_status():
                         # unset to disable further updating
                         tmpJobSpec.propagatorTime = None
                         tmpJobSpec.subStatus = 'done'
                     else:
                         # check event availability
                         if tmpJobSpec.status == 'starting' and 'eventService' in tmpJobSpec.jobParams and \
                                 tmpJobSpec.subStatus != 'submitted':
                             tmpEvStat, tmpEvRet = self.communicator.check_event_availability(
                                 tmpJobSpec)
                             if tmpEvStat and tmpEvRet == 0:
                                 mainLog.debug(
                                     'kill PandaID={0} due to no event'.
                                     format(tmpJobSpec.PandaID))
                                 tmpRet['command'] = 'tobekilled'
                         # got kill command
                         if 'command' in tmpRet and tmpRet['command'] in [
                                 'tobekilled'
                         ]:
                             nWorkers = self.dbProxy.kill_workers_with_job(
                                 tmpJobSpec.PandaID)
                             if nWorkers == 0:
                                 # no remaining workers
                                 tmpJobSpec.status = 'cancelled'
                                 tmpJobSpec.subStatus = 'killed'
                                 tmpJobSpec.stateChangeTime = datetime.datetime.utcnow(
                                 )
                                 tmpJobSpec.trigger_propagation()
                     self.dbProxy.update_job(tmpJobSpec,
                                             {'propagatorLock': self.ident})
                 else:
                     mainLog.error(
                         'failed to update PandaID={0} status={1}'.format(
                             tmpJobSpec.PandaID, tmpJobSpec.status))
         mainLog.debug('getting workers to propagate')
         workSpecs = self.dbProxy.get_workers_to_propagate(
             harvester_config.propagator.maxWorkers,
             harvester_config.propagator.updateInterval)
         mainLog.debug('got {0} workers'.format(len(workSpecs)))
         # update workers in central database
         iWorkers = 0
         nWorkers = harvester_config.propagator.nWorkersInBulk
         while iWorkers < len(workSpecs):
             workList = workSpecs[iWorkers:iWorkers + nJobs]
             iWorkers += nWorkers
             retList, tmpErrStr = self.communicator.update_workers(workList)
             # logging
             if retList is None:
                 mainLog.error(
                     'failed to update workers with {0}'.format(tmpErrStr))
             else:
                 for tmpWorkSpec, tmpRet in zip(workList, retList):
                     if tmpRet:
                         mainLog.debug(
                             'updated workerID={0} status={1}'.format(
                                 tmpWorkSpec.workerID, tmpWorkSpec.status))
                         # update logs
                         for logFilePath, logOffset, logSize, logRemoteName in \
                                 tmpWorkSpec.get_log_files_to_upload():
                             with open(logFilePath, 'rb') as logFileObj:
                                 tmpStat, tmpErr = self.communicator.upload_file(
                                     logRemoteName, logFileObj, logOffset,
                                     logSize)
                                 if tmpStat:
                                     tmpWorkSpec.update_log_files_to_upload(
                                         logFilePath, logOffset + logSize)
                         # disable further update
                         if tmpWorkSpec.is_final_status():
                             tmpWorkSpec.disable_propagation()
                         self.dbProxy.update_worker(
                             tmpWorkSpec,
                             {'workerID': tmpWorkSpec.workerID})
                     else:
                         mainLog.error(
                             'failed to update workerID={0} status={1}'.
                             format(tmpWorkSpec.workerID,
                                    tmpWorkSpec.status))
         mainLog.debug('getting commands')
         commandSpecs = self.dbProxy.get_commands_for_receiver('propagator')
         mainLog.debug('got {0} commands'.format(len(commandSpecs)))
         for commandSpec in commandSpecs:
             if commandSpec.command.startswith(
                     CommandSpec.COM_reportWorkerStats):
                 # get worker stats
                 siteName = commandSpec.command.split(':')[-1]
                 workerStats = self.dbProxy.get_worker_stats(siteName)
                 if len(workerStats) == 0:
                     mainLog.error(
                         'failed to get worker stats for {0}'.format(
                             siteName))
                 else:
                     # report worker stats
                     tmpRet, tmpStr = self.communicator.update_worker_stats(
                         siteName, workerStats)
                     if tmpRet:
                         mainLog.debug(
                             'updated worker stats for {0}'.format(
                                 siteName))
                     else:
                         mainLog.error(
                             'failed to update worker stats for {0} err={1}'
                             .format(siteName, tmpStr))
         mainLog.debug('done' + sw.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.propagator.sleepTime):
             mainLog.debug('terminated')
             return
コード例 #42
0
        def _handle_one_worker(workspec):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workspec.workerID),
                                            method_name='_handle_one_worker')

            # get default information from queue info
            n_core_per_node_from_queue = this_panda_queue_dict.get(
                'corecount', 1) if this_panda_queue_dict.get('corecount',
                                                             1) else 1
            ce_info_dict = dict()
            batch_log_dict = dict()
            special_par = ''

            if self.useAtlasGridCE:
                # If ATLAS Grid CE mode used
                tmpLog.debug('Using ATLAS Grid CE mode...')
                queues_from_queue_list = this_panda_queue_dict.get(
                    'queues', [])
                special_par = this_panda_queue_dict.get('special_par', '')
                ce_endpoint_from_queue = ''
                ce_flavour_str = ''
                ce_version_str = ''
                random.shuffle(queues_from_queue_list)
                for _queue_dict in queues_from_queue_list:
                    if _queue_dict.get('ce_endpoint') and str(
                            _queue_dict.get('ce_state',
                                            '')).upper() == 'ACTIVE':
                        ce_flavour_str = str(_queue_dict.get('ce_flavour',
                                                             '')).lower()
                        ce_version_str = str(_queue_dict.get('ce_version',
                                                             '')).lower()
                        if ce_flavour_str in set(
                            ['arc-ce', 'cream-ce', 'htcondor-ce']):
                            ce_info_dict = _queue_dict.copy()
                            ce_endpoint_from_queue = ce_info_dict.get(
                                'ce_endpoint', '')
                            ce_info_dict['ce_hostname'] = re.sub(
                                ':\w*', '', ce_endpoint_from_queue)
                            break
                        else:
                            ce_flavour_str = ''
                tmpLog.debug(
                    'For site {0} got CE endpoint: "{1}", flavour: "{2}"'.
                    format(self.queueName, ce_endpoint_from_queue,
                           ce_flavour_str))
                if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                    sdf_template_filename = '{ce_flavour_str}.sdf'.format(
                        ce_flavour_str=ce_flavour_str)
                    self.templateFile = os.path.join(self.CEtemplateDir,
                                                     sdf_template_filename)

            # template for batch script
            tmpFile = open(self.templateFile)
            sdf_template = tmpFile.read()
            tmpFile.close()

            # get batch_log, stdout, stderr filename
            for _line in sdf_template.split('\n'):
                if _line.startswith('#'):
                    continue
                _match_batch_log = re.match('log = (.+)', _line)
                _match_stdout = re.match('output = (.+)', _line)
                _match_stderr = re.match('error = (.+)', _line)
                if _match_batch_log:
                    batch_log_value = _match_batch_log.group(1)
                    continue
                if _match_stdout:
                    stdout_value = _match_stdout.group(1)
                    continue
                if _match_stderr:
                    stderr_value = _match_stderr.group(1)
                    continue

            # get override requirements from queue configured
            try:
                n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
            except AttributeError:
                n_core_per_node = n_core_per_node_from_queue

            # URLs for log files
            if not (self.logBaseURL is None):
                if workspec.batchID:
                    batchID = workspec.batchID
                    guess = False
                else:
                    batchID = ''
                    guess = True
                batch_log_filename = parse_batch_job_filename(
                    value_str=batch_log_value,
                    file_dir=self.logDir,
                    batchID=batchID,
                    guess=guess)
                stdout_path_file_name = parse_batch_job_filename(
                    value_str=stdout_value,
                    file_dir=self.logDir,
                    batchID=batchID,
                    guess=guess)
                stderr_path_filename = parse_batch_job_filename(
                    value_str=stderr_value,
                    file_dir=self.logDir,
                    batchID=batchID,
                    guess=guess)
                batch_log = '{0}/{1}'.format(self.logBaseURL,
                                             batch_log_filename)
                batch_stdout = '{0}/{1}'.format(self.logBaseURL,
                                                stdout_path_file_name)
                batch_stderr = '{0}/{1}'.format(self.logBaseURL,
                                                stderr_path_filename)
                workspec.set_log_file('batch_log', batch_log)
                workspec.set_log_file('stdout', batch_stdout)
                workspec.set_log_file('stderr', batch_stderr)
                batch_log_dict['batch_log'] = batch_log
                batch_log_dict['batch_stdout'] = batch_stdout
                batch_log_dict['batch_stderr'] = batch_stderr
                batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                tmpLog.debug('Done set_log_file')
                if not workspec.get_jobspec_list():
                    tmpLog.debug(
                        'No jobspec associated in the worker of workerID={0}'.
                        format(workspec.workerID))
                else:
                    for jobSpec in workspec.get_jobspec_list():
                        # using batchLog and stdOut URL as pilotID and pilotLog
                        jobSpec.set_one_attribute(
                            'pilotID', workspec.workAttributes['stdOut'])
                        jobSpec.set_one_attribute(
                            'pilotLog', workspec.workAttributes['batchLog'])
            tmpLog.debug('Done jobspec attribute setting')

            # set data dict
            data = {
                'workspec': workspec,
                'template': sdf_template,
                'log_dir': self.logDir,
                'n_core_per_node': n_core_per_node,
                'panda_queue_name': panda_queue_name,
                'x509_user_proxy': self.x509UserProxy,
                'ce_info_dict': ce_info_dict,
                'batch_log_dict': batch_log_dict,
                'special_par': special_par,
            }

            return data
コード例 #43
0
def submit_a_worker(data):
    workspec = data['workspec']
    template = data['template']
    log_dir = data['log_dir']
    n_core_per_node = data['n_core_per_node']
    panda_queue_name = data['panda_queue_name']
    x509_user_proxy = data['x509_user_proxy']
    ce_info_dict = data['ce_info_dict']
    batch_log_dict = data['batch_log_dict']
    special_par = data['special_par']
    workspec.reset_changed_list()
    # make logger
    tmpLog = core_utils.make_logger(baseLogger,
                                    'workerID={0}'.format(workspec.workerID),
                                    method_name='submit_a_worker')
    # make batch script
    # batchFile = make_batch_script(workspec=workspec, template=template, n_core_per_node=n_core_per_node, log_dir=log_dir,
    #                                 panda_queue_name=panda_queue_name, x509_user_proxy=x509_user_proxy,
    #                                 ce_info_dict=ce_info_dict, batch_log_dict=batch_log_dict, special_par=special_par)
    batchFile = make_batch_script(**data)
    # command
    comStr = 'condor_submit {0}'.format(batchFile)
    # submit
    tmpLog.debug('submit with {0}'.format(batchFile))
    try:
        p = subprocess.Popen(comStr.split(),
                             shell=False,
                             universal_newlines=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        # check return code
        stdOut, stdErr = p.communicate()
        retCode = p.returncode
    except:
        stdOut = ''
        stdErr = core_utils.dump_error_message(tmpLog, no_message=True)
        retCode = 1
    tmpLog.debug('retCode={0}'.format(retCode))
    if retCode == 0:
        # extract batchID
        job_id_match = None
        for tmp_line_str in stdOut.split('\n'):
            job_id_match = re.search(
                '^(\d+) job[(]s[)] submitted to cluster (\d+)\.$',
                tmp_line_str)
            if job_id_match:
                break
        if job_id_match is not None:
            workspec.batchID = job_id_match.group(2)
            tmpLog.debug('batchID={0}'.format(workspec.batchID))
            batch_log = _condor_macro_replace(batch_log_dict['batch_log'],
                                              ClusterId=workspec.batchID)
            batch_stdout = _condor_macro_replace(
                batch_log_dict['batch_stdout'], ClusterId=workspec.batchID)
            batch_stderr = _condor_macro_replace(
                batch_log_dict['batch_stderr'], ClusterId=workspec.batchID)
            workspec.set_log_file('batch_log', batch_log)
            workspec.set_log_file('stdout', batch_stdout)
            workspec.set_log_file('stderr', batch_stderr)
            tmpRetVal = (True, '')
        else:
            errStr = 'batchID cannot be found'
            tmpLog.error(errStr)
            tmpRetVal = (False, errStr)
    else:
        # failed
        errStr = '{0} \n {1}'.format(stdOut, stdErr)
        tmpLog.error(errStr)
        tmpRetVal = (False, errStr)
    return tmpRetVal, workspec.get_changed_attributes()
コード例 #44
0
queueConfigMapper = QueueConfigMapper()
queueConfig = queueConfigMapper.get_queue(queueName)
initial_queueConfig_preparator = queueConfig.preparator
queueConfig.preparator[
    'module'] = 'pandaharvester.harvesterpreparator.go_preparator'
queueConfig.preparator['name'] = 'GoPreparator'
modified_queueConfig_preparator = queueConfig.preparator

pluginFactory = PluginFactory()
# get stage-out plugin
preparatorCore = pluginFactory.get_plugin(queueConfig.preparator)

# logger
_logger = core_utils.setup_logger('stageInTest_go_preparator')
tmpLog = core_utils.make_logger(_logger,
                                method_name='stageInTest_go_preparator')
tmpLog.debug('start')

for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems():
    #print "loggerName - {}".format(loggerName)
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

msgStr = "plugin={0}".format(preparatorCore.__class__.__name__)
tmpLog.debug(msgStr)
コード例 #45
0
def _check_one_worker(workspec, job_ads_all_dict):
    # Make logger for one single worker
    tmpLog = core_utils.make_logger(baseLogger,
                                    'workerID={0}'.format(workspec.workerID),
                                    method_name='_check_one_worker')

    ## Initialize newStatus
    newStatus = workspec.status
    errStr = ''

    name_opt, pool_opt = '', ''
    if workspec.submissionHost:
        try:
            condor_schedd, condor_pool = workspec.submissionHost.split(
                ',')[0:2]
        except ValueError:
            pass
        name_opt = '-name {0}'.format(condor_schedd) if condor_schedd else ''
        pool_opt = '-pool {0}'.format(condor_pool) if condor_pool else ''

    try:
        job_ads_dict = job_ads_all_dict[str(workspec.batchID)]
    except KeyError:
        got_job_ads = False
    except Exception as e:
        got_job_ads = False
        tmpLog.error('With error {0}'.format(e))
    else:
        got_job_ads = True

        ## Parse job ads
        if got_job_ads:
            ## Check JobStatus
            try:
                batchStatus = job_ads_dict['JobStatus']
            except KeyError:
                errStr = 'cannot get JobStatus of job batchID={0}. Regard the worker as canceled by default'.format(
                    workspec.batchID)
                tmpLog.error(errStr)
                newStatus = WorkSpec.ST_cancelled
            else:
                # Propagate native condor job status
                workspec.nativeStatus = CONDOR_JOB_STATUS_MAP.get(
                    batchStatus, 'unexpected')
                if batchStatus in ['2', '6']:
                    # 2 running, 6 transferring output
                    newStatus = WorkSpec.ST_running
                elif batchStatus in ['1', '7']:
                    # 1 idle, 7 suspended
                    newStatus = WorkSpec.ST_submitted
                elif batchStatus in ['3']:
                    # 3 removed
                    errStr = 'Condor HoldReason: {0} ; Condor RemoveReason: {1} '.format(
                        job_ads_dict.get('LastHoldReason'),
                        job_ads_dict.get('RemoveReason'))
                    newStatus = WorkSpec.ST_cancelled
                elif batchStatus in ['5']:
                    # 5 held
                    if (job_ads_dict.get('HoldReason') == 'Job not found'
                            or int(time.time()) -
                            int(job_ads_dict.get('EnteredCurrentStatus', 0)) >
                            7200):
                        # Kill the job if held too long or other reasons
                        (retCode, stdOut, stdErr) = _runShell(
                            'condor_rm {name_opt} {pool_opt} {batchID}'.format(
                                batchID=workspec.batchID,
                                name_opt=name_opt,
                                pool_opt=pool_opt,
                            ))
                        if retCode == 0:
                            tmpLog.info('killed held job batchID={0}'.format(
                                workspec.batchID))
                        else:
                            newStatus = WorkSpec.ST_cancelled
                            tmpLog.error(
                                'cannot kill held job batchID={0}. Force worker to be in cancelled status'
                                .format(workspec.batchID))
                        # Mark the PanDA job as closed instead of failed
                        workspec.set_pilot_closed()
                        tmpLog.debug('Called workspec set_pilot_closed')
                    else:
                        newStatus = WorkSpec.ST_submitted
                elif batchStatus in ['4']:
                    # 4 completed
                    try:
                        payloadExitCode = job_ads_dict['ExitCode']
                    except KeyError:
                        errStr = 'cannot get ExitCode of job batchID={0}'.format(
                            workspec.batchID)
                        tmpLog.error(errStr)
                        newStatus = WorkSpec.ST_failed
                    else:
                        # Propagate condor return code
                        workspec.nativeExitCode = payloadExitCode
                        if payloadExitCode in ['0']:
                            # Payload should return 0 after successful run
                            newStatus = WorkSpec.ST_finished
                        else:
                            # Other return codes are considered failed
                            newStatus = WorkSpec.ST_failed
                            errStr = 'Payload execution error: returned non-zero'
                            tmpLog.debug(errStr)

                        tmpLog.info('Payload return code = {0}'.format(
                            payloadExitCode))
                else:
                    errStr = 'cannot get reasonable JobStatus of job batchID={0}. Regard the worker as failed by default'.format(
                        workspec.batchID)
                    tmpLog.error(errStr)
                    newStatus = WorkSpec.ST_failed

                tmpLog.info(
                    'batchID={0} : batchStatus {1} -> workerStatus {2}'.format(
                        workspec.batchID, batchStatus, newStatus))

        else:
            tmpLog.error(
                'condor job batchID={0} not found. Regard the worker as canceled by default'
                .format(workspec.batchID))
            newStatus = WorkSpec.ST_cancelled
            tmpLog.info(
                'batchID={0}: batchStatus {1} -> workerStatus {2}'.format(
                    workspec.batchID, batchStatus, newStatus))

    ## Return
    return (newStatus, errStr)
コード例 #46
0
 def get_files_to_stage_out(self, workspec):
     # get logger
     tmpLog = core_utils.make_logger(_logger,
                                     'workerID={0}'.format(
                                         workspec.workerID),
                                     method_name='get_files_to_stage_out')
     fileDict = dict()
     # look for the json just under the access point
     for pandaID in workspec.pandaid_list:
         # look for the json just under the access point
         accessPoint = self.get_access_point(workspec, pandaID)
         jsonFilePath = os.path.join(accessPoint, jsonOutputsFileName)
         readJsonPath = jsonFilePath + suffixReadJson
         # first look for json.read which is not yet acknowledged
         tmpLog.debug('looking for output file {0}'.format(readJsonPath))
         if os.path.exists(readJsonPath):
             pass
         else:
             tmpLog.debug(
                 'looking for output file {0}'.format(jsonFilePath))
             if not os.path.exists(jsonFilePath):
                 # not found
                 tmpLog.debug('not found')
                 continue
             try:
                 tmpLog.debug('found')
                 # rename to prevent from being overwritten
                 os.rename(jsonFilePath, readJsonPath)
             except Exception:
                 tmpLog.error('failed to rename json')
                 continue
         # load json
         toSkip = False
         loadDict = None
         try:
             with open(readJsonPath) as jsonFile:
                 loadDict = json.load(jsonFile)
         except Exception:
             tmpLog.error('failed to load json')
             toSkip = True
         # test validity of data format (ie it should be a Dictionary)
         if not toSkip:
             if not isinstance(loadDict, dict):
                 tmpLog.error('loaded data is not a dictionary')
                 toSkip = True
         # collect files and events
         nData = 0
         if not toSkip:
             sizeMap = dict()
             chksumMap = dict()
             eventsList = dict()
             for tmpPandaID, tmpEventMapList in iteritems(loadDict):
                 tmpPandaID = long(tmpPandaID)
                 # test if tmpEventMapList is a list
                 if not isinstance(tmpEventMapList, list):
                     tmpLog.error('loaded data item is not a list')
                     toSkip = True
                     break
                 for tmpEventInfo in tmpEventMapList:
                     try:
                         nData += 1
                         if 'eventRangeID' in tmpEventInfo:
                             tmpEventRangeID = tmpEventInfo['eventRangeID']
                         else:
                             tmpEventRangeID = None
                         tmpFileDict = dict()
                         pfn = tmpEventInfo['path']
                         lfn = os.path.basename(pfn)
                         tmpFileDict['path'] = pfn
                         if pfn not in sizeMap:
                             if 'fsize' in tmpEventInfo:
                                 sizeMap[pfn] = tmpEventInfo['fsize']
                             else:
                                 sizeMap[pfn] = os.stat(pfn).st_size
                         tmpFileDict['fsize'] = sizeMap[pfn]
                         tmpFileDict['type'] = tmpEventInfo['type']
                         if tmpEventInfo['type'] in ['log', 'output']:
                             # disable zipping
                             tmpFileDict['isZip'] = 0
                         elif tmpEventInfo['type'] == 'zip_output':
                             # already zipped
                             tmpFileDict['isZip'] = 1
                         elif 'isZip' in tmpEventInfo:
                             tmpFileDict['isZip'] = tmpEventInfo['isZip']
                         # guid
                         if 'guid' in tmpEventInfo:
                             tmpFileDict['guid'] = tmpEventInfo['guid']
                         else:
                             tmpFileDict['guid'] = str(uuid.uuid4())
                         # get checksum
                         if pfn not in chksumMap:
                             if 'chksum' in tmpEventInfo:
                                 chksumMap[pfn] = tmpEventInfo['chksum']
                             else:
                                 chksumMap[pfn] = core_utils.calc_adler32(
                                     pfn)
                         tmpFileDict['chksum'] = chksumMap[pfn]
                         if tmpPandaID not in fileDict:
                             fileDict[tmpPandaID] = dict()
                         if lfn not in fileDict[tmpPandaID]:
                             fileDict[tmpPandaID][lfn] = []
                         fileDict[tmpPandaID][lfn].append(tmpFileDict)
                         # skip if unrelated to events
                         if tmpFileDict['type'] not in [
                                 'es_output', 'zip_output'
                         ]:
                             continue
                         tmpFileDict['eventRangeID'] = tmpEventRangeID
                         if tmpPandaID not in eventsList:
                             eventsList[tmpPandaID] = list()
                         eventsList[tmpPandaID].append({
                             'eventRangeID':
                             tmpEventRangeID,
                             'eventStatus':
                             tmpEventInfo['eventStatus']
                         })
                     except Exception:
                         core_utils.dump_error_message(tmpLog)
             # dump events
             if not toSkip:
                 if len(eventsList) > 0:
                     curName = os.path.join(accessPoint,
                                            jsonEventsUpdateFileName)
                     newName = curName + '.new'
                     f = open(newName, 'w')
                     json.dump(eventsList, f)
                     f.close()
                     os.rename(newName, curName)
         # remove empty file
         if toSkip or nData == 0:
             try:
                 os.remove(readJsonPath)
             except Exception:
                 pass
         tmpLog.debug('got {0} files for PandaID={1}'.format(
             nData, pandaID))
     return fileDict
コード例 #47
0
 def do_POST(self):
     # logger
     if self.tmpLog is None:
         self.tmpLog = core_utils.make_logger(_logger)
     toSkip = False
     form = None
     methodName = None
     dataStr = None
     message = ''
     # parse the form data posted
     try:
         form = self.get_form()
     except Exception:
         message = 'corrupted json'
         toSkip = True
     # check parameters
     if not toSkip:
         toSkip = True
         # method is not set
         if 'methodName' not in form:
             message = 'methodName is not given'
             self.send_response(400)
         elif 'workerID' not in form:
             message = 'workerID is not given'
             self.send_response(400)
         elif 'data' not in form:
             message = 'data is not given'
             self.send_response(400)
         else:
             toSkip = False
     # get worker
     if not toSkip:
         try:
             workerID = form['workerID']
             workSpec = self.dbProxy.get_worker_with_id(workerID)
             if workSpec is None:
                 message = 'workerID={0} not found in DB'.format(workerID)
                 self.send_response(400)
             else:
                 # chose file and operation for each action
                 methodName = form['methodName']
                 opType = None
                 filePath = ''
                 if methodName == 'requestJobs':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonJobRequestFileName)
                     opType = 'w'
                 elif methodName == 'getJobs':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jobSpecFileName)
                     opType = 'r'
                 elif methodName == 'requestEventRanges':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonEventsRequestFileName)
                     opType = 'w'
                 elif methodName == 'getEventRanges':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonEventsFeedFileName)
                     opType = 'r'
                 elif methodName == 'updateJobs':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonAttrsFileName)
                     opType = 'w'
                 elif methodName == 'uploadJobReport':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonJobReport)
                     opType = 'w'
                 elif methodName == 'uploadEventOutputDump':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.jsonOutputsFileName)
                     opType = 'w'
                 elif methodName == 'setPandaIDs':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.pandaIDsFile)
                     opType = 'w'
                 elif methodName == 'killWorker':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.killWorkerFile)
                     opType = 'w'
                 elif methodName == 'heartbeat':
                     filePath = os.path.join(
                         workSpec.get_access_point(),
                         shared_file_messenger.heartbeatFile)
                     opType = 'w'
                 else:
                     self.send_response(501)
                     message = 'method not implemented'
                     toSkip = True
                 # take action
                 if not toSkip:
                     # write actions
                     if opType == 'w':
                         # check if file exists. Methods such as heartbeat however need to overwrite the file
                         if os.path.exists(filePath) and methodName not in [
                                 'heartbeat'
                         ]:
                             message = 'previous request is not yet processed'
                             self.send_response(503)
                         else:
                             with open(filePath, 'w') as fileHandle:
                                 json.dump(form['data'], fileHandle)
                                 message = 'OK'
                                 self.send_response(200)
                     else:
                         # read actions
                         if os.path.exists(filePath):
                             with open(filePath) as fileHandle:
                                 try:
                                     _message = json.load(fileHandle)
                                     message = json.dumps(_message)
                                     self.send_header(
                                         'Content-Type', 'application/json')
                                 except JSONDecodeError:
                                     _f_qs = open(filePath).read()
                                     # _message = dict(parse_qsl(_f_qs, keep_blank_values=True))
                                     message = _f_qs
                                     self.send_header(
                                         'Content-Type', 'text/plain')
                                 self.send_response(200)
                         else:
                             message = 'previous request is not yet processed'
                             self.send_response(503)
         except Exception:
             self.send_response(500)
             message = core_utils.dump_error_message(_logger)
     if harvester_config.frontend.verbose:
         self.tmpLog.debug('method={0} json={1} msg={2}'.format(
             methodName, dataStr, message))
     # set the response
     self.do_postprocessing(message)
     return
コード例 #48
0
 def run(self):
     lockedBy = 'monitor-{0}'.format(self.ident)
     # init messengers
     for queueConfig in self.queueConfigMapper.get_all_queues().values():
         # just import for module initialization
         self.pluginFactory.get_plugin(queueConfig.messenger)
     # main
     while True:
         sw = core_utils.get_stopwatch()
         mainLog = core_utils.make_logger(_logger,
                                          'id={0}'.format(lockedBy),
                                          method_name='run')
         mainLog.debug('getting workers to monitor')
         workSpecsPerQueue = self.dbProxy.get_workers_to_update(
             harvester_config.monitor.maxWorkers,
             harvester_config.monitor.checkInterval,
             harvester_config.monitor.lockInterval, lockedBy)
         mainLog.debug('got {0} queues'.format(len(workSpecsPerQueue)))
         # loop over all workers
         for queueName, workSpecsList in iteritems(workSpecsPerQueue):
             tmpQueLog = core_utils.make_logger(_logger,
                                                'id={0} queue={1}'.format(
                                                    lockedBy, queueName),
                                                method_name='run')
             # check queue
             if not self.queueConfigMapper.has_queue(queueName):
                 tmpQueLog.error('config not found')
                 continue
             # get queue
             queueConfig = self.queueConfigMapper.get_queue(queueName)
             # get plugins
             monCore = self.pluginFactory.get_plugin(queueConfig.monitor)
             messenger = self.pluginFactory.get_plugin(
                 queueConfig.messenger)
             # check workers
             allWorkers = [
                 item for sublist in workSpecsList for item in sublist
             ]
             tmpQueLog.debug('checking {0} workers'.format(len(allWorkers)))
             tmpRetMap = self.check_workers(monCore, messenger, allWorkers,
                                            queueConfig, tmpQueLog)
             # loop over all worker chunks
             tmpQueLog.debug('update jobs and workers')
             iWorker = 0
             for workSpecs in workSpecsList:
                 jobSpecs = None
                 filesToStageOut = dict()
                 pandaIDsList = []
                 eventsToUpdateList = []
                 filesToStageOutList = []
                 for workSpec in workSpecs:
                     tmpLog = core_utils.make_logger(_logger,
                                                     'workerID={0}'.format(
                                                         workSpec.workerID),
                                                     method_name='run')
                     tmpOut = tmpRetMap[workSpec.workerID]
                     newStatus = tmpOut['newStatus']
                     monStatus = tmpOut['monStatus']
                     diagMessage = tmpOut['diagMessage']
                     workAttributes = tmpOut['workAttributes']
                     eventsToUpdate = tmpOut['eventsToUpdate']
                     filesToStageOut = tmpOut['filesToStageOut']
                     eventsRequestParams = tmpOut['eventsRequestParams']
                     nJobsToReFill = tmpOut['nJobsToReFill']
                     pandaIDs = tmpOut['pandaIDs']
                     tmpStr = 'newStatus={0} monitoredStatus={1} diag={2} '
                     tmpStr += 'postProcessed={3} files={4}'
                     tmpLog.debug(
                         tmpStr.format(newStatus, monStatus, diagMessage,
                                       workSpec.is_post_processed(),
                                       str(filesToStageOut)))
                     iWorker += 1
                     # check status
                     if newStatus not in WorkSpec.ST_LIST:
                         tmpLog.error(
                             'unknown status={0}'.format(newStatus))
                         continue
                     # update worker
                     workSpec.set_status(newStatus)
                     workSpec.set_work_attributes(workAttributes)
                     # request events
                     if eventsRequestParams != {}:
                         workSpec.eventsRequest = WorkSpec.EV_requestEvents
                         workSpec.eventsRequestParams = eventsRequestParams
                     # jobs to refill
                     if nJobsToReFill is not None:
                         workSpec.nJobsToReFill = nJobsToReFill
                     # get associated jobs for the worker chunk
                     if workSpec.hasJob == 1 and jobSpecs is None:
                         jobSpecs = self.dbProxy.get_jobs_with_worker_id(
                             workSpec.workerID, None, only_running=True)
                     # pandaIDs for push
                     pandaIDsList.append(pandaIDs)
                     if len(eventsToUpdate) > 0:
                         eventsToUpdateList.append(eventsToUpdate)
                     if len(filesToStageOut) > 0:
                         filesToStageOutList.append(filesToStageOut)
                 # update jobs and workers
                 if jobSpecs is not None:
                     tmpQueLog.debug(
                         'updating {0} jobs with {1} workers'.format(
                             len(jobSpecs), len(workSpecs)))
                     core_utils.update_job_attributes_with_workers(
                         queueConfig.mapType, jobSpecs, workSpecs,
                         filesToStageOutList, eventsToUpdateList)
                     for jobSpec in jobSpecs:
                         tmpLog = core_utils.make_logger(
                             _logger,
                             'PandaID={0}'.format(jobSpec.PandaID),
                             method_name='run')
                         tmpLog.debug(
                             'new status={0} subStatus={1} status_in_metadata={2}'
                             .format(
                                 jobSpec.status, jobSpec.subStatus,
                                 jobSpec.get_job_status_from_attributes()))
                 # update local database
                 tmpRet = self.dbProxy.update_jobs_workers(
                     jobSpecs, workSpecs, lockedBy, pandaIDsList)
                 if not tmpRet:
                     for workSpec in workSpecs:
                         tmpLog = core_utils.make_logger(
                             _logger,
                             'workerID={0}'.format(workSpec.workerID),
                             method_name='run')
                         tmpLog.error(
                             'failed to update the DB. lockInterval may be too short'
                         )
                 # send ACK to workers for events and files
                 if len(eventsToUpdateList) > 0 or len(
                         filesToStageOutList) > 0:
                     for workSpec in workSpecs:
                         messenger.acknowledge_events_files(workSpec)
             tmpQueLog.debug('done')
         mainLog.debug('done' + sw.get_elapsed_time())
         # check if being terminated
         if self.terminated(harvester_config.monitor.sleepTime):
             mainLog.debug('terminated')
             return
コード例 #49
0
        def _handle_one_worker(workspec, to_submit=to_submit_any):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger, 'site={0} workerID={1}'.format(self.queueName, workspec.workerID),
                                            method_name='_handle_one_worker')
            def _choose_proxy(workspec):
                """
                Choose the proxy based on the job type
                """
                job_type = workspec.jobType
                proxy = self.x509UserProxy
                if is_grandly_unified_queue and job_type in ('user', 'panda', 'analysis') and self.x509UserProxyAnalysis:
                    tmpLog.debug('Taking analysis proxy')
                    proxy = self.x509UserProxyAnalysis
                else:
                    tmpLog.debug('Taking default proxy')
                return proxy
            # initialize
            ce_info_dict = dict()
            batch_log_dict = dict()
            data = {'workspec': workspec,
                    'to_submit': to_submit,}
            if to_submit:
                sdf_template_file = None
                if self.useAtlasGridCE:
                    # choose a CE
                    tmpLog.info('choose a CE...')
                    ce_chosen = _choose_ce(ce_weighting)
                    try:
                        ce_info_dict = ce_auxilary_dict[ce_chosen].copy()
                    except KeyError:
                        tmpLog.info('Problem choosing CE with weighting. Choose an arbitrary CE endpoint')
                        ce_info_dict = random.choice(list(ce_auxilary_dict.values())).copy()
                    # go on info of the CE; ignore protocol prefix in ce_endpoint
                    ce_endpoint_from_queue = re.sub('^\w+://', '', ce_info_dict.get('ce_endpoint', ''))
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version', '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(':\w*', '',  ce_endpoint_from_queue)
                    if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue:
                        # add default port to ce_endpoint if missing
                        default_port_map = {
                                'cream-ce': 8443,
                                'arc-ce': 2811,
                                'htcondor-ce': 9619,
                            }
                        if ce_flavour_str in default_port_map:
                            default_port = default_port_map[ce_flavour_str]
                            ce_info_dict['ce_endpoint'] = '{0}:{1}'.format(ce_endpoint_from_queue, default_port)
                    tmpLog.debug('Got pilot version: "{0}"; CE endpoint: "{1}", flavour: "{2}"'.format(
                                    pilot_version, ce_endpoint_from_queue, ce_flavour_str))
                    if self.templateFile:
                        sdf_template_file = self.templateFile
                    elif os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}{sdf_suffix_str}.sdf'.format(
                                                    ce_flavour_str=ce_flavour_str, sdf_suffix_str=sdf_suffix_str)
                        sdf_template_file = os.path.join(self.CEtemplateDir, sdf_template_filename)
                else:
                    if self.templateFile:
                        sdf_template_file = self.templateFile
                    try:
                        # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                        if self.ceHostname and isinstance(self.ceHostname, list) and len(self.ceHostname) > 0:
                            if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0:
                                ce_info_dict['ce_hostname'], ce_info_dict['ce_endpoint'] = random.choice(list(zip(self.ceHostname, self.ceEndpoint)))
                            else:
                                ce_info_dict['ce_hostname'] = random.choice(self.ceHostname)
                                ce_info_dict['ce_endpoint'] = self.ceEndpoint
                        else:
                            ce_info_dict['ce_hostname'] = self.ceHostname
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    except AttributeError:
                        pass
                    try:
                        # Manually define ceQueueName
                        if self.ceQueueName:
                            ce_info_dict['ce_queue_name'] = self.ceQueueName
                    except AttributeError:
                        pass
                # template for batch script
                try:
                    tmpFile = open(sdf_template_file)
                    sdf_template_raw = tmpFile.read()
                    tmpFile.close()
                except AttributeError:
                    tmpLog.error('No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found')
                    to_submit = False
                    return data
                else:
                    # get batch_log, stdout, stderr filename, and remobe commented liness
                    sdf_template_str_list = []
                    for _line in sdf_template_raw.split('\n'):
                        if _line.startswith('#'):
                            continue
                        sdf_template_str_list.append(_line)
                        _match_batch_log = re.match('log = (.+)', _line)
                        _match_stdout = re.match('output = (.+)', _line)
                        _match_stderr = re.match('error = (.+)', _line)
                        if _match_batch_log:
                            batch_log_value = _match_batch_log.group(1)
                            continue
                        if _match_stdout:
                            stdout_value = _match_stdout.group(1)
                            continue
                        if _match_stderr:
                            stderr_value = _match_stderr.group(1)
                            continue
                    sdf_template = '\n'.join(sdf_template_str_list)
                    # Choose from Condor schedd and central managers
                    condor_schedd, condor_pool = random.choice(schedd_pool_choice_list)
                    # set submissionHost
                    if not condor_schedd and not condor_pool:
                        workspec.submissionHost = 'LOCAL'
                    else:
                        workspec.submissionHost = '{0},{1}'.format(condor_schedd, condor_pool)
                    tmpLog.debug('set submissionHost={0}'.format(workspec.submissionHost))
                    # Log Base URL
                    if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                        schedd_hostname = re.sub(r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                                                    lambda matchobj: matchobj.group(1) if matchobj.group(1) else '',
                                                    condor_schedd)
                        log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL)
                    else:
                        log_base_url = self.logBaseURL
                    # URLs for log files
                    if not (log_base_url is None):
                        if workspec.batchID:
                            batchID = workspec.batchID
                            guess = False
                        else:
                            batchID = ''
                            guess = True
                        batch_log_filename = parse_batch_job_filename(value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        stdout_path_file_name = parse_batch_job_filename(value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        stderr_path_filename = parse_batch_job_filename(value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess)
                        batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir, batch_log_filename)
                        batch_stdout = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stdout_path_file_name)
                        batch_stderr = '{0}/{1}/{2}'.format(log_base_url, log_subdir, stderr_path_filename)
                        workspec.set_log_file('batch_log', batch_log)
                        workspec.set_log_file('stdout', batch_stdout)
                        workspec.set_log_file('stderr', batch_stderr)
                        batch_log_dict['batch_log'] = batch_log
                        batch_log_dict['batch_stdout'] = batch_stdout
                        batch_log_dict['batch_stderr'] = batch_stderr
                        batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                        tmpLog.debug('Done set_log_file before submission')
                    tmpLog.debug('Done jobspec attribute setting')

                # choose the x509 certificate based on the type of job (analysis or production)
                proxy = _choose_proxy(workspec)

                # set data dict
                data.update({
                        'workspec': workspec,
                        'to_submit': to_submit,
                        'template': sdf_template,
                        'executable_file': self.executableFile,
                        'log_dir': self.logDir,
                        'log_subdir': log_subdir,
                        'n_core_per_node': n_core_per_node,
                        'panda_queue_name': panda_queue_name,
                        'x509_user_proxy': proxy,
                        'ce_info_dict': ce_info_dict,
                        'batch_log_dict': batch_log_dict,
                        'special_par': special_par,
                        'harvester_queue_config': harvester_queue_config,
                        'is_unified_queue': is_unified_queue,
                        'condor_schedd': condor_schedd,
                        'condor_pool': condor_pool,
                        'use_spool': self.useSpool,
                        'pilot_url': pilot_url,
                        'pilot_version': pilot_version,
                        'python_version': python_version,
                        })
            return data
コード例 #50
0
def submit_a_worker(data):
    workspec = data['workspec']
    to_submit = data['to_submit']
    # make logger
    tmpLog = core_utils.make_logger(baseLogger,
                                    'workerID={0}'.format(workspec.workerID),
                                    method_name='submit_a_worker')
    # no need to submit bad worker
    if not to_submit:
        errStr = 'Not submitted, due to incomplete data of the worker'
        tmpLog.warning(errStr)
        tmpRetVal = (None, errStr)
        return tmpRetVal, workspec.get_changed_attributes()
    # attributes
    try:
        ce_info_dict = data['ce_info_dict']
        batch_log_dict = data['batch_log_dict']
        condor_schedd = data['condor_schedd']
        condor_pool = data['condor_pool']
        use_spool = data['use_spool']
    except KeyError:
        errStr = 'Not submitted, due to incomplete data of the worker'
        tmpLog.warning(errStr)
        tmpRetVal = (None, errStr)
        return tmpRetVal, workspec.get_changed_attributes()
    else:
        workspec.reset_changed_list()
    # make batch script
    batchFile = make_batch_script(**data)
    # make condor remote options
    name_opt = '-name {0}'.format(condor_schedd) if condor_schedd else ''
    pool_opt = '-pool {0}'.format(condor_pool) if condor_pool else ''
    spool_opt = '-spool'.format(
        use_spool) if use_spool and condor_schedd else ''
    # command
    comStr = 'condor_submit {spool_opt} {name_opt} {pool_opt} {sdf_file}'.format(
        sdf_file=batchFile,
        name_opt=name_opt,
        pool_opt=pool_opt,
        spool_opt=spool_opt)
    # submit
    tmpLog.debug('submit with command: {0}'.format(comStr))
    try:
        p = subprocess.Popen(comStr.split(),
                             shell=False,
                             universal_newlines=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        # check return code
        stdOut, stdErr = p.communicate()
        retCode = p.returncode
    except Exception:
        stdOut = ''
        stdErr = core_utils.dump_error_message(tmpLog, no_message=True)
        retCode = 1
    tmpLog.debug('retCode={0}'.format(retCode))
    if retCode == 0:
        # extract batchID
        job_id_match = None
        for tmp_line_str in stdOut.split('\n'):
            job_id_match = re.search(
                '^(\d+) job[(]s[)] submitted to cluster (\d+)\.$',
                tmp_line_str)
            if job_id_match:
                break
        if job_id_match is not None:
            workspec.batchID = job_id_match.group(2)
            # set submissionHost
            if not condor_schedd and not condor_pool:
                workspec.submissionHost = None
            else:
                workspec.submissionHost = '{0},{1}'.format(
                    condor_schedd, condor_pool)
            tmpLog.debug('submissionHost={0} batchID={1}'.format(
                workspec.submissionHost, workspec.batchID))
            # set computingElement
            workspec.computingElement = ce_info_dict.get('ce_endpoint', '')
            # set log
            batch_log = _condor_macro_replace(batch_log_dict['batch_log'],
                                              ClusterId=workspec.batchID)
            batch_stdout = _condor_macro_replace(
                batch_log_dict['batch_stdout'], ClusterId=workspec.batchID)
            batch_stderr = _condor_macro_replace(
                batch_log_dict['batch_stderr'], ClusterId=workspec.batchID)
            workspec.set_log_file('batch_log', batch_log)
            workspec.set_log_file('stdout', batch_stdout)
            workspec.set_log_file('stderr', batch_stderr)
            if not workspec.get_jobspec_list():
                tmpLog.debug(
                    'No jobspec associated in the worker of workerID={0}'.
                    format(workspec.workerID))
            else:
                for jobSpec in workspec.get_jobspec_list():
                    # using batchLog and stdOut URL as pilotID and pilotLog
                    jobSpec.set_one_attribute(
                        'pilotID', workspec.workAttributes['stdOut'])
                    jobSpec.set_one_attribute(
                        'pilotLog', workspec.workAttributes['batchLog'])
            tmpLog.debug('Done set_log_file after submission')
            tmpRetVal = (True, '')

        else:
            errStr = 'batchID cannot be found'
            tmpLog.error(errStr)
            tmpRetVal = (None, errStr)
    else:
        # failed
        errStr = '{0} \n {1}'.format(stdOut, stdErr)
        tmpLog.error(errStr)
        tmpRetVal = (None, errStr)
    return tmpRetVal, workspec.get_changed_attributes()
コード例 #51
0
def make_a_jdl(workspec, template, n_core_per_node, log_dir, panda_queue_name, executable_file,
                x509_user_proxy, log_subdir=None, ce_info_dict=dict(), batch_log_dict=dict(), pilot_url=None,
                special_par='', harvester_queue_config=None, is_unified_queue=False,
                pilot_version='unknown', python_version='unknown', **kwarg):
    # make logger
    tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format(workspec.workerID),
                                    method_name='make_a_jdl')
    # Note: In workspec, unit of minRamCount and of maxDiskCount are both MB.
    #       In HTCondor SDF, unit of request_memory is MB, and request_disk is KB.
    n_core_total = workspec.nCore if workspec.nCore else n_core_per_node
    request_ram = max(workspec.minRamCount, 1 * n_core_total) if workspec.minRamCount else 1 * n_core_total
    request_disk = workspec.maxDiskCount * 1024 if workspec.maxDiskCount else 1
    request_walltime = workspec.maxWalltime if workspec.maxWalltime else 0
    io_intensity = workspec.ioIntensity if workspec.ioIntensity else 0
    ce_info_dict = ce_info_dict.copy()
    batch_log_dict = batch_log_dict.copy()
    # possible override by AGIS special_par
    if special_par:
        special_par_attr_list = ['queue', 'maxWallTime', 'xcount', ]
        _match_special_par_dict = { attr: re.search('\({attr}=([^)]+)\)'.format(attr=attr), special_par) \
                                        for attr in special_par_attr_list }
        for attr, _match in _match_special_par_dict.items():
            if not _match:
                continue
            elif attr == 'queue':
                ce_info_dict['ce_queue_name'] = str(_match.group(1))
            elif attr == 'maxWallTime':
                request_walltime = int(_match.group(1))
            elif attr == 'xcount':
                n_core_total = int(_match.group(1))
            tmpLog.debug('job attributes override by AGIS special_par: {0}={1}'.format(attr, str(_match.group(1))))
    # derived job attributes
    n_node = _div_round_up(n_core_total, n_core_per_node)
    request_ram_per_core = _div_round_up(request_ram * n_node, n_core_total)
    request_cputime = request_walltime * n_core_total
    request_walltime_minute = _div_round_up(request_walltime, 60)
    request_cputime_minute = _div_round_up(request_cputime, 60)
    # decide prodSourceLabel
    pilot_opt_dict = submitter_common.get_complicated_pilot_options(workspec.pilotType, pilot_url=pilot_url)
    if pilot_opt_dict is None:
        prod_source_label = harvester_queue_config.get_source_label(workspec.jobType)
        pilot_type_opt = workspec.pilotType
        pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else ''
    else:
        prod_source_label = pilot_opt_dict['prod_source_label']
        pilot_type_opt = pilot_opt_dict['pilot_type_opt']
        pilot_url_str = pilot_opt_dict['pilot_url_str']
    # open tmpfile as submit description file
    tmpFile = tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='_submit.sdf', dir=workspec.get_access_point())
    # placeholder map
    placeholder_map = {
            'sdfPath': tmpFile.name,
            'executableFile': executable_file,
            'nCorePerNode': n_core_per_node,
            'nCoreTotal': n_core_total,
            'nNode': n_node,
            'requestRam': request_ram,
            'requestRamPerCore': request_ram_per_core,
            'requestDisk': request_disk,
            'requestWalltime': request_walltime,
            'requestWalltimeMinute': request_walltime_minute,
            'requestCputime': request_cputime,
            'requestCputimeMinute': request_cputime_minute,
            'accessPoint': workspec.accessPoint,
            'harvesterID': harvester_config.master.harvester_id,
            'workerID': workspec.workerID,
            'computingSite': workspec.computingSite,
            'pandaQueueName': panda_queue_name,
            'x509UserProxy': x509_user_proxy,
            'ceEndpoint': ce_info_dict.get('ce_endpoint', ''),
            'ceHostname': ce_info_dict.get('ce_hostname', ''),
            'ceFlavour': ce_info_dict.get('ce_flavour', ''),
            'ceJobmanager': ce_info_dict.get('ce_jobmanager', ''),
            'ceQueueName': ce_info_dict.get('ce_queue_name', ''),
            'ceVersion': ce_info_dict.get('ce_version', ''),
            'logDir': log_dir,
            'logSubdir': log_subdir,
            'gtag': batch_log_dict.get('gtag', 'fake_GTAG_string'),
            'prodSourceLabel': prod_source_label,
            'jobType': workspec.jobType,
            'resourceType': _get_resource_type(workspec.resourceType, is_unified_queue),
            'pilotResourceTypeOption': _get_resource_type(workspec.resourceType, is_unified_queue, True),
            'ioIntensity': io_intensity,
            'pilotType': pilot_type_opt,
            'pilotUrlOption': pilot_url_str,
            'pilotVersion': pilot_version,
            'pilotPythonOption': submitter_common.get_python_version_option(python_version, prod_source_label),
            'submissionHost': workspec.submissionHost,
            'submissionHostShort': workspec.submissionHost.split('.')[0],
        }
    # fill in template string
    jdl_str = template.format(**placeholder_map)
    # save jdl to submit description file
    tmpFile.write(jdl_str)
    tmpFile.close()
    tmpLog.debug('saved sdf at {0}'.format(tmpFile.name))
    tmpLog.debug('done')
    return jdl_str, placeholder_map
コード例 #52
0
        def _handle_one_worker(workspec):
            # make logger
            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workspec.workerID),
                                            method_name='_handle_one_worker')

            # get default information from queue info
            to_submit = True
            n_core_per_node_from_queue = this_panda_queue_dict.get(
                'corecount', 1) if this_panda_queue_dict.get('corecount',
                                                             1) else 1
            is_unified_queue = 'unifiedPandaQueue' in this_panda_queue_dict.get('catchall', '').split(',') \
                               or this_panda_queue_dict.get('capability', '') == 'ucore'
            ce_info_dict = dict()
            batch_log_dict = dict()
            special_par = ''
            data = {
                'workspec': workspec,
                'to_submit': to_submit,
            }

            if self.useAtlasGridCE:
                # If ATLAS Grid CE mode used
                tmpLog.debug('Using ATLAS Grid CE mode...')
                queues_from_queue_list = this_panda_queue_dict.get(
                    'queues', [])
                special_par = this_panda_queue_dict.get('special_par', '')
                ce_auxilary_dict = {}
                for _queue_dict in queues_from_queue_list:
                    if not (_queue_dict.get('ce_endpoint')
                            and str(_queue_dict.get('ce_state',
                                                    '')).upper() == 'ACTIVE'
                            and str(_queue_dict.get('ce_flavour', '')).lower()
                            in set(['arc-ce', 'cream-ce', 'htcondor-ce'])):
                        continue
                    ce_endpoint = _queue_dict.get('ce_endpoint')
                    if (ce_endpoint in ce_auxilary_dict
                            and str(_queue_dict.get('ce_queue_name',
                                                    '')).lower() == 'default'):
                        pass
                    else:
                        ce_auxilary_dict[ce_endpoint] = _queue_dict
                # qualified CEs from AGIS info
                n_qualified_ce = len(ce_auxilary_dict)
                queue_status_dict = self.dbInterface.get_queue_status(
                    self.queueName)
                worker_ce_stats_dict = self.dbInterface.get_worker_ce_stats(
                    self.queueName)
                ce_weight_dict = _get_ce_weight_dict(
                    ce_endpoint_list=list(ce_auxilary_dict.keys()),
                    queue_status_dict=queue_status_dict,
                    worker_ce_stats_dict=worker_ce_stats_dict)
                # good CEs which can be submitted to, duplicate by weight
                good_ce_weighted_list = []
                for _ce_endpoint in ce_auxilary_dict.keys():
                    good_ce_weighted_list.extend(
                        [_ce_endpoint] * ce_weight_dict.get(_ce_endpoint, 0))
                tmpLog.debug(
                    'queue_status_dict: {0} ; worker_ce_stats_dict: {1} ; ce_weight_dict: {2}'
                    .format(queue_status_dict, worker_ce_stats_dict,
                            ce_weight_dict))
                try:
                    if len(good_ce_weighted_list) > 0:
                        ce_info_dict = ce_auxilary_dict[random.choice(
                            good_ce_weighted_list)].copy()
                    else:
                        tmpLog.info(
                            'No good CE endpoint left. Choose an arbitrary CE endpoint'
                        )
                        ce_info_dict = random.choice(
                            list(ce_auxilary_dict.values())).copy()
                except IndexError:
                    tmpLog.error('No valid CE endpoint found')
                    ce_info_dict = {}
                    to_submit = False
                else:
                    ce_endpoint_from_queue = ce_info_dict.get(
                        'ce_endpoint', '')
                    ce_flavour_str = str(ce_info_dict.get('ce_flavour',
                                                          '')).lower()
                    ce_version_str = str(ce_info_dict.get('ce_version',
                                                          '')).lower()
                    ce_info_dict['ce_hostname'] = re.sub(
                        ':\w*', '', ce_endpoint_from_queue)
                    tmpLog.debug(
                        'For site {0} got CE endpoint: "{1}", flavour: "{2}"'.
                        format(self.queueName, ce_endpoint_from_queue,
                               ce_flavour_str))
                    if os.path.isdir(self.CEtemplateDir) and ce_flavour_str:
                        sdf_template_filename = '{ce_flavour_str}.sdf'.format(
                            ce_flavour_str=ce_flavour_str)
                        self.templateFile = os.path.join(
                            self.CEtemplateDir, sdf_template_filename)
            else:
                try:
                    # Manually define site condor schedd as ceHostname and central manager as ceEndpoint
                    if self.ceHostname and isinstance(
                            self.ceHostname,
                            list) and len(self.ceHostname) > 0:
                        if isinstance(self.ceEndpoint,
                                      list) and len(self.ceEndpoint) > 0:
                            ce_info_dict['ce_hostname'], ce_info_dict[
                                'ce_endpoint'] = random.choice(
                                    list(zip(self.ceHostname,
                                             self.ceEndpoint)))
                        else:
                            ce_info_dict['ce_hostname'] = random.choice(
                                self.ceHostname)
                            ce_info_dict['ce_endpoint'] = self.ceEndpoint
                    else:
                        ce_info_dict['ce_hostname'] = self.ceHostname
                        ce_info_dict['ce_endpoint'] = self.ceEndpoint
                except AttributeError:
                    pass

            # Choose from Condor schedd and central managers
            if isinstance(self.condorSchedd,
                          list) and len(self.condorSchedd) > 0:
                if isinstance(self.condorPool,
                              list) and len(self.condorPool) > 0:
                    condor_schedd, condor_pool = random.choice(
                        list(zip(self.condorSchedd, self.condorPool)))
                else:
                    condor_schedd = random.choice(self.condorSchedd)
                    condor_pool = self.condorPool
            else:
                condor_schedd = self.condorSchedd
                condor_pool = self.condorPool

            # Log Base URL
            if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL:
                schedd_hostname = re.sub(
                    r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?',
                    lambda matchobj: matchobj.group(1)
                    if matchobj.group(1) else '', condor_schedd)
                log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname,
                                      self.logBaseURL)
            else:
                log_base_url = self.logBaseURL

            # template for batch script
            try:
                tmpFile = open(self.templateFile)
                sdf_template = tmpFile.read()
                tmpFile.close()
            except AttributeError:
                tmpLog.error(
                    'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found'
                )
                to_submit = False
            else:
                # get batch_log, stdout, stderr filename
                for _line in sdf_template.split('\n'):
                    if _line.startswith('#'):
                        continue
                    _match_batch_log = re.match('log = (.+)', _line)
                    _match_stdout = re.match('output = (.+)', _line)
                    _match_stderr = re.match('error = (.+)', _line)
                    if _match_batch_log:
                        batch_log_value = _match_batch_log.group(1)
                        continue
                    if _match_stdout:
                        stdout_value = _match_stdout.group(1)
                        continue
                    if _match_stderr:
                        stderr_value = _match_stderr.group(1)
                        continue

                # get override requirements from queue configured
                try:
                    n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue
                except AttributeError:
                    n_core_per_node = n_core_per_node_from_queue

                # URLs for log files
                if not (log_base_url is None):
                    if workspec.batchID:
                        batchID = workspec.batchID
                        guess = False
                    else:
                        batchID = ''
                        guess = True
                    batch_log_filename = parse_batch_job_filename(
                        value_str=batch_log_value,
                        file_dir=log_subdir_path,
                        batchID=batchID,
                        guess=guess)
                    stdout_path_file_name = parse_batch_job_filename(
                        value_str=stdout_value,
                        file_dir=log_subdir_path,
                        batchID=batchID,
                        guess=guess)
                    stderr_path_filename = parse_batch_job_filename(
                        value_str=stderr_value,
                        file_dir=log_subdir_path,
                        batchID=batchID,
                        guess=guess)
                    batch_log = '{0}/{1}/{2}'.format(log_base_url, log_subdir,
                                                     batch_log_filename)
                    batch_stdout = '{0}/{1}/{2}'.format(
                        log_base_url, log_subdir, stdout_path_file_name)
                    batch_stderr = '{0}/{1}/{2}'.format(
                        log_base_url, log_subdir, stderr_path_filename)
                    workspec.set_log_file('batch_log', batch_log)
                    workspec.set_log_file('stdout', batch_stdout)
                    workspec.set_log_file('stderr', batch_stderr)
                    batch_log_dict['batch_log'] = batch_log
                    batch_log_dict['batch_stdout'] = batch_stdout
                    batch_log_dict['batch_stderr'] = batch_stderr
                    batch_log_dict['gtag'] = workspec.workAttributes['stdOut']
                    tmpLog.debug('Done set_log_file before submission')

                tmpLog.debug('Done jobspec attribute setting')

                # set data dict
                data.update({
                    'workspec': workspec,
                    'to_submit': to_submit,
                    'template': sdf_template,
                    'log_dir': self.logDir,
                    'log_subdir': log_subdir,
                    'n_core_per_node': n_core_per_node,
                    'panda_queue_name': panda_queue_name,
                    'x509_user_proxy': self.x509UserProxy,
                    'ce_info_dict': ce_info_dict,
                    'batch_log_dict': batch_log_dict,
                    'special_par': special_par,
                    'harvester_queue_config': harvester_queue_config,
                    'is_unified_queue': is_unified_queue,
                    'condor_schedd': condor_schedd,
                    'condor_pool': condor_pool,
                    'use_spool': self.useSpool,
                })

            return data
コード例 #53
0
ファイル: saga_sweeper.py プロジェクト: wguanicedew/harvester
 def __init__(self, **kwarg):
     PluginBase.__init__(self, **kwarg)
     tmpLog = core_utils.make_logger(baseLogger, method_name='__init__')
     tmpLog.info("[{0}] SAGA adaptor will be used".format(self.adaptor))
コード例 #54
0
    end_job_id = int(sys.argv[3])

queueConfigMapper = QueueConfigMapper()
queueConfig = queueConfigMapper.get_queue(queueName)
initial_queueConfig_stager = queueConfig.stager
queueConfig.stager['module'] = 'pandaharvester.harvesterstager.go_bulk_stager'
queueConfig.stager['name'] = 'GlobusBulkStager'
modified_queueConfig_stager = queueConfig.stager

pluginFactory = PluginFactory()
# get stage-out plugin
stagerCore = pluginFactory.get_plugin(queueConfig.stager)

# logger
_logger = core_utils.setup_logger('stageOutTest_go_bulk_stager')
tmpLog = core_utils.make_logger(_logger,
                                method_name='stageOutTest_go_bulk_stager')
tmpLog.debug('start')

for loggerName, loggerObj in logging.Logger.manager.loggerDict.iteritems():
    #print "loggerName - {}".format(loggerName)
    if loggerName.startswith('panda.log'):
        if len(loggerObj.handlers) == 0:
            continue
        if loggerName.split('.')[-1] in ['db_proxy']:
            continue
        stdoutHandler = logging.StreamHandler(sys.stdout)
        stdoutHandler.setFormatter(loggerObj.handlers[0].formatter)
        loggerObj.addHandler(stdoutHandler)

msgStr = "plugin={0}".format(stagerCore.__class__.__name__)
tmpLog.debug(msgStr)
コード例 #55
0
 def define_num_workers(self, static_num_workers, site_name):
     tmpLog = core_utils.make_logger(_logger,
                                     'site={0}'.format(site_name),
                                     method_name='define_num_workers')
     tmpLog.debug('start')
     dyn_num_workers = copy.copy(static_num_workers)
     try:
         # get queue status
         queueStat = self.dbProxy.get_cache("panda_queues.json", None)
         if queueStat is None:
             queueStat = dict()
         else:
             queueStat = queueStat.data
         # define num of new workers
         for queueName, tmpVal in iteritems(static_num_workers):
             # set 0 to num of new workers when the queue is disabled
             if queueName in queueStat and queueStat[queueName][
                     'status'] in ['offline']:
                 dyn_num_workers[queueName]['nNewWorkers'] = 0
                 retMsg = 'set nNewWorkers=0 since status={0}'.format(
                     queueStat[queueName]['status'])
                 tmpLog.debug(retMsg)
                 continue
             # get queue
             queueConfig = self.queueConfigMapper.get_queue(queueName)
             # get throttler
             if queueName not in self.throttlerMap:
                 if hasattr(queueConfig, 'throttler'):
                     throttler = self.pluginFactory.get_plugin(
                         queueConfig.throttler)
                 else:
                     throttler = None
                 self.throttlerMap[queueName] = throttler
             # check throttler
             throttler = self.throttlerMap[queueName]
             if throttler is not None:
                 toThrottle, tmpMsg = throttler.to_be_throttled(queueConfig)
                 if toThrottle:
                     dyn_num_workers[queueName]['nNewWorkers'] = 0
                     retMsg = 'set nNewWorkers=0 by {0}:{1}'.format(
                         throttler.__class__.__name__, tmpMsg)
                     tmpLog.debug(retMsg)
                     continue
             # check stats
             nQueue = tmpVal['nQueue']
             nReady = tmpVal['nReady']
             nRunning = tmpVal['nRunning']
             nQueueLimit = queueConfig.nQueueLimitWorker
             maxWorkers = queueConfig.maxWorkers
             if queueConfig.runMode == 'slave':
                 nNewWorkersDef = tmpVal['nNewWorkers']
                 if nNewWorkersDef == 0:
                     dyn_num_workers[queueName]['nNewWorkers'] = 0
                     retMsg = 'set nNewWorkers=0 by panda in slave mode'
                     tmpLog.debug(retMsg)
                     continue
             else:
                 nNewWorkersDef = None
             # define num of new workers based on static site config
             nNewWorkers = 0
             if nQueueLimit > 0 and nQueue >= nQueueLimit:
                 # enough queued workers
                 retMsg = 'No nNewWorkers since nQueue({0})>=nQueueLimit({1})'.format(
                     nQueue, nQueueLimit)
                 tmpLog.debug(retMsg)
                 pass
             elif maxWorkers > 0 and (nQueue + nReady +
                                      nRunning) >= maxWorkers:
                 # enough workers in the system
                 retMsg = 'No nNewWorkers since nQueue({0}) + nReady({1}) + nRunning({2}) '.format(
                     nQueue, nReady, nRunning)
                 retMsg += '>= maxWorkers({0})'.format(maxWorkers)
                 tmpLog.debug(retMsg)
                 pass
             else:
                 # get max number of queued workers
                 maxQueuedWorkers = 0
                 if nQueueLimit > 0:
                     maxQueuedWorkers = nQueueLimit
                 if maxQueuedWorkers == 0:
                     if nNewWorkersDef is not None:
                         # slave mode
                         maxQueuedWorkers = nNewWorkersDef + nQueue
                     else:
                         # use default value
                         maxQueuedWorkers = 1
                 # new workers
                 nNewWorkers = max(maxQueuedWorkers - nQueue, 0)
                 if maxWorkers > 0:
                     nNewWorkers = min(
                         nNewWorkers,
                         max(maxWorkers - nQueue - nReady - nRunning, 0))
             if queueConfig.maxNewWorkersPerCycle > 0:
                 nNewWorkers = min(nNewWorkers,
                                   queueConfig.maxNewWorkersPerCycle)
             dyn_num_workers[queueName]['nNewWorkers'] = nNewWorkers
         # dump
         tmpLog.debug('defined {0}'.format(str(dyn_num_workers)))
         return dyn_num_workers
     except:
         # dump error
         errMsg = core_utils.dump_error_message(tmpLog)
         return None
コード例 #56
0
 def check_status(self, jobspec):
     # make logger
     tmpLog = core_utils.make_logger(_logger,
                                     'PandaID={0}'.format(jobspec.PandaID),
                                     method_name='check_status')
     tmpLog.debug('start')
     # default return
     tmpRetVal = (True, '')
     # check that jobspec.computingSite is defined
     if jobspec.computingSite is None:
         # not found
         tmpLog.error('jobspec.computingSite is not defined')
         return False, 'jobspec.computingSite is not defined'
     else:
         tmpLog.debug('jobspec.computingSite : {0}'.format(
             jobspec.computingSite))
     # test we have a Globus Transfer Client
     if not self.tc:
         errStr = 'failed to get Globus Transfer Client'
         tmpLog.error(errStr)
         return False, errStr
     # set transferID to None
     transferID = None
     # get transfer groups
     groups = jobspec.get_groups_of_output_files()
     tmpLog.debug(
         'jobspec.get_groups_of_output_files() = : {0}'.format(groups))
     # lock if the dummy transfer ID is used to avoid submitting duplicated transfer requests
     if self.dummy_transfer_id in groups:
         # lock for 120 sec
         if not self.have_db_lock:
             tmpLog.debug(
                 'attempt to set DB lock for self.id - {0} self.dummy_transfer_id - {1}'
                 .format(self.id, self.dummy_transfer_id))
             self.have_db_lock = self.dbInterface.get_object_lock(
                 self.dummy_transfer_id, lock_interval=120)
         if not self.have_db_lock:
             # escape since locked by another thread
             msgStr = 'escape since locked by another thread'
             tmpLog.debug(msgStr)
             return None, msgStr
         # refresh group information since that could have been updated by another thread before getting the lock
         self.dbInterface.refresh_file_group_info(jobspec)
         # get transfer groups again with refreshed info
         groups = jobspec.get_groups_of_output_files()
         # the dummy transfer ID is still there
         if self.dummy_transfer_id in groups:
             groupUpdateTime = groups[
                 self.dummy_transfer_id]['groupUpdateTime']
             # get files with the dummy transfer ID across jobs
             fileSpecs = self.dbInterface.get_files_with_group_id(
                 self.dummy_transfer_id)
             # submit transfer if there are more than 10 files or the group was made before more than 10 min
             msgStr = 'self.dummy_transfer_id = {0}  number of files = {1}'.format(
                 self.dummy_transfer_id, len(fileSpecs))
             tmpLog.debug(msgStr)
             if len(fileSpecs) >= 10 or \
                     groupUpdateTime < datetime.datetime.utcnow() - datetime.timedelta(minutes=10):
                 tmpLog.debug('prepare to transfer files')
                 # submit transfer and get a real transfer ID
                 # set the Globus destination Endpoint id and path will get them from Agis eventually
                 from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper
                 queueConfigMapper = QueueConfigMapper()
                 queueConfig = queueConfigMapper.get_queue(
                     jobspec.computingSite)
                 #self.Globus_srcPath = queueConfig.stager['Globus_srcPath']
                 self.srcEndpoint = queueConfig.stager['srcEndpoint']
                 self.Globus_srcPath = self.basePath
                 self.Globus_dstPath = queueConfig.stager['Globus_dstPath']
                 self.dstEndpoint = queueConfig.stager['dstEndpoint']
                 # Test the endpoints and create the transfer data class
                 errMsg = None
                 try:
                     # Test endpoints for activation
                     tmpStatsrc, srcStr = globus_utils.check_endpoint_activation(
                         tmpLog, self.tc, self.srcEndpoint)
                     tmpStatdst, dstStr = globus_utils.check_endpoint_activation(
                         tmpLog, self.tc, self.dstEndpoint)
                     if tmpStatsrc and tmpStatdst:
                         errStr = 'source Endpoint and destination Endpoint activated'
                         tmpLog.debug(errStr)
                     else:
                         errMsg = ''
                         if not tmpStatsrc:
                             errMsg += ' source Endpoint not activated '
                         if not tmpStatdst:
                             errMsg += ' destination Endpoint not activated '
                         # release process lock
                         tmpLog.debug(
                             'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'
                             .format(self.id, self.dummy_transfer_id))
                         self.have_db_lock = self.dbInterface.release_object_lock(
                             self.dummy_transfer_id)
                         if not self.have_db_lock:
                             errMsg += ' - Could not release DB lock for {}'.format(
                                 self.dummy_transfer_id)
                         tmpLog.error(errMsg)
                         tmpRetVal = (None, errMsg)
                         return tmpRetVal
                     # both endpoints activated now prepare to transfer data
                     tdata = TransferData(self.tc,
                                          self.srcEndpoint,
                                          self.dstEndpoint,
                                          sync_level="checksum")
                 except:
                     errStat, errMsg = globus_utils.handle_globus_exception(
                         tmpLog)
                     # release process lock
                     tmpLog.debug(
                         'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'
                         .format(self.id, self.dummy_transfer_id))
                     self.have_db_lock = self.dbInterface.release_object_lock(
                         self.dummy_transfer_id)
                     if not self.have_db_lock:
                         errMsg += ' - Could not release DB lock for {}'.format(
                             self.dummy_transfer_id)
                     tmpLog.error(errMsg)
                     tmpRetVal = (errStat, errMsg)
                     return tmpRetVal
                 # loop over all files
                 for fileSpec in fileSpecs:
                     attrs = jobspec.get_output_file_attributes()
                     msgStr = "len(jobSpec.get_output_file_attributes()) = {0} type - {1}".format(
                         len(attrs), type(attrs))
                     tmpLog.debug(msgStr)
                     for key, value in attrs.iteritems():
                         msgStr = "output file attributes - {0} {1}".format(
                             key, value)
                         tmpLog.debug(msgStr)
                     msgStr = "fileSpec.lfn - {0} fileSpec.scope - {1}".format(
                         fileSpec.lfn, fileSpec.scope)
                     tmpLog.debug(msgStr)
                     scope = fileSpec.scope
                     hash = hashlib.md5()
                     hash.update('%s:%s' % (scope, fileSpec.lfn))
                     hash_hex = hash.hexdigest()
                     correctedscope = "/".join(scope.split('.'))
                     srcURL = fileSpec.path
                     dstURL = "{endPoint}/{scope}/{hash1}/{hash2}/{lfn}".format(
                         endPoint=self.Globus_dstPath,
                         scope=correctedscope,
                         hash1=hash_hex[0:2],
                         hash2=hash_hex[2:4],
                         lfn=fileSpec.lfn)
                     tmpLog.debug('src={srcURL} dst={dstURL}'.format(
                         srcURL=srcURL, dstURL=dstURL))
                     # add files to transfer object - tdata
                     if os.access(srcURL, os.R_OK):
                         tmpLog.debug("tdata.add_item({},{})".format(
                             srcURL, dstURL))
                         tdata.add_item(srcURL, dstURL)
                     else:
                         errMsg = "source file {} does not exist".format(
                             srcURL)
                         # release process lock
                         tmpLog.debug(
                             'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'
                             .format(self.id, self.dummy_transfer_id))
                         self.have_db_lock = self.dbInterface.release_object_lock(
                             self.dummy_transfer_id)
                         if not self.have_db_lock:
                             errMsg += ' - Could not release DB lock for {}'.format(
                                 self.dummy_transfer_id)
                         tmpLog.error(errMsg)
                         tmpRetVal = (False, errMsg)
                         return tmpRetVal
                 # submit transfer
                 try:
                     transfer_result = self.tc.submit_transfer(tdata)
                     # check status code and message
                     tmpLog.debug(str(transfer_result))
                     if transfer_result['code'] == "Accepted":
                         # succeeded
                         # set transfer ID which are used for later lookup
                         transferID = transfer_result['task_id']
                         tmpLog.debug(
                             'successfully submitted id={0}'.format(
                                 transferID))
                         # set status for files
                         self.dbInterface.set_file_group(
                             fileSpecs, transferID, 'running')
                         msgStr = 'submitted transfer with ID={0}'.format(
                             transferID)
                         tmpLog.debug(msgStr)
                     else:
                         # release process lock
                         tmpLog.debug(
                             'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'
                             .format(self.id, self.dummy_transfer_id))
                         self.have_db_lock = self.dbInterface.release_object_lock(
                             self.dummy_transfer_id)
                         if not self.have_db_lock:
                             errMsg = 'Could not release DB lock for {}'.format(
                                 self.dummy_transfer_id)
                             tmpLog.error(errMsg)
                         tmpRetVal = (None, transfer_result['message'])
                         return tmpRetVal
                 except Exception as e:
                     errStat, errMsg = globus_utils.handle_globus_exception(
                         tmpLog)
                     # release process lock
                     tmpLog.debug(
                         'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'
                         .format(self.id, self.dummy_transfer_id))
                     self.have_db_lock = self.dbInterface.release_object_lock(
                         self.dummy_transfer_id)
                     if not self.have_db_lock:
                         errMsg += ' - Could not release DB lock for {}'.format(
                             self.dummy_transfer_id)
                     tmpLog.error(errMsg)
                     return errStat, errMsg
             else:
                 msgStr = 'wait until enough files are pooled'
                 tmpLog.debug(msgStr)
             # release the lock
             tmpLog.debug(
                 'attempt to release DB lock for self.id - {0} self.dummy_transfer_id - {1}'
                 .format(self.id, self.dummy_transfer_id))
             self.have_db_lock = self.dbInterface.release_object_lock(
                 self.dummy_transfer_id)
             if not self.have_db_lock:
                 msgStr += ' - Could not release DB lock for {}'.format(
                     self.dummy_transfer_id)
                 tmpLog.error(msgStr)
             # return None to retry later
             return None, msgStr
     # check transfer with real transfer IDs
     # get transfer groups
     groups = jobspec.get_groups_of_output_files()
     for transferID in groups:
         if transferID != self.dummy_transfer_id:
             # get transfer task
             tmpStat, transferTasks = globus_utils.get_transfer_task_by_id(
                 tmpLog, self.tc, transferID)
             # return a temporary error when failed to get task
             if not tmpStat:
                 errStr = 'failed to get transfer task'
                 tmpLog.error(errStr)
                 return None, errStr
             # return a temporary error when task is missing
             if transferID not in transferTasks:
                 errStr = 'transfer task ID - {} is missing'.format(
                     transferID)
                 tmpLog.error(errStr)
                 return None, errStr
             # succeeded in finding a transfer task by tranferID
             if transferTasks[transferID]['status'] == 'SUCCEEDED':
                 tmpLog.debug(
                     'transfer task {} succeeded'.format(transferID))
                 self.set_FileSpec_status(jobspec, 'finished')
                 return True, ''
             # failed
             if transferTasks[transferID]['status'] == 'FAILED':
                 errStr = 'transfer task {} failed'.format(transferID)
                 tmpLog.error(errStr)
                 self.set_FileSpec_status(jobspec, 'failed')
                 return False, errStr
             # another status
             tmpStr = 'transfer task {0} status: {1}'.format(
                 transferID, transferTasks[transferID]['status'])
             tmpLog.debug(tmpStr)
             return None, ''
コード例 #57
0
    def submit_workers(self, workspec_list):
        retList = []
        for workSpec in workspec_list:

            tmpLog = core_utils.make_logger(baseLogger,
                                            'workerID={0}'.format(
                                                workSpec.workerID),
                                            method_name='submit_workers')

            queueconfigmapper = QueueConfigMapper()
            queueconfig = queueconfigmapper.get_queue(workSpec.computingSite)
            prodSourceLabel = queueconfig.get_source_label()

            # If jobSpec is defined we are in push mode, if not pull mode
            # Both assume one to one worker to job mapping
            jobSpec = workSpec.get_jobspec_list()
            if jobSpec:
                jobSpec = jobSpec[0]
                tmpLog.debug("JobSpec: {0}".format(jobSpec.values_map()))

            desc = {}
            # If we need to prefetch events, set aCT status waiting.
            # feed_events in act_messenger will fill events and release the job
            if queueconfig.prefetchEvents:
                desc['pandastatus'] = 'waiting'
                desc['actpandastatus'] = 'waiting'
                desc['arcjobid'] = -1  # dummy id to prevent submission
            else:
                desc['pandastatus'] = 'sent'
                desc['actpandastatus'] = 'sent'
            desc['siteName'] = workSpec.computingSite
            desc['proxyid'] = self.proxymap['pilot' if prodSourceLabel ==
                                            'user' else 'production']
            desc['sendhb'] = 0
            metadata = {
                'harvesteraccesspoint':
                workSpec.get_access_point(),
                'schedulerid':
                'harvester-{}'.format(harvester_config.master.harvester_id)
            }
            desc['metadata'] = json.dumps(metadata)

            if jobSpec:
                # push mode: aCT takes the url-encoded job description (like it gets from panda server)
                pandaid = jobSpec.PandaID
                actjobdesc = urllib.urlencode(jobSpec.jobParams)
            else:
                # pull mode: just set pandaid (to workerid) and prodsourcelabel
                pandaid = workSpec.workerID
                actjobdesc = 'PandaID=%d&prodSourceLabel=%s' % (
                    pandaid, prodSourceLabel)

            tmpLog.info("Inserting job {0} into aCT DB: {1}".format(
                pandaid, str(desc)))
            try:
                batchid = self.actDB.insertJob(pandaid, actjobdesc,
                                               desc)['LAST_INSERT_ID()']
            except Exception as e:
                result = (False,
                          "Failed to insert job into aCT DB: {0}".format(
                              str(e)))
            else:
                tmpLog.info("aCT batch id {0}".format(batchid))
                workSpec.batchID = str(batchid)
                workSpec.submissionHost = self.hostname
                workSpec.nativeStatus = desc['actpandastatus']
                # Set log files in workSpec
                today = time.strftime('%Y-%m-%d', time.gmtime())
                logurl = '/'.join([
                    queueconfig.submitter.get('logBaseURL'), today,
                    workSpec.computingSite,
                    str(pandaid)
                ])
                workSpec.set_log_file('batch_log', '{0}.log'.format(logurl))
                workSpec.set_log_file('stdout', '{0}.out'.format(logurl))
                workSpec.set_log_file('stderr', '{0}.err'.format(logurl))
                workSpec.set_log_file('jdl', '{0}.jdl'.format(logurl))
                result = (True, '')
            retList.append(result)

        return retList
コード例 #58
0
    def run(self):
        lockedBy = 'submitter-{0}'.format(self.ident)
        while True:
            mainLog = core_utils.make_logger(_logger,
                                             'id={0}'.format(lockedBy),
                                             method_name='run')
            mainLog.debug('getting queues to submit workers')

            # get queues associated to a site to submit workers
            curWorkers, siteName, resMap = self.dbProxy.get_queues_to_submit(
                harvester_config.submitter.nQueues,
                harvester_config.submitter.lookupTime,
                harvester_config.submitter.lockInterval)
            if siteName is not None:
                mainLog.debug('got {0} queues for site {1}'.format(
                    len(curWorkers), siteName))

                # get commands
                comStr = '{0}:{1}'.format(CommandSpec.COM_setNWorkers,
                                          siteName)
                commandSpecs = self.dbProxy.get_commands_for_receiver(
                    'submitter', comStr)
                mainLog.debug('got {0} commands'.format(len(commandSpecs)))
                for commandSpec in commandSpecs:
                    newLimits = self.dbProxy.set_queue_limit(
                        siteName, commandSpec.params)
                    for tmpResource, tmpNewVal in iteritems(newLimits):
                        # if available, overwrite new worker value with the command from panda server
                        if tmpResource in resMap:
                            tmpQueueName = resMap[tmpResource]
                            if tmpQueueName in curWorkers:
                                curWorkers[tmpQueueName][tmpResource][
                                    'nNewWorkers'] = tmpNewVal

                # define number of new workers
                if len(curWorkers) == 0:
                    n_workers_per_queue_and_rt = dict()
                else:
                    n_workers_per_queue_and_rt = self.workerAdjuster.define_num_workers(
                        curWorkers, siteName)

                if n_workers_per_queue_and_rt is None:
                    mainLog.error(
                        'WorkerAdjuster failed to define the number of workers'
                    )
                elif len(n_workers_per_queue_and_rt) == 0:
                    pass
                else:
                    # loop over all queues and resource types
                    for queueName in n_workers_per_queue_and_rt:
                        for resource_type, tmpVal in iteritems(
                                n_workers_per_queue_and_rt[queueName]):

                            tmpLog = core_utils.make_logger(
                                _logger,
                                'id={0} queue={1} resource_type={2}'.format(
                                    lockedBy, queueName, resource_type),
                                method_name='run')
                            tmpLog.debug('start')
                            nWorkers = tmpVal['nNewWorkers'] + tmpVal['nReady']
                            nReady = tmpVal['nReady']

                            # check queue
                            if not self.queueConfigMapper.has_queue(queueName):
                                tmpLog.error('config not found')
                                continue

                            # no new workers
                            if nWorkers == 0:
                                tmpLog.debug(
                                    'skipped since no new worker is needed based on current stats'
                                )
                                continue
                            # get queue
                            queueConfig = self.queueConfigMapper.get_queue(
                                queueName)

                            # actions based on mapping type
                            if queueConfig.mapType == WorkSpec.MT_NoJob:
                                # workers without jobs
                                jobChunks = []
                                for i in range(nWorkers):
                                    jobChunks.append([])
                            elif queueConfig.mapType == WorkSpec.MT_OneToOne:
                                # one worker per one job
                                jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                    queueName, nWorkers, nReady, 1, None,
                                    queueConfig.useJobLateBinding,
                                    harvester_config.submitter.checkInterval,
                                    harvester_config.submitter.lockInterval,
                                    lockedBy)
                            elif queueConfig.mapType == WorkSpec.MT_MultiJobs:
                                # one worker for multiple jobs
                                nJobsPerWorker = self.workerMaker.get_num_jobs_per_worker(
                                    queueConfig, nWorkers, resource_type)
                                tmpLog.debug('nJobsPerWorker={0}'.format(
                                    nJobsPerWorker))
                                jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                    queueName, nWorkers, nReady,
                                    nJobsPerWorker, None,
                                    queueConfig.useJobLateBinding,
                                    harvester_config.submitter.checkInterval,
                                    harvester_config.submitter.lockInterval,
                                    lockedBy, queueConfig.allowJobMixture)
                            elif queueConfig.mapType == WorkSpec.MT_MultiWorkers:
                                # multiple workers for one job
                                nWorkersPerJob = self.workerMaker.get_num_workers_per_job(
                                    queueConfig, nWorkers, resource_type)
                                jobChunks = self.dbProxy.get_job_chunks_for_workers(
                                    queueName, nWorkers, nReady, None,
                                    nWorkersPerJob,
                                    queueConfig.useJobLateBinding,
                                    harvester_config.submitter.checkInterval,
                                    harvester_config.submitter.lockInterval,
                                    lockedBy)
                            else:
                                tmpLog.error('unknown mapType={0}'.format(
                                    queueConfig.mapType))
                                continue

                            tmpLog.debug('got {0} job chunks'.format(
                                len(jobChunks)))
                            if len(jobChunks) == 0:
                                continue
                            # make workers
                            okChunks, ngChunks = self.workerMaker.make_workers(
                                jobChunks, queueConfig, nReady, resource_type)
                            if len(ngChunks) == 0:
                                tmpLog.debug(
                                    'successfully made {0} workers'.format(
                                        len(okChunks)))
                            else:
                                tmpLog.debug(
                                    'made {0} workers, while {1} workers failed'
                                    .format(len(okChunks), len(ngChunks)))
                            timeNow = datetime.datetime.utcnow()
                            # NG (=not good)
                            for ngJobs in ngChunks:
                                for jobSpec in ngJobs:
                                    jobSpec.status = 'failed'
                                    jobSpec.subStatus = 'failedtomake'
                                    jobSpec.stateChangeTime = timeNow
                                    jobSpec.lockedBy = None
                                    jobSpec.trigger_propagation()
                                    self.dbProxy.update_job(
                                        jobSpec, {
                                            'lockedBy': lockedBy,
                                            'subStatus': 'prepared'
                                        })
                            # OK
                            pandaIDs = set()
                            workSpecList = []
                            if len(okChunks) > 0:
                                for workSpec, okJobs in okChunks:
                                    # has job
                                    if (queueConfig.useJobLateBinding and workSpec.workerID is None) \
                                            or queueConfig.mapType == WorkSpec.MT_NoJob:
                                        workSpec.hasJob = 0
                                    else:
                                        workSpec.hasJob = 1
                                        if workSpec.nJobsToReFill in [None, 0]:
                                            workSpec.set_jobspec_list(okJobs)
                                        else:
                                            # refill free slots during the worker is running
                                            workSpec.set_jobspec_list(
                                                okJobs[:workSpec.
                                                       nJobsToReFill])
                                            workSpec.nJobsToReFill = None
                                            for jobSpec in okJobs[
                                                    workSpec.nJobsToReFill:]:
                                                pandaIDs.add(jobSpec.PandaID)
                                    # map type
                                    workSpec.mapType = queueConfig.mapType
                                    # queue name
                                    workSpec.computingSite = queueConfig.queueName
                                    # set access point
                                    workSpec.accessPoint = queueConfig.messenger[
                                        'accessPoint']
                                    # events
                                    if len(okJobs) > 0 and (
                                            'eventService'
                                            in okJobs[0].jobParams or
                                            'cloneJob' in okJobs[0].jobParams):
                                        workSpec.eventsRequest = WorkSpec.EV_useEvents
                                    workSpecList.append(workSpec)
                            if len(workSpecList) > 0:
                                # get plugin for submitter
                                submitterCore = self.pluginFactory.get_plugin(
                                    queueConfig.submitter)
                                if submitterCore is None:
                                    # not found
                                    tmpLog.error(
                                        'submitter plugin for {0} not found'.
                                        format(jobSpec.computingSite))
                                    continue
                                # get plugin for messenger
                                messenger = self.pluginFactory.get_plugin(
                                    queueConfig.messenger)
                                if messenger is None:
                                    # not found
                                    tmpLog.error(
                                        'messenger plugin for {0} not found'.
                                        format(jobSpec.computingSite))
                                    continue
                                # setup access points
                                messenger.setup_access_points(workSpecList)
                                # feed jobs
                                for workSpec in workSpecList:
                                    if workSpec.hasJob == 1:
                                        tmpStat = messenger.feed_jobs(
                                            workSpec,
                                            workSpec.get_jobspec_list())
                                        if tmpStat is False:
                                            tmpLog.error(
                                                'failed to send jobs to workerID={0}'
                                                .format(workSpec.workerID))
                                        else:
                                            tmpLog.debug(
                                                'sent jobs to workerID={0} with {1}'
                                                .format(
                                                    workSpec.workerID,
                                                    tmpStat))
                                # insert workers
                                self.dbProxy.insert_workers(
                                    workSpecList, lockedBy)
                                # submit
                                tmpLog.info('submitting {0} workers'.format(
                                    len(workSpecList)))
                                workSpecList, tmpRetList, tmpStrList = self.submit_workers(
                                    submitterCore, workSpecList)
                                for iWorker, (tmpRet, tmpStr) in enumerate(
                                        zip(tmpRetList, tmpStrList)):
                                    workSpec, jobList = okChunks[iWorker]
                                    # use associated job list since it can be truncated for re-filling
                                    jobList = workSpec.get_jobspec_list()
                                    # set status
                                    if not tmpRet:
                                        # failed submission
                                        tmpLog.error(
                                            'failed to submit a workerID={0} with {1}'
                                            .format(workSpec.workerID, tmpStr))
                                        workSpec.set_status(WorkSpec.ST_missed)
                                        workSpec.set_dialog_message(tmpStr)
                                        jobList = []
                                    elif queueConfig.useJobLateBinding and workSpec.hasJob == 1:
                                        # directly go to running after feeding jobs for late biding
                                        workSpec.set_status(
                                            WorkSpec.ST_running)
                                    else:
                                        # normal successful submission
                                        workSpec.set_status(
                                            WorkSpec.ST_submitted)
                                    workSpec.submitTime = timeNow
                                    workSpec.modificationTime = timeNow
                                    # prefetch events
                                    if tmpRet and workSpec.hasJob == 1 and workSpec.eventsRequest == WorkSpec.EV_useEvents:
                                        workSpec.eventsRequest = WorkSpec.EV_requestEvents
                                        eventsRequestParams = dict()
                                        for jobSpec in jobList:
                                            eventsRequestParams[
                                                jobSpec.PandaID] = {
                                                    'pandaID':
                                                    jobSpec.PandaID,
                                                    'taskID':
                                                    jobSpec.taskID,
                                                    'jobsetID':
                                                    jobSpec.
                                                    jobParams['jobsetID'],
                                                    'nRanges':
                                                    jobSpec.
                                                    jobParams['coreCount'],
                                                }
                                        workSpec.eventsRequestParams = eventsRequestParams
                                    # register worker
                                    tmpStat = self.dbProxy.register_worker(
                                        workSpec, jobList, lockedBy)
                                    if jobList is not None:
                                        for jobSpec in jobList:
                                            pandaIDs.add(jobSpec.PandaID)
                                            if tmpStat:
                                                tmpStr = 'submitted a workerID={0} for PandaID={1} with batchID={2}'
                                                tmpLog.info(
                                                    tmpStr.format(
                                                        workSpec.workerID,
                                                        jobSpec.PandaID,
                                                        workSpec.batchID))
                                            else:
                                                tmpStr = 'failed to register a worker for PandaID={0} with batchID={1}'
                                                tmpLog.error(
                                                    tmpStr.format(
                                                        jobSpec.PandaID,
                                                        workSpec.batchID))
                            # release jobs
                            self.dbProxy.release_jobs(pandaIDs, lockedBy)
                            tmpLog.info('done')
            mainLog.debug('done')
            # define sleep interval
            if siteName is None:
                sleepTime = harvester_config.submitter.sleepTime
            else:
                sleepTime = 0
            # check if being terminated
            if self.terminated(sleepTime):
                mainLog.debug('terminated')
                return