def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self.nodeName = socket.gethostname() self.lastHeartbeat = None
def kill_workers(arguments): status_in = 'ALL' if (len(arguments.status) == 1 and arguments.status[0] == 'ALL') else arguments.status computingSite_in = 'ALL' if ( len(arguments.sites) == 1 and arguments.sites[0] == 'ALL') else arguments.sites computingElement_in = 'ALL' if (len( arguments.ces) == 1 and arguments.ces[0] == 'ALL') else arguments.ces submissionHost_in = 'ALL' if (len(arguments.submissionhosts) == 1 and arguments.submissionhosts[0] == 'ALL') else arguments.submissionhosts dbProxy = DBProxy() retVal = dbProxy.kill_workers_by_query({ 'status': status_in, 'computingSite': computingSite_in, 'computingElement': computingElement_in, 'submissionHost': submissionHost_in }) if retVal is not None: msg_temp = ('Sweeper will soon kill {n_workers} workers, with ' 'status in {status_in}, ' 'computingSite in {computingSite_in}, ' 'computingElement in {computingElement_in}, ' 'submissionHost in {submissionHost_in}') print( msg_temp.format(n_workers=retVal, status_in=status_in, computingSite_in=computingSite_in, computingElement_in=computingElement_in, submissionHost_in=submissionHost_in)) else: mainLogger.critical('Failed to kill workers. See panda-db_proxy.log')
def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper self._last_stats_update = None self._last_metrics_update = None
def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.nodeName = socket.gethostname() self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory()
def __init__(self, **kwarg): for tmpKey, tmpVal in iteritems(kwarg): setattr(self, tmpKey, tmpVal) self.hostname = socket.gethostname() self.os_pid = os.getpid() self.dbProxy = DBProxy() self.dbInterface = DBInterface()
def __init__(self, single_mode=False): AgentBase.__init__(self, single_mode) self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # get module and class names moduleNames = self.get_list(harvester_config.credmanager.moduleName) classNames = self.get_list(harvester_config.credmanager.className) # file names of original certificates if hasattr(harvester_config.credmanager, 'inCertFile'): inCertFiles = self.get_list( harvester_config.credmanager.inCertFile) else: inCertFiles = self.get_list(harvester_config.credmanager.certFile) # file names of certificates to be generated if hasattr(harvester_config.credmanager, 'outCertFile'): outCertFiles = self.get_list( harvester_config.credmanager.outCertFile) else: # use the file name of the certificate for panda connection as output name outCertFiles = self.get_list(harvester_config.pandacon.cert_file) # VOMS vomses = self.get_list(harvester_config.credmanager.voms) # get plugin self.exeCores = [] for moduleName, className, inCertFile, outCertFile, voms in \ zip(moduleNames, classNames, inCertFiles, outCertFiles, vomses): pluginPar = {} pluginPar['module'] = moduleName pluginPar['name'] = className pluginPar['inCertFile'] = inCertFile pluginPar['outCertFile'] = outCertFile pluginPar['voms'] = voms exeCore = self.pluginFactory.get_plugin(pluginPar) self.exeCores.append(exeCore)
def HarvesterReport(self): try: from distutils.sysconfig import get_python_lib # pylint: disable=import-error sys.path.append(get_python_lib()+'/pandacommon') os.environ['PANDA_HOME']=os.environ['VIRTUAL_ENV'] from collections import defaultdict # pylint: disable=import-error from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy # pylint: disable=import-error self.dbProxy = DBProxy() workers = self.dbProxy.get_worker_stats_bulk(None) rep = defaultdict(dict) rtot = defaultdict(int) for site, prodsourcelabels in workers.items(): for prodsourcelabel, resources in prodsourcelabels.items(): for resource, jobs in resources.items(): rep[f'{site}-{resource}'][prodsourcelabel or 'empty'] = jobs for state, count in jobs.items(): rtot[state] += count self.log(f"All Harvester jobs: {sum(rtot.values())} prodSourceLabel: submitted/running") for k in sorted(rep.keys()): log=f"{k:>28.28}:" for psl, jobs in rep[k].items(): log += f"{psl:>10}: {jobs['submitted']}/{jobs['running']}" self.log(log) log = f"{'Totals':>28}: submitted: {rtot['submitted']} running: {rtot['running']}" self.log(log+'\n\n') except: pass
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.workerMaker = WorkerMaker() self.workerAdjuster = WorkerAdjuster(queue_config_mapper) self.pluginFactory = PluginFactory()
def qconf_purge(arguments): queueName = arguments.queue dbProxy = DBProxy() retVal = dbProxy.purge_pq(queueName) if retVal: print('Purged {0} from harvester DB'.format(queueName)) else: mainLogger.critical('Failed to purge {0} . See panda-db_proxy.log'.format(queueName))
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() self.apfmon = Apfmon(self.queueConfigMapper)
def __init__(self, queue_config_mapper): self.queue_configMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict() self.apf_mon = Apfmon(self.queue_configMapper) try: self.maxNewWorkers = harvester_config.submitter.maxNewWorkers except AttributeError: self.maxNewWorkers = None
def __init__(self, single_mode=False, stop_event=None, daemon_mode=True): # initialize database and config self.singleMode = single_mode self.stopEvent = stop_event self.daemonMode = daemon_mode from pandaharvester.harvestercore.communicator_pool import CommunicatorPool self.communicatorPool = CommunicatorPool() from pandaharvester.harvestercore.queue_config_mapper import QueueConfigMapper self.queueConfigMapper = QueueConfigMapper() from pandaharvester.harvestercore.db_proxy_pool import DBProxyPool as DBProxy dbProxy = DBProxy() dbProxy.make_tables(self.queueConfigMapper)
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queue_config_mapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() # plugin cores self.exeCores = [] self.queue_exe_cores = [] # get plugin from harvester config self.get_cores_from_harvester_config() # update plugin cores from queue config self.update_cores_from_queue_config()
def __init__(self, **kwarg): '''Set up DB connection and credentials''' PluginBase.__init__(self, **kwarg) self.dbproxy = DBProxy() self.schedulerid = harvester_config.master.harvester_id # Credential dictionary role: proxy file self.certs = dict( zip([ r.split('=')[1] for r in list(harvester_config.credmanager.voms) ], list(harvester_config.credmanager.outCertFile))) self.cred_type = arc.initializeCredentialsType( arc.initializeCredentialsType.SkipCredentials)
def __init__(self, pid_file, single_mode=False): AgentBase.__init__(self, single_mode) self.db_proxy = DBProxy() if pid_file is not None: self.pid_file = pid_file else: try: self.pid_file = harvester_config.service_monitor.pidfile except Exception: self.pid_file = None self.pid = self.get_master_pid() self.master_process = psutil.Process(self.pid) self.children = self.master_process.children(recursive=True) self.cpu_count = multiprocessing.cpu_count()
def __init__(self, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.queueConfigMapper = queue_config_mapper self.dbProxy = DBProxy() self.pluginFactory = PluginFactory() self.startTimestamp = time.time() self.monitor_fifo = MonitorFIFO() if self.monitor_fifo.enabled: self.monitor_event_fifo = MonitorEventFIFO() else: self.monitor_event_fifo = None self.apfmon = Apfmon(self.queueConfigMapper) self.eventBasedMonCoreList = [] if getattr(harvester_config.monitor, 'eventBasedEnable', False): for pluginConf in harvester_config.monitor.eventBasedPlugins: pluginFactory = PluginFactory() self.eventBasedMonCoreList.append(pluginFactory.get_plugin(pluginConf))
def __init__(self, queue_config_mapper): self.queueConfigMapper = queue_config_mapper self.pluginFactory = PluginFactory() self.dbProxy = DBProxy() self.throttlerMap = dict()
def check_workers(self, workspec_list): """Check status of workers. This method takes a list of WorkSpecs as input argument and returns a list of worker's statuses. :param workspec_list: a list of work specs instances :return: A tuple of return code (True for success, False otherwise) and a list of worker's statuses. :rtype: (bool, [string,]) """ try: job_service = rs.job.Service(self.adaptor) except rs.SagaException as ex: time.sleep(10) self.check_workers(workspec_list) retList = [] for workSpec in workspec_list: # make logger errStr = '' tmpLog = self.make_logger(baseLogger, 'workerID={0}'.format(workSpec.workerID), method_name='check_workers') tmpLog.debug("SAGA monitor started") if workSpec.batchID: saga_submission_id = '[{0}]-[{1}]'.format(self.adaptor, workSpec.batchID) try: worker = job_service.get_job(saga_submission_id) tmpLog.debug( 'SAGA State for submission with batchid: {0} is: {1}'.format(workSpec.batchID, worker.state)) harvester_job_state = SAGASubmitter.status_translator(worker.state) workSpec.nativeStatus = worker.state workSpec.set_status(harvester_job_state) tmpLog.debug( 'Worker state with batchid: {0} is: {1} exit code: {2}'.format(workSpec.batchID, harvester_job_state, worker.exit_code)) workSpec.set_status(harvester_job_state) if worker.created: tmpLog.debug("Worker created (SAGA): {0}".format(worker.created)) workSpec.submitTime = datetime.utcfromtimestamp(worker.created) if worker.started: tmpLog.debug("Worker started (SAGA): {0}".format(worker.started)) workSpec.startTime = datetime.utcfromtimestamp(worker.started) if worker.finished: tmpLog.debug("Worker finished (SAGA): {0}".format(worker.finished)) workSpec.endTime = datetime.utcfromtimestamp(worker.finished) if workSpec.is_final_status(): workSpec.nativeExitCode = worker.exit_code tmpLog.info("Worker in final status [{0}] exit code: {1}".format(workSpec.status, workSpec.nativeExitCode)) if workSpec.nativeExitCode != 0: # let's try to find exit code, exit message etc... tmpLog.info("Deep check to find exit code and exit status required") harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) # jsonFilePath = os.path.join(workSpec.get_access_point(), harvester_config.payload_interaction.killWorkerFile) # tmpLog.debug('Going to request kill worker via file {0}.'.format(jsonFilePath)) # try: # os.utime(jsonFilePath, None) # except OSError: # open(jsonFilePath, 'a').close() tmpLog.info('Worker {2} with BatchID={0} finished with exit code {1} and state {3}'.format( workSpec.batchID, worker.exit_code, workSpec.workerID, worker.state)) tmpLog.debug('Started: [{0}] finished: [{1}]'.format(worker.started, worker.finished)) if worker.state == rs.job.PENDING: queue_time = (datetime.now() - workSpec.submitTime).total_seconds() tmpLog.info("Worker queued for {0} sec.".format(queue_time)) if hasattr(self, 'maxqueuetime') and queue_time > self.maxqueuetime: tmpLog.info( "Queue time {0} is longer than limit {1} worker will be canceled".format(queue_time, self.maxqueuetime)) worker.cancel() worker.wait() workSpec.nativeExitCode = worker.exit_code cur_time = datetime.now() workSpec.startTime = cur_time workSpec.endTime = cur_time workSpec.set_pilot_closed() workSpec.set_status(workSpec.ST_cancelled) harvester_job_state = workSpec.ST_cancelled tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) # proper processing of jobs for worker will be required, to avoid 'fake' fails if worker.state == rs.job.RUNNING: tmpLog.info("Going to check that all jobs of the worker are in the final status.") dbProxy = DBProxy() job_spec_list = dbProxy.get_jobs_with_worker_id(workSpec.workerID, None, only_running=False, slim=False) allFinal = True for job_spec in job_spec_list: if not job_spec.is_final_status(): allFinal = False tmpLog.info("Not all jobs are in the final status, skip till the next monitoring cycle.") break if allFinal: tmpLog.info("All jobs are in the final status, going to cancel the worker.") worker.cancel() worker.wait() workSpec.nativeExitCode = 0 cur_time = datetime.utcnow() workSpec.endTime = cur_time jsonFilePath = os.path.join(workSpec.get_access_point(), harvester_config.payload_interaction.killWorkerFile) tmpLog.debug('Going to request kill worker via file {0}.'.format(jsonFilePath)) try: os.utime(jsonFilePath, None) except OSError: open(jsonFilePath, 'a').close() workSpec.set_status(workSpec.ST_finished) harvester_job_state = workSpec.ST_finished tmpLog.info("Worker state: {0} worker exit code: {1}".format(harvester_job_state, workSpec.nativeExitCode)) except rs.SagaException as ex: tmpLog.info('An exception occured during retriving worker information {0}'.format(workSpec.batchID)) tmpLog.info(ex.get_message()) # probably 'fnished' is not proper state in this case, 'undefined' looks a bit better # some more work for SAGA to get proper state harvester_job_state, workSpec.nativeExitCode, workSpec.nativeStatus, starttime, endtime, errStr = self.deep_checkjob( workSpec.batchID, workSpec.workerID) if harvester_job_state == "": harvester_job_state = workSpec.ST_finished if not workSpec.startTime: workSpec.startTime = starttime if endtime: workSpec.endTime = endtime workSpec.set_status(harvester_job_state) tmpLog.debug('Worker state set to: {0} ({1})'.format(workSpec.status, harvester_job_state)) retList.append((harvester_job_state, errStr)) # for compatibility with dummy monitor f = open(os.path.join(workSpec.accessPoint, 'status.txt'), 'w') f.write(workSpec.status) f.close() else: tmpLog.debug("SAGA monitor found worker [{0}] without batchID".format(workSpec.workerID)) job_service.close() tmpLog.debug('Results: {0}'.format(retList)) return True, retList
def __init__(self, communicator, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator
def __init__(self, *args, **kwargs): self.dbProxy = DBProxy() self.tmpLog = None BaseHTTPRequestHandler.__init__(self, *args, **kwargs)
def __init__(self, **kwarg): # logic type : AND: throttled if all rules are satisfied, OR: throttled if one rule is satisfied self.logicType = 'OR' PluginBase.__init__(self, **kwarg) self.dbProxy = DBProxy()
def __init__(self, communicator, queue_config_mapper, single_mode=False): AgentBase.__init__(self, single_mode) self.dbProxy = DBProxy() self.communicator = communicator self.queueConfigMapper = queue_config_mapper
def trigger_stage_out(self, jobspec): """Trigger the stage-out procedure for the job. Output files are available through jobspec.get_outfile_specs(skip_done=False) which gives a list of FileSpecs not yet done. FileSpec.attemptNr shows how many times transfer was tried for the file so far. :param jobspec: job specifications :type jobspec: JobSpec :return: A tuple of return code (True: success, False: fatal failure, None: temporary failure) and error dialog :rtype: (bool, string) """ # make logger tmpLog = self.make_logger(baseLogger, 'PandaID={0}'.format(jobspec.PandaID), method_name='check_stage_out_status') tmpLog.debug('start') allChecked = True ErrMsg = 'These files failed to upload: ' tmpLog.debug('Getting seprodpath from queue_config') queue_config = self.queue_config_mapper.get_queue(self.queueName) tmpLog.debug('Requesting full spec of the job {0}' . format(jobspec.PandaID)) proxy = DBProxy() jobSpec_full = proxy.get_job(jobspec.PandaID) for fileSpec in jobspec.get_output_file_specs(skip_done=True): destination = queue_config.seprodpath filename = fileSpec.lfn se_path = '' sw_path = '' prod_name = '' prodSlt = '' TMPMDSTFILE = '' TMPHISTFILE = '' EVTDUMPFILE = '' MERGEDMDSTFILE = '' MERGEDHISTFILE = '' MERGEDDUMPFILE = '' if not ".log.tgz" in fileSpec.lfn: tmpLog.debug('Getting sw path, name and hist filename from jobPars') sw_prefix, sw_path, prod_name, prodSlt, TMPMDSTFILE, TMPHISTFILE, EVTDUMPFILE, MERGEDMDSTFILE, MERGEDHISTFILE, MERGEDDUMPFILE, PRODSOFT, MCGENFILEOUT = self.getSWPathAndNameAndFilename(jobSpec_full.jobParams['jobPars']) tmpLog.debug('sw_prefix: {0}' . format(sw_prefix)) tmpLog.debug('sw_path: {0}' . format(sw_path)) tmpLog.debug('prod_name: {0}' . format(prod_name)) tmpLog.debug('prodSlt: {0}' . format(prodSlt)) tmpLog.debug('TMPMDSTFILE: {0}' . format(TMPMDSTFILE)) tmpLog.debug('TMPHISTFILE: {0}' . format(TMPHISTFILE)) tmpLog.debug('EVTDUMPFILE: {0}' . format(EVTDUMPFILE)) tmpLog.debug('MERGEDMDSTFILE: {0}' . format(MERGEDMDSTFILE)) tmpLog.debug('MERGEDHISTFILE: {0}' . format(MERGEDHISTFILE)) tmpLog.debug('MERGEDDUMPFILE: {0}' . format(MERGEDDUMPFILE)) tmpLog.debug('PRODSOFT: {0}' . format(PRODSOFT)) tmpLog.debug('MCGENFILEOUT: {0}' . format(MCGENFILEOUT)) # prod if fileSpec.lfn == TMPMDSTFILE : se_path = sw_prefix + sw_path + PRODSOFT + '/mDST.chunks' if fileSpec.lfn == TMPHISTFILE: se_path = sw_prefix + sw_path + PRODSOFT + '/TRAFDIC' if fileSpec.lfn == "testevtdump.raw": se_path = sw_prefix + sw_path + PRODSOFT + '/evtdump/slot' + prodSlt filename = EVTDUMPFILE if fileSpec.lfn == "payload_stdout.out.gz": se_path = sw_prefix + sw_path + PRODSOFT + '/logFiles' filename = prod_name + '.' + TMPHISTFILE.replace('.root', '.stdout.gz') if fileSpec.lfn == "payload_stderr.out.gz": se_path = sw_prefix + sw_path + PRODSOFT + '/logFiles' filename = prod_name + '.' + TMPHISTFILE.replace('.root', '.stderr.gz') # merge if fileSpec.lfn == MERGEDMDSTFILE : se_path = sw_prefix + sw_path + PRODSOFT + '/mDST' if fileSpec.lfn == MERGEDHISTFILE: se_path = sw_prefix + sw_path + PRODSOFT + '/histos' if fileSpec.lfn == MERGEDDUMPFILE: se_path = sw_prefix + sw_path + PRODSOFT + '/mergedDump/slot' + prodSlt # mc generation if fileSpec.lfn == MCGENFILEOUT: se_path = sw_prefix + '/mc/' + sw_path + PRODSOFT + '/mcgen' filename = MCGENFILEOUT destination = se_path surl = "{0}/{1}" . format(destination, filename) dst_gpfn = "{0}/{1}" . format(destination, filename) lfcdir = destination tmpLog.debug('fileSpec.path = {0}' . format(fileSpec.path)) tmpLog.debug('SURL = {0}' . format(surl)) tmpLog.debug('dst_gpfn = {0}' . format(dst_gpfn)) tmpLog.debug('lfcdir = {0}' . format(lfcdir)) tmpLog.debug('Create if does not exist {0}' . format(lfcdir)) if not os.path.exists(lfcdir): os.makedirs(lfcdir) tmpLog.debug('Copy {0} to {1}' . format(fileSpec.path, dst_gpfn)) shutil.copyfile(fileSpec.path, dst_gpfn) if os.path.exists(dst_gpfn): fileSpec.status = 'finished' else: fileSpec.status = 'failed' allChecked = False ErrMsg += '{0} ' . format(fileSpec.lfn) # force update fileSpec.force_update('status') tmpLog.debug('Status of file {0} is {1}' . format(fileSpec.path, fileSpec.status)) del jobSpec_full tmpLog.debug('done') if allChecked: return True, '' else: return False, ErrMsg
def __init__(self): self.pluginFactory = PluginFactory() self.dbProxy = DBProxy()
stdoutHandler = logging.StreamHandler(sys.stdout) stdoutHandler.setFormatter(loggerObj.handlers[0].formatter) loggerObj.addHandler(stdoutHandler) msgStr = "plugin={0}".format(preparatorCore.__class__.__name__) tmpLog.debug(msgStr) msgStr = "Initial queueConfig.preparator = {}".format( initial_queueConfig_preparator) tmpLog.debug(msgStr) msgStr = "Modified queueConfig.preparator = {}".format( modified_queueConfig_preparator) tmpLog.debug(msgStr) scope = 'panda' proxy = DBProxy() communicator = CommunicatorPool() cacher = Cacher(communicator, single_mode=True) cacher.run() tmpLog.debug("plugin={0}".format(preparatorCore.__class__.__name__)) tmpLog.debug("BasePath from preparator configuration: %s " % preparatorCore.basePath) # get all jobs in table in a preparing substate tmpLog.debug('try to get all jobs in a preparing substate') jobSpec_list = proxy.get_jobs_in_sub_status('preparing', 2000, None, None, None, None, None, None) tmpLog.debug('got {0} jobs'.format(len(jobSpec_list))) # loop over all found jobs if len(jobSpec_list) > 0:
def __init__(self, **kwarg): for tmpKey, tmpVal in iteritems(kwarg): setattr(self, tmpKey, tmpVal) self.dbProxy = DBProxy()