def submit_workers(self, workspec_list): tmpLog = self.make_logger(baseLogger, method_name='submit_workers') nWorkers = len(workspec_list) tmpLog.debug('start nWorkers={0}'.format(nWorkers)) # whether to submit any worker to_submit_any = True # get log subdirectory name from timestamp timeNow = datetime.datetime.utcnow() log_subdir = timeNow.strftime('%y-%m-%d_%H') log_subdir_path = os.path.join(self.logDir, log_subdir) if self.condorSchedd is None or not self.useSpool: try: os.mkdir(log_subdir_path) except OSError as e: if e.errno != errno.EEXIST: raise else: pass # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) is_grandly_unified_queue = False # get queue info from AGIS by cacher in db if self.useAtlasAGIS: panda_queues_dict = PandaQueuesDict() panda_queue_name = panda_queues_dict.get_panda_queue_name( self.queueName) this_panda_queue_dict = panda_queues_dict.get( self.queueName, dict()) is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue( self.queueName) # tmpLog.debug('panda_queues_name and queue_info: {0}, {1}'.format(self.queueName, panda_queues_dict[self.queueName])) else: panda_queues_dict = dict() panda_queue_name = self.queueName this_panda_queue_dict = dict() # get default information from queue info n_core_per_node_from_queue = this_panda_queue_dict.get( 'corecount', 1) if this_panda_queue_dict.get('corecount', 1) else 1 is_unified_queue = this_panda_queue_dict.get('capability', '') == 'ucore' pilot_version_orig = str(this_panda_queue_dict.get( 'pilot_version', '')) pilot_version_suffix_str = '_pilot2' if pilot_version_orig == '2' else '' # get override requirements from queue configured try: n_core_per_node = self.nCorePerNode if self.nCorePerNode else n_core_per_node_from_queue except AttributeError: n_core_per_node = n_core_per_node_from_queue # deal with Condor schedd and central managers; make a random list the choose n_bulks = _div_round_up(nWorkers, self.minBulkToRamdomizedSchedd) if isinstance(self.condorSchedd, list) and len(self.condorSchedd) > 0: if isinstance(self.condorPool, list) and len(self.condorPool) > 0: orig_list = list(zip(self.condorSchedd, self.condorPool)) else: orig_list = [(_schedd, self.condorPool) for _schedd in self.condorSchedd] if n_bulks < len(orig_list): schedd_pool_choice_list = random.sample(orig_list, n_bulks) else: schedd_pool_choice_list = orig_list else: schedd_pool_choice_list = [(self.condorSchedd, self.condorPool)] # deal with CE special_par = '' ce_weighting = None if self.useAtlasGridCE: # If ATLAS Grid CE mode used tmpLog.debug('Using ATLAS Grid CE mode...') queues_from_queue_list = this_panda_queue_dict.get('queues', []) special_par = this_panda_queue_dict.get('special_par', '') ce_auxilary_dict = {} for _queue_dict in queues_from_queue_list: if not (_queue_dict.get('ce_endpoint') and str( _queue_dict.get('ce_state', '')).upper() == 'ACTIVE' and str(_queue_dict.get('ce_flavour', '')).lower() in set(['arc-ce', 'cream-ce', 'htcondor-ce'])): continue ce_endpoint = _queue_dict.get('ce_endpoint') if (ce_endpoint in ce_auxilary_dict and str(_queue_dict.get('ce_queue_name', '')).lower() == 'default'): pass else: ce_auxilary_dict[ce_endpoint] = _queue_dict # qualified CEs from AGIS info n_qualified_ce = len(ce_auxilary_dict) if n_qualified_ce > 0: # Get CE weighting tmpLog.debug('Get CE weighting') worker_ce_all_tuple = self.get_ce_statistics( self.queueName, nWorkers) ce_weighting = _get_ce_weighting( ce_endpoint_list=list(ce_auxilary_dict.keys()), worker_ce_all_tuple=worker_ce_all_tuple) stats_weighting_display_str = _get_ce_stats_weighting_display( ce_auxilary_dict.keys(), worker_ce_all_tuple, ce_weighting) tmpLog.debug('CE stats and weighting: {0}'.format( stats_weighting_display_str)) else: tmpLog.error('No valid CE endpoint found') to_submit_any = False def _handle_one_worker(workspec, to_submit=to_submit_any): # make logger tmpLog = core_utils.make_logger(baseLogger, 'workerID={0}'.format( workspec.workerID), method_name='_handle_one_worker') ce_info_dict = dict() batch_log_dict = dict() data = { 'workspec': workspec, 'to_submit': to_submit, } if to_submit: if self.useAtlasGridCE: # choose a CE tmpLog.info('choose a CE...') ce_chosen = _choose_ce(ce_weighting) try: ce_info_dict = ce_auxilary_dict[ce_chosen].copy() except KeyError: tmpLog.info( 'Problem choosing CE with weighting. Choose an arbitrary CE endpoint' ) ce_info_dict = random.choice( list(ce_auxilary_dict.values())).copy() # go on info of the CE; ignore protocol prefix in ce_endpoint ce_endpoint_from_queue = re.sub( '^\w+://', '', ce_info_dict.get('ce_endpoint', '')) ce_flavour_str = str(ce_info_dict.get('ce_flavour', '')).lower() ce_version_str = str(ce_info_dict.get('ce_version', '')).lower() ce_info_dict['ce_hostname'] = re.sub( ':\w*', '', ce_endpoint_from_queue) if ce_info_dict['ce_hostname'] == ce_endpoint_from_queue: # add default port to ce_endpoint if missing default_port_map = { 'cream-ce': 8443, 'arc-ce': 2811, 'htcondor-ce': 9619, } if ce_flavour_str in default_port_map: default_port = default_port_map[ce_flavour_str] ce_info_dict['ce_endpoint'] = '{0}:{1}'.format( ce_endpoint_from_queue, default_port) tmpLog.debug( 'For site {0} got pilot version: "{1}"; CE endpoint: "{2}", flavour: "{3}"' .format(self.queueName, pilot_version_orig, ce_endpoint_from_queue, ce_flavour_str)) if not self.templateFile and os.path.isdir( self.CEtemplateDir) and ce_flavour_str: sdf_template_filename = '{ce_flavour_str}{pilot_version_suffix_str}.sdf'.format( ce_flavour_str=ce_flavour_str, pilot_version_suffix_str=pilot_version_suffix_str) self.templateFile = os.path.join( self.CEtemplateDir, sdf_template_filename) else: try: # Manually define site condor schedd as ceHostname and central manager as ceEndpoint if self.ceHostname and isinstance( self.ceHostname, list) and len(self.ceHostname) > 0: if isinstance(self.ceEndpoint, list) and len(self.ceEndpoint) > 0: ce_info_dict['ce_hostname'], ce_info_dict[ 'ce_endpoint'] = random.choice( list( zip(self.ceHostname, self.ceEndpoint))) else: ce_info_dict['ce_hostname'] = random.choice( self.ceHostname) ce_info_dict['ce_endpoint'] = self.ceEndpoint else: ce_info_dict['ce_hostname'] = self.ceHostname ce_info_dict['ce_endpoint'] = self.ceEndpoint except AttributeError: pass try: # Manually define ceQueueName if self.ceQueueName: ce_info_dict['ce_queue_name'] = self.ceQueueName except AttributeError: pass # template for batch script try: tmpFile = open(self.templateFile) sdf_template_raw = tmpFile.read() tmpFile.close() except AttributeError: tmpLog.error( 'No valid templateFile found. Maybe templateFile, CEtemplateDir invalid, or no valid CE found' ) to_submit = False return data else: # get batch_log, stdout, stderr filename, and remobe commented liness sdf_template_str_list = [] for _line in sdf_template_raw.split('\n'): if _line.startswith('#'): continue sdf_template_str_list.append(_line) _match_batch_log = re.match('log = (.+)', _line) _match_stdout = re.match('output = (.+)', _line) _match_stderr = re.match('error = (.+)', _line) if _match_batch_log: batch_log_value = _match_batch_log.group(1) continue if _match_stdout: stdout_value = _match_stdout.group(1) continue if _match_stderr: stderr_value = _match_stderr.group(1) continue sdf_template = '\n'.join(sdf_template_str_list) # Choose from Condor schedd and central managers condor_schedd, condor_pool = random.choice( schedd_pool_choice_list) # set submissionHost if not condor_schedd and not condor_pool: workspec.submissionHost = 'LOCAL' else: workspec.submissionHost = '{0},{1}'.format( condor_schedd, condor_pool) tmpLog.debug('set submissionHost={0}'.format( workspec.submissionHost)) # Log Base URL if self.logBaseURL and '[ScheddHostname]' in self.logBaseURL: schedd_hostname = re.sub( r'(?:[a-zA-Z0-9_.\-]*@)?([a-zA-Z0-9.\-]+)(?::[0-9]+)?', lambda matchobj: matchobj.group(1) if matchobj.group(1) else '', condor_schedd) log_base_url = re.sub(r'\[ScheddHostname\]', schedd_hostname, self.logBaseURL) else: log_base_url = self.logBaseURL # URLs for log files if not (log_base_url is None): if workspec.batchID: batchID = workspec.batchID guess = False else: batchID = '' guess = True batch_log_filename = parse_batch_job_filename( value_str=batch_log_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stdout_path_file_name = parse_batch_job_filename( value_str=stdout_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) stderr_path_filename = parse_batch_job_filename( value_str=stderr_value, file_dir=log_subdir_path, batchID=batchID, guess=guess) batch_log = '{0}/{1}/{2}'.format( log_base_url, log_subdir, batch_log_filename) batch_stdout = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stdout_path_file_name) batch_stderr = '{0}/{1}/{2}'.format( log_base_url, log_subdir, stderr_path_filename) workspec.set_log_file('batch_log', batch_log) workspec.set_log_file('stdout', batch_stdout) workspec.set_log_file('stderr', batch_stderr) batch_log_dict['batch_log'] = batch_log batch_log_dict['batch_stdout'] = batch_stdout batch_log_dict['batch_stderr'] = batch_stderr batch_log_dict['gtag'] = workspec.workAttributes[ 'stdOut'] tmpLog.debug('Done set_log_file before submission') tmpLog.debug('Done jobspec attribute setting') # choose the x509 certificate based on the type of job (analysis or production) proxy = _choose_proxy(workspec) # set data dict data.update({ 'workspec': workspec, 'to_submit': to_submit, 'template': sdf_template, 'executable_file': self.executableFile, 'log_dir': self.logDir, 'log_subdir': log_subdir, 'n_core_per_node': n_core_per_node, 'panda_queue_name': panda_queue_name, 'x509_user_proxy': proxy, 'ce_info_dict': ce_info_dict, 'batch_log_dict': batch_log_dict, 'special_par': special_par, 'harvester_queue_config': harvester_queue_config, 'is_unified_queue': is_unified_queue, 'condor_schedd': condor_schedd, 'condor_pool': condor_pool, 'use_spool': self.useSpool, 'pilot_version': pilot_version_orig, }) return data def _choose_proxy(workspec): """ Choose the proxy based on the job type """ job_type = workspec.jobType proxy = self.x509UserProxy if is_grandly_unified_queue and job_type in ( 'user', 'panda', 'analysis') and self.x509UserProxyAnalysis: tmpLog.debug('Taking analysis proxy') proxy = self.x509UserProxyAnalysis else: tmpLog.debug('Taking default proxy') return proxy def _propagate_attributes(workspec, tmpVal): # make logger tmpLog = core_utils.make_logger( baseLogger, 'workerID={0}'.format(workspec.workerID), method_name='_propagate_attributes') (retVal, tmpDict) = tmpVal workspec.set_attributes_with_dict(tmpDict) tmpLog.debug('Done workspec attributes propagation') return retVal tmpLog.debug('finished preparing worker attributes') # map(_handle_one_worker, workspec_list) with ThreadPoolExecutor(self.nProcesses * 4) as thread_pool: dataIterator = thread_pool.map(_handle_one_worker, workspec_list) tmpLog.debug('{0} workers handled'.format(nWorkers)) # submit retValList = submit_bag_of_workers(list(dataIterator)) tmpLog.debug('{0} workers submitted'.format(nWorkers)) # propagate changed attributes with ThreadPoolExecutor(self.nProcesses) as thread_pool: retIterator = thread_pool.map( lambda _wv_tuple: _propagate_attributes(*_wv_tuple), zip(workspec_list, retValList)) retList = list(retIterator) tmpLog.debug('done') return retList
def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) # set the stdout log file log_file_name = '{0}_{1}.out'.format(harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file('stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) try: # read the job configuration (if available, only push model) job_fields, job_pars_parsed = self.read_job_configuration(work_spec) # decide container image and executable to run. In pull mode, defaults are provided container_image = self.decide_container_image(job_fields, job_pars_parsed) executable, args = self.build_executable(job_fields, job_pars_parsed) tmp_log.debug('container_image: "{0}"; executable: "{1}"; args: "{2}"'.format(container_image, executable, args)) # choose the appropriate proxy panda_queues_dict = PandaQueuesDict() this_panda_queue_dict = panda_queues_dict.get(self.queueName, dict()) is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue(self.queueName) cert = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: err_str = 'No proxy specified in proxySecretPath. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: max_time = this_panda_queue_dict['maxtime'] except Exception as e: tmp_log.warning('Could not retrieve maxtime field for queue {0}'.format(self.queueName)) max_time = None associated_params_dict = {} for key, val in panda_queues_dict.get_harvester_params(self.queueName).items(): if key in self._allowed_agis_attrs: associated_params_dict[key] = val pilot_url = associated_params_dict.get('pilot_url') pilot_version = str(this_panda_queue_dict.get('pilot_version', 'current')) python_version = str(this_panda_queue_dict.get('python_version', '2')) # prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) pilot_opt_dict = submitter_common.get_complicated_pilot_options(work_spec.pilotType) if pilot_opt_dict is None: prod_source_label = harvester_queue_config.get_source_label(work_spec.jobType) pilot_type = work_spec.pilotType pilot_url_str = '--piloturl {0}'.format(pilot_url) if pilot_url else '' else: prod_source_label = pilot_opt_dict['prod_source_label'] pilot_type = pilot_opt_dict['pilot_type_opt'] pilot_url_str = pilot_opt_dict['pilot_url_str'] pilot_python_option = submitter_common.get_python_version_option(python_version, prod_source_label) # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml(yaml_content, work_spec, prod_source_label, pilot_type, pilot_url_str, pilot_python_option, container_image, executable, args, cert, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio, max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] tmp_log.debug('Created worker {0} with batchID={1}'.format(work_spec.workerID, work_spec.batchID)) tmp_return_value = (True, '') return tmp_return_value
def submit_k8s_worker(self, work_spec): tmp_log = self.make_logger(base_logger, method_name='submit_k8s_worker') # get info from harvester queue config _queueConfigMapper = QueueConfigMapper() harvester_queue_config = _queueConfigMapper.get_queue(self.queueName) prod_source_label = harvester_queue_config.get_source_label( work_spec.jobType) # set the stdout log file log_file_name = '{0}_{1}.out'.format( harvester_config.master.harvester_id, work_spec.workerID) work_spec.set_log_file( 'stdout', '{0}/{1}'.format(self.logBaseURL, log_file_name)) # TODO: consider if we want to upload the yaml file to PanDA cache yaml_content = self.k8s_client.read_yaml_file(self.k8s_yaml_file) try: # read the job configuration (if available, only push model) job_fields, job_pars_parsed = self.read_job_configuration( work_spec) # decide container image and executable to run. In pull mode, defaults are provided container_image = self.decide_container_image( job_fields, job_pars_parsed) executable, args = self.build_executable(job_fields, job_pars_parsed) tmp_log.debug( 'container_image: "{0}"; executable: "{1}"; args: "{2}"'. format(container_image, executable, args)) # choose the appropriate proxy panda_queues_dict = PandaQueuesDict() is_grandly_unified_queue = panda_queues_dict.is_grandly_unified_queue( self.queueName) cert, use_secret = self._choose_proxy(work_spec, is_grandly_unified_queue) if not cert: err_str = 'No proxy specified in proxySecretPath or x509UserProxy. Not submitted' tmp_return_value = (False, err_str) return tmp_return_value # get the walltime limit try: max_time = panda_queues_dict.get(self.queueName)['maxtime'] except Exception as e: tmp_log.warning( 'Could not retrieve maxtime field for queue {0}'.format( self.queueName)) max_time = None # submit the worker rsp, yaml_content_final = self.k8s_client.create_job_from_yaml( yaml_content, work_spec, prod_source_label, container_image, executable, args, cert, cert_in_secret=use_secret, cpu_adjust_ratio=self.cpuAdjustRatio, memory_adjust_ratio=self.memoryAdjustRatio, max_time=max_time) except Exception as _e: tmp_log.error(traceback.format_exc()) err_str = 'Failed to create a JOB; {0}'.format(_e) tmp_return_value = (False, err_str) else: work_spec.batchID = yaml_content['metadata']['name'] tmp_log.debug('Created worker {0} with batchID={1}'.format( work_spec.workerID, work_spec.batchID)) tmp_return_value = (True, '') return tmp_return_value
def run(self): while True: mainLog = self.make_logger(_logger, 'id={0}'.format(self.get_pid()), method_name='run') mainLog.debug('getting number of jobs to be fetched') # get number of jobs to be fetched nJobsPerQueue = self.dbProxy.get_num_jobs_to_fetch( harvester_config.jobfetcher.nQueues, harvester_config.jobfetcher.lookupTime) mainLog.debug('got {0} queues'.format(len(nJobsPerQueue))) # get up to date queue configuration pandaQueueDict = PandaQueuesDict() # loop over all queues for queueName, nJobs in iteritems(nJobsPerQueue): # check queue if not self.queueConfigMapper.has_queue(queueName): continue tmpLog = self.make_logger(_logger, 'queueName={0}'.format(queueName), method_name='run') # get queue queueConfig = self.queueConfigMapper.get_queue(queueName) siteName = queueConfig.siteName # upper limit if nJobs > harvester_config.jobfetcher.maxJobs: nJobs = harvester_config.jobfetcher.maxJobs # get jobs try: is_grandly_unified_queue = pandaQueueDict.is_grandly_unified_queue( siteName) except Exception: is_grandly_unified_queue = False default_prodSourceLabel = queueConfig.get_source_label( is_gu=is_grandly_unified_queue) pdpm = getattr(queueConfig, 'prodSourceLabelRandomWeightsPermille', {}) choice_list = core_utils.make_choice_list( pdpm=pdpm, default=default_prodSourceLabel) prodSourceLabel = random.choice(choice_list) tmpLog.debug('getting {0} jobs for prodSourceLabel {1}'.format( nJobs, prodSourceLabel)) sw = core_utils.get_stopwatch() jobs, errStr = self.communicator.get_jobs( siteName, self.nodeName, prodSourceLabel, self.nodeName, nJobs, queueConfig.getJobCriteria) tmpLog.info('got {0} jobs with {1} {2}'.format( len(jobs), errStr, sw.get_elapsed_time())) # convert to JobSpec if len(jobs) > 0: # get extractor plugin if hasattr(queueConfig, 'extractor'): extractorCore = self.pluginFactory.get_plugin( queueConfig.extractor) else: extractorCore = None jobSpecs = [] fileStatMap = dict() sw_startconvert = core_utils.get_stopwatch() for job in jobs: timeNow = datetime.datetime.utcnow() jobSpec = JobSpec() jobSpec.convert_job_json(job) jobSpec.computingSite = queueName jobSpec.status = 'starting' jobSpec.subStatus = 'fetched' jobSpec.creationTime = timeNow jobSpec.stateChangeTime = timeNow jobSpec.configID = queueConfig.configID jobSpec.set_one_attribute( 'schedulerID', 'harvester-{0}'.format( harvester_config.master.harvester_id)) if queueConfig.zipPerMB is not None and jobSpec.zipPerMB is None: jobSpec.zipPerMB = queueConfig.zipPerMB fileGroupDictList = [ jobSpec.get_input_file_attributes() ] if extractorCore is not None: fileGroupDictList.append( extractorCore.get_aux_inputs(jobSpec)) for fileGroupDict in fileGroupDictList: for tmpLFN, fileAttrs in iteritems(fileGroupDict): # make file spec fileSpec = FileSpec() fileSpec.PandaID = jobSpec.PandaID fileSpec.taskID = jobSpec.taskID fileSpec.lfn = tmpLFN fileSpec.endpoint = queueConfig.ddmEndpointIn fileSpec.scope = fileAttrs['scope'] if 'INTERNAL_FileType' in fileAttrs: fileSpec.fileType = fileAttrs[ 'INTERNAL_FileType'] jobSpec.auxInput = JobSpec.AUX_hasAuxInput else: fileSpec.fileType = 'input' # check file status if tmpLFN not in fileStatMap: fileStatMap[ tmpLFN] = self.dbProxy.get_file_status( tmpLFN, fileSpec.fileType, queueConfig.ddmEndpointIn, 'starting') # set preparing to skip stage-in if the file is (being) taken care of by another job if [ x for x in [ 'ready', 'preparing', 'to_prepare', 'triggered' ] if x in fileStatMap[tmpLFN] ]: fileSpec.status = 'preparing' else: fileSpec.status = 'to_prepare' fileStatMap[tmpLFN].setdefault( fileSpec.status, None) if 'INTERNAL_URL' in fileAttrs: fileSpec.url = fileAttrs['INTERNAL_URL'] jobSpec.add_in_file(fileSpec) jobSpec.trigger_propagation() jobSpecs.append(jobSpec) # insert to DB tmpLog.debug("Converting of {0} jobs {1}".format( len(jobs), sw_startconvert.get_elapsed_time())) sw_insertdb = core_utils.get_stopwatch() self.dbProxy.insert_jobs(jobSpecs) tmpLog.debug('Insert of {0} jobs {1}'.format( len(jobSpecs), sw_insertdb.get_elapsed_time())) mainLog.debug('done') # check if being terminated if self.terminated(harvester_config.jobfetcher.sleepTime): mainLog.debug('terminated') return