def executeInternal(self, apmon, *args, **kwargs): #Marco: I guess these value errors only happens for development instances if 'task' not in kwargs: raise ValueError("No task specified.") self.task = kwargs['task'] if 'tm_taskname' not in self.task: raise ValueError("No taskname specified") self.workflow = self.task['tm_taskname'] if 'user_proxy' not in self.task: raise ValueError("No proxy provided") self.proxy = self.task['user_proxy'] self.logger.info("About to kill workflow: %s." % self.workflow) self.workflow = str(self.workflow) if not WORKFLOW_RE.match(self.workflow): raise Exception("Invalid workflow name.") # Query HTCondor for information about running jobs and update Dashboard appropriately if self.task['tm_collector']: self.backendurls['htcondorPool'] = self.task['tm_collector'] loc = HTCondorLocator.HTCondorLocator(self.backendurls) address = "" try: self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd']) except Exception as exp: msg = "The CRAB server backend was not able to contact the Grid scheduler." msg += " Please try again later." msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL) msg += " Message from the scheduler: %s" % (str(exp)) self.logger.exception("%s: %s" % (self.workflow, msg)) raise TaskWorkerException(msg) try: hostname = socket.getfqdn() except: hostname = '' const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(self.workflow) try: for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])): if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad): continue jobid = str(ad.eval('CRAB_Id')) jobretry = str(ad.eval('CRAB_Retry')) jinfo = {'broker': hostname, 'bossId': jobid, 'StatusValue': 'killed', } insertJobIdSid(jinfo, jobid, self.workflow, jobretry) self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo)) apmon.sendToML(jinfo) except: self.logger.exception("Failed to notify Dashboard of job kills") #warning # Note that we can not send kills for jobs not in queue at this time; we'll need the # DAG FINAL node to be fixed and the node status to include retry number. return self.killAll(const)
def update_dashboard(self, crab_retry): """ Need a doc string here. """ if not self.task_ad: return params = { 'tool': 'crab3', 'SubmissionType': 'crab3', 'JSToolVersion': '3.3.0', 'tool_ui': os.environ.get('HOSTNAME', ''), 'scheduler': 'GLIDEIN', 'GridName': self.task_ad['CRAB_UserDN'], 'ApplicationVersion': self.task_ad['CRAB_JobSW'], 'taskType': self.task_ad.get("CRAB_DashboardTaskType", 'analysistest'), 'vo': 'cms', 'CMSUser': self.task_ad['CRAB_UserHN'], 'user': self.task_ad['CRAB_UserHN'], 'taskId': self.task_ad['CRAB_ReqName'], 'datasetFull': self.task_ad['DESIRED_CMSDataset'], 'resubmitter': self.task_ad['CRAB_UserHN'], 'exe': 'cmsRun', 'broker': self.backend, 'bossId': str(self.job_id), 'localId': '', 'SyncGridJobId': 'https://glidein.cern.ch/%s/%s' % (self.job_id, self.task_ad['CRAB_ReqName'].replace("_", ":")), } if not self.userWebDirPrx: storage_rules = htcondor.param['CRAB_StorageRules'] self.userWebDirPrx = getWebdirForDb( str(self.task_ad.get('CRAB_ReqName')), storage_rules) self.logger.info("User web dir: %s", self.userWebDirPrx) insertJobIdSid(params, self.job_id, self.task_ad['CRAB_ReqName'], crab_retry)
def update_dashboard(self, crab_retry): """ Need a doc string here. """ if not self.task_ad: return params = {'tool': 'crab3', 'SubmissionType': 'crab3', 'JSToolVersion': '3.3.0', 'tool_ui': os.environ.get('HOSTNAME', ''), 'scheduler': 'GLIDEIN', 'GridName': self.task_ad['CRAB_UserDN'], 'ApplicationVersion': self.task_ad['CRAB_JobSW'], 'taskType': self.task_ad.get("CRAB_DashboardTaskType", 'analysistest'), 'vo': 'cms', 'CMSUser': self.task_ad['CRAB_UserHN'], 'user': self.task_ad['CRAB_UserHN'], 'taskId': self.task_ad['CRAB_ReqName'], 'datasetFull': self.task_ad['DESIRED_CMSDataset'], 'resubmitter': self.task_ad['CRAB_UserHN'], 'exe': 'cmsRun', 'broker': self.backend, 'bossId': str(self.job_id), 'localId': '', 'SyncGridJobId': 'https://glidein.cern.ch/%s/%s' % (self.job_id, self.task_ad['CRAB_ReqName'].replace("_", ":")), } storage_rules = htcondor.param['CRAB_StorageRules'] userWebDir = getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules) userWebDirPrx = "" try: with open('proxied_webdir') as fd: proxied_webdir = fd.read() userWebDirPrx = proxied_webdir except IOError as e: self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal" " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror))) self.logger.info("User web dir proxy: " + userWebDirPrx) self.logger.info("web dir: " + userWebDir) if userWebDirPrx: setDashboardLogs(params, userWebDirPrx, self.job_id, crab_retry) elif userWebDir: setDashboardLogs(params, userWebDir, self.job_id, crab_retry) else: print("Not setting dashboard logfiles as I cannot find CRAB_UserWebDir nor CRAB_UserWebDirPrx.") insertJobIdSid(params, self.job_id, self.task_ad['CRAB_ReqName'], crab_retry) apmon = ApmonIf() self.logger.debug("Dashboard task info: %s" % str(params)) apmon.sendToML(params) apmon.free()
def createSubdag(self, splitterResult, **kwargs): startjobid = 0 dagSpecs = [] if hasattr(self.config.TaskWorker, 'stageoutPolicy'): kwargs['task']['stageoutpolicy'] = ",".join( self.config.TaskWorker.stageoutPolicy) else: kwargs['task']['stageoutpolicy'] = "local,remote" ## In the future this parameter may be set by the user in the CRAB configuration ## file and we would take it from the Task DB. kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker, 'numAutomJobRetries', 2) info = self.makeJobSubmit(kwargs['task']) outfiles = kwargs['task']['tm_outfiles'] + kwargs['task'][ 'tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles'] os.chmod("CMSRunAnalysis.sh", 0o755) # This config setting acts as a global black list global_blacklist = set(self.getBlacklistedSites()) self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist))) # This is needed for Site Metrics # It should not block any site for Site Metrics and if needed for other activities # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev'] if hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \ kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere: global_blacklist = set() self.logger.debug("Ignoring the CRAB site blacklist.") sitead = classad.ClassAd() siteinfo = {'group_sites': {}, 'group_datasites': {}} blocksWithNoLocations = set() siteWhitelist = set(kwargs['task']['tm_site_whitelist']) siteBlacklist = set(kwargs['task']['tm_site_blacklist']) self.logger.debug("Site whitelist: %s" % (list(siteWhitelist))) self.logger.debug("Site blacklist: %s" % (list(siteBlacklist))) if siteWhitelist & global_blacklist: msg = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % ( list(siteWhitelist & global_blacklist)) msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist." self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) if siteBlacklist & siteWhitelist: msg = "The following sites appear in both the user site blacklist and whitelist: %s." % ( list(siteBlacklist & siteWhitelist)) msg += " Since the whitelist has precedence, these sites are not considered in the blacklist." self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T' self.logger.debug("Ignore locality: %s" % (ignoreLocality)) for jobgroup in splitterResult: jobs = jobgroup.getJobs() blocks = set() for job in jobs: for inputfile in job['input_files']: blocks.add(inputfile['block']) self.logger.debug("Blocks: %s" % list(blocks)) if not jobs: locations = set() else: locations = set(jobs[0]['input_files'][0]['locations']) self.logger.debug("Locations: %s" % (list(locations))) ## Discard the blocks that have no locations. This can happen when a block is ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large ## chance of having some block which is closed in DBS but not in PhEDEx. ## Open blocks in PhEDEx can have a location; it is WMCore who is returning no ## location. ## This is how a block is constructed during data taking: ## 1) an open block in T0 is injected in PhEDEx; ## 2) files are being added to the block in T0; ## 3) data are transferred by PhEDEx if a subscription is present; ## 4) once the block is finished: ## a) the block is inserted into DBS as a closed block (before this, DBS has ## no knowledge about the block); ## b) block is closed in PhEDEx. if not locations and not ignoreLocality: blocksWithNoLocations = blocksWithNoLocations.union(blocks) continue if ignoreLocality: sbj = SiteDB.SiteDBJSON({ "key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert }) try: possiblesites = set(sbj.getAllCMSNames()) except Exception as ex: msg = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites." msg += " This could be a temporary SiteDB glitch." msg += " Please try to submit a new task (resubmit will not work)" msg += " and contact the experts if the error persists." msg += "\nError reason: %s" % ( str(ex) ) #TODO add the sitedb url so the user can check themselves! raise TaskWorker.WorkerExceptions.TaskWorkerException(msg) else: possiblesites = locations ## At this point 'possiblesites' should never be empty. self.logger.debug("Possible sites: %s" % (list(possiblesites))) ## Apply the global site blacklist. availablesites = possiblesites - global_blacklist ## TODO: The messsages below do not clarify that here it only matters the part ## of the dataset that passed the lumi-mask/run-range selection. ## Abort the submission of the task if (part of?) the dataset is available only ## on sites that are blacklisted by the CRAB server. ## Or should we submit at least the jobs on the part of the dataset that ## survives the blacklisting? Comment S.Belforte Sep,2015: So far DDM policy ## is to replicate entire datasets, not scatter them around. Once we will have ## very large datasets that can happen, but it is not the case now. if not availablesites: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler." msg += " No site available for submission of task %s" % ( kwargs['task']['tm_taskname']) msg += "\n\t\t\t\tThe sites available for submission of task %s are blacklisted by the CRAB3 server." % ( kwargs['task']['tm_taskname']) msg += "\n\t\t\t\tThis is the list of in principle available sites: %s" % ( list(possiblesites)) msg += "\n\t\t\t\tThis is the list of sites that are blacklisted by the CRAB3 server: %s" % ( list(global_blacklist)) raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) ## Abort the submission of the task if (part of?) the dataset is available only ## on sites that are removed after applying the user site blacklist/whitelist. ## Or should we submit at least the jobs on the part of the dataset that ## survives the blacklisting/whitelisting? (See S.Belforte comment above.) # NOTE: User can still shoot themselves in the foot with the resubmit blacklist # However, this is the last chance we have to warn the users about an impossible task at submit time. available = set(availablesites) if siteWhitelist: available &= siteWhitelist if not available: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler." msg += " You put %s as site whitelist," % ( list(siteWhitelist)) msg += " but the input dataset '%s' can only be accessed at these sites: %s." % ( kwargs['task']['tm_input_dataset'], list(availablesites)) msg += " Please check your site whitelist." raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) available -= (siteBlacklist - siteWhitelist) if not available: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler." msg += " You put %s as site blacklist," % (list(siteBlacklist - siteWhitelist)) msg += " when the input dataset '%s' can actually only be accessed at these sites: %s." % ( kwargs['task']['tm_input_dataset'], list(availablesites)) msg += " Please check in DAS the locations of the input dataset." msg += " Hint: the ignoreLocality option might help." raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) availablesites = [str(i) for i in availablesites] datasites = jobs[0]['input_files'][0]['locations'] self.logger.info("Resulting available sites: %s" % (list(availablesites))) if siteWhitelist or siteBlacklist: msg = "The site whitelist and blacklist will be applied by the pre-job." msg += " This is expected to result in DESIRED_SITES = %s" % ( list(available)) self.logger.debug(msg) jobgroupDagSpecs, startjobid = self.makeDagSpecs( kwargs['task'], sitead, siteinfo, jobgroup, list(blocks)[0], availablesites, datasites, outfiles, startjobid) dagSpecs += jobgroupDagSpecs if not dagSpecs: msg = "No jobs created for task %s." % ( kwargs['task']['tm_taskname']) if blocksWithNoLocations: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler." msg += " No locations found for dataset '%s'" % ( kwargs['task']['tm_input_dataset']) msg += " (or at least for the part of the dataset that passed the lumi-mask and/or run-range selection)." raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) if blocksWithNoLocations: msg = "The following blocks from dataset '%s' were skipped," % ( kwargs['task']['tm_input_dataset']) msg += " because they have no locations: %s." % (sorted( list(blocksWithNoLocations))) self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) ## Write down the DAG as needed by DAGMan. dag = DAG_HEADER % { 'resthost': kwargs['task']['resthost'], 'resturiwfdb': kwargs['task']['resturinoapi'] + '/workflowdb' } for dagSpec in dagSpecs: dag += DAG_FRAGMENT % dagSpec ## Create a tarball with all the job lumi files. run_and_lumis_tar = tarfile.open("run_and_lumis.tar.gz", "w:gz") ## Also creating a tarball with the dataset input files. ## Each .txt file in the tarball contains a list of dataset files to be used for the job. input_files_tar = tarfile.open("input_files.tar.gz", "w:gz") for dagSpec in dagSpecs: job_lumis_file = 'job_lumis_' + str(dagSpec['count']) + '.json' job_input_file_list = 'job_input_file_list_' + str( dagSpec['count']) + '.txt' with open(job_lumis_file, "w") as fd: fd.write(str(dagSpec['runAndLumiMask'])) with open(job_input_file_list, "w") as fd: fd.write(str(dagSpec['inputFiles'])) run_and_lumis_tar.add(job_lumis_file) input_files_tar.add(job_input_file_list) os.remove(job_lumis_file) os.remove(job_input_file_list) run_and_lumis_tar.close() input_files_tar.close() ## Save the DAG into a file. with open("RunJobs.dag", "w") as fd: fd.write(dag) with open("site.ad", "w") as fd: fd.write(str(sitead)) with open("site.ad.json", "w") as fd: json.dump(siteinfo, fd) task_name = kwargs['task'].get('CRAB_ReqName', kwargs['task'].get('tm_taskname', '')) userdn = kwargs['task'].get('CRAB_UserDN', kwargs['task'].get('tm_user_dn', '')) info["jobcount"] = len(dagSpecs) maxpost = getattr(self.config.TaskWorker, 'maxPost', 20) if maxpost == -1: maxpost = info['jobcount'] elif maxpost == 0: maxpost = int(max(20, info['jobcount'] * .1)) info['maxpost'] = maxpost if info.get('faillimit') == None: info['faillimit'] = -1 #if info['jobcount'] > 200 # info['faillimit'] = 100 #else: # info['faillimit'] = -1 elif info.get('faillimit') < 0: info['faillimit'] = -1 # Info for ML: target_se = '' max_len_target_se = 900 for site in map(str, availablesites): if len(target_se) > max_len_target_se: target_se += ',Many_More' break if len(target_se): target_se += ',' target_se += site ml_info = info.setdefault('apmon', []) for idx in range(1, info['jobcount'] + 1): taskid = kwargs['task']['tm_taskname'] jinfo = { 'broker': os.environ.get('HOSTNAME', ''), 'bossId': str(idx), 'TargetSE': target_se, 'localId': '', 'StatusValue': 'pending', } insertJobIdSid(jinfo, idx, taskid, 0) ml_info.append(jinfo) # When running in standalone mode, we want to record the number of jobs in the task if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN' in kwargs['task']): const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % ( task_name, userdn) cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const, len(dagSpecs)) self.logger.debug("+ %s" % cmd) status, output = commands.getstatusoutput(cmd) if status: self.logger.error(output) self.logger.error("Failed to record the number of jobs.") return 1 return info, splitterResult
def createSubdag(self, splitterResult, **kwargs): startjobid = 0 dagSpecs = [] if hasattr(self.config.TaskWorker, 'stageoutPolicy'): kwargs['task']['stageoutpolicy'] = ",".join(self.config.TaskWorker.stageoutPolicy) else: kwargs['task']['stageoutpolicy'] = "local,remote" ## In the future this parameter may be set by the user in the CRAB configuration ## file and we would take it from the Task DB. kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker, 'numAutomJobRetries', 2) info = self.makeJobSubmit(kwargs['task']) outfiles = kwargs['task']['tm_outfiles'] + kwargs['task']['tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles'] os.chmod("CMSRunAnalysis.sh", 0o755) # This config setting acts as a global black list global_blacklist = set(self.getBlacklistedSites()) self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist))) # This is needed for Site Metrics # It should not block any site for Site Metrics and if needed for other activities # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev'] if hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \ kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere: global_blacklist = set() self.logger.debug("Ignoring the CRAB site blacklist.") sitead = classad.ClassAd() siteinfo = {'group_sites': {}, 'group_datasites': {}} blocksWithNoLocations = set() siteWhitelist = set(kwargs['task']['tm_site_whitelist']) siteBlacklist = set(kwargs['task']['tm_site_blacklist']) self.logger.debug("Site whitelist: %s" % (list(siteWhitelist))) self.logger.debug("Site blacklist: %s" % (list(siteBlacklist))) if siteWhitelist & global_blacklist: msg = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % (list(siteWhitelist & global_blacklist)) msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist." self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) if siteBlacklist & siteWhitelist: msg = "The following sites appear in both the user site blacklist and whitelist: %s." % (list(siteBlacklist & siteWhitelist)) msg += " Since the whitelist has precedence, these sites are not considered in the blacklist." self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T' self.logger.debug("Ignore locality: %s" % (ignoreLocality)) for jobgroup in splitterResult: jobs = jobgroup.getJobs() blocks = set() for job in jobs: for inputfile in job['input_files']: blocks.add(inputfile['block']) self.logger.debug("Blocks: %s" % list(blocks)) if not jobs: locations = set() else: locations = set(jobs[0]['input_files'][0]['locations']) self.logger.debug("Locations: %s" % (list(locations))) ## Discard the blocks that have no locations. This can happen when a block is ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large ## chance of having some block which is closed in DBS but not in PhEDEx. ## Open blocks in PhEDEx can have a location; it is WMCore who is returning no ## location. ## This is how a block is constructed during data taking: ## 1) an open block in T0 is injected in PhEDEx; ## 2) files are being added to the block in T0; ## 3) data are transferred by PhEDEx if a subscription is present; ## 4) once the block is finished: ## a) the block is inserted into DBS as a closed block (before this, DBS has ## no knowledge about the block); ## b) block is closed in PhEDEx. if not locations and not ignoreLocality: blocksWithNoLocations = blocksWithNoLocations.union(blocks) continue if ignoreLocality: sbj = SiteDB.SiteDBJSON({"key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert}) try: possiblesites = set(sbj.getAllCMSNames()) except Exception as ex: msg = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites." msg += " This could be a temporary SiteDB glitch." msg += " Please try to submit a new task (resubmit will not work)" msg += " and contact the experts if the error persists." msg += "\nError reason: %s" % (str(ex)) #TODO add the sitedb url so the user can check themselves! raise TaskWorker.WorkerExceptions.TaskWorkerException(msg) else: possiblesites = locations ## At this point 'possiblesites' should never be empty. self.logger.debug("Possible sites: %s" % (list(possiblesites))) ## Apply the global site blacklist. availablesites = possiblesites - global_blacklist ## TODO: The messsages below do not clarify that here it only matters the part ## of the dataset that passed the lumi-mask/run-range selection. ## Abort the submission of the task if (part of?) the dataset is available only ## on sites that are blacklisted by the CRAB server. ## Or should we submit at least the jobs on the part of the dataset that ## survives the blacklisting? Comment S.Belforte Sep,2015: So far DDM policy ## is to replicate entire datasets, not scatter them around. Once we will have ## very large datasets that can happen, but it is not the case now. if not availablesites: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler." msg += " No site available for submission of task %s" % (kwargs['task']['tm_taskname']) msg += "\n\t\t\t\tThe sites available for submission of task %s are blacklisted by the CRAB3 server." % (kwargs['task']['tm_taskname']) msg += "\n\t\t\t\tThis is the list of in principle available sites: %s" % (list(possiblesites)) msg += "\n\t\t\t\tThis is the list of sites that are blacklisted by the CRAB3 server: %s" % (list(global_blacklist)) raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) ## Abort the submission of the task if (part of?) the dataset is available only ## on sites that are removed after applying the user site blacklist/whitelist. ## Or should we submit at least the jobs on the part of the dataset that ## survives the blacklisting/whitelisting? (See S.Belforte comment above.) # NOTE: User can still shoot themselves in the foot with the resubmit blacklist # However, this is the last chance we have to warn the users about an impossible task at submit time. available = set(availablesites) if siteWhitelist: available &= siteWhitelist if not available: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler." msg += " You put %s as site whitelist," % (list(siteWhitelist)) msg += " but the input dataset '%s' can only be accessed at these sites: %s." % (kwargs['task']['tm_input_dataset'], list(availablesites)) msg += " Please check your site whitelist." raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) available -= (siteBlacklist - siteWhitelist) if not available: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler." msg += " You put %s as site blacklist," % (list(siteBlacklist - siteWhitelist)) msg += " when the input dataset '%s' can actually only be accessed at these sites: %s." % (kwargs['task']['tm_input_dataset'], list(availablesites)) msg += " Please check in DAS the locations of the input dataset." msg += " Hint: the ignoreLocality option might help." raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) availablesites = [str(i) for i in availablesites] datasites = jobs[0]['input_files'][0]['locations'] self.logger.info("Resulting available sites: %s" % (list(availablesites))) if siteWhitelist or siteBlacklist: msg = "The site whitelist and blacklist will be applied by the pre-job." msg += " This is expected to result in DESIRED_SITES = %s" % (list(available)) self.logger.debug(msg) jobgroupDagSpecs, startjobid = self.makeDagSpecs(kwargs['task'], sitead, siteinfo, jobgroup, list(blocks)[0], availablesites, datasites, outfiles, startjobid) dagSpecs += jobgroupDagSpecs if not dagSpecs: msg = "No jobs created for task %s." % (kwargs['task']['tm_taskname']) if blocksWithNoLocations: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler." msg += " No locations found for dataset '%s'" % (kwargs['task']['tm_input_dataset']) msg += " (or at least for the part of the dataset that passed the lumi-mask and/or run-range selection)." raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) if blocksWithNoLocations: msg = "The following blocks from dataset '%s' were skipped," % (kwargs['task']['tm_input_dataset']) msg += " because they have no locations: %s." % (sorted(list(blocksWithNoLocations))) self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) ## Write down the DAG as needed by DAGMan. dag = DAG_HEADER % {'resthost': kwargs['task']['resthost'], 'resturiwfdb': kwargs['task']['resturinoapi'] + '/workflowdb'} for dagSpec in dagSpecs: dag += DAG_FRAGMENT % dagSpec ## Create a tarball with all the job lumi files. run_and_lumis_tar = tarfile.open("run_and_lumis.tar.gz", "w:gz") ## Also creating a tarball with the dataset input files. ## Each .txt file in the tarball contains a list of dataset files to be used for the job. input_files_tar = tarfile.open("input_files.tar.gz", "w:gz") for dagSpec in dagSpecs: job_lumis_file = 'job_lumis_'+ str(dagSpec['count']) +'.json' job_input_file_list = 'job_input_file_list_' + str(dagSpec['count']) + '.txt' with open(job_lumis_file, "w") as fd: fd.write(str(dagSpec['runAndLumiMask'])) with open(job_input_file_list, "w") as fd: fd.write(str(dagSpec['inputFiles'])) run_and_lumis_tar.add(job_lumis_file) input_files_tar.add(job_input_file_list) os.remove(job_lumis_file) os.remove(job_input_file_list) run_and_lumis_tar.close() input_files_tar.close() ## Save the DAG into a file. with open("RunJobs.dag", "w") as fd: fd.write(dag) with open("site.ad", "w") as fd: fd.write(str(sitead)) with open("site.ad.json", "w") as fd: json.dump(siteinfo, fd) task_name = kwargs['task'].get('CRAB_ReqName', kwargs['task'].get('tm_taskname', '')) userdn = kwargs['task'].get('CRAB_UserDN', kwargs['task'].get('tm_user_dn', '')) info["jobcount"] = len(dagSpecs) maxpost = getattr(self.config.TaskWorker, 'maxPost', 20) if maxpost == -1: maxpost = info['jobcount'] elif maxpost == 0: maxpost = int(max(20, info['jobcount']*.1)) info['maxpost'] = maxpost if info.get('faillimit') == None: info['faillimit'] = -1 #if info['jobcount'] > 200 # info['faillimit'] = 100 #else: # info['faillimit'] = -1 elif info.get('faillimit') < 0: info['faillimit'] = -1 # Info for ML: target_se = '' max_len_target_se = 900 for site in map(str, availablesites): if len(target_se) > max_len_target_se: target_se += ',Many_More' break if len(target_se): target_se += ',' target_se += site ml_info = info.setdefault('apmon', []) for idx in range(1, info['jobcount']+1): taskid = kwargs['task']['tm_taskname'] jinfo = {'broker': os.environ.get('HOSTNAME', ''), 'bossId': str(idx), 'TargetSE': target_se, 'localId': '', 'StatusValue': 'pending', } insertJobIdSid(jinfo, idx, taskid, 0) ml_info.append(jinfo) # When running in standalone mode, we want to record the number of jobs in the task if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN' in kwargs['task']): const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % (task_name, userdn) cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const, len(dagSpecs)) self.logger.debug("+ %s" % cmd) status, output = commands.getstatusoutput(cmd) if status: self.logger.error(output) self.logger.error("Failed to record the number of jobs.") return 1 return info, splitterResult
def createSubdag(self, splitterResult, **kwargs): startjobid = kwargs.get('startjobid', 0) subjob = kwargs.get('subjob', None) stage = kwargs.get('stage', 'conventional') self.logger.debug('starting createSubdag, kwargs are:') self.logger.debug(str(kwargs)) dagSpecs = [] subdags = [] if hasattr(self.config.TaskWorker, 'stageoutPolicy'): kwargs['task']['stageoutpolicy'] = ",".join(self.config.TaskWorker.stageoutPolicy) else: kwargs['task']['stageoutpolicy'] = "local,remote" ## In the future this parameter may be set by the user in the CRAB configuration ## file and we would take it from the Task DB. kwargs['task']['numautomjobretries'] = getattr(self.config.TaskWorker, 'numAutomJobRetries', 2) kwargs['task']['max_runtime'] = kwargs['task']['tm_split_args'].get('seconds_per_job', -1) if kwargs['task']['tm_split_algo'] == 'Automatic' and stage == 'conventional': kwargs['task']['max_runtime'] = getattr(self.config.TaskWorker, 'splittingPilotRuntime', 15 * 60) kwargs['task']['completion_jobs'] = getattr(self.config.TaskWorker, 'completionJobs', False) outfiles = [] stage = 'probe' if stage == 'process' and not kwargs['task']['completion_jobs']: kwargs['task']['max_runtime'] = -1 if stage == 'probe': parent = None startjobid = -1 else: parent = startjobid info = self.makeJobSubmit(kwargs['task']) outfiles = kwargs['task']['tm_outfiles'] + kwargs['task']['tm_tfile_outfiles'] + kwargs['task']['tm_edm_outfiles'] os.chmod("CMSRunAnalysis.sh", 0o755) # This config setting acts as a global black list global_blacklist = set(self.getBlacklistedSites()) self.logger.debug("CRAB site blacklist: %s" % (list(global_blacklist))) # This is needed for Site Metrics # It should not block any site for Site Metrics and if needed for other activities # self.config.TaskWorker.ActivitiesToRunEverywhere = ['hctest', 'hcdev'] # The other case where the blacklist is ignored is if the user sset this explicitly in his configuration if self.isGlobalBlacklistIgnored(kwargs) or (hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \ kwargs['task']['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere): global_blacklist = set() self.logger.debug("Ignoring the CRAB site blacklist.") sitead = classad.ClassAd() siteinfo = {'group_sites': {}, 'group_datasites': {}} blocksWithNoLocations = set() blocksWithBannedLocations = set() allblocks = set() siteWhitelist = set(kwargs['task']['tm_site_whitelist']) siteBlacklist = set(kwargs['task']['tm_site_blacklist']) self.logger.debug("Site whitelist: %s" % (list(siteWhitelist))) self.logger.debug("Site blacklist: %s" % (list(siteBlacklist))) if siteWhitelist & global_blacklist: msg = "The following sites from the user site whitelist are blacklisted by the CRAB server: %s." % (list(siteWhitelist & global_blacklist)) msg += " Since the CRAB server blacklist has precedence, these sites are not considered in the user whitelist." self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) if siteBlacklist & siteWhitelist: msg = "The following sites appear in both the user site blacklist and whitelist: %s." % (list(siteBlacklist & siteWhitelist)) msg += " Since the whitelist has precedence, these sites are not considered in the blacklist." self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) ignoreLocality = kwargs['task']['tm_ignore_locality'] == 'T' self.logger.debug("Ignore locality: %s" % (ignoreLocality)) for jobgroup in splitterResult[0]: self.logger.error(dir(jobgroup)) self.logger.error(type(jobgroup)) jobs = jobgroup.getJobs() jgblocks = set() #job group blocks for job in jobs: for inputfile in job['input_files']: jgblocks.add(inputfile['block']) allblocks.add(inputfile['block']) self.logger.debug("Blocks: %s" % list(jgblocks)) if not jobs: locations = set() else: locations = set(jobs[0]['input_files'][0]['locations']) self.logger.debug("Locations: %s" % (list(locations))) ## Discard the jgblocks that have no locations. This can happen when a block is ## still open in PhEDEx. Newly created datasets from T0 (at least) have a large ## chance of having some block which is closed in DBS but not in PhEDEx. ## Open jgblocks in PhEDEx can have a location; it is WMCore who is returning no ## location. ## This is how a block is constructed during data taking: ## 1) an open block in T0 is injected in PhEDEx; ## 2) files are being added to the block in T0; ## 3) data are transferred by PhEDEx if a subscription is present; ## 4) once the block is finished: ## a) the block is inserted into DBS as a closed block (before this, DBS has ## no knowledge about the block); ## b) block is closed in PhEDEx. if not locations and not ignoreLocality: blocksWithNoLocations = blocksWithNoLocations.union(jgblocks) continue if ignoreLocality: sbj = SiteDB.SiteDBJSON({"key": self.config.TaskWorker.cmskey, "cert": self.config.TaskWorker.cmscert}) try: possiblesites = set(sbj.getAllCMSNames()) except Exception as ex: msg = "The CRAB3 server backend could not contact SiteDB to get the list of all CMS sites." msg += " This could be a temporary SiteDB glitch." msg += " Please try to submit a new task (resubmit will not work)" msg += " and contact the experts if the error persists." msg += "\nError reason: %s" % (str(ex)) #TODO add the sitedb url so the user can check themselves! raise TaskWorker.WorkerExceptions.TaskWorkerException(msg) else: possiblesites = locations ## At this point 'possiblesites' should never be empty. self.logger.debug("Possible sites: %s" % (list(possiblesites))) ## Apply the global site blacklist. availablesites = possiblesites - global_blacklist ## See https://github.com/dmwm/CRABServer/issues/5241 ## for a discussion about blocksWithBannedLocations if not availablesites: blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks) continue # NOTE: User can still shoot themselves in the foot with the resubmit blacklist # However, this is the last chance we have to warn the users about an impossible task at submit time. available = set(availablesites) if siteWhitelist: available &= siteWhitelist if not available: blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks) available -= (siteBlacklist - siteWhitelist) if not available: blocksWithBannedLocations = blocksWithBannedLocations.union(jgblocks) continue availablesites = [str(i) for i in availablesites] datasites = jobs[0]['input_files'][0]['locations'] self.logger.info("Resulting available sites: %s" % (list(availablesites))) if siteWhitelist or siteBlacklist: msg = "The site whitelist and blacklist will be applied by the pre-job." msg += " This is expected to result in DESIRED_SITES = %s" % (list(available)) self.logger.debug(msg) jobgroupDagSpecs, startjobid = self.makeDagSpecs(kwargs['task'], sitead, siteinfo, jobgroup, list(jgblocks)[0], availablesites, datasites, outfiles, startjobid, subjob=subjob, stage=stage) dagSpecs += jobgroupDagSpecs def getBlacklistMsg(): if len(global_blacklist)!=0: tmp = " Global CRAB3 blacklist is %s.\n" % global_blacklist if len(siteBlacklist)!=0: tmp += " User blacklist is %s.\n" % siteBlacklist if len(siteWhitelist)!=0: tmp += " User whitelist is %s.\n" % siteWhitelist return tmp if not dagSpecs: msg = "No jobs created for task %s." % (kwargs['task']['tm_taskname']) if blocksWithNoLocations or blocksWithBannedLocations: msg = "The CRAB server backend refuses to send jobs to the Grid scheduler. " msg += "No locations found for dataset '%s'. " % (kwargs['task']['tm_input_dataset']) msg += "(or at least for the part of the dataset that passed the lumi-mask and/or run-range selection).\n" if blocksWithBannedLocations: msg += " Found %s (out of %s) blocks present only at blacklisted sites." % (len(blocksWithBannedLocations), len(allblocks)) msg += getBlacklistMsg() raise TaskWorker.WorkerExceptions.NoAvailableSite(msg) msg = "Some blocks from dataset '%s' were skipped " % (kwargs['task']['tm_input_dataset']) if blocksWithNoLocations: msgBlocklist = sorted(list(blocksWithNoLocations[:10])) + ['...'] msg += " because they have no locations.\n List is (first 10 elements only): %s.\n" % msgBlocklist if blocksWithBannedLocations: msg += " because they are only present at blacklisted sites.\n List is: %s.\n" % (sorted(list(blocksWithBannedLocations))) msg += getBlacklistMsg() if blocksWithNoLocations or blocksWithBannedLocations: msg += (" Dataset processing will be incomplete because %s (out of %s) blocks are only present at blacklisted site(s)" % (len(blocksWithNoLocations)+len(blocksWithBannedLocations), len(allblocks))) self.uploadWarning(msg, kwargs['task']['user_proxy'], kwargs['task']['tm_taskname']) self.logger.warning(msg) ## Write down the DAG as needed by DAGMan. dag = DAG_HEADER.format( nodestate='' if not parent else '.{0}'.format(parent), resthost=kwargs['task']['resthost'], resturiwfdb=kwargs['task']['resturinoapi'] + '/workflowdb') if stage == 'probe': # We want only one probe job dagSpecs = dagSpecs[:1] for dagSpec in dagSpecs: dag += DAG_FRAGMENT.format(**dagSpec) if stage == 'probe' or (stage == 'process' and kwargs['task']['completion_jobs']): dag += SUBDAG_FRAGMENT.format(**dagSpec) subdag = "RunJobs{0}.subdag".format(dagSpec['count']) with open(subdag, "w") as fd: fd.write("") subdags.append(subdag) ## Create a tarball with all the job lumi files. with getLock('splitting_data'): self.logger.debug("Acquired lock on run and lumi tarball") try: tempDir = tempfile.mkdtemp() tempDir2 = tempfile.mkdtemp() try: tfd = tarfile.open('run_and_lumis.tar.gz', 'r:gz') tfd.extractall(tempDir) tfd.close() except (tarfile.ReadError, IOError): self.logger.debug("First iteration: creating run and lumi from scratch") try: tfd2 = tarfile.open('input_files.tar.gz', 'r:gz') tfd2.extractall(tempDir2) tfd2.close() except (tarfile.ReadError, IOError): self.logger.debug("First iteration: creating inputfiles from scratch") tfd = tarfile.open('run_and_lumis.tar.gz', 'w:gz') tfd2 = tarfile.open('input_files.tar.gz', 'w:gz') for dagSpec in dagSpecs: job_lumis_file = os.path.join(tempDir, 'job_lumis_'+ str(dagSpec['count']) +'.json') with open(job_lumis_file, "w") as fd: fd.write(str(dagSpec['runAndLumiMask'])) ## Also creating a tarball with the dataset input files. ## Each .txt file in the tarball contains a list of dataset files to be used for the job. job_input_file_list = os.path.join(tempDir2, 'job_input_file_list_'+ str(dagSpec['count']) +'.txt') with open(job_input_file_list, "w") as fd2: fd2.write(str(dagSpec['inputFiles'])) finally: tfd.add(tempDir, arcname='') tfd.close() shutil.rmtree(tempDir) tfd2.add(tempDir2, arcname='') tfd2.close() shutil.rmtree(tempDir2) if stage in ('probe', 'conventional'): name = "RunJobs.dag" ## Cache data discovery with open("datadiscovery.pkl", "wb") as fd: pickle.dump(splitterResult[1], fd) ## Cache task information with open("taskinformation.pkl", "wb") as fd: pickle.dump(kwargs['task'], fd) else: name = "RunJobs{0}.subdag".format(parent) if stage != 'tail': ## Cache site information with open("site.ad", "w") as fd: fd.write(str(sitead)) with open("site.ad.json", "w") as fd: json.dump(siteinfo, fd) ## Save the DAG into a file. with open(name, "w") as fd: fd.write(dag) task_name = kwargs['task'].get('CRAB_ReqName', kwargs['task'].get('tm_taskname', '')) userdn = kwargs['task'].get('CRAB_UserDN', kwargs['task'].get('tm_user_dn', '')) info["jobcount"] = len(dagSpecs) maxpost = getattr(self.config.TaskWorker, 'maxPost', 20) if maxpost == -1: maxpost = info['jobcount'] elif maxpost == 0: maxpost = int(max(20, info['jobcount']*.1)) info['maxpost'] = maxpost if info.get('faillimit') == None: info['faillimit'] = -1 #if info['jobcount'] > 200 # info['faillimit'] = 100 #else: # info['faillimit'] = -1 elif info.get('faillimit') < 0: info['faillimit'] = -1 # Info for ML: target_se = '' max_len_target_se = 900 for site in map(str, availablesites): if len(target_se) > max_len_target_se: target_se += ',Many_More' break if len(target_se): target_se += ',' target_se += site ml_info = info.setdefault('apmon', []) shift = 0 if stage == 'probe' else 1 for idx in range(shift, info['jobcount']+shift): taskid = kwargs['task']['tm_taskname'] jinfo = {'broker': os.environ.get('HOSTNAME', ''), 'bossId': str(idx), 'TargetSE': target_se, 'localId': '', 'StatusValue': 'pending', } insertJobIdSid(jinfo, idx, taskid, 0) ml_info.append(jinfo) # When running in standalone mode, we want to record the number of jobs in the task if ('CRAB_ReqName' in kwargs['task']) and ('CRAB_UserDN' in kwargs['task']): const = 'TaskType =?= \"ROOT\" && CRAB_ReqName =?= "%s" && CRAB_UserDN =?= "%s"' % (task_name, userdn) cmd = "condor_qedit -const '%s' CRAB_JobCount %d" % (const, len(dagSpecs)) self.logger.debug("+ %s" % cmd) status, output = commands.getstatusoutput(cmd) if status: self.logger.error(output) self.logger.error("Failed to record the number of jobs.") return 1 return info, splitterResult, subdags
def killTransfers(self, apmon): self.logger.info("About to kill transfers from workflow %s." % self.workflow) asourl = self.task.get('tm_asourl', None) #let's default asodb to asynctransfer, for old task this is empty! #Probably tm_asodb is always there and the get is not necessary, but let's not assume this asodb = self.task.get('tm_asodb', 'asynctransfer') or 'asynctransfer' if not asourl: self.logger.info("ASO URL not set; will not kill transfers") return False try: hostname = socket.getfqdn() except: hostname = '' server = CMSCouch.CouchServer(dburl=asourl, ckey=self.proxy, cert=self.proxy) try: db = server.connectDatabase(asodb) except Exception as ex: msg = "Error while connecting to asynctransfer CouchDB" self.logger.exception(msg) raise TaskWorkerException(msg) self.queryKill = {'reduce':False, 'key':self.workflow, 'include_docs': True} try: filesKill = db.loadView('AsyncTransfer', 'forKill', self.queryKill)['rows'] except Exception as ex: msg = "Error while connecting to asynctransfer CouchDB" self.logger.exception(msg) raise TaskWorkerException(msg) if len(filesKill) == 0: self.logger.warning('No files to kill found') for idt in filesKill: now = str(datetime.datetime.now()) id = idt['value'] data = { 'end_time': now, 'state': 'killed', 'last_update': time.time(), 'retry': now, } updateUri = "/%s/_design/AsyncTransfer/_update/updateJobs/%s?%s" % (db.name, id, urllib.urlencode(data)) jobid = idt.get('jobid') jobretry = idt.get('job_retry_count') if not self.task['kill_all']: if idt.get("jobid") not in self.task['kill_ids']: continue self.logger.info("Killing transfer %s (job ID %s; job retry %s)." % (id, str(jobid), str(jobretry))) jobid = str(jobid) jobretry = str(jobretry) if jobid and jobretry != None: jinfo = {'broker': hostname, 'bossId': jobid, 'StatusValue': 'killed', } insertJobIdSid(jinfo, jobid, self.workflow, jobretry) self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo)) apmon.sendToML(jinfo) try: db.makeRequest(uri = updateUri, type = "PUT", decode = False) except Exception as ex: msg = "Error updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) raise TaskWorkerException(msg) return True
def killTransfers(self, apmon): self.logger.info("About to kill transfers from workflow %s." % self.workflow) ASOURL = self.task.get('tm_asourl', None) if not ASOURL: self.logger.info("ASO URL not set; will not kill transfers") return False try: hostname = socket.getfqdn() except: hostname = '' server = CMSCouch.CouchServer(dburl=ASOURL, ckey=self.proxy, cert=self.proxy) try: db = server.connectDatabase('asynctransfer') except Exception as ex: msg = "Error while connecting to asynctransfer CouchDB" self.logger.exception(msg) raise TaskWorkerException(msg) self.queryKill = {'reduce':False, 'key':self.workflow, 'include_docs': True} try: filesKill = db.loadView('AsyncTransfer', 'forKill', self.queryKill)['rows'] except Exception as ex: msg = "Error while connecting to asynctransfer CouchDB" self.logger.exception(msg) raise TaskWorkerException(msg) if len(filesKill) == 0: self.logger.warning('No files to kill found') for idt in filesKill: now = str(datetime.datetime.now()) id = idt['value'] data = { 'end_time': now, 'state': 'killed', 'last_update': time.time(), 'retry': now, } updateUri = "/%s/_design/AsyncTransfer/_update/updateJobs/%s?%s" % (db.name, id, urllib.urlencode(data)) jobid = idt.get('jobid') jobretry = idt.get('job_retry_count') if not self.task['kill_all']: if idt.get("jobid") not in self.task['kill_ids']: continue self.logger.info("Killing transfer %s (job ID %s; job retry %s)." % (id, str(jobid), str(jobretry))) jobid = str(jobid) jobretry = str(jobretry) if jobid and jobretry != None: jinfo = {'broker': hostname, 'bossId': jobid, 'StatusValue': 'killed', } insertJobIdSid(jinfo, jobid, self.workflow, jobretry) self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo)) apmon.sendToML(jinfo) try: db.makeRequest(uri = updateUri, type = "PUT", decode = False) except Exception as ex: msg = "Error updating document in couch" msg += str(ex) msg += str(traceback.format_exc()) raise TaskWorkerException(msg) return True