def master_prepare(self,app,appmasterconfig): # PandaTools from pandatools import Client from pandatools import AthenaUtils from taskbuffer.JobSpec import JobSpec from taskbuffer.FileSpec import FileSpec job = app._getParent() logger.debug('AthenaMCPandaRTHandler master_prepare called for %s', job.getFQID('.')) usertag = configDQ2['usertag'] #usertag='user09' nickname = getNickname(allowMissingNickname=True) self.libDataset = '%s.%s.ganga.%s_%d.lib._%06d' % (usertag,nickname,commands.getoutput('hostname').split('.')[0],int(time.time()),job.id) # self.userprefix='%s.%s.ganga' % (usertag,gridProxy.identity()) sources = 'sources.%s.tar.gz' % commands.getoutput('uuidgen 2> /dev/null') self.library = '%s.lib.tgz' % self.libDataset # check DBRelease # if job.backend.dbRelease != '' and job.backend.dbRelease.find(':') == -1: # raise ApplicationConfigurationError(None,"ERROR : invalid argument for backend.dbRelease. Must be 'DatasetName:FileName'") # unpack library logger.debug('Creating source tarball ...') tmpdir = '/tmp/%s' % commands.getoutput('uuidgen 2> /dev/null') os.mkdir(tmpdir) inputbox=[] if os.path.exists(app.transform_archive): # must add a condition on size. inputbox += [ File(app.transform_archive) ] if app.evgen_job_option: self.evgen_job_option=app.evgen_job_option if os.path.exists(app.evgen_job_option): # locally modified job option file to add to the input sand box inputbox += [ File(app.evgen_job_option) ] self.evgen_job_option=app.evgen_job_option.split("/")[-1] # add input sandbox files if (job.inputsandbox): for file in job.inputsandbox: inputbox += [ file ] # add option files for extFile in job.backend.extOutFile: try: shutil.copy(extFile,tmpdir) except IOError: os.makedirs(tmpdir) shutil.copy(extFile,tmpdir) # fill the archive for opt_file in inputbox: try: shutil.copy(opt_file.name,tmpdir) except IOError: os.makedirs(tmpdir) shutil.copy(opt_file.name,tmpdir) # now tar it up again inpw = job.getInputWorkspace() rc, output = commands.getstatusoutput('tar czf %s -C %s .' % (inpw.getPath(sources),tmpdir)) if rc: logger.error('Packing sources failed with status %d',rc) logger.error(output) raise ApplicationConfigurationError(None,'Packing sources failed.') shutil.rmtree(tmpdir) # upload sources logger.debug('Uploading source tarball ...') try: cwd = os.getcwd() os.chdir(inpw.getPath()) rc, output = Client.putFile(sources) if output != 'True': logger.error('Uploading sources %s failed. Status = %d', sources, rc) logger.error(output) raise ApplicationConfigurationError(None,'Uploading archive failed') finally: os.chdir(cwd) # Use Panda's brokerage ## if job.inputdata and len(app.sites)>0: ## # update cloud, use inputdata's ## from dq2.info.TiersOfATLAS import whichCloud,ToACache ## inclouds=[] ## for site in app.sites: ## cloudSite=whichCloud(app.sites[0]) ## if cloudSite not in inclouds: ## inclouds.append(cloudSite) ## # now converting inclouds content into proper brokering stuff. ## outclouds=[] ## for cloudSite in inclouds: ## for cloudID, eachCloud in ToACache.dbcloud.iteritems(): ## if cloudSite==eachCloud: ## cloud=cloudID ## outclouds.append(cloud) ## break ## print outclouds ## # finally, matching with user's wishes ## if len(outclouds)>0: ## if not job.backend.requirements.cloud: # no user wish, update ## job.backend.requirements.cloud=outclouds[0] ## else: ## try: ## assert job.backend.requirements.cloud in outclouds ## except: ## raise ApplicationConfigurationError(None,'Input dataset not available in target cloud %s. Please try any of the following %s' % (job.backend.requirements.cloud, str(outclouds))) from GangaPanda.Lib.Panda.Panda import runPandaBrokerage runPandaBrokerage(job) if job.backend.site == 'AUTO': raise ApplicationConfigurationError(None,'site is still AUTO after brokerage!') # output dataset preparation and registration try: outDsLocation = Client.PandaSites[job.backend.site]['ddm'] except: raise ApplicationConfigurationError(None,"Could not extract output dataset location from job.backend.site value: %s. Aborting" % job.backend.site) if not app.dryrun: for outtype in app.outputpaths.keys(): dset=string.replace(app.outputpaths[outtype],"/",".") dset=dset[1:] # dataset registration must be done only once. print "registering output dataset %s at %s" % (dset,outDsLocation) try: Client.addDataset(dset,False,location=outDsLocation) dq2_set_dataset_lifetime(dset, location=outDsLocation) except: raise ApplicationConfigurationError(None,"Fail to create output dataset %s. Aborting" % dset) # extend registration to build job lib dataset: print "registering output dataset %s at %s" % (self.libDataset,outDsLocation) try: Client.addDataset(self.libDataset,False,location=outDsLocation) dq2_set_dataset_lifetime(self.libDataset, outDsLocation) except: raise ApplicationConfigurationError(None,"Fail to create output dataset %s. Aborting" % self.libDataset) ### cacheVer = "-AtlasProduction_" + str(app.prod_release) logger.debug("master job submit?") self.outsite=job.backend.site if app.se_name and app.se_name != "none" and not self.outsite: self.outsite=app.se_name # create build job jspec = JobSpec() jspec.jobDefinitionID = job.id jspec.jobName = commands.getoutput('uuidgen 2> /dev/null') jspec.AtlasRelease = 'Atlas-%s' % app.atlas_rel jspec.homepackage = 'AnalysisTransforms'+cacheVer#+nightVer jspec.transformation = '%s/buildJob-00-00-03' % Client.baseURLSUB # common base to Athena and AthenaMC jobs: buildJob is a pilot job which takes care of all inputs for the real jobs (in prepare() jspec.destinationDBlock = self.libDataset jspec.destinationSE = job.backend.site jspec.prodSourceLabel = 'panda' jspec.assignedPriority = 2000 jspec.computingSite = job.backend.site jspec.cloud = job.backend.requirements.cloud # jspec.jobParameters = self.args not known yet jspec.jobParameters = '-o %s' % (self.library) if app.userarea: print app.userarea jspec.jobParameters += ' -i %s' % (os.path.basename(app.userarea)) else: jspec.jobParameters += ' -i %s' % (sources) jspec.cmtConfig = AthenaUtils.getCmtConfig(athenaVer=app.atlas_rel) matchURL = re.search('(http.*://[^/]+)/',Client.baseURLSSL) if matchURL: jspec.jobParameters += ' --sourceURL %s' % matchURL.group(1) fout = FileSpec() fout.lfn = self.library fout.type = 'output' fout.dataset = self.libDataset fout.destinationDBlock = self.libDataset jspec.addFile(fout) flog = FileSpec() flog.lfn = '%s.log.tgz' % self.libDataset flog.type = 'log' flog.dataset = self.libDataset flog.destinationDBlock = self.libDataset jspec.addFile(flog) #print "MASTER JOB DETAILS:",jspec.jobParameters return jspec
def master_prepare(self,app,appconfig): '''Prepare the master job''' from pandatools import Client from pandatools import MiscUtils from pandatools import AthenaUtils from pandatools import PsubUtils # create a random number for this submission to allow multiple use of containers self.rndSubNum = random.randint(1111,9999) job = app._getParent() logger.debug('AthenaJediRTHandler master_prepare called for %s', job.getFQID('.')) if app.useRootCoreNoBuild: logger.info('Athena.useRootCoreNoBuild is True, setting Panda.nobuild=True.') job.backend.nobuild = True if job.backend.bexec and job.backend.nobuild: raise ApplicationConfigurationError("Contradicting options: job.backend.bexec and job.backend.nobuild are both enabled.") if job.backend.requirements.rootver != '' and job.backend.nobuild: raise ApplicationConfigurationError("Contradicting options: job.backend.requirements.rootver given and job.backend.nobuild are enabled.") # Switch on compilation flag if bexec is set or libds is empty if job.backend.bexec != '' or not job.backend.nobuild: app.athena_compile = True for sj in job.subjobs: sj.application.athena_compile = True logger.info('"job.backend.nobuild=False" or "job.backend.bexec" is set - Panda build job is enabled.') if job.backend.nobuild: app.athena_compile = False for sj in job.subjobs: sj.application.athena_compile = False logger.info('"job.backend.nobuild=True" or "--nobuild" chosen - Panda build job is switched off.') # check for auto datri if job.outputdata.location != '': if not PsubUtils.checkDestSE(job.outputdata.location,job.outputdata.datasetname,False): raise ApplicationConfigurationError("Problems with outputdata.location setting '%s'" % job.outputdata.location) # validate application if not app.atlas_release and not job.backend.requirements.rootver and not app.atlas_exetype in [ 'EXE' ]: raise ApplicationConfigurationError("application.atlas_release is not set. Did you run application.prepare()") self.dbrelease = app.atlas_dbrelease if self.dbrelease != '' and self.dbrelease != 'LATEST' and self.dbrelease.find(':') == -1: raise ApplicationConfigurationError("ERROR : invalid argument for DB Release. Must be 'LATEST' or 'DatasetName:FileName'") self.runConfig = AthenaUtils.ConfigAttr(app.atlas_run_config) for k in self.runConfig.keys(): self.runConfig[k]=AthenaUtils.ConfigAttr(self.runConfig[k]) if not app.atlas_run_dir: raise ApplicationConfigurationError("application.atlas_run_dir is not set. Did you run application.prepare()") self.rundirectory = app.atlas_run_dir self.cacheVer = '' if app.atlas_project and app.atlas_production: self.cacheVer = "-" + app.atlas_project + "_" + app.atlas_production # handle different atlas_exetypes self.job_options = '' if app.atlas_exetype == 'TRF': self.job_options += ' '.join([os.path.basename(fopt.name) for fopt in app.option_file]) #if not job.outputdata.outputdata: # raise ApplicationConfigurationError("job.outputdata.outputdata is required for atlas_exetype in ['PYARA','ARES','TRF','ROOT','EXE' ] and Panda backend") #raise ApplicationConfigurationError("Sorry TRF on Panda backend not yet supported") if app.options: self.job_options += ' %s ' % app.options elif app.atlas_exetype == 'ATHENA': if len(app.atlas_environment) > 0 and app.atlas_environment[0].find('DBRELEASE_OVERRIDE')==-1: logger.warning("Passing of environment variables to Athena using Panda not supported. Ignoring atlas_environment setting.") if job.outputdata.outputdata: raise ApplicationConfigurationError("job.outputdata.outputdata must be empty if atlas_exetype='ATHENA' and Panda backend is used (outputs are auto-detected)") if app.options: if app.options.startswith('-c'): self.job_options += ' %s ' % app.options else: self.job_options += ' -c %s ' % app.options logger.warning('The value of j.application.options has been prepended with " -c " ') logger.warning('Please make sure to use proper quotes for the values of j.application.options !') self.job_options += ' '.join([os.path.basename(fopt.name) for fopt in app.option_file]) # check for TAG compression if 'subcoll.tar.gz' in app.append_to_user_area: self.job_options = ' uncompress.py ' + self.job_options elif app.atlas_exetype in ['PYARA','ARES','ROOT','EXE']: #if not job.outputdata.outputdata: # raise ApplicationConfigurationError("job.outputdata.outputdata is required for atlas_exetype in ['PYARA','ARES','TRF','ROOT','EXE' ] and Panda backend") self.job_options += ' '.join([os.path.basename(fopt.name) for fopt in app.option_file]) # sort out environment variables env_str = "" if len(app.atlas_environment) > 0: for env_var in app.atlas_environment: env_str += "export %s ; " % env_var else: env_str = "" # below fixes issue with runGen -- job_options are executed by os.system when dbrelease is used, and by the shell otherwise ## - REMOVED FIX DUE TO CHANGE IN PILOT - MWS 8/11/11 if job.backend.requirements.usecommainputtxt: input_str = '/bin/echo %IN > input.txt; cat input.txt; ' else: input_str = '/bin/echo %IN | sed \'s/,/\\\n/g\' > input.txt; cat input.txt; ' if app.atlas_exetype == 'PYARA': self.job_options = env_str + input_str + ' python ' + self.job_options elif app.atlas_exetype == 'ARES': self.job_options = env_str + input_str + ' athena.py ' + self.job_options elif app.atlas_exetype == 'ROOT': self.job_options = env_str + input_str + ' root -b -q ' + self.job_options elif app.atlas_exetype == 'EXE': self.job_options = env_str + input_str + self.job_options if app.options: self.job_options += ' %s ' % app.options if self.job_options == '': raise ApplicationConfigurationError("No Job Options found!") logger.info('Running job options: %s'%self.job_options) # validate dbrelease if self.dbrelease != "LATEST": self.dbrFiles,self.dbrDsList = getDBDatasets(self.job_options,'',self.dbrelease) # handle the output dataset if job.outputdata: if job.outputdata._name != 'DQ2OutputDataset': raise ApplicationConfigurationError('Panda backend supports only DQ2OutputDataset') else: logger.info('Adding missing DQ2OutputDataset') job.outputdata = DQ2OutputDataset() # validate the output dataset name (and make it a container) job.outputdata.datasetname,outlfn = dq2outputdatasetname(job.outputdata.datasetname, job.id, job.outputdata.isGroupDS, job.outputdata.groupname) if not job.outputdata.datasetname.endswith('/'): job.outputdata.datasetname+='/' # add extOutFiles self.extOutFile = [] for tmpName in job.outputdata.outputdata: if tmpName != '': self.extOutFile.append(tmpName) for tmpName in job.backend.extOutFile: if tmpName != '': self.extOutFile.append(tmpName) # use the shared area if possible tmp_user_area_name = app.user_area.name if app.is_prepared is not True: from Ganga.Utility.files import expandfilename shared_path = os.path.join(expandfilename(getConfig('Configuration')['gangadir']),'shared',getConfig('Configuration')['user']) tmp_user_area_name = os.path.join(os.path.join(shared_path,app.is_prepared.name),os.path.basename(app.user_area.name)) # Add inputsandbox to user_area if job.inputsandbox: logger.warning("Submitting Panda job with inputsandbox. This may slow the submission slightly.") if tmp_user_area_name: inpw = os.path.dirname(tmp_user_area_name) self.inputsandbox = os.path.join(inpw, 'sources.%s.tar' % commands.getoutput('uuidgen 2> /dev/null')) else: inpw = job.getInputWorkspace() self.inputsandbox = inpw.getPath('sources.%s.tar' % commands.getoutput('uuidgen 2> /dev/null')) if tmp_user_area_name: rc, output = commands.getstatusoutput('cp %s %s.gz' % (tmp_user_area_name, self.inputsandbox)) if rc: logger.error('Copying user_area failed with status %d',rc) logger.error(output) raise ApplicationConfigurationError('Packing inputsandbox failed.') rc, output = commands.getstatusoutput('gunzip %s.gz' % (self.inputsandbox)) if rc: logger.error('Unzipping user_area failed with status %d',rc) logger.error(output) raise ApplicationConfigurationError('Packing inputsandbox failed.') for fname in [os.path.abspath(f.name) for f in job.inputsandbox]: fname.rstrip(os.sep) path = os.path.dirname(fname) fn = os.path.basename(fname) #app.atlas_run_dir # get Athena versions rc, out = AthenaUtils.getAthenaVer() # failed if not rc: #raise ApplicationConfigurationError('CMT could not parse correct environment ! \n Did you start/setup ganga in the run/ or cmt/ subdirectory of your athena analysis package ?') logger.warning("CMT could not parse correct environment for inputsandbox - will use the atlas_run_dir as default") # as we don't have to be in the run dir now, create a copy of the run_dir directory structure and use that input_dir = os.path.dirname(self.inputsandbox) run_path = "%s/sbx_tree/%s" % (input_dir, app.atlas_run_dir) rc, output = commands.getstatusoutput("mkdir -p %s" % run_path) if not rc: # copy this sandbox file rc, output = commands.getstatusoutput("cp %s %s" % (fname, run_path)) if not rc: path = os.path.join(input_dir, 'sbx_tree') fn = os.path.join(app.atlas_run_dir, fn) else: raise ApplicationConfigurationError("Couldn't copy file %s to recreate run_dir for input sandbox" % fname) else: raise ApplicationConfigurationError("Couldn't create directory structure to match run_dir %s for input sandbox" % run_path) else: userarea = out['workArea'] # strip the path from the filename if present in the userarea ua = os.path.abspath(userarea) if ua in path: fn = fname[len(ua)+1:] path = ua rc, output = commands.getstatusoutput('tar -h -r -f %s -C %s %s' % (self.inputsandbox, path, fn)) if rc: logger.error('Packing inputsandbox failed with status %d',rc) logger.error(output) raise ApplicationConfigurationError('Packing inputsandbox failed.') # remove sandbox tree if created if "sbx_tree" in os.listdir(os.path.dirname(self.inputsandbox)): rc, output = commands.getstatusoutput("rm -r %s/sbx_tree" % os.path.dirname(self.inputsandbox)) if rc: raise ApplicationConfigurationError("Couldn't remove directory structure used for input sandbox") rc, output = commands.getstatusoutput('gzip %s' % (self.inputsandbox)) if rc: logger.error('Packing inputsandbox failed with status %d',rc) logger.error(output) raise ApplicationConfigurationError('Packing inputsandbox failed.') self.inputsandbox += ".gz" else: self.inputsandbox = tmp_user_area_name # job name jobName = 'ganga.%s' % MiscUtils.wrappedUuidGen() # make task taskParamMap = {} # Enforce that outputdataset name ends with / for container if not job.outputdata.datasetname.endswith('/'): job.outputdata.datasetname = job.outputdata.datasetname + '/' taskParamMap['taskName'] = job.outputdata.datasetname taskParamMap['uniqueTaskName'] = True taskParamMap['vo'] = 'atlas' taskParamMap['architecture'] = AthenaUtils.getCmtConfig(athenaVer=app.atlas_release, cmtConfig=app.atlas_cmtconfig) if app.atlas_release: taskParamMap['transUses'] = 'Atlas-%s' % app.atlas_release else: taskParamMap['transUses'] = '' taskParamMap['transHome'] = 'AnalysisTransforms'+self.cacheVer#+nightVer configSys = getConfig('System') gangaver = configSys['GANGA_VERSION'].lower() if not gangaver: gangaver = "ganga" if app.atlas_exetype in ["ATHENA", "TRF"]: taskParamMap['processingType'] = '{0}-jedi-athena'.format(gangaver) else: taskParamMap['processingType'] = '{0}-jedi-run'.format(gangaver) #if options.eventPickEvtList != '': # taskParamMap['processingType'] += '-evp' taskParamMap['prodSourceLabel'] = 'user' if job.backend.site != 'AUTO': taskParamMap['cloud'] = Client.PandaSites[job.backend.site]['cloud'] taskParamMap['site'] = job.backend.site elif job.backend.requirements.cloud != None and not job.backend.requirements.anyCloud: taskParamMap['cloud'] = job.backend.requirements.cloud if job.backend.requirements.excluded_sites != []: taskParamMap['excludedSite'] = expandExcludedSiteList( job ) # if only a single site specifed, don't set includedSite #if job.backend.site != 'AUTO': # taskParamMap['includedSite'] = job.backend.site #taskParamMap['cliParams'] = fullExecString if job.backend.requirements.noEmail: taskParamMap['noEmail'] = True if job.backend.requirements.skipScout: taskParamMap['skipScout'] = True if not app.atlas_exetype in ["ATHENA", "TRF"]: taskParamMap['nMaxFilesPerJob'] = job.backend.requirements.maxNFilesPerJob if job.backend.requirements.disableAutoRetry: taskParamMap['disableAutoRetry'] = 1 # source URL matchURL = re.search("(http.*://[^/]+)/",Client.baseURLCSRVSSL) if matchURL != None: taskParamMap['sourceURL'] = matchURL.group(1) # dataset names outDatasetName = job.outputdata.datasetname logDatasetName = re.sub('/$','.log/',job.outputdata.datasetname) # log taskParamMap['log'] = {'dataset': logDatasetName, 'container': logDatasetName, 'type':'template', 'param_type':'log', 'value':'{0}.${{SN}}.log.tgz'.format(logDatasetName[:-1]) } # job parameters if app.atlas_exetype in ["ATHENA", "TRF"]: taskParamMap['jobParameters'] = [ {'type':'constant', 'value': ' --sourceURL ${SURL}', }, ] else: taskParamMap['jobParameters'] = [ {'type':'constant', 'value': '-j "" --sourceURL ${SURL}', }, ] taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '-r {0}'.format(self.rundirectory), }, ] # Add the --trf option to jobParameters if required if app.atlas_exetype == "TRF": taskParamMap['jobParameters'] += [{'type': 'constant', 'value': '--trf'}] # output # output files outMap = {} if app.atlas_exetype in ["ATHENA", "TRF"]: outMap, tmpParamList = AthenaUtils.convertConfToOutput(self.runConfig, self.extOutFile, job.outputdata.datasetname, destination=job.outputdata.location) taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '-o "%s" ' % outMap }, ] taskParamMap['jobParameters'] += tmpParamList else: if job.outputdata.outputdata: for tmpLFN in job.outputdata.outputdata: if len(job.outputdata.datasetname.split('.')) > 2: lfn = '{0}.{1}'.format(*job.outputdata.datasetname.split('.')[:2]) else: lfn = job.outputdata.datasetname[:-1] lfn += '.$JOBSETID._${{SN/P}}.{0}'.format(tmpLFN) dataset = '{0}_{1}/'.format(job.outputdata.datasetname[:-1],tmpLFN) taskParamMap['jobParameters'] += MiscUtils.makeJediJobParam(lfn,dataset,'output',hidden=True, destination=job.outputdata.location) outMap[tmpLFN] = lfn taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '-o "{0}"'.format(str(outMap)), }, ] if app.atlas_exetype in ["ATHENA"]: # jobO parameter tmpJobO = self.job_options # replace full-path jobOs for tmpFullName,tmpLocalName in AthenaUtils.fullPathJobOs.iteritems(): tmpJobO = re.sub(tmpFullName,tmpLocalName,tmpJobO) # modify one-liner for G4 random seeds if self.runConfig.other.G4RandomSeeds > 0: if app.options != '': tmpJobO = re.sub('-c "%s" ' % app.options, '-c "%s;from G4AtlasApps.SimFlags import SimFlags;SimFlags.SeedsG4=${RNDMSEED}" ' \ % app.options,tmpJobO) else: tmpJobO = '-c "from G4AtlasApps.SimFlags import SimFlags;SimFlags.SeedsG4=${RNDMSEED}" ' dictItem = {'type':'template', 'param_type':'number', 'value':'${RNDMSEED}', 'hidden':True, 'offset':self.runConfig.other.G4RandomSeeds, } taskParamMap['jobParameters'] += [dictItem] elif app.atlas_exetype in ["TRF"]: # replace parameters for TRF tmpJobO = self.job_options # output : basenames are in outMap['IROOT'] trough extOutFile tmpOutMap = [] for tmpName,tmpLFN in outMap['IROOT']: tmpJobO = tmpJobO.replace('%OUT.' + tmpName,tmpName) # replace DBR tmpJobO = re.sub('%DB=[^ \'\";]+','${DBR}',tmpJobO) if app.atlas_exetype in ["TRF"]: taskParamMap['useLocalIO'] = 1 # build if job.backend.nobuild: taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '-a {0}'.format(os.path.basename(self.inputsandbox)), }, ] else: taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '-l ${LIB}', }, ] # # input if job.inputdata and job.inputdata._name == 'DQ2Dataset': if job.backend.requirements.nFilesPerJob > 0 and job.inputdata.number_of_files == 0 and job.backend.requirements.split > 0: job.inputdata.number_of_files = job.backend.requirements.nFilesPerJob * job.backend.requirements.split if job.inputdata and job.inputdata._name == 'DQ2Dataset' and job.inputdata.number_of_files != 0: taskParamMap['nFiles'] = job.inputdata.number_of_files elif job.backend.requirements.nFilesPerJob > 0 and job.backend.requirements.split > 0: # pathena does this for some reason even if there is no input files taskParamMap['nFiles'] = job.backend.requirements.nFilesPerJob * job.backend.requirements.split if job.backend.requirements.nFilesPerJob > 0: taskParamMap['nFilesPerJob'] = job.backend.requirements.nFilesPerJob if job.backend.requirements.nEventsPerFile > 0: taskParamMap['nEventsPerFile'] = job.backend.requirements.nEventsPerFile if not job.backend.requirements.nGBPerJob in [ 0,'MAX']: try: if job.backend.requirements.nGBPerJob != 'MAX': job.backend.requirments.nGBPerJob = int(job.backend.requirements.nGBPerJob) except: logger.error("nGBPerJob must be an integer or MAX") # check negative if job.backend.requirements.nGBPerJob <= 0: logger.error("nGBPerJob must be positive") # don't set MAX since it is the defalt on the server side if not job.backend.requirements.nGBPerJob in [-1,'MAX']: taskParamMap['nGBPerJob'] = job.backend.requirements.nGBPerJob if app.atlas_exetype in ["ATHENA", "TRF"]: inputMap = {} if job.inputdata and job.inputdata._name == 'DQ2Dataset': tmpDict = {'type':'template', 'param_type':'input', 'value':'-i "${IN/T}"', 'dataset': ','.join(job.inputdata.dataset), 'expand':True, 'exclude':'\.log\.tgz(\.\d+)*$', } #if options.inputType != '': # tmpDict['include'] = options.inputType taskParamMap['jobParameters'].append(tmpDict) taskParamMap['dsForIN'] = ','.join(job.inputdata.dataset) inputMap['IN'] = ','.join(job.inputdata.dataset) else: # no input taskParamMap['noInput'] = True if job.backend.requirements.split > 0: taskParamMap['nEvents'] = job.backend.requirements.split else: taskParamMap['nEvents'] = 1 taskParamMap['nEventsPerJob'] = 1 taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '-i "[]"', }, ] else: if job.inputdata and job.inputdata._name == 'DQ2Dataset': tmpDict = {'type':'template', 'param_type':'input', 'value':'-i "${IN/T}"', 'dataset': ','.join(job.inputdata.dataset), 'expand':True, 'exclude':'\.log\.tgz(\.\d+)*$', } #if options.nSkipFiles != 0: # tmpDict['offset'] = options.nSkipFiles taskParamMap['jobParameters'].append(tmpDict) taskParamMap['dsForIN'] = ','.join(job.inputdata.dataset) else: # no input taskParamMap['noInput'] = True if job.backend.requirements.split > 0: taskParamMap['nEvents'] = job.backend.requirements.split else: taskParamMap['nEvents'] = 1 taskParamMap['nEventsPerJob'] = 1 # param for DBR if self.dbrelease != '': dbrDS = self.dbrelease.split(':')[0] # change LATEST to DBR_LATEST if dbrDS == 'LATEST': dbrDS = 'DBR_LATEST' dictItem = {'type':'template', 'param_type':'input', 'value':'--dbrFile=${DBR}', 'dataset':dbrDS, } taskParamMap['jobParameters'] += [dictItem] # no expansion #if options.notExpandDBR: #dictItem = {'type':'constant', # 'value':'--noExpandDBR', # } #taskParamMap['jobParameters'] += [dictItem] # secondary FIXME disabled self.secondaryDSs = {} if self.secondaryDSs != {}: inMap = {} streamNames = [] for tmpDsName,tmpMap in self.secondaryDSs.iteritems(): # make template item streamName = tmpMap['streamName'] dictItem = MiscUtils.makeJediJobParam('${'+streamName+'}',tmpDsName,'input',hidden=True, expand=True,include=tmpMap['pattern'],offset=tmpMap['nSkip'], nFilesPerJob=tmpMap['nFiles']) taskParamMap['jobParameters'] += dictItem inMap[streamName] = 'tmp_'+streamName streamNames.append(streamName) # make constant item strInMap = str(inMap) # set placeholders for streamName in streamNames: strInMap = strInMap.replace("'tmp_"+streamName+"'",'${'+streamName+'/T}') dictItem = {'type':'constant', 'value':'--inMap "%s"' % strInMap, } taskParamMap['jobParameters'] += [dictItem] # misc jobParameters = '' # use Athena packages if app.atlas_exetype == 'ARES' or (app.atlas_exetype in ['PYARA','ROOT','EXE'] and app.useAthenaPackages): jobParameters += "--useAthenaPackages " # use RootCore if app.useRootCore or app.useRootCoreNoBuild: jobParameters += "--useRootCore " # use mana if app.useMana: jobParameters += "--useMana " if app.atlas_release != "": jobParameters += "--manaVer %s " % app.atlas_release # root if app.atlas_exetype in ['PYARA','ROOT','EXE'] and job.backend.requirements.rootver != '': rootver = re.sub('/','.', job.backend.requirements.rootver) jobParameters += "--rootVer %s " % rootver # write input to txt #if options.writeInputToTxt != '': # jobParameters += "--writeInputToTxt %s " % options.writeInputToTxt # debug parameters #if options.queueData != '': # jobParameters += "--overwriteQueuedata=%s " % options.queueData # JEM #if options.enableJEM: # jobParameters += "--enable-jem " # if options.configJEM != '': # jobParameters += "--jem-config %s " % options.configJEM # set task param if jobParameters != '': taskParamMap['jobParameters'] += [ {'type':'constant', 'value': jobParameters, }, ] # force stage-in if job.backend.accessmode == "LocalIO": taskParamMap['useLocalIO'] = 1 # set jobO parameter if app.atlas_exetype in ["ATHENA", "TRF"]: taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '-j "', 'padding':False, }, ] taskParamMap['jobParameters'] += PsubUtils.convertParamStrToJediParam(tmpJobO,inputMap,job.outputdata.datasetname[:-1], True,False) taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '"', }, ] else: taskParamMap['jobParameters'] += [ {'type':'constant', 'value': '-p "{0}"'.format(urllib.quote(self.job_options)), }, ] # build step if not job.backend.nobuild: jobParameters = '-i ${IN} -o ${OUT} --sourceURL ${SURL} ' if job.backend.bexec != '': jobParameters += ' --bexec "%s" ' % urllib.quote(job.backend.bexec) if app.atlas_exetype == 'ARES' or (app.atlas_exetype in ['PYARA','ROOT','EXE'] and app.useAthenaPackages): # use Athena packages jobParameters += "--useAthenaPackages " # use RootCore if app.useRootCore or app.useRootCoreNoBuild: jobParameters += "--useRootCore " # run directory if app.atlas_exetype in ['PYARA','ARES','ROOT','EXE']: jobParameters += '-r {0} '.format(self.rundirectory) # no compile #if options.noCompile: # jobParameters += "--noCompile " # use mana if app.useMana: jobParameters += "--useMana " if app.atlas_release != "": jobParameters += "--manaVer %s " % app.atlas_release # root if app.atlas_exetype in ['PYARA','ROOT','EXE'] and job.backend.requirements.rootver != '': rootver = re.sub('/','.', job.backend.requirements.rootver) jobParameters += "--rootVer %s " % rootver # cmt config if app.atlas_exetype in ['PYARA','ARES','ROOT','EXE']: if not app.atlas_cmtconfig in ['','NULL',None]: jobParameters += " --cmtConfig %s " % app.atlas_cmtconfig #cmtConfig = AthenaUtils.getCmtConfig(athenaVer=app.atlas_release, cmtConfig=app.atlas_cmtconfig) #if cmtConfig: # jobParameters += "--cmtConfig %s " % cmtConfig # debug parameters #if options.queueData != '': # jobParameters += "--overwriteQueuedata=%s " % options.queueData # set task param taskParamMap['buildSpec'] = { 'prodSourceLabel':'panda', 'archiveName':os.path.basename(self.inputsandbox), 'jobParameters':jobParameters, } # enable merging if job.backend.requirements.enableMerge: jobParameters = '-r {0} '.format(self.rundirectory) if 'exec' in job.backend.requirements.configMerge and job.backend.requirements.configMerge['exec'] != '': jobParameters += '-j "{0}" '.format(job.backend.requirements.configMerge['exec']) if not job.backend.nobuild: jobParameters += '-l ${LIB} ' else: jobParameters += '-a {0} '.format(os.path.basename(self.inputsandbox)) jobParameters += "--sourceURL ${SURL} " jobParameters += '${TRN_OUTPUT:OUTPUT} ${TRN_LOG:LOG}' taskParamMap['mergeSpec'] = {} taskParamMap['mergeSpec']['useLocalIO'] = 1 taskParamMap['mergeSpec']['jobParameters'] = jobParameters taskParamMap['mergeOutput'] = True # Selected by Jedi #if not app.atlas_exetype in ['PYARA','ROOT','EXE']: # taskParamMap['transPath'] = 'http://atlpan.web.cern.ch/atlpan/runAthena-00-00-12' logger.debug(taskParamMap) # upload sources if self.inputsandbox and not job.backend.libds: uploadSources(os.path.dirname(self.inputsandbox),os.path.basename(self.inputsandbox)) if not self.inputsandbox == tmp_user_area_name: logger.info('Removing source tarball %s ...' % self.inputsandbox ) os.remove(self.inputsandbox) return taskParamMap
def prepare(self,app,appconfig,appmasterconfig,jobmasterconfig): '''prepare the subjob specific configuration''' # PandaTools from pandatools import Client from pandatools import AthenaUtils from taskbuffer.JobSpec import JobSpec from taskbuffer.FileSpec import FileSpec job = app._getParent() logger.debug('AthenaMCPandaRTHandler prepare called for %s', job.getFQID('.')) try: assert self.outsite except: logger.error("outsite not set. Aborting") raise Exception() job.backend.site = self.outsite job.backend.actualCE = self.outsite cloud = job._getRoot().backend.requirements.cloud job.backend.requirements.cloud = cloud # now just filling the job from AthenaMC data jspec = JobSpec() jspec.jobDefinitionID = job._getRoot().id jspec.jobName = commands.getoutput('uuidgen 2> /dev/null') jspec.AtlasRelease = 'Atlas-%s' % app.atlas_rel if app.transform_archive: jspec.homepackage = 'AnalysisTransforms'+app.transform_archive elif app.prod_release: jspec.homepackage = 'AnalysisTransforms-AtlasProduction_'+str(app.prod_release) jspec.transformation = '%s/runAthena-00-00-11' % Client.baseURLSUB #---->???? prodDBlock and destinationDBlock when facing several input / output datasets? jspec.prodDBlock = 'NULL' if job.inputdata and len(app.inputfiles)>0 and app.inputfiles[0] in app.dsetmap: jspec.prodDBlock = app.dsetmap[app.inputfiles[0]] # How to specify jspec.destinationDBlock when more than one type of output is available? Panda prod jobs seem to specify only the last output dataset outdset="" for type in ["EVNT","RDO","HITS","AOD","ESD","NTUP"]: if type in app.outputpaths.keys(): outdset=string.replace(app.outputpaths[type],"/",".") outdset=outdset[1:-1] break if not outdset: try: assert len(app.outputpaths.keys())>0 except: logger.error("app.outputpaths is empty: check your output datasets") raise type=app.outputpaths.keys()[0] outdset=string.replace(app.outputpaths[type],"/",".") outdset=outdset[1:-1] jspec.destinationDBlock = outdset jspec.destinationSE = self.outsite jspec.prodSourceLabel = 'user' jspec.assignedPriority = 1000 jspec.cloud = cloud # memory if job.backend.requirements.memory != -1: jspec.minRamCount = job.backend.requirements.memory jspec.computingSite = self.outsite jspec.cmtConfig = AthenaUtils.getCmtConfig(athenaVer=app.atlas_rel) # library (source files) flib = FileSpec() flib.lfn = self.library # flib.GUID = flib.type = 'input' # flib.status = flib.dataset = self.libDataset flib.dispatchDBlock = self.libDataset jspec.addFile(flib) # input files FIXME: many more input types for lfn in app.inputfiles: useguid=app.turls[lfn].replace("guid:","") finp = FileSpec() finp.lfn = lfn finp.GUID = useguid finp.dataset = app.dsetmap[lfn] finp.prodDBlock = app.dsetmap[lfn] finp.prodDBlockToken = 'local' finp.dispatchDBlock = app.dsetmap[lfn] finp.type = 'input' finp.status = 'ready' jspec.addFile(finp) # add dbfiles if any: for lfn in app.dbfiles: useguid=app.dbturls[lfn].replace("guid:","") finp = FileSpec() finp.lfn = lfn finp.GUID = useguid finp.dataset = app.dsetmap[lfn] finp.prodDBlock = app.dsetmap[lfn] finp.prodDBlockToken = 'local' finp.dispatchDBlock = app.dsetmap[lfn] finp.type = 'input' finp.status = 'ready' jspec.addFile(finp) # then minbias files for lfn in app.mbfiles: useguid=app.minbias_turls[lfn].replace("guid:","") finp = FileSpec() finp.lfn = lfn finp.GUID = useguid finp.dataset = app.dsetmap[lfn] finp.prodDBlock = app.dsetmap[lfn] finp.prodDBlockToken = 'local' finp.dispatchDBlock = app.dsetmap[lfn] finp.type = 'input' finp.status = 'ready' jspec.addFile(finp) # then cavern files for lfn in app.cavernfiles: useguid=app.cavern_turls[lfn].replace("guid:","") finp = FileSpec() finp.lfn = lfn finp.GUID = useguid finp.dataset = app.dsetmap[lfn] finp.prodDBlock = app.dsetmap[lfn] finp.prodDBlockToken = 'local' finp.dispatchDBlock = app.dsetmap[lfn] finp.type = 'input' finp.status = 'ready' jspec.addFile(finp) # output files( this includes the logfiles) # Output files jidtag="" job = app._getParent() # Returns job or subjob object if job._getRoot().subjobs: jidtag = job._getRoot().id else: jidtag = "%d" % job.id outfiles=app.subjobsOutfiles[job.id] pandaOutfiles={} for type in outfiles.keys(): pandaOutfiles[type]=outfiles[type]+"."+str(jidtag) if type=="LOG": pandaOutfiles[type]+=".tgz" #print pandaOutfiles for outtype in pandaOutfiles.keys(): fout = FileSpec() dset=string.replace(app.outputpaths[outtype],"/",".") dset=dset[1:-1] fout.dataset=dset fout.lfn=pandaOutfiles[outtype] fout.type = 'output' # fout.destinationDBlock = jspec.destinationDBlock fout.destinationDBlock = fout.dataset fout.destinationSE = jspec.destinationSE if outtype=='LOG': fout.type='log' fout.destinationDBlock = fout.dataset fout.destinationSE = job.backend.site jspec.addFile(fout) # job parameters param = '-l %s ' % self.library # user tarball. # use corruption checker if job.backend.requirements.corCheck: param += '--corCheck ' # disable to skip missing files if job.backend.requirements.notSkipMissing: param += '--notSkipMissing ' # transform parameters # need to update arglist with final output file name... newArgs=[] if app.mode == "evgen": app.args[3]=app.args[3]+" -t " if app.verbosity: app.args[3]=app.args[3]+" -l %s " % app.verbosity for arg in app.args[3:]: for type in outfiles.keys(): if arg.find(outfiles[type])>-1: arg=arg.replace(outfiles[type],pandaOutfiles[type]) newArgs.append(arg) arglist=string.join(newArgs," ") # print "Arglist:",arglist param += ' -r ./ ' param += ' -j "%s"' % urllib.quote(arglist) allinfiles=app.inputfiles+app.dbfiles # Input files. param += ' -i "%s" ' % allinfiles if len(app.mbfiles)>0: param+= ' -m "%s" ' % app.mbfiles if len(app.cavernfiles)>0: param+= ' -n "%s" ' % app.cavernfiles # param += '-m "[]" ' #%minList FIXME # param += '-n "[]" ' #%cavList FIXME del pandaOutfiles["LOG"] # logfiles do not appear in IROOT block, and this one is not needed anymore... param += ' -o "{\'IROOT\':%s }"' % str(pandaOutfiles.items()) # source URL matchURL = re.search("(http.*://[^/]+)/",Client.baseURLSSL) if matchURL != None: param += " --sourceURL %s " % matchURL.group(1) param += " --trf" jspec.jobParameters = param jspec.metadata="--trf \"%s\"" % arglist #print "SUBJOB DETAILS:",jspec.values() if app.dryrun: print "job.application.dryrun activated, printing out job parameters" print jspec.values() return return jspec
def retry(self,JobsetID,newSite=False,newOpts={},noSubmit=False,ignoreDuplication=False,useJobsetID=False,retryBuild=False,reproduceFiles=[],unsetRetryID=False): # get logger tmpLog = PLogger.getPandaLogger() # check proxy self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy( self.gridPassPhrase, False, self.verbose, useCache=True) # force update just in case self.status(JobsetID,True) # set an empty map since mutable default value is used if newOpts == {}: newOpts = {} # get jobset newJobsetID = -1 jobList = self.getJobIDsWithSetID(JobsetID) if jobList == None: # works only for jobsetID if useJobsetID: return # works with jobID isJobset = False jobList = [JobsetID] else: isJobset = True tmpMsg = "ID=%s is composed of JobID=" % JobsetID for tmpJobID in jobList: tmpMsg += '%s,' % tmpJobID tmpMsg = tmpMsg[:-1] tmpLog.info(tmpMsg) for JobID in jobList: # get job info from local repository localJob = self.getJobInfo(JobID) if localJob == None: tmpLog.warning("JobID=%s not found in local repository. Synchronization may be needed" % JobID) return None # for JEDI if localJob.isJEDI(): status,out = Client.retryTask( localJob.jediTaskID, verbose=self.verbose, properErrorCode=True, newParams=newOpts) if status != 0: tmpLog.error(status) tmpLog.error(out) tmpLog.error("Failed to retry TaskID=%s" % localJob.jediTaskID) return False tmpStat,tmpDiag = out if (not tmpStat in [0,True] and newOpts == {}) or (newOpts != {} and tmpStat != 3): tmpLog.error(tmpDiag) tmpLog.error("Failed to retry TaskID=%s" % localJob.jediTaskID) return False tmpLog.info(tmpDiag) continue # skip running job if localJob.dbStatus != 'frozen': tmpLog.info('Retry failed subjobs in running jobId=%s' % JobID) status,out = Client.retryFailedJobsInActive(JobID,verbose=self.verbose) if status != 0: tmpLog.error(status) tmpLog.error(out) tmpLog.error("Failed to retry JobID=%s" % JobID) else: job = self.status(JobID) if isJobset: continue else: return # skip already retried if localJob.retryID != '0': if isJobset: tmpLog.info('Skip JobID=%s since already retried by JobID=%s JobsetID=%s' % \ (JobID,localJob.retryID,localJob.retryJobsetID)) continue else: tmpLog.warning('This job was already retried by JobID=%s' % localJob.retryID) return # check status of buildJob if not retryBuild and not localJob.buildStatus in ['','finished']: tmpMsgStr = 'Cannot retry since status of buildJob %s is %s (!= finished). ' \ % (localJob.PandaID.split(',')[0],localJob.buildStatus) tmpMsgStr += 'Please execute %s with the same input/output datasets (or containers). ' % localJob.jobType tmpMsgStr += 'It will run only on failed/cancelled/unused input files ' tmpMsgStr += 'and append output files to the output dataset container. ' tmpMsgStr += 'Or you may set retryBuild=True in pbook.retry() ' tmpLog.warning(tmpMsgStr) if isJobset: continue else: return # check opts for newSite if newSite or newOpts != {}: if not localJob.outDS.endswith('/') and not newOpts.has_key('outDS') and not newOpts.has_key('--outDS'): tmpLog.warning('You need to specify --outDS in newOpts to retry at new site unless container is used as output') return # get list of failed jobs pandaIDs = localJob.PandaID.split(',') statusList= localJob.jobStatus.split(',') jobList = [] for idx in range(len(pandaIDs)): # check status unless reproduce files if reproduceFiles == [] and not statusList[idx] in ['failed','cancelled']: continue jobList.append(pandaIDs[idx]) # no failed job if jobList == []: if isJobset: tmpLog.info('Skip JobID=%s since no failed jobs' % JobID) continue else: tmpLog.info('No failed jobs to be retried for JobID=%s' % JobID) return # get full job spec tmpLog.info("Retrying JobID=%s ..." % JobID) tmpLog.info("Getting job info") idxJL = 0 nQuery = 500 pandaJobs = [] while idxJL < len(jobList): # avoid burst query tmpLog.info(" %5s/%s" % (idxJL,len(jobList))) status,oTmp = Client.getFullJobStatus( jobList[idxJL:idxJL+nQuery], verbose=self.verbose) if status != 0: tmpLog.error(status) tmpLog.error(oTmp) tmpLog.error("Cannot get job info from Panda server") return pandaJobs += oTmp idxJL += nQuery time.sleep(1) tmpLog.info(" %5s/%s" % (len(jobList),len(jobList))) # get PandaIDs to reproduce files if reproduceFiles != []: # change wildcard to .* for regexp reproduceFilePatt = [] for tmpReproduceFile in reproduceFiles: if '*' in tmpReproduceFile: tmpReproduceFile = tmpReproduceFile.replace('*','.*') reproduceFilePatt.append(tmpReproduceFile) # get list of jobs which produced interesting files tmpJobList = [] tmpPandaJobs = [] for tmpPandaJob in pandaJobs: # check names tmpMatchFlag = False for tmpFile in tmpPandaJob.Files: if tmpFile.type == 'output' and tmpFile.status == 'ready': for tmpReproduceFile in reproduceFilePatt: # normal matching if tmpReproduceFile == tmpFile.lfn: tmpMatchFlag = True break # wild card if '*' in tmpReproduceFile and \ re.search('^'+tmpReproduceFile,tmpFile.lfn) != None: tmpMatchFlag = True break if tmpMatchFlag: break # append if tmpMatchFlag: tmpJobList.append(tmpPandaJob.PandaID) tmpPandaJobs.append(tmpPandaJob) # use new list jobList = tmpJobList pandaJobs = tmpPandaJobs if jobList == []: tmpLog.info("No jobs to reproduce files : Jobs in JobID=%s didn't produce lost files" % JobID) continue # jobdefID newJobdefID = PsubUtils.readJobDefID() # reset some parameters retryJobs = [] retrySite = None retryElement = None retryDestSE = None outDsName = None shadowList = [] oldLibDS = None newLibDS = None newLibTgz = None rebroMap = {} for idx in range(len(jobList)): job = pandaJobs[idx] # skip exired if job == None: tmpLog.warning("Could not retry jobs older than 30 days : JobID=%s (PandaID=%s) expired" \ % (JobID,jobList[idxJob])) return # skip jobs reassigned by rebrokerage if (job.jobStatus == 'cancelled' and job.taskBufferErrorCode in [105,'105']) or \ (job.jobStatus == 'failed' and job.taskBufferErrorCode in [106,'106']): # extract JobIDs of reassigned jobs tmpM = re.search('JobsetID=(\d+) JobID=(\d+)',job.taskBufferErrorDiag) if tmpM != None: tmpRebKey = (tmpM.group(1),tmpM.group(2)) if not rebroMap.has_key(tmpRebKey): rebroMap[tmpRebKey] = 0 # count # of reassigned jobs rebroMap[tmpRebKey] += 1 continue # get shadow list if (not ignoreDuplication) and outDsName == None and job.prodSourceLabel == 'user': # look for dataset for log since it doesn't have suffix even when --individualOutDS is used for tmpFile in job.Files: if tmpFile.type == 'log': outDsName = tmpFile.dataset break # output dataset was not found if outDsName == None: tmpLog.error("Could not get output dataset name for JobID=%s (PandaID=%s)" \ % (JobID,job.PandaID)) return # get files in shadow if outDsName.endswith('/'): shadowList = Client.getFilesInShadowDataset( outDsName, Client.suffixShadow, self.verbose) else: # disable duplication check mainly for old overlay jobs since non-signal files are wrongly skipped #shadowList = Client.getFilesInShadowDatasetOld(outDsName,Client.suffixShadow,self.verbose) pass # unify sitename if retrySite == None: retrySite = job.computingSite retryElement = job.computingElement retryDestSE = job.destinationSE # reset job.jobStatus = None job.commandToPilot = None job.startTime = None job.endTime = None job.attemptNr = 1+job.attemptNr for attr in job._attributes: if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'): setattr(job,attr,None) job.transExitCode = None job.computingSite = retrySite job.computingElement = retryElement job.destinationSE = retryDestSE job.dispatchDBlock = None if not unsetRetryID: job.jobExecutionID = JobID job.jobDefinitionID = newJobdefID job.parentID = job.PandaID if job.jobsetID != ['NULL',None,-1]: if not unsetRetryID: job.sourceSite = job.jobsetID job.jobsetID = newJobsetID skipInputList = [] numUsedFiles = 0 # loop over all files for file in job.Files: file.rowID = None if file.type == 'input': # protection against wrong sync which doesn't update buildStatus correctly if not retryBuild and file.lfn.endswith('.lib.tgz') and file.GUID == 'NULL': tmpLog.warning('GUID for %s is unknown. Cannot retry when corresponding buildJob failed' \ % file.lfn) return if not retryBuild or not file.lfn.endswith('.lib.tgz'): file.status = 'ready' # set new lib dataset if retryBuild and file.lfn.endswith('.lib.tgz'): if newLibTgz != None: file.lfn = newLibTgz file.dataset = newLibDS file.dispatchDBlock = newLibDS # check with shadow for non lib.tgz/DBR tmpDbrMatch = re.search('^DBRelease-.*\.tar\.gz$',file.lfn) if tmpDbrMatch == None and not file.lfn.endswith('.lib.tgz'): if file.lfn in shadowList: skipInputList.append(file) else: numUsedFiles += 1 elif file.type in ('output','log'): file.destinationSE = retryDestSE file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock) # add retry num if file.dataset.endswith('/') or job.prodSourceLabel == 'panda': oldOutDsName = file.destinationDBlock retryDsPatt = '_r' if reproduceFiles != []: retryDsPatt = '_rp' retryMatch = re.search(retryDsPatt+'(\d+)$',file.destinationDBlock) if retryMatch == None: file.destinationDBlock += (retryDsPatt+'1') else: tmpDestinationDBlock = re.sub(retryDsPatt+'(\d+)$','',file.destinationDBlock) file.destinationDBlock = tmpDestinationDBlock + retryDsPatt + '%d' % (1+int(retryMatch.group(1))) if job.processingType == 'usermerge': job.jobParameters = job.jobParameters.replace(' %s ' % oldOutDsName, ' %s ' % file.destinationDBlock) # use new dataset name for buildXYZ if job.prodSourceLabel == 'panda': if file.lfn.endswith('.lib.tgz'): # get new libDS and lib.tgz names oldLibDS = file.dataset file.dataset = file.destinationDBlock newLibDS = file.dataset file.lfn = re.sub(oldLibDS,newLibDS,file.lfn) newLibTgz = file.lfn else: file.dataset = file.destinationDBlock # add attempt nr oldName = file.lfn if job.prodSourceLabel == 'panda' and file.lfn.endswith('.lib.tgz'): continue else: # append attempt number at the tail file.lfn = re.sub("\.\d+$","",file.lfn) file.lfn = "%s.%d" % (file.lfn,job.attemptNr) newName = file.lfn # modify jobParameters job.jobParameters = re.sub("'%s'" % oldName ,"'%s'" % newName, job.jobParameters) # look for output in trf oldGenelicName = re.sub('\.\d+$','',oldName) match = re.search(oldGenelicName+'(\.\d+)*(%20|")',job.jobParameters) if match != None: job.jobParameters = job.jobParameters.replace(match.group(0),newName+match.group(2)) # change lib.tgz name if retryBuild and newLibDS != None: job.jobParameters = re.sub(oldLibDS,newLibDS,job.jobParameters) # change destinationDBlock if job.prodSourceLabel == 'panda': job.destinationDBlock = newLibDS # all files are used by others if numUsedFiles == 0 and skipInputList != []: continue # remove skipped files strSkipped = '' for tmpFile in skipInputList: strSkipped += '%s,' % tmpFile.lfn job.Files.remove(tmpFile) strSkipped = strSkipped[:-1] # modify jobpar if strSkipped != '': optionToSkipFiles = '--skipInputByRetry' if not optionToSkipFiles in job.jobParameters: # just append job.jobParameters += "%s=%s " % (optionToSkipFiles,strSkipped) else: # extract already skipped files tmpMatch = re.search("(%s=[^ ]+)",job.jobParameters) if tmpMatch == None: tmpLog.error("Failed to extract arg of %s for PandaID=%s" \ % (optionToSkipFiles,job.PandaID)) return # replace job.jobParameters = re.sub(tmpMatch.group(1),"%s,%s" % (tmpMatch.group(1),optionToSkipFiles), job.jobParameters) if self.verbose: tmpLog.debug(job.jobParameters) # append retryJobs.append(job) # info on rebrokeage if rebroMap != {}: for tmpRebKey,tmpRebNumJobs in rebroMap.iteritems(): tmpRebSetID,tmpRebJobID = tmpRebKey tmpLog.info('Skip %s jobs since JobID=%s JobsetID=%s already reassigned them to another site' % \ (tmpRebNumJobs,tmpRebJobID,tmpRebSetID)) if retryJobs == []: tmpLog.info("No more jobs to be retried for JobID=%s" % JobID) if isJobset: continue else: return # all input files were or are being used by other jobs if retryJobs == []: tmpLog.info('All input files were or are being used by other jobs for the same output. No jobs to be retried. If you need to ignore duplication check (e.g., using the same EVNT file for multiple simulation subjobs), set ignoreDuplication=True. i.e. retry(123,ignoreDuplication=True)') if isJobset: continue else: return # check voms role if not retryJobs[0].workingGroup in ['NULL',None,'']: # VOMS role was used if not "--workingGroup" in job.metadata: # extract voms roles from metadata match = re.search("--voms( |=)[ \"]*([^ \"]+)",job.metadata) if match != None: vomsRoles = match.group(2) else: vomsRoles = "atlas:/atlas/%s/Role=production" % retryJobs[0].workingGroup # regenerate proxy with VOMS roles try: tmpLog.info("Checking proxy role to resubmit %s jobs" % retryJobs[0].workingGroup) self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy( self.gridPassPhrase, False, self.verbose,vomsRoles, useCache=True) except: tmpLog.error("Failed to generate a proxy with %s" % vomsRoles) return # check runtime env for new site submission if (newSite or newOpts != {}): if retryJobs[0].processingType == 'pathena' or '--useAthenaPackages' in retryJobs[0].metadata: from pandatools import AthenaUtils stA,retA = AthenaUtils.getAthenaVer() if not stA: tmpLog.error("Failed to get Athena rel/cache version in current runtime env") return athenaVer = retA['athenaVer'] cacheVer = retA['cacheVer'] nightVer = retA['nightVer'] wrongSetup = False if retryJobs[0].AtlasRelease != 'Atlas-%s' % athenaVer: wrongSetup = True errMsg = "Current Athena version Atlas-%s is inconsitent with the previous submission %s. " % (athenaVer,retryJobs[0].AtlasRelease) elif retryJobs[0].homepackage != 'AnalysisTransforms'+cacheVer+nightVer: wrongSetup = True errMsg = "Current cache version %s is inconsitent with the previous submission. " % cacheVer.replace('-','').replace('_','-') if wrongSetup: errMsg += 'You need to have the same runtime env as before since all job spec need to be re-created to send jobs to a new site. ' errMsg += 'Please setup Athena correctly and restart pbook' tmpLog.error(errMsg) return # test mode if noSubmit: continue # invoke pathena/prun to send job to new site if (newSite or newOpts != {}) and retryJobs[0].processingType != 'usermerge': # set parent jobID and jobsetID newOpts['provenanceID'] = retryJobs[0].jobExecutionID newOpts['panda_parentJobsetID'] = retryJobs[0].sourceSite tmpLog.info("Constructing job spec again to be sent to another site ...") comStat= PsubUtils.execWithModifiedParams(retryJobs,newOpts,self.verbose,newSite) if comStat == 0: # update database time.sleep(2) self.sync() else: tmpLog.error("Failed to submit jobs to Panda server") return # register datasets tmpOutDsLocation = Client.PandaSites[retryJobs[-1].computingSite]['ddm'] addedDataset = [] shadowDSname = None for tmpFile in retryJobs[-1].Files: if tmpFile.type in ['output','log'] and tmpFile.dataset.endswith('/'): # add shadow """ removed shadow if shadowDSname == None and tmpFile.type == 'log': shadowDSname = "%s%s" % (tmpFile.destinationDBlock,Client.suffixShadow) Client.addDataset(shadowDSname,self.verbose) """ # add datasets if not tmpFile.destinationDBlock in addedDataset: # create dataset Client.addDataset( tmpFile.destinationDBlock, self.verbose, location=tmpOutDsLocation, dsCheck=False) # add to container Client.addDatasetsToContainer( tmpFile.dataset, [tmpFile.destinationDBlock], self.verbose) # append addedDataset.append(tmpFile.destinationDBlock) # register libDS if retryBuild and newLibDS != None: Client.addDataset( newLibDS, self.verbose, location=tmpOutDsLocation, dsCheck=False) # submit tmpLog.info("Submitting job ...") status,out = Client.submitJobs(retryJobs,verbose=self.verbose) if out == None or status != 0: tmpLog.error(status) tmpLog.error(out) tmpLog.error("Failed to submit jobs to Panda server") return # update database pandaIDstatus = {} newJobID = None for items in out: # get newJobID if newJobID == None: newJobID = items[1] # check PandaID PandaID = items[0] if PandaID == 'NULL': tmpLog.error("Panda server returned wrong IDs. It may have a temporary problem") return # set newJobsetID if newJobsetID in [None,-1]: newJobsetID = items[2]['jobsetID'] # dummy statuso pandaIDstatus[PandaID] = ('defined','NULL') # set retry ID if not unsetRetryID: localJob.retryID = newJobID if not newJobsetID in [None,-1,'NULL']: localJob.retryJobsetID = newJobsetID try: PdbUtils.updateJobDB(localJob,self.verbose) except: tmpLog.error("Failed to set retryID for JobID=%s" % JobID) return # set new paramers newLocalJob = PdbUtils.convertPtoD(retryJobs,pandaIDstatus) newLocalJob.JobID = newJobID if not newJobsetID in [None,-1,'NULL']: newLocalJob.groupID = newJobsetID newLocalJob.creationTime = datetime.datetime.utcnow() # insert to DB try: PdbUtils.insertJobDB(newLocalJob,self.verbose) except: tmpLog.error("Failed to insert JobID=%s to local repository" % newJobID) return # write new jobdefID PsubUtils.writeJobDefID(newJobID) # done tmpMsg = 'Done. New JobID=%s' % newJobID if not newJobsetID in [None,-1,'NULL']: tmpMsg += " JobsetID=%s" % newJobsetID tmpLog.info(tmpMsg)
def master_prepare(self, app, appconfig): '''Prepare the master job''' from pandatools import Client from pandatools import MiscUtils from pandatools import AthenaUtils from pandatools import PsubUtils from taskbuffer.JobSpec import JobSpec from taskbuffer.FileSpec import FileSpec from pandatools import PandaToolsPkgInfo # create a random number for this submission to allow multiple use of containers self.rndSubNum = random.randint(1111, 9999) job = app._getParent() logger.debug('AthenaJediRTHandler master_prepare called for %s', job.getFQID('.')) if app.useRootCoreNoBuild: logger.info( 'Athena.useRootCoreNoBuild is True, setting Panda.nobuild=True.' ) job.backend.nobuild = True if job.backend.bexec and job.backend.nobuild: raise ApplicationConfigurationError( None, "Contradicting options: job.backend.bexec and job.backend.nobuild are both enabled." ) if job.backend.requirements.rootver != '' and job.backend.nobuild: raise ApplicationConfigurationError( None, "Contradicting options: job.backend.requirements.rootver given and job.backend.nobuild are enabled." ) # Switch on compilation flag if bexec is set or libds is empty if job.backend.bexec != '' or not job.backend.nobuild: app.athena_compile = True for sj in job.subjobs: sj.application.athena_compile = True logger.info( '"job.backend.nobuild=False" or "job.backend.bexec" is set - Panda build job is enabled.' ) if job.backend.nobuild: app.athena_compile = False for sj in job.subjobs: sj.application.athena_compile = False logger.info( '"job.backend.nobuild=True" or "--nobuild" chosen - Panda build job is switched off.' ) # check for auto datri if job.outputdata.location != '': if not PsubUtils.checkDestSE(job.outputdata.location, job.outputdata.datasetname, False): raise ApplicationConfigurationError( None, "Problems with outputdata.location setting '%s'" % job.outputdata.location) # validate application if not app.atlas_release and not job.backend.requirements.rootver and not app.atlas_exetype in [ 'EXE' ]: raise ApplicationConfigurationError( None, "application.atlas_release is not set. Did you run application.prepare()" ) self.dbrelease = app.atlas_dbrelease if self.dbrelease != '' and self.dbrelease != 'LATEST' and self.dbrelease.find( ':') == -1: raise ApplicationConfigurationError( None, "ERROR : invalid argument for DB Release. Must be 'LATEST' or 'DatasetName:FileName'" ) self.runConfig = AthenaUtils.ConfigAttr(app.atlas_run_config) for k in self.runConfig.keys(): self.runConfig[k] = AthenaUtils.ConfigAttr(self.runConfig[k]) if not app.atlas_run_dir: raise ApplicationConfigurationError( None, "application.atlas_run_dir is not set. Did you run application.prepare()" ) self.rundirectory = app.atlas_run_dir self.cacheVer = '' if app.atlas_project and app.atlas_production: self.cacheVer = "-" + app.atlas_project + "_" + app.atlas_production # handle different atlas_exetypes self.job_options = '' if app.atlas_exetype == 'TRF': self.job_options += ' '.join( [os.path.basename(fopt.name) for fopt in app.option_file]) #if not job.outputdata.outputdata: # raise ApplicationConfigurationError(None,"job.outputdata.outputdata is required for atlas_exetype in ['PYARA','ARES','TRF','ROOT','EXE' ] and Panda backend") #raise ApplicationConfigurationError(None,"Sorry TRF on Panda backend not yet supported") if app.options: self.job_options += ' %s ' % app.options elif app.atlas_exetype == 'ATHENA': if len(app.atlas_environment) > 0 and app.atlas_environment[ 0].find('DBRELEASE_OVERRIDE') == -1: logger.warning( "Passing of environment variables to Athena using Panda not supported. Ignoring atlas_environment setting." ) if job.outputdata.outputdata: raise ApplicationConfigurationError( None, "job.outputdata.outputdata must be empty if atlas_exetype='ATHENA' and Panda backend is used (outputs are auto-detected)" ) if app.options: if app.options.startswith('-c'): self.job_options += ' %s ' % app.options else: self.job_options += ' -c %s ' % app.options logger.warning( 'The value of j.application.options has been prepended with " -c " ' ) logger.warning( 'Please make sure to use proper quotes for the values of j.application.options !' ) self.job_options += ' '.join( [os.path.basename(fopt.name) for fopt in app.option_file]) # check for TAG compression if 'subcoll.tar.gz' in app.append_to_user_area: self.job_options = ' uncompress.py ' + self.job_options elif app.atlas_exetype in ['PYARA', 'ARES', 'ROOT', 'EXE']: #if not job.outputdata.outputdata: # raise ApplicationConfigurationError(None,"job.outputdata.outputdata is required for atlas_exetype in ['PYARA','ARES','TRF','ROOT','EXE' ] and Panda backend") self.job_options += ' '.join( [os.path.basename(fopt.name) for fopt in app.option_file]) # sort out environment variables env_str = "" if len(app.atlas_environment) > 0: for env_var in app.atlas_environment: env_str += "export %s ; " % env_var else: env_str = "" # below fixes issue with runGen -- job_options are executed by os.system when dbrelease is used, and by the shell otherwise ## - REMOVED FIX DUE TO CHANGE IN PILOT - MWS 8/11/11 if job.backend.requirements.usecommainputtxt: input_str = '/bin/echo %IN > input.txt; cat input.txt; ' else: input_str = '/bin/echo %IN | sed \'s/,/\\\n/g\' > input.txt; cat input.txt; ' if app.atlas_exetype == 'PYARA': self.job_options = env_str + input_str + ' python ' + self.job_options elif app.atlas_exetype == 'ARES': self.job_options = env_str + input_str + ' athena.py ' + self.job_options elif app.atlas_exetype == 'ROOT': self.job_options = env_str + input_str + ' root -b -q ' + self.job_options elif app.atlas_exetype == 'EXE': self.job_options = env_str + input_str + self.job_options if app.options: self.job_options += ' %s ' % app.options if self.job_options == '': raise ApplicationConfigurationError(None, "No Job Options found!") logger.info('Running job options: %s' % self.job_options) # validate dbrelease if self.dbrelease != "LATEST": self.dbrFiles, self.dbrDsList = getDBDatasets( self.job_options, '', self.dbrelease) # handle the output dataset if job.outputdata: if job.outputdata._name != 'DQ2OutputDataset': raise ApplicationConfigurationError( None, 'Panda backend supports only DQ2OutputDataset') else: logger.info('Adding missing DQ2OutputDataset') job.outputdata = DQ2OutputDataset() # validate the output dataset name (and make it a container) job.outputdata.datasetname, outlfn = dq2outputdatasetname( job.outputdata.datasetname, job.id, job.outputdata.isGroupDS, job.outputdata.groupname) if not job.outputdata.datasetname.endswith('/'): job.outputdata.datasetname += '/' # add extOutFiles self.extOutFile = [] for tmpName in job.outputdata.outputdata: if tmpName != '': self.extOutFile.append(tmpName) for tmpName in job.backend.extOutFile: if tmpName != '': self.extOutFile.append(tmpName) # use the shared area if possible tmp_user_area_name = app.user_area.name if app.is_prepared is not True: from Ganga.Utility.files import expandfilename shared_path = os.path.join( expandfilename(getConfig('Configuration')['gangadir']), 'shared', getConfig('Configuration')['user']) tmp_user_area_name = os.path.join( os.path.join(shared_path, app.is_prepared.name), os.path.basename(app.user_area.name)) # Add inputsandbox to user_area if job.inputsandbox: logger.warning( "Submitting Panda job with inputsandbox. This may slow the submission slightly." ) if tmp_user_area_name: inpw = os.path.dirname(tmp_user_area_name) self.inputsandbox = os.path.join( inpw, 'sources.%s.tar' % commands.getoutput('uuidgen 2> /dev/null')) else: inpw = job.getInputWorkspace() self.inputsandbox = inpw.getPath( 'sources.%s.tar' % commands.getoutput('uuidgen 2> /dev/null')) if tmp_user_area_name: rc, output = commands.getstatusoutput( 'cp %s %s.gz' % (tmp_user_area_name, self.inputsandbox)) if rc: logger.error('Copying user_area failed with status %d', rc) logger.error(output) raise ApplicationConfigurationError( None, 'Packing inputsandbox failed.') rc, output = commands.getstatusoutput('gunzip %s.gz' % (self.inputsandbox)) if rc: logger.error('Unzipping user_area failed with status %d', rc) logger.error(output) raise ApplicationConfigurationError( None, 'Packing inputsandbox failed.') for fname in [os.path.abspath(f.name) for f in job.inputsandbox]: fname.rstrip(os.sep) path = os.path.dirname(fname) fn = os.path.basename(fname) #app.atlas_run_dir # get Athena versions rc, out = AthenaUtils.getAthenaVer() # failed if not rc: #raise ApplicationConfigurationError(None, 'CMT could not parse correct environment ! \n Did you start/setup ganga in the run/ or cmt/ subdirectory of your athena analysis package ?') logger.warning( "CMT could not parse correct environment for inputsandbox - will use the atlas_run_dir as default" ) # as we don't have to be in the run dir now, create a copy of the run_dir directory structure and use that input_dir = os.path.dirname(self.inputsandbox) run_path = "%s/sbx_tree/%s" % (input_dir, app.atlas_run_dir) rc, output = commands.getstatusoutput("mkdir -p %s" % run_path) if not rc: # copy this sandbox file rc, output = commands.getstatusoutput( "cp %s %s" % (fname, run_path)) if not rc: path = os.path.join(input_dir, 'sbx_tree') fn = os.path.join(app.atlas_run_dir, fn) else: raise ApplicationConfigurationError( None, "Couldn't copy file %s to recreate run_dir for input sandbox" % fname) else: raise ApplicationConfigurationError( None, "Couldn't create directory structure to match run_dir %s for input sandbox" % run_path) else: userarea = out['workArea'] # strip the path from the filename if present in the userarea ua = os.path.abspath(userarea) if ua in path: fn = fname[len(ua) + 1:] path = ua rc, output = commands.getstatusoutput( 'tar -h -r -f %s -C %s %s' % (self.inputsandbox, path, fn)) if rc: logger.error('Packing inputsandbox failed with status %d', rc) logger.error(output) raise ApplicationConfigurationError( None, 'Packing inputsandbox failed.') # remove sandbox tree if created if "sbx_tree" in os.listdir(os.path.dirname(self.inputsandbox)): rc, output = commands.getstatusoutput( "rm -r %s/sbx_tree" % os.path.dirname(self.inputsandbox)) if rc: raise ApplicationConfigurationError( None, "Couldn't remove directory structure used for input sandbox" ) rc, output = commands.getstatusoutput('gzip %s' % (self.inputsandbox)) if rc: logger.error('Packing inputsandbox failed with status %d', rc) logger.error(output) raise ApplicationConfigurationError( None, 'Packing inputsandbox failed.') self.inputsandbox += ".gz" else: self.inputsandbox = tmp_user_area_name # job name jobName = 'ganga.%s' % MiscUtils.wrappedUuidGen() # make task taskParamMap = {} # Enforce that outputdataset name ends with / for container if not job.outputdata.datasetname.endswith('/'): job.outputdata.datasetname = job.outputdata.datasetname + '/' taskParamMap['taskName'] = job.outputdata.datasetname taskParamMap['uniqueTaskName'] = True taskParamMap['vo'] = 'atlas' taskParamMap['architecture'] = AthenaUtils.getCmtConfig( athenaVer=app.atlas_release, cmtConfig=app.atlas_cmtconfig) if app.atlas_release: taskParamMap['transUses'] = 'Atlas-%s' % app.atlas_release else: taskParamMap['transUses'] = '' taskParamMap[ 'transHome'] = 'AnalysisTransforms' + self.cacheVer #+nightVer configSys = getConfig('System') gangaver = configSys['GANGA_VERSION'].lower() if not gangaver: gangaver = "ganga" if app.atlas_exetype in ["ATHENA", "TRF"]: taskParamMap['processingType'] = '{0}-jedi-athena'.format(gangaver) else: taskParamMap['processingType'] = '{0}-jedi-run'.format(gangaver) #if options.eventPickEvtList != '': # taskParamMap['processingType'] += '-evp' taskParamMap['prodSourceLabel'] = 'user' if job.backend.site != 'AUTO': taskParamMap['cloud'] = Client.PandaSites[ job.backend.site]['cloud'] taskParamMap['site'] = job.backend.site elif job.backend.requirements.cloud != None and not job.backend.requirements.anyCloud: taskParamMap['cloud'] = job.backend.requirements.cloud if job.backend.requirements.excluded_sites != []: taskParamMap['excludedSite'] = expandExcludedSiteList(job) # if only a single site specifed, don't set includedSite #if job.backend.site != 'AUTO': # taskParamMap['includedSite'] = job.backend.site #taskParamMap['cliParams'] = fullExecString if job.backend.requirements.noEmail: taskParamMap['noEmail'] = True if job.backend.requirements.skipScout: taskParamMap['skipScout'] = True if not app.atlas_exetype in ["ATHENA", "TRF"]: taskParamMap[ 'nMaxFilesPerJob'] = job.backend.requirements.maxNFilesPerJob if job.backend.requirements.disableAutoRetry: taskParamMap['disableAutoRetry'] = 1 # source URL matchURL = re.search("(http.*://[^/]+)/", Client.baseURLCSRVSSL) if matchURL != None: taskParamMap['sourceURL'] = matchURL.group(1) # dataset names outDatasetName = job.outputdata.datasetname logDatasetName = re.sub('/$', '.log/', job.outputdata.datasetname) # log taskParamMap['log'] = { 'dataset': logDatasetName, 'container': logDatasetName, 'type': 'template', 'param_type': 'log', 'value': '{0}.${{SN}}.log.tgz'.format(logDatasetName[:-1]) } # job parameters if app.atlas_exetype in ["ATHENA", "TRF"]: taskParamMap['jobParameters'] = [ { 'type': 'constant', 'value': ' --sourceURL ${SURL}', }, ] else: taskParamMap['jobParameters'] = [ { 'type': 'constant', 'value': '-j "" --sourceURL ${SURL}', }, ] taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-r {0}'.format(self.rundirectory), }, ] # output # output files outMap = {} if app.atlas_exetype in ["ATHENA", "TRF"]: outMap, tmpParamList = AthenaUtils.convertConfToOutput( self.runConfig, self.extOutFile, job.outputdata.datasetname, destination=job.outputdata.location) taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-o "%s" ' % outMap }, ] taskParamMap['jobParameters'] += tmpParamList else: if job.outputdata.outputdata: for tmpLFN in job.outputdata.outputdata: if len(job.outputdata.datasetname.split('.')) > 2: lfn = '{0}.{1}'.format( *job.outputdata.datasetname.split('.')[:2]) else: lfn = job.outputdata.datasetname[:-1] lfn += '.$JOBSETID._${{SN/P}}.{0}'.format(tmpLFN) dataset = '{0}_{1}/'.format( job.outputdata.datasetname[:-1], tmpLFN) taskParamMap[ 'jobParameters'] += MiscUtils.makeJediJobParam( lfn, dataset, 'output', hidden=True, destination=job.outputdata.location) outMap[tmpLFN] = lfn taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-o "{0}"'.format(str(outMap)), }, ] if app.atlas_exetype in ["ATHENA"]: # jobO parameter tmpJobO = self.job_options # replace full-path jobOs for tmpFullName, tmpLocalName in AthenaUtils.fullPathJobOs.iteritems( ): tmpJobO = re.sub(tmpFullName, tmpLocalName, tmpJobO) # modify one-liner for G4 random seeds if self.runConfig.other.G4RandomSeeds > 0: if app.options != '': tmpJobO = re.sub('-c "%s" ' % app.options, '-c "%s;from G4AtlasApps.SimFlags import SimFlags;SimFlags.SeedsG4=${RNDMSEED}" ' \ % app.options,tmpJobO) else: tmpJobO = '-c "from G4AtlasApps.SimFlags import SimFlags;SimFlags.SeedsG4=${RNDMSEED}" ' dictItem = { 'type': 'template', 'param_type': 'number', 'value': '${RNDMSEED}', 'hidden': True, 'offset': self.runConfig.other.G4RandomSeeds, } taskParamMap['jobParameters'] += [dictItem] elif app.atlas_exetype in ["TRF"]: # replace parameters for TRF tmpJobO = self.job_options # output : basenames are in outMap['IROOT'] trough extOutFile tmpOutMap = [] for tmpName, tmpLFN in outMap['IROOT']: tmpJobO = tmpJobO.replace('%OUT.' + tmpName, tmpName) # replace DBR tmpJobO = re.sub('%DB=[^ \'\";]+', '${DBR}', tmpJobO) if app.atlas_exetype in ["TRF"]: taskParamMap['useLocalIO'] = 1 # build if job.backend.nobuild: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-a {0}'.format(os.path.basename(self.inputsandbox)), }, ] else: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-l ${LIB}', }, ] # # input if job.inputdata and job.inputdata._name == 'DQ2Dataset': if job.backend.requirements.nFilesPerJob > 0 and job.inputdata.number_of_files == 0 and job.backend.requirements.split > 0: job.inputdata.number_of_files = job.backend.requirements.nFilesPerJob * job.backend.requirements.split if job.inputdata and job.inputdata._name == 'DQ2Dataset' and job.inputdata.number_of_files != 0: taskParamMap['nFiles'] = job.inputdata.number_of_files elif job.backend.requirements.nFilesPerJob > 0 and job.backend.requirements.split > 0: # pathena does this for some reason even if there is no input files taskParamMap[ 'nFiles'] = job.backend.requirements.nFilesPerJob * job.backend.requirements.split if job.backend.requirements.nFilesPerJob > 0: taskParamMap[ 'nFilesPerJob'] = job.backend.requirements.nFilesPerJob if job.backend.requirements.nEventsPerFile > 0: taskParamMap[ 'nEventsPerFile'] = job.backend.requirements.nEventsPerFile if not job.backend.requirements.nGBPerJob in [0, 'MAX']: try: if job.backend.requirements.nGBPerJob != 'MAX': job.backend.requirments.nGBPerJob = int( job.backend.requirements.nGBPerJob) except: logger.error("nGBPerJob must be an integer or MAX") # check negative if job.backend.requirements.nGBPerJob <= 0: logger.error("nGBPerJob must be positive") # don't set MAX since it is the defalt on the server side if not job.backend.requirements.nGBPerJob in [-1, 'MAX']: taskParamMap['nGBPerJob'] = job.backend.requirements.nGBPerJob if app.atlas_exetype in ["ATHENA", "TRF"]: inputMap = {} if job.inputdata and job.inputdata._name == 'DQ2Dataset': tmpDict = { 'type': 'template', 'param_type': 'input', 'value': '-i "${IN/T}"', 'dataset': ','.join(job.inputdata.dataset), 'expand': True, 'exclude': '\.log\.tgz(\.\d+)*$', } #if options.inputType != '': # tmpDict['include'] = options.inputType taskParamMap['jobParameters'].append(tmpDict) taskParamMap['dsForIN'] = ','.join(job.inputdata.dataset) inputMap['IN'] = ','.join(job.inputdata.dataset) else: # no input taskParamMap['noInput'] = True if job.backend.requirements.split > 0: taskParamMap['nEvents'] = job.backend.requirements.split else: taskParamMap['nEvents'] = 1 taskParamMap['nEventsPerJob'] = 1 taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-i "[]"', }, ] else: if job.inputdata and job.inputdata._name == 'DQ2Dataset': tmpDict = { 'type': 'template', 'param_type': 'input', 'value': '-i "${IN/T}"', 'dataset': ','.join(job.inputdata.dataset), 'expand': True, 'exclude': '\.log\.tgz(\.\d+)*$', } #if options.nSkipFiles != 0: # tmpDict['offset'] = options.nSkipFiles taskParamMap['jobParameters'].append(tmpDict) taskParamMap['dsForIN'] = ','.join(job.inputdata.dataset) else: # no input taskParamMap['noInput'] = True if job.backend.requirements.split > 0: taskParamMap['nEvents'] = job.backend.requirements.split else: taskParamMap['nEvents'] = 1 taskParamMap['nEventsPerJob'] = 1 # param for DBR if self.dbrelease != '': dbrDS = self.dbrelease.split(':')[0] # change LATEST to DBR_LATEST if dbrDS == 'LATEST': dbrDS = 'DBR_LATEST' dictItem = { 'type': 'template', 'param_type': 'input', 'value': '--dbrFile=${DBR}', 'dataset': dbrDS, } taskParamMap['jobParameters'] += [dictItem] # no expansion #if options.notExpandDBR: #dictItem = {'type':'constant', # 'value':'--noExpandDBR', # } #taskParamMap['jobParameters'] += [dictItem] # secondary FIXME disabled self.secondaryDSs = {} if self.secondaryDSs != {}: inMap = {} streamNames = [] for tmpDsName, tmpMap in self.secondaryDSs.iteritems(): # make template item streamName = tmpMap['streamName'] dictItem = MiscUtils.makeJediJobParam( '${' + streamName + '}', tmpDsName, 'input', hidden=True, expand=True, include=tmpMap['pattern'], offset=tmpMap['nSkip'], nFilesPerJob=tmpMap['nFiles']) taskParamMap['jobParameters'] += dictItem inMap[streamName] = 'tmp_' + streamName streamNames.append(streamName) # make constant item strInMap = str(inMap) # set placeholders for streamName in streamNames: strInMap = strInMap.replace("'tmp_" + streamName + "'", '${' + streamName + '/T}') dictItem = { 'type': 'constant', 'value': '--inMap "%s"' % strInMap, } taskParamMap['jobParameters'] += [dictItem] # misc jobParameters = '' # use Athena packages if app.atlas_exetype == 'ARES' or (app.atlas_exetype in ['PYARA', 'ROOT', 'EXE'] and app.useAthenaPackages): jobParameters += "--useAthenaPackages " # use RootCore if app.useRootCore or app.useRootCoreNoBuild: jobParameters += "--useRootCore " # use mana if app.useMana: jobParameters += "--useMana " if app.atlas_release != "": jobParameters += "--manaVer %s " % app.atlas_release # root if app.atlas_exetype in ['PYARA', 'ROOT', 'EXE' ] and job.backend.requirements.rootver != '': rootver = re.sub('/', '.', job.backend.requirements.rootver) jobParameters += "--rootVer %s " % rootver # write input to txt #if options.writeInputToTxt != '': # jobParameters += "--writeInputToTxt %s " % options.writeInputToTxt # debug parameters #if options.queueData != '': # jobParameters += "--overwriteQueuedata=%s " % options.queueData # JEM #if options.enableJEM: # jobParameters += "--enable-jem " # if options.configJEM != '': # jobParameters += "--jem-config %s " % options.configJEM # set task param if jobParameters != '': taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': jobParameters, }, ] # force stage-in if job.backend.accessmode == "LocalIO": taskParamMap['useLocalIO'] = 1 # set jobO parameter if app.atlas_exetype in ["ATHENA", "TRF"]: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-j "', 'padding': False, }, ] taskParamMap[ 'jobParameters'] += PsubUtils.convertParamStrToJediParam( tmpJobO, inputMap, job.outputdata.datasetname[:-1], True, False) taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '"', }, ] else: taskParamMap['jobParameters'] += [ { 'type': 'constant', 'value': '-p "{0}"'.format(urllib.quote(self.job_options)), }, ] # build step if not job.backend.nobuild: jobParameters = '-i ${IN} -o ${OUT} --sourceURL ${SURL} ' if job.backend.bexec != '': jobParameters += ' --bexec "%s" ' % urllib.quote( job.backend.bexec) if app.atlas_exetype == 'ARES' or (app.atlas_exetype in ['PYARA', 'ROOT', 'EXE'] and app.useAthenaPackages): # use Athena packages jobParameters += "--useAthenaPackages " # use RootCore if app.useRootCore or app.useRootCoreNoBuild: jobParameters += "--useRootCore " # run directory if app.atlas_exetype in ['PYARA', 'ARES', 'ROOT', 'EXE']: jobParameters += '-r {0} '.format(self.rundirectory) # no compile #if options.noCompile: # jobParameters += "--noCompile " # use mana if app.useMana: jobParameters += "--useMana " if app.atlas_release != "": jobParameters += "--manaVer %s " % app.atlas_release # root if app.atlas_exetype in [ 'PYARA', 'ROOT', 'EXE' ] and job.backend.requirements.rootver != '': rootver = re.sub('/', '.', job.backend.requirements.rootver) jobParameters += "--rootVer %s " % rootver # cmt config if app.atlas_exetype in ['PYARA', 'ARES', 'ROOT', 'EXE']: if not app.atlas_cmtconfig in ['', 'NULL', None]: jobParameters += " --cmtConfig %s " % app.atlas_cmtconfig #cmtConfig = AthenaUtils.getCmtConfig(athenaVer=app.atlas_release, cmtConfig=app.atlas_cmtconfig) #if cmtConfig: # jobParameters += "--cmtConfig %s " % cmtConfig # debug parameters #if options.queueData != '': # jobParameters += "--overwriteQueuedata=%s " % options.queueData # set task param taskParamMap['buildSpec'] = { 'prodSourceLabel': 'panda', 'archiveName': os.path.basename(self.inputsandbox), 'jobParameters': jobParameters, } # enable merging if job.backend.requirements.enableMerge: jobParameters = '-r {0} '.format(self.rundirectory) if 'exec' in job.backend.requirements.configMerge and job.backend.requirements.configMerge[ 'exec'] != '': jobParameters += '-j "{0}" '.format( job.backend.requirements.configMerge['exec']) if not job.backend.nobuild: jobParameters += '-l ${LIB} ' else: jobParameters += '-a {0} '.format( os.path.basename(self.inputsandbox)) jobParameters += "--sourceURL ${SURL} " jobParameters += '${TRN_OUTPUT:OUTPUT} ${TRN_LOG:LOG}' taskParamMap['mergeSpec'] = {} taskParamMap['mergeSpec']['useLocalIO'] = 1 taskParamMap['mergeSpec']['jobParameters'] = jobParameters taskParamMap['mergeOutput'] = True # Selected by Jedi #if not app.atlas_exetype in ['PYARA','ROOT','EXE']: # taskParamMap['transPath'] = 'http://atlpan.web.cern.ch/atlpan/runAthena-00-00-12' logger.debug(taskParamMap) # upload sources if self.inputsandbox and not job.backend.libds: uploadSources(os.path.dirname(self.inputsandbox), os.path.basename(self.inputsandbox)) if not self.inputsandbox == tmp_user_area_name: logger.info('Removing source tarball %s ...' % self.inputsandbox) os.remove(self.inputsandbox) return taskParamMap
def retry(self, JobsetID, newSite=False, newOpts={}, noSubmit=False, ignoreDuplication=True): # get logger tmpLog = PLogger.getPandaLogger() # check proxy self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy( self.gridPassPhrase, False, self.verbose) # get jobset newJobsetID = -1 jobList = self.getJobIDsWithSetID(JobsetID) if jobList == None: isJobset = False jobList = [JobsetID] else: isJobset = True tmpMsg = "JobsetID=%s is composed of JobID=" % JobsetID for tmpJobID in jobList: tmpMsg += '%s,' % tmpJobID tmpMsg = tmpMsg[:-1] tmpLog.info(tmpMsg) for JobID in jobList: # get job info from local repository localJob = self.getJobInfo(JobID) if localJob == None: tmpLog.warning( "JobID=%s not found in local repository. Synchronization may be needed" % JobID) return None # skip running job if localJob.dbStatus != 'frozen': tmpLog.warning('Cannot retry running jobs') if isJobset: continue else: return # skip already retried if localJob.retryID != '0': if isJobset: tmpLog.info('Skip JobID=%s since already retried by JobID=%s JobsetID=%s' % \ (JobID,localJob.retryID,localJob.retryJobsetID)) continue else: tmpLog.warning('This job was already retried by JobID=%s' % localJob.retryID) return # check status of buildJob if not localJob.buildStatus in ['', 'finished']: tmpMsgStr = 'Cannot retry since status of buildJob %s is %s (!= finished). ' \ % (localJob.PandaID.split(',')[0],localJob.buildStatus) tmpMsgStr += 'Please execute %s with the same input/output datasets (or containers). ' % localJob.jobType tmpMsgStr += 'It will run only on failed/cancelled/unused input files ' tmpMsgStr += 'and append output files to the output dataset container' tmpLog.warning(tmpMsgStr) if isJobset: continue else: return # check opts for newSite if newSite: if not localJob.outDS.endswith('/') and not newOpts.has_key( 'outDS') and not newOpts.has_key('--outDS'): tmpLog.warning( 'You need to specify --outDS in newOpts to retry at new site unless container is used as output' ) return # get list of failed jobs pandaIDs = localJob.PandaID.split(',') statusList = localJob.jobStatus.split(',') jobList = [] for idx in range(len(pandaIDs)): # check status if not statusList[idx] in ['failed', 'cancelled']: continue jobList.append(pandaIDs[idx]) # no failed job if jobList == []: if isJobset: tmpLog.info('Skip JobID=%s since no failed jobs' % JobID) continue else: tmpLog.info('No failed jobs to be retried for JobID=%s' % JobID) return # get full job spec tmpLog.info("Retrying JobID=%s ..." % JobID) tmpLog.info("Getting job info") idxJL = 0 nQuery = 500 pandaJobs = [] while idxJL < len(jobList): # avoid burst query tmpLog.info(" %5s/%s" % (idxJL, len(jobList))) status, oTmp = Client.getFullJobStatus(jobList[idxJL:idxJL + nQuery], verbose=self.verbose) if status != 0: tmpLog.error(status) tmpLog.error(oTmp) tmpLog.error("Cannot get job info from Panda server") return pandaJobs += oTmp idxJL += nQuery time.sleep(1) tmpLog.info(" %5s/%s" % (len(jobList), len(jobList))) # jobdefID newJobdefID = PsubUtils.readJobDefID() # reset some parameters retryJobs = [] retrySite = None retryElement = None retryDestSE = None outDsName = None shadowList = [] for idx in range(len(jobList)): job = pandaJobs[idx] # skip exired if job == None: tmpLog.warning("Could not retry jobs older than 30 days : JobID=%s (PandaID=%s) expired" \ % (JobID,jobList[idxJob])) return # get shadow list if (not ignoreDuplication ) and outDsName == None and job.prodSourceLabel == 'user': # look for dataset for log since it doesn't have suffix even when --individualOutDS is used for tmpFile in job.Files: if tmpFile.type == 'log': outDsName = tmpFile.dataset break # output dataset was not found if outDsName == None: tmpLog.error("Could not get output dataset name for JobID=%s (PandaID=%s)" \ % (JobID,job.PandaID)) return # get files in shadow if outDsName.endswith('/'): shadowList = Client.getFilesInShadowDataset( outDsName, Client.suffixShadow, self.verbose) else: # disable duplication check mainly for old overlay jobs since non-signal files are wrongly skipped #shadowList = Client.getFilesInShadowDatasetOld(outDsName,Client.suffixShadow,self.verbose) pass # unify sitename if retrySite == None: retrySite = job.computingSite retryElement = job.computingElement retryDestSE = job.destinationSE # reset job.jobStatus = None job.commandToPilot = None job.startTime = None job.endTime = None job.attemptNr = 1 + job.attemptNr for attr in job._attributes: if attr.endswith('ErrorCode') or attr.endswith( 'ErrorDiag'): setattr(job, attr, None) job.transExitCode = None job.computingSite = retrySite job.computingElement = retryElement job.destinationSE = retryDestSE job.dispatchDBlock = None job.jobExecutionID = JobID job.jobDefinitionID = newJobdefID job.parentID = job.PandaID if job.jobsetID != ['NULL', None, -1]: job.sourceSite = job.jobsetID job.jobsetID = newJobsetID skipInputList = [] numUsedFiles = 0 for file in job.Files: file.rowID = None if file.type == 'input': # protection against wrong sync which doesn't update buildStatus correctly if file.lfn.endswith( '.lib.tgz') and file.GUID == 'NULL': tmpLog.warning('GUID for %s is unknown. Cannot retry when corresponding buildJob failed' \ % file.lfn) return file.status = 'ready' # check with shadow for non lib.tgz/DBR tmpDbrMatch = re.search('^DBRelease-.*\.tar\.gz$', file.lfn) if tmpDbrMatch == None and not file.lfn.endswith( '.lib.tgz'): if file.lfn in shadowList: skipInputList.append(file) else: numUsedFiles += 1 elif file.type in ('output', 'log'): file.destinationSE = retryDestSE file.destinationDBlock = re.sub( '_sub\d+$', '', file.destinationDBlock) # add retry num if file.dataset.endswith('/'): retryMatch = re.search('_r(\d+)$', file.destinationDBlock) if retryMatch == None: file.destinationDBlock += '_r1' else: tmpDestinationDBlock = re.sub( '_r(\d+)$', '', file.destinationDBlock) file.destinationDBlock = tmpDestinationDBlock + '_r%d' % ( 1 + int(retryMatch.group(1))) # add attempt nr oldName = file.lfn file.lfn = re.sub("\.\d+$", "", file.lfn) file.lfn = "%s.%d" % (file.lfn, job.attemptNr) newName = file.lfn # modify jobParameters job.jobParameters = re.sub("'%s'" % oldName, "'%s'" % newName, job.jobParameters) # look for output in trf oldGenelicName = re.sub('\.\d+$', '', oldName) match = re.search(oldGenelicName + '(\.\d+)*(%20|")', job.jobParameters) if match != None: job.jobParameters = job.jobParameters.replace( match.group(0), newName + match.group(2)) # all files are used by others if numUsedFiles == 0 and skipInputList != []: continue # remove skipped files strSkipped = '' for tmpFile in skipInputList: strSkipped += '%s,' % tmpFile.lfn job.Files.remove(tmpFile) strSkipped = strSkipped[:-1] # modify jobpar if strSkipped != '': optionToSkipFiles = '--skipInputByRetry' if not optionToSkipFiles in job.jobParameters: # just append job.jobParameters += "%s=%s " % (optionToSkipFiles, strSkipped) else: # extract already skipped files tmpMatch = re.search("(%s=[^ ]+)", job.jobParameters) if tmpMatch == None: tmpLog.error("Failed to extract arg of %s for PandaID=%s" \ % (optionToSkipFiles,job.PandaID)) return # replace job.jobParameters = re.sub( tmpMatch.group(1), "%s,%s" % (tmpMatch.group(1), optionToSkipFiles), job.jobParameters) if self.verbose: tmpLog.debug(job.jobParameters) # append retryJobs.append(job) # all input files were or are being used by other jobs if retryJobs == []: tmpLog.info( 'All input files were or are being used by other jobs for the same output. No jobs to be retried. If you need to ignore duplication check (e.g., using the same EVNT file for multiple simulation subjobs), set ignoreDuplication=True. i.e. retry(123,ignoreDuplication=True)' ) return # check voms role if not retryJobs[0].workingGroup in ['NULL', None, '']: # VOMS role was used if not "--workingGroup" in job.metadata: # extract voms roles from metadata match = re.search("--voms( |=)[ \"]*([^ \"]+)", job.metadata) if match != None: vomsRoles = match.group(2) else: vomsRoles = "atlas:/atlas/%s/Role=production" % retryJobs[ 0].workingGroup # regenerate proxy with VOMS roles try: tmpLog.info("Checking proxy role to resubmit %s jobs" % retryJobs[0].workingGroup) self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy( self.gridPassPhrase, False, self.verbose, vomsRoles) except: tmpLog.error("Failed to generate a proxy with %s" % vomsRoles) return # check runtime env for new site submission if newSite: if retryJobs[ 0].processingType == 'pathena' or '--useAthenaPackages' in retryJobs[ 0].metadata: from pandatools import AthenaUtils stA, retA = AthenaUtils.getAthenaVer() if not stA: tmpLog.error( "Failed to get Athena rel/cache version in current runtime env" ) return athenaVer = retA['athenaVer'] cacheVer = retA['cacheVer'] nightVer = retA['nightVer'] wrongSetup = False if retryJobs[0].AtlasRelease != 'Atlas-%s' % athenaVer: wrongSetup = True errMsg = "Current Athena version Atlas-%s is inconsitent with the previous submission %s. " % ( athenaVer, retryJobs[0].AtlasRelease) elif retryJobs[ 0].homepackage != 'AnalysisTransforms' + cacheVer + nightVer: wrongSetup = True errMsg = "Current cache version %s is inconsitent with the previous submission. " % cacheVer.replace( '-', '').replace('_', '-') if wrongSetup: errMsg += 'You need to have the same runtime env as before since all job spec need to be re-created to send jobs to a new site. ' errMsg += 'Please setup Athena correctly and restart pbook' tmpLog.error(errMsg) return # test mode if noSubmit: continue # invoke pathena/prun to send job to new site if newSite: tmpLog.info( "Constrcuting job spec again to be sent to another site ..." ) comStat = PsubUtils.execWithModifiedParams( retryJobs, newOpts, self.verbose) if comStat == 0: # update database time.sleep(2) self.sync() else: tmpLog.error("Failed to submit jobs to Panda server") return # register datasets tmpOutDsLocation = Client.PandaSites[ retryJobs[-1].computingSite]['ddm'] addedDataset = [] shadowDSname = None for tmpFile in retryJobs[-1].Files: if tmpFile.type in ['output', 'log' ] and tmpFile.dataset.endswith('/'): # add shadow if shadowDSname == None and tmpFile.type == 'log': shadowDSname = "%s%s" % (tmpFile.destinationDBlock, Client.suffixShadow) Client.addDataset(shadowDSname, self.verbose) # add datasets if not tmpFile.destinationDBlock in addedDataset: # create dataset Client.addDataset(tmpFile.destinationDBlock, self.verbose, location=tmpOutDsLocation) # add to container Client.addDatasetsToContainer( tmpFile.dataset, [tmpFile.destinationDBlock], self.verbose) # append addedDataset.append(tmpFile.destinationDBlock) # submit tmpLog.info("Submitting job ...") status, out = Client.submitJobs(retryJobs, verbose=self.verbose) if out == None or status != 0: tmpLog.error(status) tmpLog.error(out) tmpLog.error("Failed to submit jobs to Panda server") return # update database pandaIDstatus = {} newJobID = None for items in out: # get newJobID if newJobID == None: newJobID = items[1] # check PandaID PandaID = items[0] if PandaID == 'NULL': tmpLog.error( "Panda server returned wrong IDs. It may have a temporary problem" ) return # set newJobsetID if newJobsetID in [None, -1]: newJobsetID = items[2]['jobsetID'] # dummy statuso pandaIDstatus[PandaID] = ('defined', 'NULL') # set retry ID localJob.retryID = newJobID if not newJobsetID in [None, -1, 'NULL']: localJob.retryJobsetID = newJobsetID try: PdbUtils.updateJobDB(localJob, self.verbose) except: tmpLog.error("Failed to set retryID for JobID=%s" % JobID) return # set new paramers newLocalJob = PdbUtils.convertPtoD(retryJobs, pandaIDstatus) newLocalJob.JobID = newJobID if not newJobsetID in [None, -1, 'NULL']: newLocalJob.groupID = newJobsetID newLocalJob.creationTime = datetime.datetime.utcnow() # insert to DB try: PdbUtils.insertJobDB(newLocalJob, self.verbose) except: tmpLog.error("Failed to insert JobID=%s to local repository" % newJobID) return # write new jobdefID PsubUtils.writeJobDefID(newJobID) # done tmpMsg = 'Done. New JobID=%s' % newJobID if not newJobsetID in [None, -1, 'NULL']: tmpMsg += " JobsetID=%s" % newJobsetID tmpLog.info(tmpMsg)
def master_prepare(self, app, appmasterconfig): # PandaTools from pandatools import Client from pandatools import AthenaUtils from taskbuffer.JobSpec import JobSpec from taskbuffer.FileSpec import FileSpec job = app._getParent() logger.debug('AthenaMCPandaRTHandler master_prepare called for %s', job.getFQID('.')) usertag = configDQ2['usertag'] #usertag='user09' nickname = getNickname(allowMissingNickname=True) self.libDataset = '%s.%s.ganga.%s_%d.lib._%06d' % ( usertag, nickname, commands.getoutput('hostname').split('.')[0], int(time.time()), job.id) # self.userprefix='%s.%s.ganga' % (usertag,gridProxy.identity()) sources = 'sources.%s.tar.gz' % commands.getoutput( 'uuidgen 2> /dev/null') self.library = '%s.lib.tgz' % self.libDataset # check DBRelease # if job.backend.dbRelease != '' and job.backend.dbRelease.find(':') == -1: # raise ApplicationConfigurationError(None,"ERROR : invalid argument for backend.dbRelease. Must be 'DatasetName:FileName'") # unpack library logger.debug('Creating source tarball ...') tmpdir = '/tmp/%s' % commands.getoutput('uuidgen 2> /dev/null') os.mkdir(tmpdir) inputbox = [] if os.path.exists(app.transform_archive): # must add a condition on size. inputbox += [File(app.transform_archive)] if app.evgen_job_option: self.evgen_job_option = app.evgen_job_option if os.path.exists(app.evgen_job_option): # locally modified job option file to add to the input sand box inputbox += [File(app.evgen_job_option)] self.evgen_job_option = app.evgen_job_option.split("/")[-1] # add input sandbox files if (job.inputsandbox): for file in job.inputsandbox: inputbox += [file] # add option files for extFile in job.backend.extOutFile: try: shutil.copy(extFile, tmpdir) except IOError: os.makedirs(tmpdir) shutil.copy(extFile, tmpdir) # fill the archive for opt_file in inputbox: try: shutil.copy(opt_file.name, tmpdir) except IOError: os.makedirs(tmpdir) shutil.copy(opt_file.name, tmpdir) # now tar it up again inpw = job.getInputWorkspace() rc, output = commands.getstatusoutput('tar czf %s -C %s .' % (inpw.getPath(sources), tmpdir)) if rc: logger.error('Packing sources failed with status %d', rc) logger.error(output) raise ApplicationConfigurationError(None, 'Packing sources failed.') shutil.rmtree(tmpdir) # upload sources logger.debug('Uploading source tarball ...') try: cwd = os.getcwd() os.chdir(inpw.getPath()) rc, output = Client.putFile(sources) if output != 'True': logger.error('Uploading sources %s failed. Status = %d', sources, rc) logger.error(output) raise ApplicationConfigurationError( None, 'Uploading archive failed') finally: os.chdir(cwd) # Use Panda's brokerage ## if job.inputdata and len(app.sites)>0: ## # update cloud, use inputdata's ## from dq2.info.TiersOfATLAS import whichCloud,ToACache ## inclouds=[] ## for site in app.sites: ## cloudSite=whichCloud(app.sites[0]) ## if cloudSite not in inclouds: ## inclouds.append(cloudSite) ## # now converting inclouds content into proper brokering stuff. ## outclouds=[] ## for cloudSite in inclouds: ## for cloudID, eachCloud in ToACache.dbcloud.iteritems(): ## if cloudSite==eachCloud: ## cloud=cloudID ## outclouds.append(cloud) ## break ## print outclouds ## # finally, matching with user's wishes ## if len(outclouds)>0: ## if not job.backend.requirements.cloud: # no user wish, update ## job.backend.requirements.cloud=outclouds[0] ## else: ## try: ## assert job.backend.requirements.cloud in outclouds ## except: ## raise ApplicationConfigurationError(None,'Input dataset not available in target cloud %s. Please try any of the following %s' % (job.backend.requirements.cloud, str(outclouds))) from GangaPanda.Lib.Panda.Panda import runPandaBrokerage runPandaBrokerage(job) if job.backend.site == 'AUTO': raise ApplicationConfigurationError( None, 'site is still AUTO after brokerage!') # output dataset preparation and registration try: outDsLocation = Client.PandaSites[job.backend.site]['ddm'] except: raise ApplicationConfigurationError( None, "Could not extract output dataset location from job.backend.site value: %s. Aborting" % job.backend.site) if not app.dryrun: for outtype in app.outputpaths.keys(): dset = string.replace(app.outputpaths[outtype], "/", ".") dset = dset[1:] # dataset registration must be done only once. print "registering output dataset %s at %s" % (dset, outDsLocation) try: Client.addDataset(dset, False, location=outDsLocation) dq2_set_dataset_lifetime(dset, location=outDsLocation) except: raise ApplicationConfigurationError( None, "Fail to create output dataset %s. Aborting" % dset) # extend registration to build job lib dataset: print "registering output dataset %s at %s" % (self.libDataset, outDsLocation) try: Client.addDataset(self.libDataset, False, location=outDsLocation) dq2_set_dataset_lifetime(self.libDataset, outDsLocation) except: raise ApplicationConfigurationError( None, "Fail to create output dataset %s. Aborting" % self.libDataset) ### cacheVer = "-AtlasProduction_" + str(app.prod_release) logger.debug("master job submit?") self.outsite = job.backend.site if app.se_name and app.se_name != "none" and not self.outsite: self.outsite = app.se_name # create build job jspec = JobSpec() jspec.jobDefinitionID = job.id jspec.jobName = commands.getoutput('uuidgen 2> /dev/null') jspec.AtlasRelease = 'Atlas-%s' % app.atlas_rel jspec.homepackage = 'AnalysisTransforms' + cacheVer #+nightVer jspec.transformation = '%s/buildJob-00-00-03' % Client.baseURLSUB # common base to Athena and AthenaMC jobs: buildJob is a pilot job which takes care of all inputs for the real jobs (in prepare() jspec.destinationDBlock = self.libDataset jspec.destinationSE = job.backend.site jspec.prodSourceLabel = 'panda' jspec.assignedPriority = 2000 jspec.computingSite = job.backend.site jspec.cloud = job.backend.requirements.cloud # jspec.jobParameters = self.args not known yet jspec.jobParameters = '-o %s' % (self.library) if app.userarea: print app.userarea jspec.jobParameters += ' -i %s' % (os.path.basename(app.userarea)) else: jspec.jobParameters += ' -i %s' % (sources) jspec.cmtConfig = AthenaUtils.getCmtConfig(athenaVer=app.atlas_rel) matchURL = re.search('(http.*://[^/]+)/', Client.baseURLSSL) if matchURL: jspec.jobParameters += ' --sourceURL %s' % matchURL.group(1) fout = FileSpec() fout.lfn = self.library fout.type = 'output' fout.dataset = self.libDataset fout.destinationDBlock = self.libDataset jspec.addFile(fout) flog = FileSpec() flog.lfn = '%s.log.tgz' % self.libDataset flog.type = 'log' flog.dataset = self.libDataset flog.destinationDBlock = self.libDataset jspec.addFile(flog) #print "MASTER JOB DETAILS:",jspec.jobParameters return jspec
def prepare(self, app, appconfig, appmasterconfig, jobmasterconfig): '''prepare the subjob specific configuration''' # PandaTools from pandatools import Client from pandatools import AthenaUtils from taskbuffer.JobSpec import JobSpec from taskbuffer.FileSpec import FileSpec job = app._getParent() logger.debug('AthenaMCPandaRTHandler prepare called for %s', job.getFQID('.')) try: assert self.outsite except: logger.error("outsite not set. Aborting") raise Exception() job.backend.site = self.outsite job.backend.actualCE = self.outsite cloud = job._getRoot().backend.requirements.cloud job.backend.requirements.cloud = cloud # now just filling the job from AthenaMC data jspec = JobSpec() jspec.jobDefinitionID = job._getRoot().id jspec.jobName = commands.getoutput('uuidgen 2> /dev/null') jspec.AtlasRelease = 'Atlas-%s' % app.atlas_rel if app.transform_archive: jspec.homepackage = 'AnalysisTransforms' + app.transform_archive elif app.prod_release: jspec.homepackage = 'AnalysisTransforms-AtlasProduction_' + str( app.prod_release) jspec.transformation = '%s/runAthena-00-00-11' % Client.baseURLSUB #---->???? prodDBlock and destinationDBlock when facing several input / output datasets? jspec.prodDBlock = 'NULL' if job.inputdata and len( app.inputfiles) > 0 and app.inputfiles[0] in app.dsetmap: jspec.prodDBlock = app.dsetmap[app.inputfiles[0]] # How to specify jspec.destinationDBlock when more than one type of output is available? Panda prod jobs seem to specify only the last output dataset outdset = "" for type in ["EVNT", "RDO", "HITS", "AOD", "ESD", "NTUP"]: if type in app.outputpaths.keys(): outdset = string.replace(app.outputpaths[type], "/", ".") outdset = outdset[1:-1] break if not outdset: try: assert len(app.outputpaths.keys()) > 0 except: logger.error( "app.outputpaths is empty: check your output datasets") raise type = app.outputpaths.keys()[0] outdset = string.replace(app.outputpaths[type], "/", ".") outdset = outdset[1:-1] jspec.destinationDBlock = outdset jspec.destinationSE = self.outsite jspec.prodSourceLabel = 'user' jspec.assignedPriority = 1000 jspec.cloud = cloud # memory if job.backend.requirements.memory != -1: jspec.minRamCount = job.backend.requirements.memory jspec.computingSite = self.outsite jspec.cmtConfig = AthenaUtils.getCmtConfig(athenaVer=app.atlas_rel) # library (source files) flib = FileSpec() flib.lfn = self.library # flib.GUID = flib.type = 'input' # flib.status = flib.dataset = self.libDataset flib.dispatchDBlock = self.libDataset jspec.addFile(flib) # input files FIXME: many more input types for lfn in app.inputfiles: useguid = app.turls[lfn].replace("guid:", "") finp = FileSpec() finp.lfn = lfn finp.GUID = useguid finp.dataset = app.dsetmap[lfn] finp.prodDBlock = app.dsetmap[lfn] finp.prodDBlockToken = 'local' finp.dispatchDBlock = app.dsetmap[lfn] finp.type = 'input' finp.status = 'ready' jspec.addFile(finp) # add dbfiles if any: for lfn in app.dbfiles: useguid = app.dbturls[lfn].replace("guid:", "") finp = FileSpec() finp.lfn = lfn finp.GUID = useguid finp.dataset = app.dsetmap[lfn] finp.prodDBlock = app.dsetmap[lfn] finp.prodDBlockToken = 'local' finp.dispatchDBlock = app.dsetmap[lfn] finp.type = 'input' finp.status = 'ready' jspec.addFile(finp) # then minbias files for lfn in app.mbfiles: useguid = app.minbias_turls[lfn].replace("guid:", "") finp = FileSpec() finp.lfn = lfn finp.GUID = useguid finp.dataset = app.dsetmap[lfn] finp.prodDBlock = app.dsetmap[lfn] finp.prodDBlockToken = 'local' finp.dispatchDBlock = app.dsetmap[lfn] finp.type = 'input' finp.status = 'ready' jspec.addFile(finp) # then cavern files for lfn in app.cavernfiles: useguid = app.cavern_turls[lfn].replace("guid:", "") finp = FileSpec() finp.lfn = lfn finp.GUID = useguid finp.dataset = app.dsetmap[lfn] finp.prodDBlock = app.dsetmap[lfn] finp.prodDBlockToken = 'local' finp.dispatchDBlock = app.dsetmap[lfn] finp.type = 'input' finp.status = 'ready' jspec.addFile(finp) # output files( this includes the logfiles) # Output files jidtag = "" job = app._getParent() # Returns job or subjob object if job._getRoot().subjobs: jidtag = job._getRoot().id else: jidtag = "%d" % job.id outfiles = app.subjobsOutfiles[job.id] pandaOutfiles = {} for type in outfiles.keys(): pandaOutfiles[type] = outfiles[type] + "." + str(jidtag) if type == "LOG": pandaOutfiles[type] += ".tgz" #print pandaOutfiles for outtype in pandaOutfiles.keys(): fout = FileSpec() dset = string.replace(app.outputpaths[outtype], "/", ".") dset = dset[1:-1] fout.dataset = dset fout.lfn = pandaOutfiles[outtype] fout.type = 'output' # fout.destinationDBlock = jspec.destinationDBlock fout.destinationDBlock = fout.dataset fout.destinationSE = jspec.destinationSE if outtype == 'LOG': fout.type = 'log' fout.destinationDBlock = fout.dataset fout.destinationSE = job.backend.site jspec.addFile(fout) # job parameters param = '-l %s ' % self.library # user tarball. # use corruption checker if job.backend.requirements.corCheck: param += '--corCheck ' # disable to skip missing files if job.backend.requirements.notSkipMissing: param += '--notSkipMissing ' # transform parameters # need to update arglist with final output file name... newArgs = [] if app.mode == "evgen": app.args[3] = app.args[3] + " -t " if app.verbosity: app.args[3] = app.args[3] + " -l %s " % app.verbosity for arg in app.args[3:]: for type in outfiles.keys(): if arg.find(outfiles[type]) > -1: arg = arg.replace(outfiles[type], pandaOutfiles[type]) newArgs.append(arg) arglist = string.join(newArgs, " ") # print "Arglist:",arglist param += ' -r ./ ' param += ' -j "%s"' % urllib.quote(arglist) allinfiles = app.inputfiles + app.dbfiles # Input files. param += ' -i "%s" ' % allinfiles if len(app.mbfiles) > 0: param += ' -m "%s" ' % app.mbfiles if len(app.cavernfiles) > 0: param += ' -n "%s" ' % app.cavernfiles # param += '-m "[]" ' #%minList FIXME # param += '-n "[]" ' #%cavList FIXME del pandaOutfiles[ "LOG"] # logfiles do not appear in IROOT block, and this one is not needed anymore... param += ' -o "{\'IROOT\':%s }"' % str(pandaOutfiles.items()) # source URL matchURL = re.search("(http.*://[^/]+)/", Client.baseURLSSL) if matchURL != None: param += " --sourceURL %s " % matchURL.group(1) param += " --trf" jspec.jobParameters = param jspec.metadata = "--trf \"%s\"" % arglist #print "SUBJOB DETAILS:",jspec.values() if app.dryrun: print "job.application.dryrun activated, printing out job parameters" print jspec.values() return return jspec