def getPandaStatus(self): for country in self.config.sites.keys(): for group in self.config.sites[country].keys(): # country/group = None is equivalent to not specifing anything self.factoryMessages.info('Polling panda status for country=%s, group=%s' % (country, group,)) error,self.config.sites[country][group]['siteStatus'] = Client.getJobStatisticsPerSite(countryGroup=country,workingGroup=group) if error != 0: raise PandaStatusFailure, 'Client.getJobStatisticsPerSite(countryGroup=%s,workingGroup=%s) error: %s' % (country, group, error) for siteid, queues in self.config.sites[country][group].iteritems(): if siteid == 'siteStatus': continue if siteid in self.config.sites[country][group]['siteStatus']: self.factoryMessages.debug('Panda status: %s (country=%s, group=%s) %s' % (siteid, country, group, self.config.sites[country][group]['siteStatus'][siteid])) for queue in queues: self.config.queues[queue]['pandaStatus'] = self.config.sites[country][group]['siteStatus'][siteid] else: # If panda knows nothing, then we assume all zeros (site may be inactive) self.factoryMessages.debug('Panda status for siteid %s (country=%s, group=%s) not found - setting zeros in status to allow bootstraping of site.' % (siteid, country, group)) for queue in queues: self.config.queues[queue]['pandaStatus'] = {'transferring': 0, 'activated': 0, 'running': 0, 'assigned': 0, 'failed': 0, 'finished': 0} # Now poll site and cloud status to suppress pilots if a site is offline # Take site staus out - better to use individual queue status from schedconfig #self.factoryMessages.info('Polling panda for site status') #error,self.pandaSiteStatus = Client.getSiteSpecs(siteType='all') #if error != 0: # raise PandaStatusFailure, '''Client.getSiteSpecs(siteType='all') error: %s''' % (error) self.factoryMessages.info('Polling panda for cloud status') error,self.pandaCloudStatus = Client.getCloudSpecs() if error != 0: raise PandaStatusFailure, 'Client.getCloudSpecs() error: %s' % (error)
def eraseDispDatasets(ids): print "eraseDispDatasets" datasets = [] # get jobs status,jobs = Client.getJobStatus(ids) if status != 0: return # gather dispDBlcoks for job in jobs: # dispatchDS is not a DQ2 dataset in US if job.cloud == 'US': continue # erase disp datasets for production jobs only if job.prodSourceLabel != 'managed': continue for file in job.Files: if file.dispatchDBlock == 'NULL': continue if (not file.dispatchDBlock in datasets) and \ re.search('_dis\d+$',file.dispatchDBlock) != None: datasets.append(file.dispatchDBlock) # erase for dataset in datasets: print 'erase %s' % dataset status,out = ddm.DQ2.main('eraseDataset',dataset) print out
def killJobs(self, ids, code=None, verbose=False): """Kill jobs. Normal users can kill only their own jobs. People with production VOMS role can kill any jobs. Running jobs are killed when next heartbeat comes from the pilot. Set code=9 if running jobs need to be killed immediately. args: ids: the list of PandaIDs code: specify why the jobs are killed 2: expire 3: aborted 4: expire in waiting 7: retry by server 8: rebrokerage 9: force kill 50: kill by JEDI 91: kill user jobs with prod role verbose: set True to see what's going on returns: status code 0: communication succeeded to the panda server 255: communication failure the list of clouds (or Nones if tasks are not yet assigned) """ import userinterface.Client as Client s, o = Client.killJobs(ids, code=code, verbose=verbose)
def eraseDispDatasets(ids): print "eraseDispDatasets" datasets = [] # get jobs status, jobs = Client.getJobStatus(ids) if status != 0: return # gather dispDBlcoks for job in jobs: # dispatchDS is not a DQ2 dataset in US if job.cloud == 'US': continue # erase disp datasets for production jobs only if job.prodSourceLabel != 'managed': continue for file in job.Files: if file.dispatchDBlock == 'NULL': continue if (not file.dispatchDBlock in datasets) and \ re.search('_dis\d+$',file.dispatchDBlock) != None: datasets.append(file.dispatchDBlock) # erase for dataset in datasets: print 'erase %s' % dataset status, out = ddm.DQ2.main('eraseDataset', dataset) print out
def __submit(self): s, o = Client.submitJobs(self.__joblist, srvID=self.__aSrvID) #print "S: %s" % s #print "O: %s" % o #panda_ids = json.loads(o) for x in o: print "PandaID=%s" % x[0]
def killJobs(self, ids, code=None, verbose=False): """Kill jobs. Normal users can kill only their own jobs. People with production VOMS role can kill any jobs. Running jobs are killed when next heartbeat comes from the pilot. Set code=9 if running jobs need to be killed immediately. args: ids: the list of PandaIDs code: specify why the jobs are killed 2: expire 3: aborted 4: expire in waiting 7: retry by server 8: rebrokerage 9: force kill 50: kill by JEDI 91: kill user jobs with prod role verbose: set True to see what's going on returns: status code 0: communication succeeded to the panda server 255: communication failure the list of clouds (or Nones if tasks are not yet assigned) """ import userinterface.Client as Client s,o = Client.killJobs(ids, code=code, verbose=verbose)
def getJobStatus(self, ids): import userinterface.Client as Client s, o = Client.getJobStatus(ids) result = {} if s != 0: _logger.error('Error response code: %s %s' % (str(s), str(o))) return result for x in o: result[x.PandaID] = x.jobStatus return result
def uploadLog(self): if self.jediTaskID == None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s, o = Client.uploadLog(strMsg, self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def killJobs(jobList): print 'Kill jobs' _logger.debug('Kill jobs') _logger.debug(str(jobList)) s,o = Client.killJobs(jobList) # Code 3 eqs. aborted status _logger.debug(o) _logger.debug(s) _logger.debug("---------------------") return o
def uploadLog(self): if self.jediTaskID == None: return 'cannot find jediTaskID' strMsg = self.logger.dumpToString() s,o = Client.uploadLog(strMsg,self.jediTaskID) if s != 0: return "failed to upload log with {0}.".format(s) if o.startswith('http'): return '<a href="{0}">log</a>'.format(o) return o
def getJobStatus(self, ids): import userinterface.Client as Client s,o = Client.getJobStatus(ids) result = {} if s != 0: _logger.error('Error response code: %s %s' %(str(s), str(o))) return result for x in o: result[x.PandaID] = x.jobStatus return result
def killJobs(jobList): print 'Kill jobs' _logger.debug('Kill jobs') _logger.debug(str(jobList)) s, o = Client.killJobs(jobList) # Code 3 eqs. aborted status _logger.debug(o) _logger.debug(s) _logger.debug("---------------------") return o
def submitJobs(self, jobList): print 'Submit jobs' _logger.debug('Submit jobs') s, o = Client.submitJobs(jobList) _logger.debug("---------------------") _logger.debug(s) for x in o: _logger.debug("PandaID=%s" % x[0]) return o
def submitJobs(jobList): print 'Submit jobs' _logger.debug('Submit jobs') _logger.debug(str(jobList)) s,o = Client.submitJobs(jobList) _logger.debug(o) _logger.debug(s) _logger.debug("---------------------") for x in o: _logger.debug("PandaID=%s" % x[0]) return o
def __submit(self, name): # gets name of job # submits # returns PanDA id if name is None or name not in self.__joblist: return -1 s,o = Client.submitJobs([self.__joblist[name][0]],srvID=self.__aSrvID) #print s #print o for x in o: print "PandaID=%s" % x[0] return x[0]
def main(): i = 4005758 jobs_list = [] while i <= 4005758: jobs_list.append(i) i+= 1 print jobs_list s,o = Client.killJobs(jobs_list,srvID=aSrvID) for x in o: print x logger.info('done')
def getStatus(self, expectedStates): idList = [job['jobID'] for job in self.__jobList] print idList status, jobInfoList = Client.getJobStatus(idList) print jobInfoList assert status == 0, "Retrieval of job state finished with status: %s" %status for job in jobInfoList: assert job.jobStatus in expectedStates, "Recently defined job was not in states %s (PandaID: %s jobStatus: %s)" %(expectedStates, job.PandaID, job.jobStatus) return jobInfoList
def getStatus(self, expectedStates): idList = [job['jobID'] for job in self.__jobList] _logger.info("%s" % idList) status, jobInfoList = Client.getJobStatus(idList) _logger.info("%s" % jobInfoList) assert status == 0, "Retrieval of job state finished with status: %s" % status for job in jobInfoList: assert job.jobStatus in expectedStates, "Recently defined job was not in states %s (PandaID: %s jobStatus: %s)" % ( expectedStates, job.PandaID, job.jobStatus) return jobInfoList
def eraseDispDatasets(ids): datasets = [] # get jobs status,jobs = Client.getJobStatus(ids) if status != 0: return # gather dispDBlcoks for job in jobs: for file in job.Files: if not file.dispatchDBlock in datasets: datasets.append(file.dispatchDBlock) # erase for dataset in datasets: ddm.DQ2.main(['eraseDataset',datasets])
def generateJobs(self): for i in range(self.__nJobs): job = self.defineEvgen16Job(i) self.__jobList.append({'jobSpec': job, 'jobID': None}) status, output = Client.submitJobs([job['jobSpec'] for job in self.__jobList]) #Return from submitJobs: ret.append((job.PandaID,job.jobDefinitionID,{'jobsetID':job.jobsetID})) assert status == 0, "Submission of jobs finished with status: %s" %status assert len(self.__jobList) == len(output), "Not all jobs seem to have been submitted properly" for job, ids in zip(self.__jobList, output): jobID = ids[0] job['jobID'] = jobID print("Generated job PandaID = %s" %jobID) return
def generateJobs(self): for i in range(self.__nJobs): job = self.defineEvgen16Job(i) self.__jobList.append({'jobSpec': job, 'jobID': None}) status, output = Client.submitJobs( [job['jobSpec'] for job in self.__jobList] ) #Return from submitJobs: ret.append((job.PandaID,job.jobDefinitionID,{'jobsetID':job.jobsetID})) assert status == 0, "Submission of jobs finished with status: %s" % status assert len(self.__jobList) == len( output), "Not all jobs seem to have been submitted properly" for job, ids in zip(self.__jobList, output): jobID = ids[0] job['jobID'] = jobID _logger.info("Generated job PandaID = %s" % jobID) return
def update_status(): # Method to sync PandaDB job status and local job status # show users jobs jobs = Job.query.filter(Job.pandaid.isnot(None))\ .filter(~Job.status.in_(['finished', 'failed', 'cancelled']))\ .all() ids = [] localids = [] for job in jobs: localids.append(job.id) ids.append(job.pandaid) # get status update if len(ids) > 0: _logger.debug('getJobStatus: ' + str(ids)) s, o = Client.getJobStatus(ids) _logger.debug(o) _logger.debug(s) _logger.debug("---------------------") for job in jobs: if job.pandaid in ids: for obj in o: if obj.PandaID == job.pandaid: # Update attemptNr if changed if job.attemptnr not in [obj.attemptNr]: job.attemptnr = obj.attemptNr jobs_.save(job) # Update status if changed if job.status != obj.jobStatus: job.status = obj.jobStatus job.modification_time = datetime.utcnow() jobs_.save(job) return localids
fileOA.destinationDBlock = job.destinationDBlock fileOA.destinationSE = job.destinationSE fileOA.dataset = job.destinationDBlock fileOA.type = 'output' job.addFile(fileOA) fileOC = FileSpec() fileOC.lfn = "%s.NTUP.root" % job.jobName fileOC.destinationDBlock = job.destinationDBlock fileOC.destinationSE = job.destinationSE fileOC.dataset = job.destinationDBlock fileOC.type = 'output' job.addFile(fileOC) fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % job.jobName fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.type = 'log' job.addFile(fileOL) job.jobParameters="HITS.022081._[00001,00002].pool.root RDO.TMP._00001_tmp.pool.root 250 0 ATLAS-CSC-05-00-00 1 1 NONE NONE None %s AtRndmGenSvc QGSP_EMV DEFAULT NONE NONE NONE NONE NONE\n RDO.TMP._00001_tmp.pool.root %s %s %s 250 0 ATLAS-CSC-05-00-00 DEFAULT None %s NONE" % \ (fileD1.lfn,fileOE.lfn,fileOA.lfn,fileOC.lfn,fileD2.lfn) s,o = Client.submitJobs([job]) print "---------------------" print s for x in o: print "PandaID=%s" % x[0]
status,res = proxyS.querySQLS(sql,varMap) if res != None: for (id,lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) # reassign jobs.sort() if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'reassign %s' % str(jobs[iJob:iJob+nJob]) Client.reassignJobs(jobs[iJob:iJob+nJob]) iJob += nJob time.sleep(10) if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print 'kill JEDI jobs %s' % str(jediJobs[iJob:iJob+nJob]) Client.killJobs(jediJobs[iJob:iJob+nJob],codeV,keepUnmerged=options.keepUnmerged) iJob += nJob print print 'reassigned {0} jobs'.format(len(jobs+jediJobs))
allSites = [] for site in tmpSites: # _allSites may conain NULL after sort() if site == 'NULL': continue # ignore test sites if site.endswith('test') or site.endswith('Test'): continue # append allSites.append(site) # reassign jobs jobs=[] for (id,modTime) in res: if modTime < timeLimit: jobs.append(id) # reassign if len(jobs): nJob = 20 iJob = 0 while iJob < len(jobs): print 'reassignJobs(%s)' % jobs[iJob:iJob+nJob] index = random.randint(1,len(allSites)) site = allSites[int(index)-1] print 'site=%s' % site Client.reassignJobs(jobs[iJob:iJob+nJob],site) iJob += nJob time.sleep(10)
'type': 'template', 'param_type': 'output', 'token': 'ATLASDATADISK', 'value': ' --outputHitsFile={0}.${{SN}}.pool.root'.format(outDatasetName), 'dataset': outDatasetName, }, { 'type': 'constant', 'value': '--physicsList=QGSP_BERT --postInclude=RecJobTransforms/UseFrontierFallbackDBRelease.py --preInclude=SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py', }, { 'type': 'template', 'value': '--skipEvents=${SKIPEVENTS}', 'param_type': 'number', }, { 'type': 'template', 'value': '--randomSeed=${RNDMSEED}', 'param_type': 'number', }, ] taskParamMap['esmergeSpec'] = {} taskParamMap['esmergeSpec']['transPath'] = 'Merge_trf.py' taskParamMap['esmergeSpec']['jobParameters'] = "aaa bbb" print(Client.insertTaskParams(taskParamMap))
jediTaskID = int(options.tid) if True: if options.resurrectDS: sd, so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', { ':id': jediTaskID, ':t1': 'output', ':t2': 'log' }) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print 'resurrect {0}'.format(datasetName) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except: pass print Client.reloadInput(jediTaskID)[-1] print 'done for jediTaskID={0}'.format(jediTaskID) else: print 'failed'
def putFile(req,file): if not Protocol.isSecure(req): return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False _logger.debug("putFile : start %s %s" % (req.subprocess_env['SSL_CLIENT_S_DN'],file.filename)) # size check fullSizeLimit = 768*1024*1024 if not file.filename.startswith('sources.'): noBuild = True sizeLimit = 100*1024*1024 else: noBuild = False sizeLimit = fullSizeLimit # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except: if req.headers_in.has_key("content-length"): _logger.error("cannot get CL : %s" % req.headers_in["content-length"]) else: _logger.error("no CL") _logger.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % (contentLength,sizeLimit) if noBuild: errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit" else: errStr += " Please remove redundant files from your workarea" _logger.error(errStr) _logger.debug("putFile : end") return errStr try: fileFullPath = '%s/%s' % (panda_config.cache_dir,file.filename.split('/')[-1]) # avoid overwriting if os.path.exists(fileFullPath): # touch os.utime(fileFullPath,None) # send error message errStr = "ERROR : Cannot overwrite file" _logger.debug('putFile : cannot overwrite file %s' % file.filename) _logger.debug("putFile : end") return errStr # write fo = open(fileFullPath,'wb') fileContent = file.file.read() fo.write(fileContent) fo.close() except: errStr = "ERROR : Cannot write file" _logger.error(errStr) _logger.debug("putFile : end") return errStr # checksum try: # decode Footer footer = fileContent[-8:] checkSum,isize = struct.unpack("II",footer) _logger.debug("CRC from gzip Footer %s" % checkSum) except: # calculate on the fly """ import zlib checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF """ # use None to avoid delay for now checkSum = None _logger.debug("CRC calculated %s" % checkSum) # file size fileSize = len(fileContent) # user name username = cleanUserID(req.subprocess_env['SSL_CLIENT_S_DN']) _logger.debug("putFile : written dn=%s file=%s size=%s crc=%s" % \ (username,file.filename,fileSize,checkSum)) # put file info to DB statClient,outClient = Client.insertSandboxFileInfo(username,file.filename, fileSize,checkSum) if statClient != 0 or outClient.startswith("ERROR"): _logger.error("putFile : failed to put sandbox to DB with %s %s" % (statClient,outClient)) #_logger.debug("putFile : end") #return "ERROR : Cannot insert sandbox to DB" else: _logger.debug("putFile : inserted sandbox to DB with %s" % outClient) # store to cassandra if hasattr(panda_config,'cacheUseCassandra') and panda_config.cacheUseCassandra == True: try: # time-stamp timeNow = datetime.datetime.utcnow() creationTime = timeNow.strftime('%Y-%m-%d %H:%M:%S') # user name username = req.subprocess_env['SSL_CLIENT_S_DN'] username = username.replace('/CN=proxy','') username = username.replace('/CN=limited proxy','') # file size fileSize = len(fileContent) # key fileKeyName = file.filename.split('/')[-1] sizeCheckSum = '%s:%s' % (fileSize,checkSum) # insert to cassandra import pycassa pool = pycassa.ConnectionPool(panda_config.cacheKeySpace) filefamily = pycassa.ColumnFamily(pool,panda_config.cacheFileTable) # avoid overwriting gotoNextCassa = True if filefamily.get_count(fileKeyName) > 0: # touch touchFlag = touchFileCassa(filefamily,fileKeyName,timeNow) if touchFlag: gotoNextCassa = False # send error message errStr = "ERROR : Cannot overwrite file in Cassandra" _logger.error(errStr) if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr # check uniqueness with size and checksum if gotoNextCassa: try: uniqExp = pycassa.index.create_index_expression('uniqID',sizeCheckSum) userExp = pycassa.index.create_index_expression('user',username) tmpClause = pycassa.index.create_index_clause([uniqExp,userExp]) tmpResults = filefamily.get_indexed_slices(tmpClause,columns=['creationTime']) for oldFileKeyName,tmpDict in tmpResults: _logger.debug('The same size and chksum %s found in old:%s and new:%s' % \ (sizeCheckSum,oldFileKeyName,fileKeyName)) # touch touchFlag = touchFileCassa(filefamily,oldFileKeyName,timeNow) if touchFlag: # make alias _logger.debug('Making alias %s->%s' % (fileKeyName,oldFileKeyName)) insertWithRetryCassa(filefamily,fileKeyName, {'alias':oldFileKeyName, 'creationTime':creationTime, 'nSplit':0, }, 'putFile : make alias for %s' % file.filename ) # set time touchFileCassa(filefamily,fileKeyName,timeNow) _logger.debug("putFile : end") return True except: gotoNextCassa = False errType,errValue = sys.exc_info()[:2] errStr = "cannot make alias for %s due to %s %s" % (fileKeyName,errType,errValue) _logger.error(errStr) if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr # insert new record if gotoNextCassa: splitIdx = 0 splitSize = 5 * 1024 * 1024 nSplit,tmpMod = divmod(len(fileContent),splitSize) if tmpMod != 0: nSplit += 1 _logger.debug('Inserting %s with %s blocks' % (fileKeyName,nSplit)) for splitIdx in range(nSplit): # split to small chunks since cassandra is not good at large files tmpFileContent = fileContent[splitSize*splitIdx:splitSize*(splitIdx+1)] tmpFileKeyName = fileKeyName tmpAttMap = {'file':tmpFileContent, 'user':username, 'creationTime':creationTime, } if splitIdx == 0: tmpAttMap['size'] = fileSize tmpAttMap['nSplit'] = nSplit tmpAttMap['uniqID'] = sizeCheckSum tmpAttMap['checkSum'] = str(checkSum) else: tmpFileKeyName += '_%s' % splitIdx tmpAttMap['size'] = 0 tmpAttMap['nSplit'] = 0 # insert with retry insertWithRetryCassa(filefamily,tmpFileKeyName,tmpAttMap, 'putFile : insert %s' % file.filename) # set time touchFileCassa(filefamily,fileKeyName,timeNow) except: errType,errValue = sys.exc_info()[:2] errStr = "cannot put %s into Cassandra due to %s %s" % (fileKeyName,errType,errValue) _logger.error(errStr) # send error message errStr = "ERROR : " + errStr if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr _logger.debug("putFile : %s end" % file.filename) return True
jobsMap[prio] = [] if not id in jobsMap[prio]: jobsMap[prio].append(id) # order by PandaID and currentPriority jobs = [] prioList = jobsMap.keys() prioList.sort() for prio in prioList: # reverse order by PandaID to kill newer jobs ids = jobsMap[prio] ids.sort() ids.reverse() jobs += ids if options.maxJobs != None: jobs = jobs[:int(options.maxJobs)] print 'The number of jobs with priorities below %s : %s' % (args[0], len(jobs)) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'kill %s' % str(jobs[iJob:iJob + nJob]) if options.forceKill: Client.killJobs(jobs[iJob:iJob + nJob], 9) else: Client.killJobs(jobs[iJob:iJob + nJob]) iJob += nJob time.sleep(1)
if options.prodSourceLabel != None: varMap[':src3'] = options.prodSourceLabel srcSQL += ',:src3' srcSQL += ')' jobs = [] tables = ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4'] for table in tables: sql = "SELECT PandaID FROM %s WHERE prodUserName=:prodUserName AND prodSourceLabel IN %s " % (table,srcSQL) if options.jobID != None: sql += "AND jobDefinitionID=:jobDefinitionID " if not options.jobsetID in (None,'all'): sql += "AND jobsetID=:jobsetID " sql += "ORDER BY PandaID " status,res = proxyS.querySQLS(sql,varMap) if res != None: for id, in res: if not id in jobs: jobs.append(id) if len(jobs): iJob = 0 nJob = 1000 while iJob < len(jobs): subJobs = jobs[iJob:iJob+nJob] print "kill %s %s/%s" % (str(subJobs),iJob,len(jobs)) Client.killJobs(subJobs,code=9) iJob += nJob else: print "no job was killed"
#job.transformation = 'http://pandawms.org/pandawms-jobcache/lsst-trf.sh' job.transformation = 'http://pandawms.org/pandawms-jobcache/lsst-trf-phosim332.sh' job.destinationDBlock = datasetName #job.destinationSE = destName job.destinationSE = 'local' job.currentPriority = 1000 #job.prodSourceLabel = 'ptest' #job.prodSourceLabel = 'panda' #job.prodSourceLabel = 'ptest' #job.prodSourceLabel = 'test' #job.prodSourceLabel = 'ptest' ### 2014-01-27 #job.prodSourceLabel = 'user' job.prodSourceLabel = 'panda' job.computingSite = site job.jobParameters = "" job.VO = "lsst" fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % job.jobName fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.type = 'log' job.addFile(fileOL) s, o = Client.submitJobs([job], srvID=aSrvID) print s for x in o: print "PandaID=%s" % x[0]
help='kill user jobs using a production role') options, args = optP.parse_args() aSrvID = None codeV = None useMailAsIDV = False if options.forceKill: codeV = 9 elif options.killUserJobs: codeV = 91 if options.killOwnProdJobs: useMailAsIDV = True if len(args) == 1: Client.killJobs([args[0]], code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged) else: startID = int(args[0]) endID = int(args[1]) if startID > endID: print '%d is less than %d' % (endID, startID) sys.exit(1) Client.killJobs(range(startID, endID + 1), code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged)
sql = "SELECT PandaID,lockedby FROM ATLAS_PANDA.jobsActive4 " sql += "WHERE jobStatus=:jobStatus AND computingSite=:computingSite AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel ORDER BY PandaID" status,res = proxyS.querySQLS(sql,varMap) print "got {0} jobs".format(len(res)) jobs = [] jediJobs = [] if res != None: for (id,lockedby) in res: if lockedby == 'jedi': jediJobs.append(id) else: jobs.append(id) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'reassign %s' % str(jobs[iJob:iJob+nJob]) Client.reassignJobs(jobs[iJob:iJob+nJob]) iJob += nJob if len(jediJobs) != 0: nJob = 100 iJob = 0 while iJob < len(jediJobs): print 'kill JEDI jobs %s' % str(jediJobs[iJob:iJob+nJob]) Client.killJobs(jediJobs[iJob:iJob+nJob],51) iJob += nJob
# password from config import panda_config passwd = panda_config.dbpasswd cloud = sys.argv[1] # instantiate DB proxies proxyS = DBProxy() proxyS.connect(panda_config.dbhost,panda_config.dbpasswd,panda_config.dbuser,panda_config.dbname) while True: # get PandaIDs res = proxyS.querySQL("SELECT PandaID FROM jobsWaiting4 WHERE cloud='%s' ORDER BY PandaID" % cloud) # escape if len(res) == 0: break # convert to list jobs = [] for id, in res: jobs.append(id) # reassign nJob = 300 iJob = 0 while iJob < len(jobs): print 'killJobs(%s)' % jobs[iJob:iJob+nJob] Client.killJobs(jobs[iJob:iJob+nJob]) iJob += nJob time.sleep(60)
from userinterface.Client import baseURLSSL from taskbuffer.TaskBuffer import taskBuffer from brokerage.SiteMapper import SiteMapper from config import panda_config # instantiate TB taskBuffer.init(panda_config.dbhost, panda_config.dbpasswd, nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) import httplib import commands id = sys.argv[1] s, o = Client.getJobStatus([id]) if s != 0: print "failed to get job with:%s" % s sys.exit(0) job = o[0] if job == None: print "got None" sys.exit(0) xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!-- ATLAS file meta-data catalog --> <!DOCTYPE POOLFILECATALOG SYSTEM "InMemory"> <POOLFILECATALOG>
import time import sys import optparse import userinterface.Client as Client aSrvID = None from taskbuffer.OraDBProxy import DBProxy # password from config import panda_config optP = optparse.OptionParser(conflict_handler="resolve") options, args = optP.parse_args() jediTaskID = args[0] s, o = Client.killTask(jediTaskID) print o
jobSpec) if not fileCheckInJEDI: jobSpec.jobStatus = 'closed' jobSpec.jobSubStatus = 'cojumbo_wrong' jobSpec.taskBufferErrorCode = taskbuffer.ErrorCode.EC_EventServiceInconsistentIn taskBuffer.archiveJobs([jobSpec], False, True) tmpLog.debug("kill {0} co-jumbo jobs in Waiting".format( len(coJumboTokill))) if len(coJumboTokill) > 0: jediJobs = list(coJumboTokill) nJob = 100 iJob = 0 while iJob < len(jediJobs): tmpLog.debug(' killing %s' % str(jediJobs[iJob:iJob + nJob])) Client.killJobs(jediJobs[iJob:iJob + nJob], 51, keepUnmerged=True) iJob += nJob except: errStr = traceback.format_exc() tmpLog.error(errStr) tmpLog.debug("Fork session") # thread for fork class ForkThr(threading.Thread): def __init__(self, fileName): threading.Thread.__init__(self) self.fileName = fileName
from django.db.models import Q from prodsys.models import Task, Job print 'Getting tasks with status send and running' tasks_list = Task.objects.all().filter(Q(status='send') | Q(status='running')) print 'Got list of %s tasks' % len(tasks_list) for t in tasks_list: print 'Getting jobs with status send and running for task with status send and running' jobs_list = list( Job.objects.filter( task=t).filter(Q(status='sent') | Q(status='running')).values_list( 'panda_id', flat=True)[:50]) print 'Got list of %s jobs' % len(jobs_list) print 'Sending request to PanDA server' s, o = Client.getJobStatus(jobs_list, None) if s == 0: for x in o: print 'Getting job for PandaID=%s' % x.PandaID j_update = Job.objects.get(panda_id=x.PandaID) if j_update.status != x.jobStatus: today = datetime.datetime.today() if x.jobStatus == 'running' or x.jobStatus == 'finished' or x.jobStatus == 'failed': print 'Going to update status of job %s from %s to %s' % ( j_update.file, j_update.status, x.jobStatus) j_update.status = x.jobStatus j_update.date_updated = today if x.jobStatus == 'finished': j_update.status_merging_mdst = 'ready'
import sys import userinterface.Client as Client if len(sys.argv) == 2: Client.reassignJobs([sys.argv[1]]) else: startID = int(sys.argv[1]) endID = int(sys.argv[2]) if startID > endID: print '%d is less than %d' % (endID,startID) sys.exit(1) Client.reassignJobs(range(startID,endID+1))
#job.cloud = "UK" job.taskID = i file = FileSpec() file.lfn = "%s.evgen.pool.root" % job.jobName file.destinationDBlock = job.destinationDBlock file.destinationSE = job.destinationSE file.dataset = job.destinationDBlock #file.destinationDBlockToken = 'ATLASDATADISK' file.type = 'output' job.addFile(file) fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % job.jobName fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.type = 'log' job.addFile(fileOL) job.jobParameters="7087 0 500000 1 DC3.007087.singlepart_fwdgamma_etaplus_E500.py %s NONE NONE NONE" % file.lfn jobList.append(job) for i in range(1): #s,o = Client.submitJobs(jobList) s,outS = Client.runTaskAssignment(jobList) print "---------------------" print s for tmpOut in outS: print tmpOut
job.prodDBlock = 'pandatest.000003.dd.input' job.destinationDBlock = 'panda.destDB.%s' % commands.getoutput('/usr/bin/uuidgen') job.destinationSE = 'BNL_SE' ids = {'pandatest.000003.dd.input._00028.junk':'6c19e1fc-ee8c-4bae-bd4c-c9e5c73aca27', 'pandatest.000003.dd.input._00033.junk':'98f79ba1-1793-4253-aac7-bdf90a51d1ee', 'pandatest.000003.dd.input._00039.junk':'33660dd5-7cef-422a-a7fc-6c24cb10deb1'} for lfn in ids.keys(): file = FileSpec() file.lfn = lfn file.GUID = ids[file.lfn] file.dataset = 'pandatest.000003.dd.input' file.type = 'input' job.addFile(file) s,o = Client.submitJobs([job]) print "---------------------" print s print o print "---------------------" s,o = Client.getJobStatus([4934, 4766, 4767, 4768, 4769]) print s if s == 0: for job in o: if job == None: continue print job.PandaID for file in job.Files: print file.lfn,file.type print "---------------------" s,o = Client.queryPandaIDs([0])
if res != None: for (id,) in res: jobs.append(id) varMap = {} varMap[':jobStatus'] = 'waiting' varMap[':modificationTime'] = timeLimit varMap[':prodSourceLabel'] = 'managed' varMap[':taskID'] = taskid sql = "SELECT PandaID FROM ATLAS_PANDA.jobsWaiting4 WHERE jobStatus=:jobStatus AND taskID=:taskID AND modificationTime<:modificationTime AND prodSourceLabel=:prodSourceLabel " status,res = proxyS.querySQLS(sql,varMap) if res != None: for (id,) in res: jobs.append(id) # reassign jobs.sort() if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'reassign %s' % str(jobs[iJob:iJob+nJob]) Client.reassignJobs(jobs[iJob:iJob+nJob]) iJob += nJob time.sleep(10) print print 'reassigned {0} jobs'.format(len(jobs))
def putFile(req, file): if not Protocol.isSecure(req): return False if '/CN=limited proxy' in req.subprocess_env['SSL_CLIENT_S_DN']: return False _logger.debug("putFile : start %s %s" % (req.subprocess_env['SSL_CLIENT_S_DN'], file.filename)) # size check fullSizeLimit = 768 * 1024 * 1024 if not file.filename.startswith('sources.'): noBuild = True sizeLimit = 100 * 1024 * 1024 else: noBuild = False sizeLimit = fullSizeLimit # get file size contentLength = 0 try: contentLength = long(req.headers_in["content-length"]) except: if req.headers_in.has_key("content-length"): _logger.error("cannot get CL : %s" % req.headers_in["content-length"]) else: _logger.error("no CL") _logger.debug("size %s" % contentLength) if contentLength > sizeLimit: errStr = "ERROR : Upload failure. Exceeded size limit %s>%s." % ( contentLength, sizeLimit) if noBuild: errStr += " Please submit the job without --noBuild/--libDS since those options impose a tighter size limit" else: errStr += " Please remove redundant files from your workarea" _logger.error(errStr) _logger.debug("putFile : end") return errStr try: fileFullPath = '%s/%s' % (panda_config.cache_dir, file.filename.split('/')[-1]) # avoid overwriting if os.path.exists(fileFullPath): # touch os.utime(fileFullPath, None) # send error message errStr = "ERROR : Cannot overwrite file" _logger.debug('putFile : cannot overwrite file %s' % file.filename) _logger.debug("putFile : end") return errStr # write fo = open(fileFullPath, 'wb') fileContent = file.file.read() fo.write(fileContent) fo.close() except: errStr = "ERROR : Cannot write file" _logger.error(errStr) _logger.debug("putFile : end") return errStr # checksum try: # decode Footer footer = fileContent[-8:] checkSum, isize = struct.unpack("II", footer) _logger.debug("CRC from gzip Footer %s" % checkSum) except: # calculate on the fly """ import zlib checkSum = zlib.adler32(fileContent) & 0xFFFFFFFF """ # use None to avoid delay for now checkSum = None _logger.debug("CRC calculated %s" % checkSum) # file size fileSize = len(fileContent) # user name username = cleanUserID(req.subprocess_env['SSL_CLIENT_S_DN']) _logger.debug("putFile : written dn=%s file=%s size=%s crc=%s" % \ (username,file.filename,fileSize,checkSum)) # put file info to DB statClient, outClient = Client.insertSandboxFileInfo( username, file.filename, fileSize, checkSum) if statClient != 0 or outClient.startswith("ERROR"): _logger.error("putFile : failed to put sandbox to DB with %s %s" % (statClient, outClient)) #_logger.debug("putFile : end") #return "ERROR : Cannot insert sandbox to DB" else: _logger.debug("putFile : inserted sandbox to DB with %s" % outClient) # store to cassandra if hasattr(panda_config, 'cacheUseCassandra') and panda_config.cacheUseCassandra == True: try: # time-stamp timeNow = datetime.datetime.utcnow() creationTime = timeNow.strftime('%Y-%m-%d %H:%M:%S') # user name username = req.subprocess_env['SSL_CLIENT_S_DN'] username = username.replace('/CN=proxy', '') username = username.replace('/CN=limited proxy', '') # file size fileSize = len(fileContent) # key fileKeyName = file.filename.split('/')[-1] sizeCheckSum = '%s:%s' % (fileSize, checkSum) # insert to cassandra import pycassa pool = pycassa.ConnectionPool(panda_config.cacheKeySpace) filefamily = pycassa.ColumnFamily(pool, panda_config.cacheFileTable) # avoid overwriting gotoNextCassa = True if filefamily.get_count(fileKeyName) > 0: # touch touchFlag = touchFileCassa(filefamily, fileKeyName, timeNow) if touchFlag: gotoNextCassa = False # send error message errStr = "ERROR : Cannot overwrite file in Cassandra" _logger.error(errStr) if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr # check uniqueness with size and checksum if gotoNextCassa: try: uniqExp = pycassa.index.create_index_expression( 'uniqID', sizeCheckSum) userExp = pycassa.index.create_index_expression( 'user', username) tmpClause = pycassa.index.create_index_clause( [uniqExp, userExp]) tmpResults = filefamily.get_indexed_slices( tmpClause, columns=['creationTime']) for oldFileKeyName, tmpDict in tmpResults: _logger.debug('The same size and chksum %s found in old:%s and new:%s' % \ (sizeCheckSum,oldFileKeyName,fileKeyName)) # touch touchFlag = touchFileCassa(filefamily, oldFileKeyName, timeNow) if touchFlag: # make alias _logger.debug('Making alias %s->%s' % (fileKeyName, oldFileKeyName)) insertWithRetryCassa( filefamily, fileKeyName, { 'alias': oldFileKeyName, 'creationTime': creationTime, 'nSplit': 0, }, 'putFile : make alias for %s' % file.filename) # set time touchFileCassa(filefamily, fileKeyName, timeNow) _logger.debug("putFile : end") return True except: gotoNextCassa = False errType, errValue = sys.exc_info()[:2] errStr = "cannot make alias for %s due to %s %s" % ( fileKeyName, errType, errValue) _logger.error(errStr) if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr # insert new record if gotoNextCassa: splitIdx = 0 splitSize = 5 * 1024 * 1024 nSplit, tmpMod = divmod(len(fileContent), splitSize) if tmpMod != 0: nSplit += 1 _logger.debug('Inserting %s with %s blocks' % (fileKeyName, nSplit)) for splitIdx in range(nSplit): # split to small chunks since cassandra is not good at large files tmpFileContent = fileContent[splitSize * splitIdx:splitSize * (splitIdx + 1)] tmpFileKeyName = fileKeyName tmpAttMap = { 'file': tmpFileContent, 'user': username, 'creationTime': creationTime, } if splitIdx == 0: tmpAttMap['size'] = fileSize tmpAttMap['nSplit'] = nSplit tmpAttMap['uniqID'] = sizeCheckSum tmpAttMap['checkSum'] = str(checkSum) else: tmpFileKeyName += '_%s' % splitIdx tmpAttMap['size'] = 0 tmpAttMap['nSplit'] = 0 # insert with retry insertWithRetryCassa(filefamily, tmpFileKeyName, tmpAttMap, 'putFile : insert %s' % file.filename) # set time touchFileCassa(filefamily, fileKeyName, timeNow) except: errType, errValue = sys.exc_info()[:2] errStr = "cannot put %s into Cassandra due to %s %s" % ( fileKeyName, errType, errValue) _logger.error(errStr) # send error message errStr = "ERROR : " + errStr if not panda_config.cacheIgnoreCassandraError: _logger.debug("putFile : end") return errStr _logger.debug("putFile : %s end" % file.filename) return True
file.dataset = job.destinationDBlock file.type = 'output' job.addFile(file) fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % commands.getoutput('uuidgen') fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.type = 'log' job.addFile(fileOL) job.jobParameters = "5056 %s NONE 81000 9000 10 DC3.005056.PythiaPhotonJet2.py NONE" % file.lfn jobListE.append(job) s, o = Client.submitJobs(jobListE) print "---------------------" print s for x in o: print "PandaID=%s" % x[0] time.sleep(20) datasetNameS = 'panda.simu.%s' % commands.getoutput('uuidgen') jobListS = [] for lfn in lfnListE: job = JobSpec() job.jobDefinitionID = int(time.time()) % 10000 job.jobName = commands.getoutput('uuidgen')
files.append(tmpLFN) print print 'found {0} lost files -> {1}'.format(len(files), ','.join(files)) s,jediTaskID = taskBuffer.resetFileStatusInJEDI('',True,options.ds,files,[],options.dryRun) if options.dryRun: sys.exit(0) if s: if options.resurrectDS: sd,so = taskBuffer.querySQLS('SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', {':id': jediTaskID, ':t1': 'output', ':t2': 'log'}) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print 'resurrect {0}'.format(datasetName) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except: pass print Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1] print 'done for jediTaskID={0}'.format(jediTaskID) else: print 'failed'
def main(): logger.info('Getting tasks with status send and running') tasks_list = Task.objects.all().filter( Q(status='send') | Q(status='running')) #tasks_list = Task.objects.all().filter(name='dvcs2017align7_mu-') logger.info('Got list of %s tasks' % len(tasks_list)) cdbServerArr = ['compassvm23.cern.ch', 'compassvm24.cern.ch'] cdbServer = cdbServerArr[0] for t in tasks_list: max_send_amount = 1000 logger.info('Getting jobs in status staged or failed for task %s' % t) jobs_list_count = Job.objects.all().filter(task=t).filter( attempt__lt=t.max_attempts).filter( Q(status='staged') | Q(status='failed')).count() if jobs_list_count > 50: jobs_list = Job.objects.all().filter(task=t).filter( attempt__lt=t.max_attempts).filter( Q(status='staged') | Q(status='failed')).order_by( '-number_of_events')[:max_send_amount] else: jobs_list = Job.objects.all().filter(task=t).filter( attempt__lt=t.max_attempts).filter( Q(status='staged') | Q(status='failed')).order_by( '-number_of_events')[:jobs_list_count] logger.info('Got list of %s jobs' % len(jobs_list)) # jobs_list = Job.objects.all().filter(task=t).filter(file='/castor/cern.ch/compass/data/2017/raw/W04/cdr12116-278485.raw') i = 0 for j in jobs_list: if j.attempt >= j.task.max_attempts: logger.info( 'Number of retry attempts has reached for job %s of task %s' % (j.file, j.task.name)) continue if i > max_send_amount: break logger.info('Job %s of %s' % (i, max_send_amount)) logger.info('Going to send job %s of %s task' % (j.file, j.task.name)) umark = commands.getoutput('uuidgen') datasetName = 'panda.destDB.%s' % umark destName = 'local' # PanDA will not try to move output data, data will be placed by pilot (based on schedconfig) TMPRAWFILE = j.file[j.file.rfind('/') + 1:] logger.info(TMPRAWFILE) TMPMDSTFILE = 'mDST-%(runNumber)s-%(runChunk)s-%(prodSlt)s-%(phastVer)s.root' % { 'input_file': j.file, 'runNumber': j.run_number, 'runChunk': j.chunk_number, 'prodSlt': j.task.prodslt, 'phastVer': j.task.phastver } logger.info(TMPMDSTFILE) TMPHISTFILE = '%(runNumber)s-%(runChunk)s-%(prodSlt)s.root' % { 'runNumber': j.run_number, 'runChunk': j.chunk_number, 'prodSlt': j.task.prodslt } logger.info(TMPHISTFILE) TMPRICHFILE = 'gfile_%(runNumber)s-%(runChunk)s.gfile' % { 'runNumber': j.run_number, 'runChunk': j.chunk_number } logger.info(TMPRICHFILE) EVTDUMPFILE = 'evtdump%(prodSlt)s-%(runChunk)s-%(runNumber)s.raw' % { 'prodSlt': j.task.prodslt, 'runNumber': j.run_number, 'runChunk': j.chunk_number } logger.info(EVTDUMPFILE) STDOUTFILE = '%(prodNameOnly)s.%(runNumber)s-%(runChunk)s-%(prodSlt)s.stdout' % { 'prodNameOnly': j.task.production, 'runNumber': j.run_number, 'runChunk': j.chunk_number, 'prodSlt': j.task.prodslt } logger.info(STDOUTFILE) STDERRFILE = '%(prodNameOnly)s.%(runNumber)s-%(runChunk)s-%(prodSlt)s.stderr' % { 'prodNameOnly': j.task.production, 'runNumber': j.run_number, 'runChunk': j.chunk_number, 'prodSlt': j.task.prodslt } logger.info(STDERRFILE) PRODSOFT = j.task.soft logger.info(PRODSOFT) ProdPathAndName = j.task.home + j.task.path + j.task.soft job = JobSpec() job.VO = 'vo.compass.cern.ch' job.taskID = j.task.id job.jobDefinitionID = 0 job.jobName = '%(prodName)s-%(fileYear)s--%(runNumber)s-%(runChunk)s-%(prodSlt)s-%(phastVer)s' % { 'prodName': j.task.production, 'fileYear': j.task.year, 'runNumber': j.run_number, 'runChunk': j.chunk_number, 'prodSlt': j.task.prodslt, 'phastVer': j.task.phastver } job.transformation = j.task.type # payload (can be URL as well) job.destinationDBlock = datasetName job.destinationSE = destName job.currentPriority = 2000 if j.task.type == 'DDD filtering': job.currentPriority = 1000 job.prodSourceLabel = 'prod_test' job.computingSite = j.task.site job.attemptNr = j.attempt + 1 job.maxAttempt = j.task.max_attempts if j.status == 'failed': job.parentID = j.panda_id head, tail = os.path.split(j.file) cdbServer = cdbServerArr[random.randrange(len(cdbServerArr))] # logs, and all files generated during execution will be placed in log (except output file) #job.jobParameters='source /afs/cern.ch/project/eos/installation/compass/etc/setup.sh;export EOS_MGM_URL=root://eoscompass.cern.ch;export PATH=/afs/cern.ch/project/eos/installation/compass/bin:$PATH;ppwd=$(pwd);echo $ppwd;export TMPMDSTFILE=%(TMPMDSTFILE)s;export TMPHISTFILE=%(TMPHISTFILE)s;export TMPRICHFILE=%(TMPRICHFILE)s;coralpath=%(ProdPathAndName)s/coral;echo $coralpath;cd -P $coralpath;export coralpathsetup=$coralpath"/setup.sh";echo $coralpathsetup;source $coralpathsetup;cd $ppwd;$CORAL/../phast/coral/coral.exe %(ProdPathAndName)s/template.opt;xrdcp -np $ppwd/%(TMPMDSTFILE)s xroot://eoscompass.cern.ch//eos/compass/%(prodName)s/mDST/%(TMPMDSTFILE)s;xrdcp -np $ppwd/%(TMPHISTFILE)s xroot://eoscompass.cern.ch//eos/compass/%(prodName)s/histos/%(TMPHISTFILE)s;metadataxml=$(ls metadata-*);echo $metadataxml;cp $metadataxml $metadataxml.PAYLOAD;' % {'TMPMDSTFILE': TMPMDSTFILE, 'TMPHISTFILE': TMPHISTFILE, 'TMPRICHFILE': TMPRICHFILE, 'input_file': input_file, 'ProdPathAndName': ProdPathAndName, 'prodName': prodName} if j.task.type == 'test production' or j.task.type == 'mass production' or j.task.type == 'technical production': if j.task.site == 'BW_COMPASS_MCORE': job.jobParameters = 'ppwd=$(pwd);export COMPASS_SW_PREFIX=/scratch/sciteam/criedl/projectdata/;export COMPASS_SW_PATH=%(prodPath)s;export COMPASS_PROD_NAME=%(prodName)s;export TMPRAWFILE=%(TMPRAWFILE)s;export TMPMDSTFILE=%(TMPMDSTFILE)s;export TMPHISTFILE=%(TMPHISTFILE)s;export TMPRICHFILE=%(TMPRICHFILE)s;export prodSlt=%(prodSlt)s;export EVTDUMPFILE=%(EVTDUMPFILE)s;export PRODSOFT=%(PRODSOFT)s;cp %(input_file)s .;coralpath=%(ProdPathAndName)s/coral;cd -P $coralpath;export coralpathsetup=$coralpath"/setup.sh";source $coralpathsetup;cd $ppwd;$CORAL/../phast/coral/coral.exe %(ProdPathAndName)s/%(template)s;if [ ! -s testevtdump.raw ]; then echo "PanDA message: the file is empty">testevtdump.raw; fi;cp payload_stderr.txt payload_stderr.out;cp payload_stdout.txt payload_stdout.out;gzip payload_stderr.out;gzip payload_stdout.out;rm %(tail)s' % { 'TMPRAWFILE': TMPRAWFILE, 'TMPMDSTFILE': TMPMDSTFILE, 'TMPHISTFILE': TMPHISTFILE, 'TMPRICHFILE': TMPRICHFILE, 'PRODSOFT': PRODSOFT, 'input_file': j.file, 'ProdPathAndName': ProdPathAndName, 'prodPath': j.task.path, 'prodName': j.task.production, 'template': j.task.template, 'tail': tail, 'prodSlt': j.task.prodslt, 'EVTDUMPFILE': EVTDUMPFILE, 'STDOUTFILE': STDOUTFILE, 'STDERRFILE': STDERRFILE } else: job.jobParameters = 'export EOS_MGM_URL=root://eoscompass.cern.ch;ppwd=$(pwd);export COMPASS_SW_PREFIX=/eos/experiment/compass/;export COMPASS_SW_PATH=%(prodPath)s;export COMPASS_PROD_NAME=%(prodName)s;export TMPRAWFILE=%(TMPRAWFILE)s;export TMPMDSTFILE=%(TMPMDSTFILE)s;export TMPHISTFILE=%(TMPHISTFILE)s;export TMPRICHFILE=%(TMPRICHFILE)s;export prodSlt=%(prodSlt)s;export EVTDUMPFILE=%(EVTDUMPFILE)s;export PRODSOFT=%(PRODSOFT)s;xrdcp -N -f root://castorpublic.cern.ch/%(input_file)s\?svcClass=compasscdr .;coralpath=%(ProdPathAndName)s/coral;cd -P $coralpath;export coralpathsetup=$coralpath"/setup.sh";source $coralpathsetup;cd $ppwd;export CDBSERVER=%(cdbServer)s;$CORAL/../phast/coral/coral.exe %(ProdPathAndName)s/%(template)s;if [ ! -s testevtdump.raw ]; then echo "PanDA message: the file is empty">testevtdump.raw; fi;cp payload_stderr.txt payload_stderr.out;cp payload_stdout.txt payload_stdout.out;gzip payload_stderr.out;gzip payload_stdout.out;rm %(tail)s' % { 'TMPRAWFILE': TMPRAWFILE, 'TMPMDSTFILE': TMPMDSTFILE, 'TMPHISTFILE': TMPHISTFILE, 'TMPRICHFILE': TMPRICHFILE, 'PRODSOFT': PRODSOFT, 'input_file': j.file, 'ProdPathAndName': ProdPathAndName, 'prodPath': j.task.path, 'prodName': j.task.production, 'template': j.task.template, 'tail': tail, 'prodSlt': j.task.prodslt, 'EVTDUMPFILE': EVTDUMPFILE, 'STDOUTFILE': STDOUTFILE, 'STDERRFILE': STDERRFILE, 'cdbServer': cdbServer } if j.task.type == 'DDD filtering': job.jobParameters = 'export EOS_MGM_URL=root://eoscompass.cern.ch;ppwd=$(pwd);export COMPASS_SW_PREFIX=/eos/experiment/compass/;export COMPASS_SW_PATH=%(prodPath)s;export COMPASS_PROD_NAME=%(prodName)s;export TMPRAWFILE=%(TMPRAWFILE)s;export TMPMDSTFILE=%(TMPMDSTFILE)s;export TMPHISTFILE=%(TMPHISTFILE)s;export TMPRICHFILE=%(TMPRICHFILE)s;export prodSlt=%(prodSlt)s;export EVTDUMPFILE=%(EVTDUMPFILE)s;export PRODSOFT=%(PRODSOFT)s;xrdcp -N -f root://castorpublic.cern.ch/%(input_file)s\?svcClass=compasscdr .;coralpath=%(ProdPathAndName)s/coral;cd -P $coralpath;export coralpathsetup=$coralpath"/setup.sh";source $coralpathsetup;cd $ppwd;$CORAL/src/DaqDataDecoding/examples/how-to/ddd --filter-CAL --out=testevtdump.raw %(TMPRAWFILE)s;if [ ! -s testevtdump.raw ]; then echo "PanDA message: the file is empty">testevtdump.raw; fi;cp payload_stderr.txt payload_stderr.out;cp payload_stdout.txt payload_stdout.out;gzip payload_stderr.out;gzip payload_stdout.out;rm %(tail)s' % { 'TMPRAWFILE': TMPRAWFILE, 'TMPMDSTFILE': TMPMDSTFILE, 'TMPHISTFILE': TMPHISTFILE, 'TMPRICHFILE': TMPRICHFILE, 'PRODSOFT': PRODSOFT, 'input_file': j.file, 'ProdPathAndName': ProdPathAndName, 'prodPath': j.task.path, 'prodName': j.task.production, 'template': j.task.template, 'tail': tail, 'prodSlt': j.task.prodslt, 'EVTDUMPFILE': EVTDUMPFILE, 'STDOUTFILE': STDOUTFILE, 'STDERRFILE': STDERRFILE } # fileIRaw = FileSpec() # fileIRaw.lfn = "%s" % (input_file) # fileIRaw.destinationDBlock = job.destinationDBlock # fileIRaw.destinationSE = job.destinationSE # fileIRaw.dataset = job.destinationDBlock # fileIRaw.type = 'input' # job.addFile(fileIRaw) fileOstdout = FileSpec() fileOstdout.lfn = "payload_stdout.out.gz" fileOstdout.destinationDBlock = job.destinationDBlock fileOstdout.destinationSE = job.destinationSE fileOstdout.dataset = job.destinationDBlock fileOstdout.type = 'output' job.addFile(fileOstdout) fileOstderr = FileSpec() fileOstderr.lfn = "payload_stderr.out.gz" fileOstderr.destinationDBlock = job.destinationDBlock fileOstderr.destinationSE = job.destinationSE fileOstderr.dataset = job.destinationDBlock fileOstderr.type = 'output' job.addFile(fileOstderr) fileOLog = FileSpec() fileOLog.lfn = "%(prodName)s-%(runNumber)s-%(runChunk)s-%(prodSlt)s-%(phastVer)s.job.log.tgz" % { 'prodName': j.task.production, 'runNumber': j.run_number, 'runChunk': j.chunk_number, 'prodSlt': j.task.prodslt, 'phastVer': j.task.phastver } fileOLog.destinationDBlock = job.destinationDBlock fileOLog.destinationSE = job.destinationSE fileOLog.dataset = job.destinationDBlock fileOLog.type = 'log' job.addFile(fileOLog) if j.task.type == 'test production' or j.task.type == 'mass production' or j.task.type == 'technical production': fileOmDST = FileSpec() fileOmDST.lfn = "%s" % (TMPMDSTFILE) fileOmDST.destinationDBlock = job.destinationDBlock fileOmDST.destinationSE = job.destinationSE fileOmDST.dataset = job.destinationDBlock fileOmDST.type = 'output' job.addFile(fileOmDST) fileOTrafdic = FileSpec() fileOTrafdic.lfn = "%s" % (TMPHISTFILE) fileOTrafdic.destinationDBlock = job.destinationDBlock fileOTrafdic.destinationSE = job.destinationSE fileOTrafdic.dataset = job.destinationDBlock fileOTrafdic.type = 'output' job.addFile(fileOTrafdic) if j.task.type == 'test production' or j.task.type == 'mass production' or j.task.type == 'technical production' or j.task.type == 'DDD filtering': fileOtestevtdump = FileSpec() fileOtestevtdump.lfn = "testevtdump.raw" fileOtestevtdump.destinationDBlock = job.destinationDBlock fileOtestevtdump.destinationSE = job.destinationSE fileOtestevtdump.dataset = job.destinationDBlock fileOtestevtdump.type = 'output' job.addFile(fileOtestevtdump) s, o = Client.submitJobs([job], srvID=aSrvID) logger.info(s) for x in o: logger.info("PandaID=%s" % x[0]) if x[0] != 0 and x[0] != 'NULL': j_update = Job.objects.get(id=j.id) j_update.panda_id = x[0] j_update.status = 'sent' j_update.attempt = j_update.attempt + 1 j_update.date_updated = timezone.now() try: j_update.save() logger.info('Job %s with PandaID %s updated at %s' % (j.id, x[0], timezone.now())) if j_update.task.status == 'send': logger.info( 'Going to update status of task %s from send to running' % j_update.task.name) t_update = Task.objects.get(id=j_update.task.id) t_update.status = 'running' t_update.date_updated = timezone.now() try: t_update.save() logger.info('Task %s updated' % t_update.name) except IntegrityError as e: logger.exception( 'Unique together catched, was not saved') except DatabaseError as e: logger.exception( 'Something went wrong while saving: %s' % e.message) except IntegrityError as e: logger.exception( 'Unique together catched, was not saved') except DatabaseError as e: logger.exception( 'Something went wrong while saving: %s' % e.message) else: logger.info('Job %s was not added to PanDA' % j.id) i += 1 logger.info('done')
default=False,help='kill jobs before next heartbeat is coming') optP.add_option('--killOwnProdJobs',action='store_const',const=True,dest='killOwnProdJobs', default=False,help='kill own production jobs without a production role') optP.add_option('--killUserJobs',action='store_const',const=True,dest='killUserJobs', default=False,help='kill user jobs using a production role') options,args = optP.parse_args() aSrvID = None codeV = None useMailAsIDV = False if options.forceKill: codeV = 9 elif options.killUserJobs: codeV = 91 if options.killOwnProdJobs: useMailAsIDV = True if len(args) == 1: Client.killJobs([args[0]],code=codeV,useMailAsID=useMailAsIDV) else: startID = int(args[0]) endID = int(args[1]) if startID > endID: print '%d is less than %d' % (endID,startID) sys.exit(1) Client.killJobs(range(startID,endID+1),code=codeV,useMailAsID=useMailAsIDV)
job.transformation = 'http://pandawms.org/pandawms-jobcache/lsst-trf-phosim332.sh' job.destinationDBlock = datasetName #job.destinationSE = destName job.destinationSE = 'local' job.currentPriority = 1000 #job.prodSourceLabel = 'ptest' #job.prodSourceLabel = 'panda' #job.prodSourceLabel = 'ptest' #job.prodSourceLabel = 'test' #job.prodSourceLabel = 'ptest' ### 2014-01-27 #job.prodSourceLabel = 'user' job.prodSourceLabel = 'panda' job.computingSite = site job.jobParameters = "" job.VO = "lsst" fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % job.jobName fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.type = 'log' job.addFile(fileOL) s,o = Client.submitJobs([job],srvID=aSrvID) print s for x in o: print "PandaID=%s" % x[0]
options,args = optP.parse_args() aSrvID = None codeV = None useMailAsIDV = False if options.forceKill: codeV = 9 elif options.killUserJobs: codeV = 91 else: try: codeV = int(options.codeV) except Exception: pass if options.killOwnProdJobs: useMailAsIDV = True if len(args) == 1: Client.killJobs([args[0]], code=codeV, useMailAsID=useMailAsIDV, keepUnmerged=options.keepUnmerged, jobSubStatus=options.jobSubStatus) else: startID = int(args[0]) endID = int(args[1]) if startID > endID: print '%d is less than %d' % (endID,startID) sys.exit(1) Client.killJobs(range(startID,endID+1),code=codeV,useMailAsID=useMailAsIDV,keepUnmerged=options.keepUnmerged, jobSubStatus=options.jobSubStatus)
if options.dryRun: sys.exit(0) if s: if options.resurrectDS: sd, so = taskBuffer.querySQLS( 'SELECT datasetName FROM ATLAS_PANDA.JEDI_Datasets WHERE jediTaskID=:id AND type IN (:t1,:t2)', { ':id': jediTaskID, ':t1': 'output', ':t2': 'log' }) rc = RucioClient() for datasetName, in so: for i in range(3): try: scope, name = rucioAPI.extract_scope(datasetName) rc.get_did(scope, name) break except DataIdentifierNotFound: print 'resurrect {0}'.format(datasetName) rc.resurrect([{'scope': scope, 'name': name}]) try: rc.set_metadata(scope, name, 'lifetime', None) except: pass print Client.retryTask(jediTaskID, noChildRetry=options.noChildRetry)[-1][-1] print 'done for jediTaskID={0}'.format(jediTaskID) else: print 'failed'
from taskbuffer.TaskBuffer import taskBuffer from brokerage.SiteMapper import SiteMapper from config import panda_config # instantiate TB taskBuffer.init(panda_config.dbhost,panda_config.dbpasswd,nDBConnection=1) # instantiate sitemapper siteMapper = SiteMapper(taskBuffer) import httplib import commands id = sys.argv[1] s,o = Client.getJobStatus([id]) if s != 0: print "failed to get job with:%s" % s sys.exit(0) job = o[0] if job == None: print "got None" sys.exit(0) xml = """<?xml version="1.0" encoding="UTF-8" standalone="no" ?> <!-- ATLAS file meta-data catalog --> <!DOCTYPE POOLFILECATALOG SYSTEM "InMemory"> <POOLFILECATALOG>
import sys import userinterface.Client as Client if len(sys.argv) == 2: jobDefIDs = [sys.argv[1]] else: startID = int(sys.argv[1]) endID = int(sys.argv[2]) if startID > endID: print '%d is less than %d' % (endID, startID) sys.exit(1) jobDefIDs = range(startID, endID + 1) # quesry PandaID status, ids = Client.queryPandaIDs(jobDefIDs) if status != 0: sys.exit(0) # remove None while True: if not None in ids: break ids.remove(None) # kill if len(ids) != 0: Client.killJobs(ids)
fileD.type = 'input' job.addFile(fileD) fileOE = FileSpec() fileOE.lfn = "%s.HITS.pool.root" % job.jobName fileOE.destinationDBlock = job.destinationDBlock fileOE.destinationSE = job.destinationSE fileOE.dataset = job.destinationDBlock fileOE.destinationDBlockToken = 'ATLASDATADISK' fileOE.type = 'output' job.addFile(fileOE) fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % job.jobName fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.destinationDBlockToken = 'ATLASDATADISK' fileOL.type = 'log' job.addFile(fileOL) job.jobParameters="%s %s NONE 1 3250 55866 ATLAS-CSC-02-01-00 55866 55866 QGSP_EMV None %s DEFAULT" % \ (fileI.lfn,fileOE.lfn,fileD.lfn) jobList.append(job) s, o = Client.submitJobs(jobList) print "---------------------" print s for x in o: print "PandaID=%s" % x[0]
jobs = [] varMap = {} varMap[':prodSourceLabel'] = 'managed' varMap[':taskID'] = args[0] varMap[':pandaIDl'] = args[1] varMap[':pandaIDu'] = args[2] sql = "SELECT PandaID FROM %s WHERE prodSourceLabel=:prodSourceLabel AND taskID=:taskID AND PandaID BETWEEN :pandaIDl AND :pandaIDu ORDER BY PandaID" for table in ['ATLAS_PANDA.jobsActive4','ATLAS_PANDA.jobsWaiting4','ATLAS_PANDA.jobsDefined4']: status,res = proxyS.querySQLS(sql % table,varMap) if res != None: for id, in res: if not id in jobs: jobs.append(id) print 'The number of jobs to be killed : %s' % len(jobs) if len(jobs): nJob = 100 iJob = 0 while iJob < len(jobs): print 'kill %s' % str(jobs[iJob:iJob+nJob]) if options.forceKill: Client.killJobs(jobs[iJob:iJob+nJob],9,useMailAsID=useMailAsIDV) else: Client.killJobs(jobs[iJob:iJob+nJob],useMailAsID=useMailAsIDV) iJob += nJob time.sleep(1)
job.currentPriority = 1000 job.prodSourceLabel = 'test' # job.prodSourceLabel = 'cloudtest' job.computingSite = site file = FileSpec() file.lfn = "%s.evgen.pool.root" % job.jobName file.destinationDBlock = job.destinationDBlock file.destinationSE = job.destinationSE file.dataset = job.destinationDBlock file.type = 'output' job.addFile(file) fileOL = FileSpec() fileOL.lfn = "%s.job.log.tgz" % job.jobName fileOL.destinationDBlock = job.destinationDBlock fileOL.destinationSE = job.destinationSE fileOL.dataset = job.destinationDBlock fileOL.type = 'log' job.addFile(fileOL) job.jobParameters="8072 0 5000 1 DC3.008072.JimmyPhotonJet1.py %s NONE NONE NONE" % file.lfn jobList.append(job) for i in range(1): s,o = Client.submitJobs(jobList) print "---------------------" print s for x in o: print "PandaID=%s" % x[0]
'dataset':dsName, }, {'type':'template', 'value':'--maxEvents=${MAXEVENTS}', 'param_type':'number', }, {'type':'template', 'param_type':'output', 'token':'ATLASDATADISK', 'value':' --outputHitsFile={0}.${{SN}}.pool.root'.format(outDatasetName), 'dataset':outDatasetName, }, {'type':'constant', 'value':'--physicsList=QGSP_BERT --postInclude=RecJobTransforms/UseFrontierFallbackDBRelease.py --preInclude=SimulationJobOptions/preInclude.FrozenShowersFCalOnly.py,SimulationJobOptions/preInclude.BeamPipeKill.py', }, {'type':'template', 'value':'--skipEvents=${SKIPEVENTS}', 'param_type':'number', }, {'type':'template', 'value':'--randomSeed=${RNDMSEED}', 'param_type':'number', }, ] taskParamMap['esmergeSpec'] = {} taskParamMap['esmergeSpec']['transPath'] = 'Merge_trf.py' taskParamMap['esmergeSpec']['jobParameters'] = "aaa bbb" print Client.insertTaskParams(taskParamMap)