Ejemplo n.º 1
0
def addDatasetsToContainer(container, datasets):
    from pandatools import Client
    # HC datasets don't use containers
    if not configPanda['processingType'].startswith(
            'gangarobot') and not configPanda['processingType'].startswith(
                'hammercloud'):
        Client.addDatasetsToContainer(container, datasets, False)
Ejemplo n.º 2
0
def addDatasetsToContainer(container, datasets):
    from pandatools import Client

    # HC datasets don't use containers
    if not configPanda["processingType"].startswith("gangarobot") and not configPanda["processingType"].startswith(
        "hammercloud"
    ):
        Client.addDatasetsToContainer(container, datasets, False)
Ejemplo n.º 3
0
    def master_resubmit(self,jobs):
        '''Resubmit failed Jedi job'''
        from pandatools import Client

        jobIDs = {}
        for job in jobs: 
            jobIDs[job.backend.id] = job

        allJobIDs = jobIDs.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            with inject_proxy(self.credential_requirements):
                status, jediTaskDict = Client.getJediTaskDetails({'jediTaskID': jID},False,True,verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                raise BackendError('Jedi','Return code %d retrieving job status information.' % status)

            # Retrieve job
            job = jobIDs[jediTaskDict['jediTaskID']]
       
            newJobsetID = -1 # get jobset
            retryJobs = [] # jspecs
            resubmittedJobs = [] # ganga jobs

            if jediTaskDict['status'] in ['failed', 'killed', 'cancelled', 'aborted', 'broken', 'finished' ]:
                retryJobs.append(job)
                resubmittedJobs.append(jID)
            #elif jediTaskDict['status'] == 'finished':
            #    pass
            else:
                logger.warning("Cannot resubmit. Jedi task %s is status %s." %(jID, jediTaskDict['status'] ))
                return False

            # submit
            if len(retryJobs)==0:
                logger.warning("No failed jobs to resubmit")
                return False

            with inject_proxy(self.credential_requirements):
                status,out = Client.retryTask(jID, verbose=False)
            if status != 0:
                logger.error(status)
                logger.error(out)
                logger.error("Failed to retry JobID=%s" % jID)                                                                                         
                return False
            tmpStat,tmpDiag = out
            if not tmpStat:
                logger.error(tmpDiag)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            logger.info(tmpDiag)
       
            job.backend.status = None
            job.backend.jobSpec = {}
            job.updateStatus('submitted')

        logger.info('Resubmission successful')
        return True
Ejemplo n.º 4
0
def createContainer(name):
    from pandatools import Client
    # don't create containers for HC datasets
    if not configPanda['processingType'].startswith('gangarobot') and not configPanda['processingType'].startswith('hammercloud'):
        try:
            Client.createContainer(name,False)
            logger.info('Created output container %s' %name)
        except exceptions.SystemExit:
            raise BackendError('Panda','Exception in Client.createContainer %s: %s %s'%(name,sys.exc_info()[0],sys.exc_info()[1]))
Ejemplo n.º 5
0
 def kill_processing(self, processing):
     try:
         if processing:
             from pandatools import Client
             task_id = processing.workload_id
             Client.killTask(task_id)
     except Exception as ex:
         msg = "Failed to check the processing (%s) status: %s" % (str(
             processing['processing_id']), str(ex))
         raise exceptions.IDDSException(msg)
Ejemplo n.º 6
0
def createContainer(name):
    from pandatools import Client
    # don't create containers for HC datasets
    if not configPanda['processingType'].startswith(
            'gangarobot') and not configPanda['processingType'].startswith(
                'hammercloud'):
        try:
            Client.createContainer(name, False)
            logger.info('Created output container %s' % name)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in Client.createContainer %s: %s %s' %
                (name, sys.exc_info()[0], sys.exc_info()[1]))
Ejemplo n.º 7
0
 def kill_processing_force(self, processing):
     try:
         if processing:
             from pandatools import Client
             proc = processing['processing_metadata']['processing']
             task_id = proc.workload_id
             # task_id = processing['processing_metadata']['task_id']
             Client.killTask(task_id)
             # Client.finishTask(task_id, soft=True)
     except Exception as ex:
         msg = "Failed to check the processing (%s) status: %s" % (str(
             processing['processing_id']), str(ex))
         raise exceptions.IDDSException(msg)
Ejemplo n.º 8
0
def createContainer(name):
    from pandatools import Client

    # don't create containers for HC datasets
    if not configPanda["processingType"].startswith("gangarobot") and not configPanda["processingType"].startswith(
        "hammercloud"
    ):
        try:
            Client.createContainer(name, False)
            logger.info("Created output container %s" % name)
        except exceptions.SystemExit:
            raise BackendError(
                "Panda", "Exception in Client.createContainer %s: %s %s" % (name, sys.exc_info()[0], sys.exc_info()[1])
            )
Ejemplo n.º 9
0
def ELG_jediState(sample):

    try:
        from pandatools import PandaToolsPkgInfo  # noqa: F401
    except ImportError:
        print("prun needs additional setup, try:")
        print("    lsetup panda")
        return 99

    jediTaskID = int(sample.getMetaDouble("nc_jediTaskID", 0))

    if jediTaskID < 100:
        print("Sample " + sample.name() + " does not have a jediTaskID")
        return ''

    from pandatools import Client

    taskDict = {}
    taskDict['jediTaskID'] = jediTaskID
    ret = Client.getJediTaskDetails(taskDict, False, True)
    if ret[0] != 0:
        print("Problem checking status of task %s with id %s" %
              (sample.name(), jediTaskID))
        return ''

    return ret[1]['status'].encode("ascii")
Ejemplo n.º 10
0
    def master_kill(self):
        '''Kill jobs'''

        from pandatools import Client

        job = self.getJobObject()
        logger.debug('Killing job %s' % job.getFQID('.'))

        active_status = [
            None, 'registered', 'waiting', 'defined', 'pending', 'assigning',
            'ready', 'scouting', 'running', 'holding', 'merging', 'prepared',
            'aborting', 'finishing'
        ]
        #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ]

        if self.id and self.status in active_status:
            status, output = Client.killTask(self.id)
            if status:
                logger.error('Failed killing job (status = %d)', status)
                return False
            else:
                logger.info('Killing Jedi task %s, Server returned: %s' %
                            (self.id, output))
        else:
            logger.error(
                'Cannot kill Jedi job %s since it is not in active status',
                self.id)

        return True
Ejemplo n.º 11
0
    def map_panda_ids(self, unregistered_job_ids, input_output_maps):
        self.logger.debug("map_panda_ids, unregistered_job_ids: %s" %
                          str(unregistered_job_ids))
        from pandatools import Client

        # updated_map_ids = []
        full_update_contents = []
        chunksize = 2000
        chunks = [
            unregistered_job_ids[i:i + chunksize]
            for i in range(0, len(unregistered_job_ids), chunksize)
        ]
        for chunk in chunks:
            jobs_list = Client.getJobStatus(chunk, verbose=0)[1]
            for job_info in jobs_list:
                if job_info and job_info.Files and len(job_info.Files) > 0:
                    for job_file in job_info.Files:
                        # if job_file.type in ['log']:
                        if job_file.type not in ['pseudo_input']:
                            continue
                        if ':' in job_file.lfn:
                            pos = job_file.lfn.find(":")
                            input_file = job_file.lfn[pos + 1:]
                            # input_file = job_file.lfn.split(':')[1]
                        else:
                            input_file = job_file.lfn
                        map_id = self.get_map_id_from_input(
                            input_output_maps, input_file)
                        if map_id:
                            update_contents = self.get_update_contents_from_map_id(
                                map_id, input_output_maps, job_info)
                            full_update_contents += update_contents
        return full_update_contents
Ejemplo n.º 12
0
def getDBDatasets(jobO,trf,dbrelease):
    from pandatools import Client

    # get DB datasets
    dbrFiles  = {}
    dbrDsList = []
    if trf or dbrelease != '':
        if trf:
            # parse jobO for TRF
            tmpItems = jobO.split()
        else:
            # mimic a trf parameter to reuse following algorithm
            tmpItems = ['%DB='+dbrelease]
        # look for DBRelease
        for tmpItem in tmpItems:
            match = re.search('%DB=([^:]+):(.+)$',tmpItem)
            if match:
                tmpDbrDS  = match.group(1)
                tmpDbrLFN = match.group(2)
                # get files in the dataset
                if not tmpDbrDS in dbrDsList:
                    logger.info("Querying files in dataset:%s" % tmpDbrDS)
                    try:
                        tmpList = Client.queryFilesInDataset(tmpDbrDS,False)
                    except:
                        raise ApplicationConfigurationError("ERROR : error while looking up dataset %s. Perhaps this dataset does not exist?"%tmpDbrDS)
                    # append
                    for tmpLFN,tmpVal in tmpList.iteritems():
                        dbrFiles[tmpLFN] = tmpVal
                    dbrDsList.append(tmpDbrDS)
                # check
                if tmpDbrLFN not in dbrFiles:
                    raise ApplicationConfigurationError("ERROR : %s is not in %s"%(tmpDbrLFN,tmpDbrDS))
    return dbrFiles,dbrDsList
Ejemplo n.º 13
0
    def get_panda_task_id(self, processing):
        from pandatools import Client

        start_time = datetime.datetime.utcnow() - datetime.timedelta(hours=10)
        start_time = start_time.strftime('%Y-%m-%d %H:%M:%S')
        status, results = Client.getJobIDsJediTasksInTimeRange(
            start_time, task_type=self.task_type, verbose=False)
        if status != 0:
            self.logger.warn(
                "Error to poll latest tasks in last ten hours: %s, %s" %
                (status, results))
            return None

        proc = processing['processing_metadata']['processing']
        task_id = None
        for req_id in results:
            task_name = results[req_id]['taskName']
            if proc.workload_id is None and task_name == self.task_name:
                task_id = results[req_id]['jediTaskID']
                # processing['processing_metadata']['task_id'] = task_id
                # processing['processing_metadata']['workload_id'] = task_id
                proc.workload_id = task_id
                if task_id:
                    proc.submitted_at = datetime.datetime.utcnow()

        return task_id
def getLatestDBReleaseCaching():
    import tempfile
    import cPickle as pickle
    from pandatools import Client
    from GangaAtlas.Lib.Credentials.ProxyHelper import getNickname

    TMPDIR = tempfile.gettempdir()
    nickname = getNickname(allowMissingNickname=False)
    DBRELCACHE = '%s/ganga.latestdbrel.%s'%(TMPDIR,nickname)

    try:
        fh = open(DBRELCACHE)
        dbrelCache = pickle.load(fh)
        fh.close()
        if dbrelCache['mtime'] > time.time() - 3600:
            logger.debug('Loading LATEST DBRelease from local cache')
            return dbrelCache['atlas_dbrelease']
        else:
            raise Exception()
    except:
        logger.debug('Updating local LATEST DBRelease cache')
        atlas_dbrelease = Client.getLatestDBRelease(False)
        dbrelCache = {}
        dbrelCache['mtime'] = time.time()
        dbrelCache['atlas_dbrelease'] = atlas_dbrelease
        fh = open(DBRELCACHE,'w')
        pickle.dump(dbrelCache,fh)
        fh.close()
        return atlas_dbrelease
Ejemplo n.º 15
0
def getLatestDBReleaseCaching():
    import tempfile
    import cPickle as pickle
    from pandatools import Client
    from GangaAtlas.Lib.Credentials.ProxyHelper import getNickname

    TMPDIR = tempfile.gettempdir()
    nickname = getNickname(allowMissingNickname=False)
    DBRELCACHE = '%s/ganga.latestdbrel.%s' % (TMPDIR, nickname)

    try:
        fh = open(DBRELCACHE)
        dbrelCache = pickle.load(fh)
        fh.close()
        if dbrelCache['mtime'] > time.time() - 3600:
            logger.debug('Loading LATEST DBRelease from local cache')
            return dbrelCache['atlas_dbrelease']
        else:
            raise Exception()
    except:
        logger.debug('Updating local LATEST DBRelease cache')
        atlas_dbrelease = Client.getLatestDBRelease(False)
        dbrelCache = {}
        dbrelCache['mtime'] = time.time()
        dbrelCache['atlas_dbrelease'] = atlas_dbrelease
        fh = open(DBRELCACHE, 'w')
        pickle.dump(dbrelCache, fh)
        fh.close()
        return atlas_dbrelease
Ejemplo n.º 16
0
 def status(self, JobID, forceUpdate=False):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy(
         self.gridPassPhrase, False, self.verbose)
     # get job info from local repository
     job = self.getJobInfo(JobID)
     if job == None:
         # not found
         return None
     # update if needed
     if job.dbStatus != 'frozen' or forceUpdate:
         tmpLog.info("Getting status for JobID=%s ..." % JobID)
         # get status from Panda server
         status, pandaIDstatus = Client.getPandIDsWithJobID(
             JobID, verbose=self.verbose)
         if status != 0:
             tmpLog.error("Failed to get status for JobID=%s" % JobID)
             return None
         # get one job to set computingSite which may have changed due to rebrokerage
         pandaJob = None
         if pandaIDstatus != {}:
             tmpPandaIDs = pandaIDstatus.keys()
             tmpPandaIDs.sort()
             status, tmpPandaJobs = Client.getFullJobStatus(
                 tmpPandaIDs[:1], verbose=self.verbose)
             if status != 0:
                 tmpLog.error("Failed to get PandaJobs for %s" % JobID)
                 return None
             pandaJob = tmpPandaJobs[0]
         # convert to local job spec
         job = PdbUtils.convertPtoD([],
                                    pandaIDstatus,
                                    job,
                                    pandaJobForSiteID=pandaJob)
         # update DB
         try:
             PdbUtils.updateJobDB(job, self.verbose)
         except:
             tmpLog.error("Failed to update local repository for JobID=%s" %
                          JobID)
             return None
         tmpLog.info("Updated JobID=%s" % JobID)
     # return
     return job
Ejemplo n.º 17
0
  def convertQueueNameToDQ2Names(self, queue, test):
    from pandatools import Client
    from dq2.info.TiersOfATLAS import ToACache
    sites = []

    if 'PFT' in test.template.description:
        #Client.PandaSites = Client.getSiteSpecs('production')[1]
        Client.PandaSites = Client.getSiteSpecs('all')[1]

    for site in Client.PandaSites[queue]['setokens'].values():
      sites.append(Client.convSrmV2ID(site))
    allowed_sites = []
    for site in ToACache.sites:
      if (site not in allowed_sites
          and Client.convSrmV2ID(site) in sites
          and site.find('TAPE') == -1 and 'DISK' in site):
        allowed_sites.append(site)
    return allowed_sites
Ejemplo n.º 18
0
def retrievePandaJobs(job, jIDs):
    '''
    methods for retrieving panda job ids of panda jobs given a jobDefId
    '''
    from pandatools import Client

    ick = False
    jstatus = ''
    num_pjobs = 0

    logger.debug("retrievePandaJobs jIDs=%s" % jIDs)

    # get status from Panda server
    rc, jobsStatus = Client.getFullJobStatus(jIDs, False)
    if rc:
        logger.error('Return code %d retrieving job status information.', rc)
        raise BackendError(
            'Jedi', 'Return code %d retrieving job status information.' % rc)

    for status in jobsStatus:
        if not status: continue

        jstatus = status.jobStatus
        if status.jobStatus == None:
            logger.warning('No panda jobs expected')
            job.backend.pandajobs = []

        elif status.jobStatus in [
                "defined", "activated", "running", "failed", "finished",
                "holding", "assigned"
        ]:
            logger.debug('Panda jobs are running')
            logger.debug("PandaID: %d" % status.PandaID)

            pjobj = JediPandaJob()
            pjobj.id = status.PandaID
            pjobj.url = 'http://panda.cern.ch/?job=%d' % status.PandaID
            pjobj.jopSpec = dict(zip(status._attributes, status.values()))
            for k in pjobj.jobSpec.keys():
                if type(pjobj.jobSpec[k]) not in [type(''), type(1)]:
                    pjobj.jobSpec[k] = str(pjobj.jobSpec[k])

            if pjobj not in job.backend.pandajobs:
                job.backend.pandajobs.append(pjobj)
            else:
                logger.debug("Panda job %s already exists locally" % pjobj.id)

            num_pjobs += 1
        else:
            logger.warning(
                "getFullJobStatus returned unsupported status %s for Panda job %s "
                % (status.jobStatus, status.PandaID))

        ick = True

    return (ick, jstatus, num_pjobs)
Ejemplo n.º 19
0
    def poll_panda_task(self, processing):
        if 'panda_task_id' in processing['processing_metadata']:
            from pandatools import Client

            status, task_status = Client.getTaskStatus(processing.workload_id)
            if status == 0:
                return task_status
        else:
            return 'failed'
        return None
Ejemplo n.º 20
0
    def master_submit(self, rjobs, subjobspecs, buildjobspec):
        '''Submit jobs'''

        from pandatools import Client
        from pandatools import MiscUtils

        from Ganga.Core.exceptions import IncompleteJobSubmissionError
        from Ganga.Utility.logging import log_user_exception

        job = self.getJobObject()

        # job name
        jobName = 'ganga.%s' % MiscUtils.wrappedUuidGen()

        jobspecs = {}
        if buildjobspec:
            jobspecs = buildjobspec
        else:
            jobspecs = subjobspecs

        logger.debug(jobspecs)

        # submit task
        for subjob in rjobs:
            subjob.updateStatus('submitting')

        logger.info("Submitting to Jedi ...")
        verbose = logger.isEnabledFor(10)
        status, tmpOut = Client.insertTaskParams(jobspecs, verbose)

        logger.debug(tmpOut)

        if status != 0:
            logger.error("Task submission to Jedi failed with %s " % status)
            return False
        if tmpOut[0] == False:
            logger.error("Task submission to Jedi failed %s" % tmpOut[1])
            return False
        logger.info("Task submission to Jedi suceeded with new jediTaskID=%s" %
                    tmpOut[1])

        #if buildjobspec:
        #    job.backend.buildjob = PandaBuildJob()
        #    job.backend.buildjob.id = jobids[0][0]
        #    job.backend.buildjob.url = 'http://panda.cern.ch/?job=%d'%jobids[0][0]
        #    del jobids[0]

        for subjob in rjobs:
            subjob.backend.id = tmpOut[1]
            subjob.backend.url = 'http://pandamon.cern.ch/jedi/taskinfo?days=20&task=%d' % tmpOut[
                1]
            subjob.updateStatus('submitted')
            logger.info("Panda monitor url: %s" % subjob.backend.url)

        return True
Ejemplo n.º 21
0
    def master_submit(self,rjobs,subjobspecs,buildjobspec):
        '''Submit jobs'''
       
        from pandatools import Client
        from pandatools import MiscUtils

        from Ganga.Core import IncompleteJobSubmissionError
        from Ganga.Utility.logging import log_user_exception

        job = self.getJobObject()

        # job name
        jobName = 'ganga.%s' % MiscUtils.wrappedUuidGen()

        jobspecs = {}
        if buildjobspec:
            jobspecs = buildjobspec
        else:
            jobspecs = subjobspecs

        logger.debug(jobspecs)

        # submit task
        for subjob in rjobs:
            subjob.updateStatus('submitting')

        logger.info("Submitting to Jedi ...")
        verbose = logger.isEnabledFor(10)
        with inject_proxy(self.credential_requirements):
            status, tmpOut = Client.insertTaskParams(jobspecs, verbose)

        logger.debug(tmpOut)

        if status != 0:
            logger.error("Task submission to Jedi failed with %s " %status)
            return False
        if tmpOut[0] == False:
            logger.error("Task submission to Jedi failed %s" %tmpOut[1])
            return False
        logger.info("Task submission to Jedi suceeded with new jediTaskID=%s" %tmpOut[1])

        #if buildjobspec:
        #    job.backend.buildjob = PandaBuildJob() 
        #    job.backend.buildjob.id = jobids[0][0]
        #    job.backend.buildjob.url = 'http://panda.cern.ch/?job=%d'%jobids[0][0]
        #    del jobids[0]

        for subjob in rjobs:
            subjob.backend.id = tmpOut[1]
            subjob.backend.url = 'http://pandamon.cern.ch/jedi/taskinfo?days=20&task=%d'%tmpOut[1]
            subjob.updateStatus('submitted')
            logger.info("Panda monitor url: %s" %subjob.backend.url)

        return True
Ejemplo n.º 22
0
	def getJobIDs(self, timestamp):
		date = time.strftime('%Y-%m-%d %H:%M:%S', time.gmtime(timestamp))

		status, output = Client.getJobIDsInTimeRange(date)

		if status != 0:
			print 'Error: panda !'
			return []

		output.sort()

		return output
Ejemplo n.º 23
0
 def finish(self,JobID,soft=False):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
             self.gridPassPhrase,
             False,
             self.verbose,
             useCache=True)
     # force update just in case
     self.status(JobID,True)
     # get jobset
     jobList = self.getJobIDsWithSetID(JobID)
     if jobList == None:
         # works only for jobsetID
         if useJobsetID:
             return
         # works with jobID
         jobList = [JobID]
     else:
         tmpMsg = "ID=%s is composed of JobID=" % JobID
         for tmpJobID in jobList:
             tmpMsg += '%s,' % tmpJobID
         tmpMsg = tmpMsg[:-1]
         tmpLog.info(tmpMsg)
     for tmpJobID in jobList:    
         # get job info from local repository
         job = self.getJobInfo(tmpJobID)
         if job == None:
             tmpLog.warning("JobID=%s not found in local repository. Synchronization may be needed" % tmpJobID)            
             continue
         # skip frozen job
         if job.dbStatus == 'frozen':
             tmpLog.info('All subJobs in JobID=%s already finished/failed' % tmpJobID)
             continue
         # finish JEDI task
         tmpLog.info('Sending finishTask command ...')
         status,output = Client.finishTask(job.jediTaskID,soft,self.verbose)
         # communication error
         if status != 0:
             tmpLog.error(output)
             tmpLog.error("Failed to finish JobID=%s" % tmpJobID)
             return False
         tmpStat,tmpDiag = output
         if not tmpStat:
             tmpLog.error(tmpDiag)
             tmpLog.error("Failed to finish JobID=%s" % tmpJobID)
             return False
         tmpLog.info(tmpDiag)
     # done
     tmpLog.info('Done. TaskID=%s will be finished soon' % job.jediTaskID)
     return True
Ejemplo n.º 24
0
 def kill(self, JobID):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy(
         self.gridPassPhrase, False, self.verbose)
     # get jobset
     jobList = self.getJobIDsWithSetID(JobID)
     if jobList == None:
         jobList = [JobID]
     else:
         tmpMsg = "JobsetID=%s is composed of JobID=" % JobID
         for tmpJobID in jobList:
             tmpMsg += '%s,' % tmpJobID
         tmpMsg = tmpMsg[:-1]
         tmpLog.info(tmpMsg)
     for tmpJobID in jobList:
         # get job info from local repository
         job = self.getJobInfo(tmpJobID)
         if job == None:
             tmpLog.warning(
                 "JobID=%s not found in local repository. Synchronization may be needed"
                 % tmpJobID)
             continue
         # skip frozen job
         if job.dbStatus == 'frozen':
             tmpLog.info('All subJobs in JobID=%s already finished/failed' %
                         tmpJobID)
             continue
         # get PandaID list
         killJobs = job.PandaID.split(',')
         # kill
         tmpLog.info('Sending kill command ...')
         status, output = Client.killJobs(killJobs, self.verbose)
         if status != 0:
             tmpLog.error(output)
             tmpLog.error("Failed to kill JobID=%s" % tmpJobID)
             return False
         # update database
         job.commandToPilot = 'tobekilled'
         # update DB
         try:
             PdbUtils.updateJobDB(job, self.verbose)
         except:
             tmpLog.error("Failed to update local repository for JobID=%s" %
                          tmpJobID)
             return False
         # done
         tmpLog.info('Done. JobID=%s will be killed in 30min' % tmpJobID)
     return True
Ejemplo n.º 25
0
def retrievePandaJobs(job, jIDs):
    '''
    methods for retrieving panda job ids of panda jobs given a jobDefId
    '''
    from pandatools import Client

    ick       = False
    jstatus    = ''
    num_pjobs = 0

    logger.debug("retrievePandaJobs jIDs=%s" %jIDs)

    # get status from Panda server
    rc, jobsStatus = Client.getFullJobStatus(jIDs,False)
    if rc:
        logger.error('Return code %d retrieving job status information.',rc)
        raise BackendError('Jedi','Return code %d retrieving job status information.' % rc)

    for status in jobsStatus:
        if not status: continue

        jstatus = status.jobStatus
        if status.jobStatus == None:
            logger.warning('No panda jobs expected')
            job.backend.pandajobs = []

        elif status.jobStatus in [ "defined", "activated", "running", "failed", "finished", "holding", "assigned"]:
            logger.debug('Panda jobs are running')
            logger.debug("PandaID: %d" % status.PandaID)

            pjobj         = JediPandaJob()
            pjobj.id      = status.PandaID
            pjobj.url     = 'http://panda.cern.ch/?job=%d' % status.PandaID
            pjobj.jopSpec = dict(zip(status._attributes,status.values()))
            for k in pjobj.jobSpec.keys():
                if type(pjobj.jobSpec[k]) not in [type(''),type(1)]:
                    pjobj.jobSpec[k]=str(pjobj.jobSpec[k])

            if pjobj not in job.backend.pandajobs:
                job.backend.pandajobs.append(pjobj)
            else:
                logger.debug("Panda job %s already exists locally" % pjobj.id)
                
            num_pjobs += 1
        else:
            logger.warning("getFullJobStatus returned unsupported status %s for Panda job %s " %(status.jobStatus, status.PandaID) )
            
        ick = True

    return (ick, jstatus, num_pjobs)
Ejemplo n.º 26
0
	def jobIsFinished(self, ID):
		L = Client.getPandIDsWithJobID(ID)[1]

		if L is None:
			result = False
		else:
			result = True

			for item in L:
				if L[item][0] != 'finished':
					result = False
					pass

		return result
Ejemplo n.º 27
0
	def jobIsCancelled(self, ID):
		L = Client.getPandIDsWithJobID(ID)[1]

		if L is None:
			result = False
		else:
			result = False

			for item in L:
				if L[item][0] == 'cancelled':
					result = True
					pass

		return result
Ejemplo n.º 28
0
    def submit_panda_task(self, processing):
        try:
            from pandatools import Client

            proc = processing['processing_metadata']['processing']
            task_param = proc.processing_metadata['task_param']
            return_code = Client.insertTaskParams(task_param, verbose=True)
            if return_code[0] == 0:
                return return_code[1][1]
            else:
                self.logger.warn("submit_panda_task, return_code: %s" %
                                 str(return_code))
        except Exception as ex:
            self.logger.error(ex)
            self.logger.error(traceback.format_exc())
            # raise exceptions.AgentPluginError('%s: %s' % (str(ex), traceback.format_exc()))
        return None
Ejemplo n.º 29
0
 def debug(self,pandaID,modeOn):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
         self.gridPassPhrase,
         False,
         self.verbose,
         useCache=True)
     # rebrokerage
     status,output = Client.setDebugMode(pandaID,modeOn,self.verbose)
     if status != 0:
         tmpLog.error(output)
         tmpLog.error("Failed to set debug mode for %s" % pandaID)
         return
     # done
     tmpLog.info(output)
     return
Ejemplo n.º 30
0
  def change_site_status(self, site, new_status):
    if new_status not in ('test', 'online'):
      return False
    self.add_log('Changing %s status to %s:' % (site, new_status))
    if self.debug:
      self.add_log('DEBUG mode')
      return True
    now = int(time.time())
    if new_status == 'test':
      try:
        last_exclusion = int(Site.objects.filter(name=site)[0].getSiteOptions_for_site.filter(option_name='last_exclusion')[0].option_value)
      except:
        last_exclusion = 0
      if now - last_exclusion < 21600:
        self.add_log('%s was recently auto-excluded. Skipping...' % site)
        return False

    cmd = "curl -s -k --cert $X509_USER_PROXY 'https://panda.cern.ch:25943/server/controller/query?tpmes=setmanual&queue=%s&moduser=HammerCloud&comment=HC.Blacklist.set.manual'" % site
    self.add_log('> ' + cmd)
    if not self.debug:
      o = commands.getoutput(cmd)
      self.add_log(o)
    cmd = "curl -s -k --cert $X509_USER_PROXY 'https://panda.cern.ch:25943/server/controller/query?tpmes=set%s&queue=%s&moduser=HammerCloud&comment=HC.Blacklist.set.%s'" % (new_status, site, new_status)
    self.add_log('> ' + cmd)
    if not self.debug:
      o = commands.getoutput(cmd)
      self.add_log(o)
    if new_status == 'test':
      try:
        option = Site.objects.filter(name=site)[0].getSiteOptions_for_site.filter(option_name='last_exclusion')[0]
        option.option_value = now
      except:
        option = SiteOption()
        option.option_name = 'last_exclusion'
        option.option_value = now
        option.site = Site.objects.get(name=site)
      option.save()

    Client.PandaSites = Client.getSiteSpecs(SITETYPE)[1]
    if not self.debug and Client.PandaSites[site]['status'] != new_status:
      self.store_log('Error setting %s to %s' % (site, new_status), 'error')
      return False
     
    return True
Ejemplo n.º 31
0
    def reactivate_processing(self, processing):
        try:
            if processing:
                from pandatools import Client
                # task_id = processing['processing_metadata']['task_id']
                proc = processing['processing_metadata']['processing']
                task_id = proc.workload_id

                # Client.retryTask(task_id)
                status, out = Client.retryTask(task_id, newParams={})
                self.logger.warn(
                    "Retry processing(%s) with task id(%s): %s, %s" %
                    (processing['processing_id'], task_id, status, out))
                # Client.reactivateTask(task_id)
                # Client.resumeTask(task_id)
        except Exception as ex:
            msg = "Failed to check the processing (%s) status: %s" % (str(
                processing['processing_id']), str(ex))
            raise exceptions.IDDSException(msg)
Ejemplo n.º 32
0
    def submit_panda_task(self, processing):
        try:
            from pandatools import Client

            status, tmpOut = Client.insertTaskParams(self.panda_task_paramsmap,
                                                     False, True)
            if status == 0:
                tmp_status, tmp_output = tmpOut
                m = re.search("jediTaskID=(\d+)", tmp_output)  # noqa W605
                task_id = int(m.group(1))
                processing.workload_id = task_id
            else:
                self.add_errors(tmpOut)
                raise Exception(tmpOut)
        except Exception as ex:
            self.logger.error(ex)
            self.logger.error(traceback.format_exc())
            raise exceptions.IDDSException('%s: %s' %
                                           (str(ex), traceback.format_exc()))
Ejemplo n.º 33
0
 def getUserJobMetadata(self, jobID, output_filename):
     job = self.getJobInfo(jobID)
     # get logger
     tmpLog = PLogger.getPandaLogger()
     if job is None:
         tmpLog.error('cannot find a task with {0}. May need to sync first'.format(jobID))
         return False
     # get metadata
     task_id = job.jediTaskID
     tmpLog.info('getting metadata')
     status, metadata = Client.getUserJobMetadata(task_id, verbose=self.verbose)
     if status != 0:
         tmpLog.error(metadata)
         tmpLog.error("Failed to get metadata")
         return False
     with open(output_filename, 'w') as f:
         json.dump(metadata, f)
     tmpLog.info('dumped to {0}'.format(output_filename))
     # return
     return True
Ejemplo n.º 34
0
 def setMergeJobStatus(self,job,forceUpdate=False):
     # only whenmerge job generation is active
     if not forceUpdate and not job.activeMergeGen():
         return True
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check status of merge job generation
     status,genStauts = Client.checkMergeGenerationStatus(job.JobID,verbose=self.verbose)
     if status != 0:
         tmpLog.error(genStauts)
         tmpLog.error("Failed to check status of merge job generation for JobID=%s" % job.JobID)
         return False
     # set status
     job.mergeJobStatus = genStauts['status']
     # set merge job IDs
     if genStauts['mergeIDs'] != []:
         job.mergeJobID = ''
         for tmpID in genStauts['mergeIDs']:
             job.mergeJobID += '%s,' % tmpID
         job.mergeJobID = job.mergeJobID[:-1]
     # return
     return True
Ejemplo n.º 35
0
 def rebrokerage(self,JobsetID,cloud):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
             self.gridPassPhrase,    
             False,
             self.verbose,
             useCache=True)
     # get jobset
     jobList = self.getJobIDsWithSetID(JobsetID)
     if jobList == None:
         jobList = [JobsetID]
     else:
         tmpMsg = "JobsetID=%s is composed of JobID=" % JobsetID
         for tmpJobID in jobList:
             tmpMsg += '%s,' % tmpJobID
         tmpMsg = tmpMsg[:-1]
         tmpLog.info(tmpMsg)
     for JobID in jobList:    
         # get job info using status
         job = self.status(JobID)
         if job == None:
             # not found
             continue
         # skip frozen job
         if job.dbStatus == 'frozen':
             tmpLog.info('All subJobs in JobID=%s already finished/failed' % JobID)
             continue
     # rebrokerage
     tmpLog.info('Sending rebrokerage request ...')
     status,output = Client.runReBrokerage(JobID,job.libDS,cloud,self.verbose)
     if status != 0:
         tmpLog.error(output)
         tmpLog.error("Failed to reassign JobID=%s" % JobID)
         return
     # done
     tmpLog.info('Done for %s' % JobID)
     return
Ejemplo n.º 36
0
    def master_kill(self):
        '''Kill jobs'''  

        from pandatools import Client

        job = self.getJobObject()
        logger.debug('Killing job %s' % job.getFQID('.'))

        active_status = [ None, 'registered', 'waiting', 'defined', 'pending', 'assigning', 'ready', 'scouting', 'running', 'holding', 'merging', 'prepared', 'aborting', 'finishing' ]
        #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ]

        if self.id and self.status in active_status: 
            status, output = Client.killTask(self.id)
            if status:
                logger.error('Failed killing job (status = %d)',status)
                return False
            else:
                logger.info('Killing Jedi task %s, Server returned: %s' %(self.id, output))
        else:
            logger.error('Cannot kill Jedi job %s since it is not in active status', self.id)

        return True
Ejemplo n.º 37
0
    def get_status_changed_contents(self, unterminated_job_ids,
                                    input_output_maps, panda_id_to_map_ids):
        self.logger.debug(
            "get_status_changed_contents, unterminated_job_ids: %s" %
            str(unterminated_job_ids))
        from pandatools import Client

        full_update_contents = []
        chunksize = 2000
        chunks = [
            unterminated_job_ids[i:i + chunksize]
            for i in range(0, len(unterminated_job_ids), chunksize)
        ]
        for chunk in chunks:
            jobs_list = Client.getJobStatus(chunk, verbose=0)[1]
            for job_info in jobs_list:
                panda_id = job_info.PandaID
                map_id = panda_id_to_map_ids[panda_id]
                update_contents = self.get_update_contents_from_map_id(
                    map_id, input_output_maps, job_info)
                full_update_contents += update_contents
        return full_update_contents
Ejemplo n.º 38
0
def getDBDatasets(jobO, trf, dbrelease):
    from pandatools import Client

    # get DB datasets
    dbrFiles = {}
    dbrDsList = []
    if trf or dbrelease != '':
        if trf:
            # parse jobO for TRF
            tmpItems = jobO.split()
        else:
            # mimic a trf parameter to reuse following algorithm
            tmpItems = ['%DB=' + dbrelease]
        # look for DBRelease
        for tmpItem in tmpItems:
            match = re.search('%DB=([^:]+):(.+)$', tmpItem)
            if match:
                tmpDbrDS = match.group(1)
                tmpDbrLFN = match.group(2)
                # get files in the dataset
                if not tmpDbrDS in dbrDsList:
                    logger.info("Querying files in dataset:%s" % tmpDbrDS)
                    try:
                        tmpList = Client.queryFilesInDataset(tmpDbrDS, False)
                    except:
                        raise ApplicationConfigurationError(
                            None,
                            "ERROR : error while looking up dataset %s. Perhaps this dataset does not exist?"
                            % tmpDbrDS)
                    # append
                    for tmpLFN, tmpVal in tmpList.iteritems():
                        dbrFiles[tmpLFN] = tmpVal
                    dbrDsList.append(tmpDbrDS)
                # check
                if tmpDbrLFN not in dbrFiles:
                    raise ApplicationConfigurationError(
                        None,
                        "ERROR : %s is not in %s" % (tmpDbrLFN, tmpDbrDS))
    return dbrFiles, dbrDsList
Ejemplo n.º 39
0
    def rebrokerage(self, JobsetID, cloud):
        # get logger
        tmpLog = PLogger.getPandaLogger()
        # check proxy
        self.gridPassPhrase, self.vomsFQAN = PsubUtils.checkGridProxy(
            self.gridPassPhrase, False, self.verbose)
        # get jobset
        jobList = self.getJobIDsWithSetID(JobsetID)
        if jobList == None:
            jobList = [JobsetID]
        else:
            tmpMsg = "JobsetID=%s is composed of JobID=" % JobsetID
            for tmpJobID in jobList:
                tmpMsg += '%s,' % tmpJobID
            tmpMsg = tmpMsg[:-1]
            tmpLog.info(tmpMsg)
        for JobID in jobList:
            # get job info using status
            job = self.status(JobID)
            if job == None:
                # not found
                continue
# skip frozen job
            if job.dbStatus == 'frozen':
                tmpLog.info('All subJobs in JobID=%s already finished/failed' %
                            JobID)
                continue
# rebrokerage
            tmpLog.info('Sending rebrokerage request ...')
            status, output = Client.runReBrokerage(JobID, job.libDS, cloud,
                                                   self.verbose)
            if status != 0:
                tmpLog.error(output)
                tmpLog.error("Failed to reassign JobID=%s" % JobID)
                return
            # done
            tmpLog.info('Done for %s' % JobID)
        return
Ejemplo n.º 40
0
def ELG_jediState(sample) :

    from pandatools import PandaToolsPkgInfo
    if int(float(PandaToolsPkgInfo.release_version[2])) < 4 :
        print "Need prun with JEDI support, try:"
        print "    localSetupPandaClient currentJedi --noAthenaCheck"
        return ''

    jediTaskID = int(sample.getMetaDouble("nc_jediTaskID", 0))

    if jediTaskID < 100 :
        print "Sample " + sample.name() + " does not have a jediTaskID"
        return ''

    from pandatools import Client

    taskDict = {}
    taskDict['jediTaskID'] = jediTaskID
    ret = Client.getJediTaskDetails(taskDict, False, True)
    if ret[0] != 0 :
        print "Problem checking status of task %s with id %s" % (sample.name(), jediTaskID)
        return ''

    return ret[1]['status']
Ejemplo n.º 41
0
def expandExcludedSiteList( job ):
    '''Expand a site list taking wildcards into account'''
                
    # first, check if there's anything to be done
    check_ddm = False
    wildcard = False
    excl_sites = []
    for s in job.backend.requirements.excluded_sites:
        if s.find("ANALY_") == -1:
            check_ddm = True

        if s.find("*") != -1:
            wildcard = True
            
        if s.find("ANALY_") != -1 and s.find("*") == -1:
            excl_sites.append(s)

    if not check_ddm and not wildcard:
        return excl_sites

    # we have either wildcards or DDM sites listed
    # First, find the allowed sites for this job and ensure no duplicates anywhere
    from pandatools import Client
    logger.info("Excluding DDM and wildcarded sites from Jedi job. Please wait....")
    orig_ddm_list = []
    new_ddm_list = []
    for s in job.inputdata.get_locations():
        if not s in orig_ddm_list:
            orig_ddm_list.append(s)
            new_ddm_list.append(s)

    orig_panda_list = []
    for s in [Client.convertDQ2toPandaID(x) for x in new_ddm_list]:        
        for s2 in Client.PandaSites.keys():
            if s2.find(s) != -1 and not s2 in orig_panda_list:
                orig_panda_list.append(s2)

    if check_ddm:
        # remove any DDM sites that are referenced, including wildcards
        for s in job.backend.requirements.excluded_sites:
            if s in orig_ddm_list:
                new_ddm_list.remove(s)

            if s.find("*") != -1:
                for s2 in orig_ddm_list:
                    if fnmatch.fnmatch(s2, s):
                        new_ddm_list.remove(s2)
                        
        # now recreate the panda list and see if any have been dropped
        new_panda_list = []
        for s in [Client.convertDQ2toPandaID(x) for x in new_ddm_list]:        
            for s2 in Client.PandaSites.keys():
                if s2.find(s) != -1 and not s2 in new_panda_list:
                    new_panda_list.append(s)

        for s in orig_panda_list:
            if not s in new_panda_list and not s in excl_sites:
                excl_sites.append(s)
                
    if wildcard:
        # find wilcarded ANALY_* sites and exclude any that match good sites
        for s in job.backend.requirements.excluded_sites:
            if s.find("*") != -1:
                for s2 in orig_panda_list:
                    if fnmatch.fnmatch(s2, s) and not s2 in excl_sites:
                        excl_sites.append(s2)
                        
    return excl_sites
Ejemplo n.º 42
0
    def get_pick_dataset(self, verbose=False):
        
        job = self._getParent()
        if job and job.inputdata and job.inputdata.pick_event_list.name != '' and  len(job.inputdata.dataset) !=0 :
            raise ApplicationConfigurationError('Cannot use event pick list and input dataset at the same time.')

        #parametr check for event picking
        if job and job.inputdata and job.inputdata.pick_event_list.name != '':
            if job.inputdata.pick_data_type == '':
                raise ApplicationConfigurationError('Event pick data type (pick_data_type) must be specified.')

        # set X509_USER_PROXY
        from pandatools import Client
        if 'X509_USER_PROXY' not in os.environ or os.environ['X509_USER_PROXY'] == '':
            os.environ['X509_USER_PROXY'] = Client._x509()

        # setup eventLookup
        from pandatools.eventLookupClient import eventLookupClient
        elssiIF = eventLookupClient()
        # open run/event txt
        runEvtList = []
        if os.path.exists( self.pick_event_list.name ):
            logger.info("Event pick list file %s selected" % self.pick_event_list.name)
            runevttxt = open(self.pick_event_list.name)
            for line in runevttxt:
                items = line.split()
                if len(items) != 2:
                    continue
                runNr,evtNr = items
                runEvtList.append([runNr,evtNr])
                # close
            runevttxt.close()

        else:
            raise ApplicationConfigurationError('Could not read event pick list file %s.' %self.pick_event_list.name)

        # convert self.pick_data_type to Athena stream ref
        if self.pick_data_type == 'AOD':
            streamRef = 'StreamAOD_ref'
        elif self.pick_data_type == 'ESD':
            streamRef = 'StreamESD_ref'
        elif self.pick_data_type == 'RAW':
            streamRef = 'StreamRAW_ref'
        else:
            errStr  = 'invalid data type %s for event picking. ' % self.pick_data_type
            errStr += ' Must be one of AOD,ESD,RAW'
            raise ApplicationConfigurationError(errStr)
        logger.info('Getting dataset names and LFNs from ELSSI for event picking')

        # read
        guids = []
        guidRunEvtMap = {}
        runEvtGuidMap = {}
        # bulk lookup
        nEventsPerLoop = 500
        iEventsTotal = 0
        while iEventsTotal < len(runEvtList):
            tmpRunEvtList = runEvtList[iEventsTotal:iEventsTotal+nEventsPerLoop]
            iEventsTotal += nEventsPerLoop
            paramStr = 'Run, Evt: %s, Stream: %s, Dataset pattern: %s' % (tmpRunEvtList,self.pick_stream_name, self.pick_dataset_pattern)
            logger.debug(paramStr)
            logger.info('.')
            # check with ELSSI
            if self.pick_stream_name == '':
                guidListELSSI = elssiIF.doLookup(tmpRunEvtList,tokens=streamRef,amitag=self.event_pick_amitag,extract=True)
            else:
                guidListELSSI = elssiIF.doLookup(tmpRunEvtList,stream=self.pick_stream_name,tokens=streamRef,amitag=self.event_pick_amitag,extract=True)

            if len(guidListELSSI) == 0 or guidListELSSI == None:
                errStr = ''
                for tmpLine in elssiIF.output:
                    errStr += tmpLine + '\n'
                errStr = "GUID was not found in ELSSI.\n" + errStr
                raise ApplicationConfigurationError(errStr)

            # check attribute
            attrNames, attrVals = guidListELSSI
            def getAttributeIndex(attr):
                for tmpIdx,tmpAttrName in enumerate(attrNames):
                    if tmpAttrName.strip() == attr:
                        return tmpIdx
                errStr = "cannot find attribute=%s in %s provided by ELSSI" % (attr,str(attrNames))
                raise ApplicationConfigurationError(errStr)
            # get index
            indexEvt = getAttributeIndex('EventNumber')
            indexRun = getAttributeIndex('RunNumber')
            indexTag = getAttributeIndex(streamRef)
            # check events
            for runNr,evtNr in tmpRunEvtList:
                paramStr = 'Run:%s Evt:%s Stream:%s' % (runNr,evtNr,self.pick_stream_name)
                # collect GUIDs
                tmpguids = []
                for attrVal in attrVals:
                    if runNr == attrVal[indexRun] and evtNr == attrVal[indexEvt]:
                        tmpGuid = attrVal[indexTag]
                        # check non existing
                        if tmpGuid == 'NOATTRIB':
                            continue
                        if not tmpGuid in tmpguids:
                            tmpguids.append(tmpGuid)
                # not found
                if tmpguids == []:
                    errStr = "no GUIDs were found in ELSSI for %s" % paramStr
                    raise ApplicationConfigurationError(errStr)
                # append
                for tmpguid in tmpguids:
                    if not tmpguid in guids:
                        guids.append(tmpguid)
                        guidRunEvtMap[tmpguid] = []
                    guidRunEvtMap[tmpguid].append((runNr,evtNr))
                runEvtGuidMap[(runNr,evtNr)] = tmpguids

        # convert to dataset names and LFNs
        dsLFNs,allDSMap = Client.listDatasetsByGUIDs(guids,self.pick_dataset_pattern,verbose)
        logger.debug(dsLFNs)
        
        #populate DQ2Datase    
        if job and job.inputdata:
            job.inputdata.dataset = []
            job.inputdata.names = []
            job.inputdata.guids = []

            # check duplication
            for runNr,evtNr in runEvtGuidMap.keys():
                tmpLFNs = []
                tmpAllDSs = {}
                for tmpguid in runEvtGuidMap[(runNr,evtNr)]:
                    if tmpguid in dsLFNs:
                        tmpLFNs.append(dsLFNs[tmpguid])
                        job.inputdata.guids.append(tmpguid)
                        job.inputdata.names.append(dsLFNs[tmpguid][1])
                        if not ((dsLFNs[tmpguid][0]) in job.inputdata.dataset):
                            job.inputdata.dataset.append(dsLFNs[tmpguid][0])
                    else:
                        tmpAllDSs[tmpguid] = allDSMap[tmpguid]
                        if tmpguid in guidRunEvtMap:
                            del guidRunEvtMap[tmpguid]
                # empty        
                if tmpLFNs == []:
                    paramStr = 'Run:%s Evt:%s Stream:%s' % (runNr,evtNr,self.pick_stream_name)                        
                    errStr = "Dataset pattern '%s' didn't pick up a file for %s\n" % (self.pick_dataset_pattern,paramStr)
                    for tmpguid,tmpAllDS in tmpAllDSs.iteritems():
                        errStr += "    GUID:%s dataset:%s\n" % (tmpguid,str(tmpAllDS))
                    raise ApplicationConfigurationError(errStr)
                # duplicated    
                if len(tmpLFNs) != 1:
                    paramStr = 'Run:%s Evt:%s Stream:%s' % (runNr,evtNr,self.pick_stream_name)            
                    errStr = "Multiple LFNs %s were found in ELSSI for %s. Please set pick_dataset_pattern and/or pick_stream_name and or event_pick_amitag correctly." %(str(tmpLFNs),paramStr)
                    raise ApplicationConfigurationError(errStr)

        
        # return
        return guidRunEvtMap
Ejemplo n.º 43
0
    def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig):
        """Prepare the specific aspec of each subjob.
           Returns: subjobconfig list of objects understood by backends."""

        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec
        from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2_set_dataset_lifetime
        from GangaPanda.Lib.Panda.Panda import refreshPandaSpecs

        # make sure we have the correct siteType
        refreshPandaSpecs()

        job = app._getParent()
        masterjob = job._getRoot()

        logger.debug('ProdTransPandaRTHandler prepare called for %s',
                     job.getFQID('.'))

        job.backend.actualCE = job.backend.site
        job.backend.requirements.cloud = Client.PandaSites[
            job.backend.site]['cloud']

        # check that the site is in a submit-able status
        if not job.splitter or job.splitter._name != 'DQ2JobSplitter':
            allowed_sites = job.backend.list_ddm_sites()

        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
            tmpDsExist = False
            if (configPanda['processingType'].startswith('gangarobot') or
                    configPanda['processingType'].startswith('hammercloud')):
                #if Client.getDatasets(job.outputdata.datasetname):
                if getDatasets(job.outputdata.datasetname):
                    tmpDsExist = True
                    logger.info('Re-using output dataset %s' %
                                job.outputdata.datasetname)
            if not configPanda[
                    'specialHandling'] == 'ddm:rucio' and not configPanda[
                        'processingType'].startswith(
                            'gangarobot'
                        ) and not configPanda['processingType'].startswith(
                            'hammercloud') and not configPanda[
                                'processingType'].startswith('rucio_test'):
                Client.addDataset(job.outputdata.datasetname,
                                  False,
                                  location=outDsLocation,
                                  allowProdDisk=True,
                                  dsExist=tmpDsExist)
            logger.info('Output dataset %s registered at %s' %
                        (job.outputdata.datasetname, outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname, outDsLocation)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in adding dataset %s: %s %s' %
                (job.outputdata.datasetname, sys.exc_info()[0],
                 sys.exc_info()[1]))

        # JobSpec.
        jspec = JobSpec()
        jspec.currentPriority = app.priority
        jspec.jobDefinitionID = masterjob.id
        jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
        jspec.coreCount = app.core_count
        jspec.AtlasRelease = 'Atlas-%s' % app.atlas_release
        jspec.homepackage = app.home_package
        jspec.transformation = app.transformation
        jspec.destinationDBlock = job.outputdata.datasetname
        if job.outputdata.location:
            jspec.destinationSE = job.outputdata.location
        else:
            jspec.destinationSE = job.backend.site
        if job.inputdata:
            jspec.prodDBlock = job.inputdata.dataset[0]
        else:
            jspec.prodDBlock = 'NULL'
        if app.prod_source_label:
            jspec.prodSourceLabel = app.prod_source_label
        else:
            jspec.prodSourceLabel = configPanda['prodSourceLabelRun']
        jspec.processingType = configPanda['processingType']
        jspec.specialHandling = configPanda['specialHandling']
        jspec.computingSite = job.backend.site
        jspec.cloud = job.backend.requirements.cloud
        jspec.cmtConfig = app.atlas_cmtconfig
        if app.dbrelease == 'LATEST':
            try:
                latest_dbrelease = getLatestDBReleaseCaching()
            except:
                from pandatools import Client
                latest_dbrelease = Client.getLatestDBRelease()
            m = re.search('(.*):DBRelease-(.*)\.tar\.gz', latest_dbrelease)
            if m:
                self.dbrelease_dataset = m.group(1)
                self.dbrelease = m.group(2)
            else:
                raise ApplicationConfigurationError(
                    None,
                    "Error retrieving LATEST DBRelease. Try setting application.dbrelease manually."
                )
        else:
            self.dbrelease_dataset = app.dbrelease_dataset
            self.dbrelease = app.dbrelease
        jspec.jobParameters = app.job_parameters

        if self.dbrelease:
            if self.dbrelease == 'current':
                jspec.jobParameters += ' --DBRelease=current'
            else:
                if jspec.transformation.endswith(
                        "_tf.py") or jspec.transformation.endswith("_tf"):
                    jspec.jobParameters += ' --DBRelease=DBRelease-%s.tar.gz' % (
                        self.dbrelease, )
                else:
                    jspec.jobParameters += ' DBRelease=DBRelease-%s.tar.gz' % (
                        self.dbrelease, )
                dbspec = FileSpec()
                dbspec.lfn = 'DBRelease-%s.tar.gz' % self.dbrelease
                dbspec.dataset = self.dbrelease_dataset
                dbspec.prodDBlock = jspec.prodDBlock
                dbspec.type = 'input'
                jspec.addFile(dbspec)

        if job.inputdata:
            m = re.search('(.*)\.(.*)\.(.*)\.(.*)\.(.*)\.(.*)',
                          job.inputdata.dataset[0])
            if not m:
                logger.error("Error retrieving run number from dataset name")
                #raise ApplicationConfigurationError(None, "Error retrieving run number from dataset name")
                runnumber = 105200
            else:
                runnumber = int(m.group(2))
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --runNumber %d' % runnumber
            else:
                jspec.jobParameters += ' RunNumber=%d' % runnumber

        # Output files.
        randomized_lfns = []
        ilfn = 0
        for lfn, lfntype in zip(app.output_files, app.output_type):
            ofspec = FileSpec()
            if app.randomize_lfns:
                randomized_lfn = lfn + (
                    '.%s.%d.%s' %
                    (job.backend.site, int(time.time()),
                     commands.getoutput('uuidgen 2> /dev/null')[:4]))
            else:
                randomized_lfn = lfn
            ofspec.lfn = randomized_lfn
            randomized_lfns.append(randomized_lfn)
            ofspec.destinationDBlock = jspec.destinationDBlock
            ofspec.destinationSE = jspec.destinationSE
            ofspec.dataset = jspec.destinationDBlock
            ofspec.type = 'output'
            jspec.addFile(ofspec)
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --output%sFile %s' % (
                    lfntype, randomized_lfns[ilfn])
            else:
                jspec.jobParameters += ' output%sFile=%s' % (
                    lfntype, randomized_lfns[ilfn])
            ilfn = ilfn + 1

        # Input files.
        if job.inputdata:
            for guid, lfn, size, checksum, scope in zip(
                    job.inputdata.guids, job.inputdata.names,
                    job.inputdata.sizes, job.inputdata.checksums,
                    job.inputdata.scopes):
                ifspec = FileSpec()
                ifspec.lfn = lfn
                ifspec.GUID = guid
                ifspec.fsize = size
                ifspec.md5sum = checksum
                ifspec.scope = scope
                ifspec.dataset = jspec.prodDBlock
                ifspec.prodDBlock = jspec.prodDBlock
                ifspec.type = 'input'
                jspec.addFile(ifspec)
            if app.input_type:
                itype = app.input_type
            else:
                itype = m.group(5)
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --input%sFile %s' % (itype, ','.join(
                    job.inputdata.names))
            else:
                jspec.jobParameters += ' input%sFile=%s' % (itype, ','.join(
                    job.inputdata.names))

        # Log files.
        lfspec = FileSpec()
        lfspec.lfn = '%s.job.log.tgz' % jspec.jobName
        lfspec.destinationDBlock = jspec.destinationDBlock
        lfspec.destinationSE = jspec.destinationSE
        lfspec.dataset = jspec.destinationDBlock
        lfspec.type = 'log'
        jspec.addFile(lfspec)

        return jspec
Ejemplo n.º 44
0
    def master_resubmit(self, jobs):
        '''Resubmit failed Jedi job'''
        from pandatools import Client

        jobIDs = {}
        for job in jobs:
            jobIDs[job.backend.id] = job

        allJobIDs = jobIDs.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails(
                {'jediTaskID': jID}, False, True, verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                raise BackendError(
                    'Jedi',
                    'Return code %d retrieving job status information.' %
                    status)

            # Retrieve job
            job = jobIDs[jediTaskDict['jediTaskID']]

            newJobsetID = -1  # get jobset
            retryJobs = []  # jspecs
            resubmittedJobs = []  # ganga jobs

            if jediTaskDict['status'] in [
                    'failed', 'killed', 'cancelled', 'aborted', 'broken',
                    'finished'
            ]:
                retryJobs.append(job)
                resubmittedJobs.append(jID)
            #elif jediTaskDict['status'] == 'finished':
            #    pass
            else:
                logger.warning("Cannot resubmit. Jedi task %s is status %s." %
                               (jID, jediTaskDict['status']))
                return False

            # submit
            if len(retryJobs) == 0:
                logger.warning("No failed jobs to resubmit")
                return False

            status, out = Client.retryTask(jID, verbose=False)
            if status != 0:
                logger.error(status)
                logger.error(out)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            tmpStat, tmpDiag = out
            if not tmpStat:
                logger.error(tmpDiag)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            logger.info(tmpDiag)

            job.backend.status = None
            job.backend.jobSpec = {}
            job.updateStatus('submitted')

        logger.info('Resubmission successful')
        return True
Ejemplo n.º 45
0
    def master_prepare(self, app, appconfig):
        '''Prepare the master job'''

        from pandatools import Client
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('ExecutablePandaRTHandler master_prepare called for %s',
                     job.getFQID('.'))

        # set chirp variables
        if configPanda['chirpconfig'] or configPanda['chirpserver']:
            setChirpVariables()

#       Pack inputsandbox
        inputsandbox = 'sources.%s.tar' % commands.getoutput(
            'uuidgen 2> /dev/null')
        inpw = job.getInputWorkspace()
        # add user script to inputsandbox
        if hasattr(job.application.exe, "name"):
            if not job.application.exe in job.inputsandbox:
                job.inputsandbox.append(job.application.exe)

        for fname in [f.name for f in job.inputsandbox]:
            fname.rstrip(os.sep)
            path = fname[:fname.rfind(os.sep)]
            f = fname[fname.rfind(os.sep) + 1:]
            rc, output = commands.getstatusoutput(
                'tar rf %s -C %s %s' % (inpw.getPath(inputsandbox), path, f))
            if rc:
                logger.error('Packing inputsandbox failed with status %d', rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    None, 'Packing inputsandbox failed.')
        if len(job.inputsandbox) > 0:
            rc, output = commands.getstatusoutput('gzip %s' %
                                                  (inpw.getPath(inputsandbox)))
            if rc:
                logger.error('Packing inputsandbox failed with status %d', rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    None, 'Packing inputsandbox failed.')
            inputsandbox += ".gz"
        else:
            inputsandbox = None

#       Upload Inputsandbox
        if inputsandbox:
            logger.debug('Uploading source tarball ...')
            uploadSources(inpw.getPath(), os.path.basename(inputsandbox))
            self.inputsandbox = inputsandbox
        else:
            self.inputsandbox = None

#       input dataset
        if job.inputdata:
            if job.inputdata._name != 'DQ2Dataset':
                raise ApplicationConfigurationError(
                    None, 'PANDA application supports only DQ2Datasets')

        # run brokerage here if not splitting
        if not job.splitter:
            from GangaPanda.Lib.Panda.Panda import runPandaBrokerage
            runPandaBrokerage(job)
        elif job.splitter._name not in [
                'DQ2JobSplitter', 'ArgSplitter', 'ArgSplitterTask'
        ]:
            raise ApplicationConfigurationError(
                None, 'Panda splitter must be DQ2JobSplitter or ArgSplitter')

        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError(
                None, 'site is still AUTO after brokerage!')

#       output dataset
        if job.outputdata:
            if job.outputdata._name != 'DQ2OutputDataset':
                raise ApplicationConfigurationError(
                    None, 'Panda backend supports only DQ2OutputDataset')
        else:
            logger.info('Adding missing DQ2OutputDataset')
            job.outputdata = DQ2OutputDataset()

        job.outputdata.datasetname, outlfn = dq2outputdatasetname(
            job.outputdata.datasetname, job.id, job.outputdata.isGroupDS,
            job.outputdata.groupname)

        self.outDsLocation = Client.PandaSites[job.backend.site]['ddm']

        try:
            Client.addDataset(job.outputdata.datasetname,
                              False,
                              location=self.outDsLocation)
            logger.info('Output dataset %s registered at %s' %
                        (job.outputdata.datasetname, self.outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname,
                                     location=self.outDsLocation)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in Client.addDataset %s: %s %s' %
                (job.outputdata.datasetname, sys.exc_info()[0],
                 sys.exc_info()[1]))

        # handle the libds
        if job.backend.libds:
            self.libDataset = job.backend.libds
            self.fileBO = getLibFileSpecFromLibDS(self.libDataset)
            self.library = self.fileBO.lfn
        elif job.backend.bexec:
            self.libDataset = job.outputdata.datasetname + '.lib'
            self.library = '%s.tgz' % self.libDataset
            try:
                Client.addDataset(self.libDataset,
                                  False,
                                  location=self.outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset,
                                         location=self.outDsLocation)
                logger.info('Lib dataset %s registered at %s' %
                            (self.libDataset, self.outDsLocation))
            except exceptions.SystemExit:
                raise BackendError(
                    'Panda', 'Exception in Client.addDataset %s: %s %s' %
                    (self.libDataset, sys.exc_info()[0], sys.exc_info()[1]))

        # collect extOutFiles
        self.extOutFile = []
        for tmpName in job.outputdata.outputdata:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.outputsandbox:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.backend.extOutFile:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        # create build job
        if job.backend.bexec != '':
            jspec = JobSpec()
            jspec.jobDefinitionID = job.id
            jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
            jspec.transformation = '%s/buildGen-00-00-01' % Client.baseURLSUB
            if Client.isDQ2free(job.backend.site):
                jspec.destinationDBlock = '%s/%s' % (
                    job.outputdata.datasetname, self.libDataset)
                jspec.destinationSE = 'local'
            else:
                jspec.destinationDBlock = self.libDataset
                jspec.destinationSE = job.backend.site
            jspec.prodSourceLabel = configPanda['prodSourceLabelBuild']
            jspec.processingType = configPanda['processingType']
            jspec.assignedPriority = configPanda['assignedPriorityBuild']
            jspec.computingSite = job.backend.site
            jspec.cloud = job.backend.requirements.cloud
            jspec.jobParameters = '-o %s' % (self.library)
            if self.inputsandbox:
                jspec.jobParameters += ' -i %s' % (self.inputsandbox)
            else:
                raise ApplicationConfigurationError(
                    None,
                    'Executable on Panda with build job defined, but inputsandbox is emtpy !'
                )
            matchURL = re.search('(http.*://[^/]+)/', Client.baseURLCSRVSSL)
            if matchURL:
                jspec.jobParameters += ' --sourceURL %s ' % matchURL.group(1)
            if job.backend.bexec != '':
                jspec.jobParameters += ' --bexec "%s" ' % urllib.quote(
                    job.backend.bexec)
                jspec.jobParameters += ' -r %s ' % '.'

            fout = FileSpec()
            fout.lfn = self.library
            fout.type = 'output'
            fout.dataset = self.libDataset
            fout.destinationDBlock = self.libDataset
            jspec.addFile(fout)

            flog = FileSpec()
            flog.lfn = '%s.log.tgz' % self.libDataset
            flog.type = 'log'
            flog.dataset = self.libDataset
            flog.destinationDBlock = self.libDataset
            jspec.addFile(flog)
            return jspec
        else:
            return None
Ejemplo n.º 46
0
 def status(self,JobID,forceUpdate=False):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
             self.gridPassPhrase,
             False,
             self.verbose,
             useCache=True)
     # get job info from local repository
     job = self.getJobInfo(JobID)
     if job == None:
         # not found
         return None
     # update if needed
     if job.dbStatus != 'frozen' or forceUpdate:
         if not job.isJEDI():
             tmpLog.info("Getting status for JobID=%s ..." % JobID)
             # get status from Panda server
             status,pandaIDstatus = Client.getPandIDsWithJobID(JobID,verbose=self.verbose)
             if status != 0:
                 tmpLog.error("Failed to get status for ID=%s" % JobID)
                 return None
             # get one job to set computingSite which may have changed due to rebrokerage
             pandaJob = None
             if pandaIDstatus != {}:
                 tmpPandaIDs = pandaIDstatus.keys()
                 tmpPandaIDs.sort()
                 status,tmpPandaJobs = Client.getFullJobStatus(
                         tmpPandaIDs[:1],
                         verbose=self.verbose)
                 if status != 0:
                     tmpLog.error("Failed to get PandaJobs for %s" % JobID)
                     return None
                 pandaJob = tmpPandaJobs[0]
             # convert to local job spec
             job = PdbUtils.convertPtoD([],pandaIDstatus,job,pandaJobForSiteID=pandaJob)
             # check merge job generation
             status = self.setMergeJobStatus(job,forceUpdate)
             if not status:
                 return None
         else:
             tmpLog.info("Getting status for TaskID=%s ..." % job.jediTaskID)
             # get JEDI task
             status,jediTaskDict = Client.getJediTaskDetails(
                     {'jediTaskID':job.jediTaskID},
                     False,
                     True,
                     verbose=self.verbose)
             if status != 0:
                 tmpLog.error("Failed to get task details for %s" % JobID)
                 return
             # convert JEDI task
             job = PdbUtils.convertJTtoD(jediTaskDict,job)
         # update DB
         try:
             PdbUtils.updateJobDB(job,self.verbose)
         except:
             tmpLog.error("Failed to update local repository for JobID=%s" % JobID)
             return None
         if not job.isJEDI():
             tmpLog.info("Updated JobID=%s" % JobID)                        
         else:
             tmpLog.info("Updated TaskID=%s ..." % job.jediTaskID)
     # return
     return job
Ejemplo n.º 47
0
def printGUIDsWithDatasets(guids):
    """ list datasets by GUIDs
    """
    try:
        from pandatools import Client
    except ImportError:
        if os.environ.has_key('PANDA_SYS'):
            pandapath = os.environ['PANDA_SYS']
        else:
            pandapath = PandaSys
        sys.path.append(pandapath + '/lib/python2.4/site-packages')
        try:
            from pandatools import Client
        except ImportError:
            print "EventLookup failed to import PanDA client, GUID->dataset name resolution disabled"
            return False

    # instantiate curl
    curl = Client._Curl()
    curl.verbose = False
    iLookUp = 0
    guidLfnMap = {}
    checkedDSList = []
    # loop over all GUIDs
    for guid in guids.keys():
        # check existing map to avid redundant lookup
        if guidLfnMap.has_key(guid):
            continue
        iLookUp += 1
        if iLookUp % 20 == 0:
            time.sleep(1)
        # get vuids
        url = Client.baseURLDQ2 + '/ws_content/rpc'
        data = {
            'operation': 'queryDatasetsWithFileByGUID',
            'guid': guid,
            'API': '0_3_0',
            'tuid': Client.MiscUtils.wrappedUuidGen()
        }
        status, out = curl.get(url, data, rucioAccount=True)
        if status != 0 or re.search('Exception', str(out)) != None:
            # failed
            guidLfnMap[guid] = "DQ2 query ERROR: " + str(out)
            continue
        if out == '\x00' or out == ():
            guidLfnMap[
                guid] = "DQ2.queryDatasetsWithFileByGUID() returned no results"
            continue
        tmpVUIDs = list(out)
        # get dataset name
        url = Client.baseURLDQ2 + '/ws_repository/rpc'
        data = {
            'operation': 'queryDatasetByVUIDs',
            'vuids': tmpVUIDs,
            'API': '0_3_0',
            'tuid': Client.MiscUtils.wrappedUuidGen()
        }
        status, out = curl.post(url, data, rucioAccount=True)
        if status != 0 or re.search('Exception', str(out)) != None:
            # failed
            guidLfnMap[guid] = "DQ2 query ERROR: " + str(out)
            continue
        if out == '\x00':
            guidLfnMap[guid] = "DQ2.queryDatasetByVUIDs() returned no results"
            continue
        for tmpDsName in out.keys():
            # ignore junk datasets
            if not (tmpDsName.startswith('panda') or \
                    tmpDsName.startswith('user') or \
                    tmpDsName.startswith('group') or \
                    re.search('_sub\d+$',tmpDsName) != None or \
                    re.search('_dis\d+$',tmpDsName) != None or \
                    re.search('_shadow$',tmpDsName) != None \
                    or tmpDsName in checkedDSList ):
                tmpMap = Client.queryFilesInDataset(tmpDsName)
                for tmpLFN, tmpVal in tmpMap.iteritems():
                    guidLfnMap.setdefault(tmpVal['guid'],
                                          []).append([tmpLFN, tmpDsName])
                checkedDSList.append(tmpDsName)

    for guid in guids.keys():
        print guid, guids[guid], guidLfnMap.setdefault(guid, "")
    return True
Ejemplo n.º 48
0
def retrieveMergeJobs(job, pandaJobDefId):
    '''
    methods for retrieving panda job ids of merging jobs given a jobDefId
    '''
    from pandatools import Client

    ick = False
    status = ''
    num_mjobs = 0

    (ec, info) = Client.checkMergeGenerationStatus(pandaJobDefId)

    if ec == 0:

        try:
            status = info['status']
            mergeJobDefIds = info['mergeIDs']

            if status == 'NA':
                logger.warning('No merging jobs expected')
                job.backend.mergejobs = []

            elif status == 'generating':
                logger.debug('merging jobs are generating')
                job.backend.mergejobs = []

            elif status == 'standby':
                logger.debug('merging jobs to be created')
                job.backend.mergejobs = []

            elif status == 'generated':
                logger.debug('merging jobs are generated')

                for id in mergeJobDefIds:
                    logger.debug("merging jobDefId: %d" % id)

                    ## retrieve merging job id,status given the jobDefId
                    (ec2, mjs) = Client.getPandIDsWithJobID(id)

                    if ec2 == 0:

                        for jid, jinfo in mjs.items():
                            mjobj = PandaMergeJob()
                            mjobj.id = jid
                            #mjobj.status = jinfo[0]
                            mjobj.url = 'http://panda.cern.ch/?job=%d' % jid

                            if mjobj not in job.backend.mergejobs:
                                job.backend.mergejobs.append(mjobj)
                            else:
                                logger.debug(
                                    "merging job %s already exists locally" %
                                    mjobj.id)

                            num_mjobs += 1
                    else:
                        logger.warning(
                            "getPandIDsWithJobID returns non-zero exit code: %d"
                            % ec2)

            ick = True

        except KeyError:
            logger.error('unexpected job information: %s' % repr(info))

        except Exception as e:
            logger.error('general merge job information retrieval error')
            raise e

    else:
        logger.error(
            'checkMergeGenerationStatus returns non-zero exit code: %d' % ec)

    return (ick, status, num_mjobs)
    def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig):
        """Prepare the specific aspec of each subjob.
           Returns: subjobconfig list of objects understood by backends."""

        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec
        from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2_set_dataset_lifetime
        from GangaPanda.Lib.Panda.Panda import refreshPandaSpecs
        
        # make sure we have the correct siteType
        refreshPandaSpecs()

        job = app._getParent()
        masterjob = job._getRoot()

        logger.debug('ProdTransPandaRTHandler prepare called for %s',
                     job.getFQID('.'))

        job.backend.actualCE = job.backend.site
        job.backend.requirements.cloud = Client.PandaSites[job.backend.site]['cloud']

        # check that the site is in a submit-able status
        if not job.splitter or job.splitter._name != 'DQ2JobSplitter':
            allowed_sites = job.backend.list_ddm_sites()

        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
            tmpDsExist = False
            if (configPanda['processingType'].startswith('gangarobot') or configPanda['processingType'].startswith('hammercloud')):
                #if Client.getDatasets(job.outputdata.datasetname):
                if getDatasets(job.outputdata.datasetname):
                    tmpDsExist = True
                    logger.info('Re-using output dataset %s'%job.outputdata.datasetname)
            if not configPanda['specialHandling']=='ddm:rucio' and not  configPanda['processingType'].startswith('gangarobot') and not configPanda['processingType'].startswith('hammercloud') and not configPanda['processingType'].startswith('rucio_test'):
                Client.addDataset(job.outputdata.datasetname,False,location=outDsLocation,allowProdDisk=True,dsExist=tmpDsExist)
            logger.info('Output dataset %s registered at %s'%(job.outputdata.datasetname,outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname, outDsLocation)
        except exceptions.SystemExit:
            raise BackendError('Panda','Exception in adding dataset %s: %s %s'%(job.outputdata.datasetname,sys.exc_info()[0],sys.exc_info()[1]))
        
        # JobSpec.
        jspec = JobSpec()
        jspec.currentPriority = app.priority
        jspec.jobDefinitionID = masterjob.id
        jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
        jspec.coreCount = app.core_count
        jspec.AtlasRelease = 'Atlas-%s' % app.atlas_release
        jspec.homepackage = app.home_package
        jspec.transformation = app.transformation
        jspec.destinationDBlock = job.outputdata.datasetname
        if job.outputdata.location:
            jspec.destinationSE = job.outputdata.location
        else:
            jspec.destinationSE = job.backend.site
        if job.inputdata:
            jspec.prodDBlock = job.inputdata.dataset[0]
        else:
            jspec.prodDBlock = 'NULL'
        if app.prod_source_label:
            jspec.prodSourceLabel = app.prod_source_label
        else:
            jspec.prodSourceLabel = configPanda['prodSourceLabelRun']
        jspec.processingType = configPanda['processingType']
        jspec.specialHandling = configPanda['specialHandling']
        jspec.computingSite = job.backend.site
        jspec.cloud = job.backend.requirements.cloud
        jspec.cmtConfig = app.atlas_cmtconfig
        if app.dbrelease == 'LATEST':
            try:
                latest_dbrelease = getLatestDBReleaseCaching()
            except:
                from pandatools import Client
                latest_dbrelease = Client.getLatestDBRelease()
            m = re.search('(.*):DBRelease-(.*)\.tar\.gz', latest_dbrelease)
            if m:
                self.dbrelease_dataset = m.group(1)
                self.dbrelease = m.group(2)
            else:
                raise ApplicationConfigurationError(None, "Error retrieving LATEST DBRelease. Try setting application.dbrelease manually.")
        else:
            self.dbrelease_dataset = app.dbrelease_dataset
            self.dbrelease = app.dbrelease
        jspec.jobParameters = app.job_parameters

        if self.dbrelease:
            if self.dbrelease == 'current':
                jspec.jobParameters += ' --DBRelease=current' 
            else:
                if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                    jspec.jobParameters += ' --DBRelease=DBRelease-%s.tar.gz' % (self.dbrelease,)
                else:
                    jspec.jobParameters += ' DBRelease=DBRelease-%s.tar.gz' % (self.dbrelease,)
                dbspec = FileSpec()
                dbspec.lfn = 'DBRelease-%s.tar.gz' % self.dbrelease
                dbspec.dataset = self.dbrelease_dataset
                dbspec.prodDBlock = jspec.prodDBlock
                dbspec.type = 'input'
                jspec.addFile(dbspec)

        if job.inputdata:
            m = re.search('(.*)\.(.*)\.(.*)\.(.*)\.(.*)\.(.*)',
                          job.inputdata.dataset[0])
            if not m:
                logger.error("Error retrieving run number from dataset name")
                #raise ApplicationConfigurationError(None, "Error retrieving run number from dataset name")
                runnumber = 105200
            else:
                runnumber = int(m.group(2))
            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --runNumber %d' % runnumber
            else:
                jspec.jobParameters += ' RunNumber=%d' % runnumber
        
        # Output files.
        randomized_lfns = []
        ilfn = 0
        for lfn, lfntype in zip(app.output_files,app.output_type):
            ofspec = FileSpec()
            if app.randomize_lfns:
                randomized_lfn = lfn + ('.%s.%d.%s' % (job.backend.site, int(time.time()), commands.getoutput('uuidgen 2> /dev/null')[:4] ) )
            else:
                randomized_lfn = lfn
            ofspec.lfn = randomized_lfn
            randomized_lfns.append(randomized_lfn)
            ofspec.destinationDBlock = jspec.destinationDBlock
            ofspec.destinationSE = jspec.destinationSE
            ofspec.dataset = jspec.destinationDBlock
            ofspec.type = 'output'
            jspec.addFile(ofspec)
            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --output%sFile %s' % (lfntype, randomized_lfns[ilfn])
            else:
                jspec.jobParameters += ' output%sFile=%s' % (lfntype, randomized_lfns[ilfn])
            ilfn=ilfn+1

        # Input files.
        if job.inputdata:
            for guid, lfn, size, checksum, scope in zip(job.inputdata.guids, job.inputdata.names, job.inputdata.sizes, job.inputdata.checksums, job.inputdata.scopes):
                ifspec = FileSpec()
                ifspec.lfn = lfn
                ifspec.GUID = guid
                ifspec.fsize = size
                ifspec.md5sum = checksum
                ifspec.scope = scope
                ifspec.dataset = jspec.prodDBlock
                ifspec.prodDBlock = jspec.prodDBlock
                ifspec.type = 'input'
                jspec.addFile(ifspec)
            if app.input_type:
                itype = app.input_type
            else:
                itype = m.group(5)
            if jspec.transformation.endswith("_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --input%sFile %s' % (itype, ','.join(job.inputdata.names))
            else:
                jspec.jobParameters += ' input%sFile=%s' % (itype, ','.join(job.inputdata.names))

        # Log files.
        lfspec = FileSpec()
        lfspec.lfn = '%s.job.log.tgz' % jspec.jobName
        lfspec.destinationDBlock = jspec.destinationDBlock
        lfspec.destinationSE  = jspec.destinationSE
        lfspec.dataset = jspec.destinationDBlock
        lfspec.type = 'log'
        jspec.addFile(lfspec)
        
        return jspec
Ejemplo n.º 50
0
    def poll_panda_task(self, processing=None, input_output_maps=None):
        task_id = None
        try:
            from pandatools import Client

            jobs_ids = None
            if processing:
                proc = processing['processing_metadata']['processing']
                task_id = proc.workload_id
                if task_id is None:
                    task_id = self.get_panda_task_id(processing)

                if task_id:
                    # ret_ids = Client.getPandaIDsWithTaskID(task_id, verbose=False)
                    task_info = Client.getJediTaskDetails(
                        {'jediTaskID': task_id}, True, True, verbose=False)
                    self.logger.info("poll_panda_task, task_info: %s" %
                                     str(task_info))
                    if task_info[0] != 0:
                        self.logger.warn(
                            "poll_panda_task %s, error getting task status, task_info: %s"
                            % (task_id, str(task_info)))
                        return ProcessingStatus.Submitting, {}

                    task_info = task_info[1]

                    processing_status = self.get_processing_status_from_panda_status(
                        task_info["status"])

                    if processing_status in [ProcessingStatus.SubFinished]:
                        if self.retry_number < self.num_retries:
                            self.reactivate_processing(processing)
                            processing_status = ProcessingStatus.Submitted
                            self.retry_number += 1

                    jobs_ids = task_info['PandaID']
                    ret_get_registered_panda_jobids = self.get_registered_panda_jobids(
                        input_output_maps)
                    terminated_job_ids, unterminated_job_ids, map_id_without_panda_ids, panda_id_to_map_ids = ret_get_registered_panda_jobids

                    registered_job_ids = terminated_job_ids + unterminated_job_ids
                    unregistered_job_ids = []
                    for job_id in jobs_ids:
                        if job_id not in registered_job_ids:
                            unregistered_job_ids.append(job_id)

                    map_update_contents = self.map_panda_ids(
                        unregistered_job_ids, input_output_maps)
                    status_changed_update_contents = self.get_status_changed_contents(
                        unterminated_job_ids, input_output_maps,
                        panda_id_to_map_ids)
                    final_update_contents = []

                    if processing_status in [
                            ProcessingStatus.SubFinished,
                            ProcessingStatus.Finished, ProcessingStatus.Failed
                    ]:
                        if (unregistered_job_ids or unterminated_job_ids):
                            # there are still polling contents, should not terminate the task.
                            log_warn = "Processing (%s) with panda id (%s) is %s, however there are still unregistered_job_ids(%s) or unterminated_job_ids(%s)" % (
                                processing['processing_id'], task_id,
                                processing_status, str(unregistered_job_ids),
                                str(unterminated_job_ids))
                            log_warn = log_warn + ". Keep the processing status as running now."
                            self.logger.warn(log_warn)
                            processing_status = ProcessingStatus.Running
                        else:
                            final_update_contents = self.get_final_update_contents(
                                input_output_maps)
                            if final_update_contents:
                                processing_status = ProcessingStatus.Running
                    return processing_status, map_update_contents + status_changed_update_contents + final_update_contents
                else:
                    return ProcessingStatus.Failed, {}
        except Exception as ex:
            msg = "Failed to check the processing (%s) status: %s" % (str(
                processing['processing_id']), str(ex))
            self.logger.error(msg)
            self.logger.error(ex)
            self.logger.error(traceback.format_exc())
            # raise exceptions.IDDSException(msg)
        return ProcessingStatus.Submitting, []
Ejemplo n.º 51
0
 def sync(self):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     tmpLog.info("Synchronizing local repository ...")
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
             self.gridPassPhrase,
             False,
             self.verbose,
             useCache=True)
     # get nickname
     nickName = PsubUtils.getNickname()
     # set Rucio accounting
     PsubUtils.setRucioAccount(nickName,'pbook',True)
     # get JobIDs in local repository
     localJobIDs = PdbUtils.getListOfJobIDs()
     # get recent JobIDs from panda server
     syncTimeRaw = datetime.datetime.utcnow()
     syncTime = syncTimeRaw.strftime('%Y-%m-%d %H:%M:%S')
     # set sync time for the first attempt
     bookConf = BookConfig.getConfig()
     if self.restoreDB:
         # reset last_synctime to restore database 
         bookConf.last_synctime = ''
     # disable
     self.restoreDB = False
     tmpLog.info("It may take several minutes to restore local repository ...")
     if bookConf.last_synctime == '':
         bookConf.last_synctime = datetime.datetime.utcnow()-datetime.timedelta(days=180)
         bookConf.last_synctime = bookConf.last_synctime.strftime('%Y-%m-%d %H:%M:%S')
     maxTaskID = None
     while True:
         status, jediTaskDicts = Client.getJobIDsJediTasksInTimeRange(bookConf.last_synctime,
                                                                      minTaskID=maxTaskID,
                                                                      verbose=self.verbose)
         if status != 0:
             tmpLog.error("Failed to get tasks from panda server")
             return
         if len(jediTaskDicts) == 0:
             break
         tmpLog.info("Got %s tasks to be updated" % len(jediTaskDicts))
         # insert if missing
         for remoteJobID in jediTaskDicts.keys():
             taskID = jediTaskDicts[remoteJobID]['jediTaskID']
             # get max
             if maxTaskID is None or taskID > maxTaskID:
                 maxTaskID = taskID
             # check local status
             job = None
             if remoteJobID in localJobIDs:
                 # get job info from local repository
                 job = PdbUtils.readJobDB(remoteJobID, self.verbose)
                 # skip if frozen
                 if job.dbStatus == 'frozen':
                     continue
             tmpLog.info("Updating taskID=%s ..." % taskID)
             # convert JEDI task
             localJob = PdbUtils.convertJTtoD(jediTaskDicts[remoteJobID],job)
             # update database
             if not remoteJobID in localJobIDs:
                 # insert to DB
                 try:
                     PdbUtils.insertJobDB(localJob,self.verbose)
                 except:
                     tmpLog.error("Failed to insert taskID=%s to local repository" % taskID)
                     return
             else:
                 # update
                 try:
                     PdbUtils.updateJobDB(localJob,self.verbose,syncTimeRaw)
                 except:
                     tmpLog.error("Failed to update local repository for taskID=%s" % taskID)
                     return
     # update sync time
     bookConf = BookConfig.getConfig()
     bookConf.last_synctime = syncTime
     BookConfig.updateConfig(bookConf)
     self.updateTaskJobsetMap()
     tmpLog.info("Synchronization Completed")
Ejemplo n.º 52
0
    def master_updateMonitoringInformation(jobs):
        '''Monitor jobs'''
        from pandatools import Client

        #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ]

        submitting_status = []
        active_status = [
            None, 'registered', 'waiting', 'defined', 'pending', 'assigning',
            'ready', 'scouting', 'running', 'holding', 'merging', 'prepared',
            'aborting', 'finishing'
        ]

        inactive_status = ['finished', 'aborted', 'broken', 'failed', 'done']

        # Find jobs to be monitored
        jobdict = {}
        for job in jobs:
            # add a delay as Panda can be a little slow in sorting out a new Task
            if job.backend.id and job.backend.status in active_status and (
                (datetime.datetime.utcnow() -
                 job.time.timestamps["submitted"]).seconds > 120):
                jobdict[job.backend.id] = job

        logger.debug("jobdict = %s" % jobdict)

        # Monitor active Jedi tasks
        allJobIDs = jobdict.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails(
                {'jediTaskID': jID}, False, True, verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                #raise BackendError('Jedi','Return code %d retrieving job status information.' % status)
                continue
            # Retrieve job
            job = jobdict[jediTaskDict['jediTaskID']]
            # Store associated Panda jobs
            if job.backend.pandajobs:
                pandaJobIDs[job.backend.id] = [
                    pj.id for pj in job.backend.pandajobs
                ]
            else:
                pandaJobIDs[
                    jediTaskDict['jediTaskID']] = jediTaskDict['PandaID']
            logger.debug("jID = %s, pandaJobIDs = %s" % (jID, pandaJobIDs))

            # Fill the output data dataset list
            if 'outDS' in jediTaskDict and jediTaskDict['outDS'] != '':
                for ds in jediTaskDict['outDS'].split(','):
                    if not ds in job.outputdata.datasetList:
                        job.outputdata.datasetList.append(ds)

            # Jedi job status has changed
            if job.backend.status != jediTaskDict['status']:
                logger.debug('Job %s has changed status from %s to %s',
                             job.getFQID('.'), job.backend.status,
                             jediTaskDict['status'])
                job.backend.status = jediTaskDict['status']
                job.backend.reason = jediTaskDict['statistics']

                # Now update Jedi job status
                if jediTaskDict['status'] in [
                        'registered', 'waiting', 'defined', 'pending',
                        'assigning', 'ready'
                ]:
                    job.updateStatus('submitted')
                elif jediTaskDict['status'] in [
                        'scouting', 'running', 'holding', 'merging', 'prepared'
                ]:
                    job.updateStatus('running')
                elif jediTaskDict['status'] in ['done']:
                    job.updateStatus('completed')
                elif jediTaskDict['status'] in ['failed', 'finished']:
                    job.updateStatus('failed')
                elif jediTaskDict['status'] in [
                        'aborted', 'broken', 'cancelled'
                ] and job.status not in ['completed', 'failed']:
                    job.updateStatus('killed')
                else:
                    logger.warning('Unexpected Jedi task status %s',
                                   jediTaskDict['status'])

            # Check if associated Panda job exist and monitor them
            if not job.backend.pandajobs:
                jdefids = pandaJobIDs[jID]
                # skip if there are no Panda jobs yet
                if not jdefids:
                    continue
                tot_num_mjobs = 0

                do_master_update = True
                ick, status, num_mjobs = retrievePandaJobs(job, jdefids)
                logger.debug('retrievePandaJobs returns: %s %s' %
                             (repr(ick), status))
                if not ick:
                    logger.debug(
                        'Panda job retrival failure for Jedi task %s with PandaIds %s'
                        % (job.backend.id, jdefids))
                    do_master_update = False

                tot_num_mjobs += num_mjobs
                logger.debug('Job %s retrieved %d Panda jobs' %
                             (job.getFQID('.'), tot_num_mjobs))
            # Now monitor the already attached Panda jobs
            else:
                jdefids = [pj.id for pj in job.backend.pandajobs]
                rc, jobsStatus = Client.getFullJobStatus(jdefids, False)
                if rc:
                    logger.error(
                        'Return code %d retrieving job status information.',
                        rc)
                    raise BackendError(
                        'Jedi',
                        'Return code %d retrieving job status information.' %
                        rc)

                for status in jobsStatus:
                    if not status: continue

                    for pjob in job.backend.pandajobs:
                        if pjob.id == status.PandaID:
                            # skip if no status change
                            if pjob.status == status.jobStatus:
                                continue
                            # Else update job record
                            pjob.jobSpec = dict(
                                zip(status._attributes, status.values()))

                            for k in pjob.jobSpec.keys():
                                if type(pjob.jobSpec[k]) not in [
                                        type(''), type(1)
                                ]:
                                    pjob.jobSpec[k] = str(pjob.jobSpec[k])

                            logger.debug(
                                'Job %s with Panda job %s has changed status from %s to %s',
                                job.getFQID('.'), pjob.id, pjob.status,
                                status.jobStatus)
                            pjob.status = status.jobStatus
                            pjob.exitcode = str(status.transExitCode)
                            pjob.piloterrorcode = str(status.pilotErrorCode)
                            pjob.reason = ''
                            for k in pjob.jobSpec.keys():
                                if k.endswith('ErrorDiag'
                                              ) and pjob.jobSpec[k] != 'NULL':
                                    pjob.reason += '%s: %s, ' % (
                                        k, str(pjob.jobSpec[k]))
                            #if job.backend.jobSpec['transExitCode'] != 'NULL':
                            pjob.reason += 'transExitCode: %s' % pjob.jobSpec[
                                'transExitCode']

                            if status.jobStatus in [
                                    'defined', 'unknown', 'assigned',
                                    'waiting', 'activated', 'sent'
                            ]:
                                logger.debug('Panda job %s %s' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus in [
                                    'starting', 'running', 'holding',
                                    'transferring', 'merging'
                            ]:
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus in ['finished']:
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus == 'failed':
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                                # check for server side retry
                                if 'taskBufferErrorDiag' in pjob.jobSpec and pjob.jobSpec[
                                        'taskBufferErrorDiag'].find(
                                            "PandaID=") != -1:
                                    # grab the new panda ID
                                    newPandaID = long(
                                        pjob.jobSpec['taskBufferErrorDiag'].
                                        split("=")[1])
                                    pjob.id = newPandaID
                                    pjob.status = None
                                    pjob.url = 'http://panda.cern.ch/?job=%d' % newPandaID
                            elif status.jobStatus == 'cancelled' and pjob.status not in [
                                    'completed', 'failed'
                            ]:  # bug 67716
                                logger.debug('Panda job %s cancelled' %
                                             pjob.id)
                                if 'taskBufferErrorDiag' in pjob.jobSpec and "rebrokerage" in pjob.jobSpec[
                                        'taskBufferErrorDiag']:
                                    newPandaID = checkForRebrokerage(
                                        pjob.jobSpec['taskBufferErrorDiag'])
                                    logger.warning(
                                        "Subjob rebrokered by Panda server. Job %d moved to %d."
                                        % (pjob.id, newPandaID))
                                    pjob.id = newPandaID
                                    pjob.status = None
                            else:
                                logger.warning('Unexpected job status %s',
                                               status.jobStatus)
Ejemplo n.º 53
0
# history support
pconfDir = os.path.expanduser(os.environ['PANDA_CONFIG_ROOT'])
if not os.path.exists(pconfDir):
    os.makedirs(pconfDir)

# set grid source file
globalConf = GlobalConfig.getConfig()
if globalConf.grid_src != '' and not os.environ.has_key(
        'PATHENA_GRID_SETUP_SH'):
    os.environ['PATHENA_GRID_SETUP_SH'] = globalConf.grid_src

# make tmp dir
tmpDir = tempfile.mkdtemp()

# set tmp dir in Client
Client.setGlobalTmpDir(tmpDir)

# look for PandaTools package
for path in sys.path:
    if path == '':
        path = '.'

    if os.path.exists(path) and os.path.isdir(path) and 'pandatools' in os.listdir(path) \
           and os.path.exists('%s/pandatools/__init__.py' % path):
        # make symlink for module name
        os.symlink('%s/pandatools' % path, '%s/taskbuffer' % tmpDir)
        break
sys.path = [tmpDir] + sys.path

# total time
TotalCPUTime = 0
Ejemplo n.º 54
0
if not PsubUtils.checkOutDsName(options.outDS,
                                distinguishedName,
                                options.official,
                                nickName,
                                verbose=options.verbose):
    tmpStr = "invalid output dataset name: %s" % options.outDS
    tmpLog.error(tmpStr)
    sys.exit(1)

# full execution string
fullExecString = PsubUtils.convSysArgv()
fullExecString += jsonExecStr

# use INTR server
if options.intrSrv:
    Client.useIntrServer()

# create tmp dir
curDir = os.getcwd()
tmpDir = os.path.join(curDir, MiscUtils.wrappedUuidGen())
os.makedirs(tmpDir)


# exit action
def _onExit(dir, del_command):
    del_command('rm -rf %s' % dir)


atexit.register(_onExit, tmpDir, MiscUtils.commands_get_output)

# sandbox
Ejemplo n.º 55
0
    def master_prepare(self,app,appmasterconfig):

        # PandaTools
        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('AthenaMCPandaRTHandler master_prepare called for %s', job.getFQID('.'))
        usertag = configDQ2['usertag']
        #usertag='user09'
        nickname = getNickname(allowMissingNickname=True)
        self.libDataset = '%s.%s.ganga.%s_%d.lib._%06d' % (usertag,nickname,commands.getoutput('hostname').split('.')[0],int(time.time()),job.id)
#        self.userprefix='%s.%s.ganga' % (usertag,gridProxy.identity())
        sources = 'sources.%s.tar.gz' % commands.getoutput('uuidgen 2> /dev/null') 
        self.library = '%s.lib.tgz' % self.libDataset

        # check DBRelease
        # if job.backend.dbRelease != '' and job.backend.dbRelease.find(':') == -1:
         #   raise ApplicationConfigurationError(None,"ERROR : invalid argument for backend.dbRelease. Must be 'DatasetName:FileName'")

#       unpack library
        logger.debug('Creating source tarball ...')        
        tmpdir = '/tmp/%s' % commands.getoutput('uuidgen 2> /dev/null')
        os.mkdir(tmpdir)

        inputbox=[]
        if os.path.exists(app.transform_archive):
            # must add a condition on size.
            inputbox += [ File(app.transform_archive) ]
        if app.evgen_job_option:
            self.evgen_job_option=app.evgen_job_option
            if os.path.exists(app.evgen_job_option):
                # locally modified job option file to add to the input sand box
                inputbox += [ File(app.evgen_job_option) ]
                self.evgen_job_option=app.evgen_job_option.split("/")[-1]

         
#       add input sandbox files
        if (job.inputsandbox):
            for file in job.inputsandbox:
                inputbox += [ file ]
#        add option files
        for extFile in job.backend.extOutFile:
            try:
                shutil.copy(extFile,tmpdir)
            except IOError:
                os.makedirs(tmpdir)
                shutil.copy(extFile,tmpdir)
#       fill the archive
        for opt_file in inputbox:
            try:
                shutil.copy(opt_file.name,tmpdir)
            except IOError:
                os.makedirs(tmpdir)
                shutil.copy(opt_file.name,tmpdir)
#       now tar it up again

        inpw = job.getInputWorkspace()
        rc, output = commands.getstatusoutput('tar czf %s -C %s .' % (inpw.getPath(sources),tmpdir))
        if rc:
            logger.error('Packing sources failed with status %d',rc)
            logger.error(output)
            raise ApplicationConfigurationError(None,'Packing sources failed.')

        shutil.rmtree(tmpdir)

#       upload sources

        logger.debug('Uploading source tarball ...')
        try:
            cwd = os.getcwd()
            os.chdir(inpw.getPath())
            rc, output = Client.putFile(sources)
            if output != 'True':
                logger.error('Uploading sources %s failed. Status = %d', sources, rc)
                logger.error(output)
                raise ApplicationConfigurationError(None,'Uploading archive failed')
        finally:
            os.chdir(cwd)      


        # Use Panda's brokerage
##         if job.inputdata and len(app.sites)>0:
##             # update cloud, use inputdata's
##             from dq2.info.TiersOfATLAS import whichCloud,ToACache
##             inclouds=[]
##             for site in app.sites:
##                 cloudSite=whichCloud(app.sites[0])
##                 if cloudSite not in inclouds:
##                     inclouds.append(cloudSite)
##             # now converting inclouds content into proper brokering stuff.
##             outclouds=[]
##             for cloudSite in inclouds:
##                 for cloudID, eachCloud in ToACache.dbcloud.iteritems():
##                     if cloudSite==eachCloud:
##                         cloud=cloudID
##                         outclouds.append(cloud)
##                         break
                    
##             print outclouds
##             # finally, matching with user's wishes
##             if len(outclouds)>0:
##                 if not job.backend.requirements.cloud: # no user wish, update
##                     job.backend.requirements.cloud=outclouds[0]
##                 else:
##                     try:
##                         assert job.backend.requirements.cloud in outclouds
##                     except:
##                         raise ApplicationConfigurationError(None,'Input dataset not available in target cloud %s. Please try any of the following %s' % (job.backend.requirements.cloud, str(outclouds)))
                                                            
        from GangaPanda.Lib.Panda.Panda import runPandaBrokerage
        
        runPandaBrokerage(job)
        
        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError(None,'site is still AUTO after brokerage!')

        # output dataset preparation and registration
        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
        except:
            raise ApplicationConfigurationError(None,"Could not extract output dataset location from job.backend.site value: %s. Aborting" % job.backend.site)
        if not app.dryrun:
            for outtype in app.outputpaths.keys():
                dset=string.replace(app.outputpaths[outtype],"/",".")
                dset=dset[1:]
                # dataset registration must be done only once.
                print "registering output dataset %s at %s" % (dset,outDsLocation)
                try:
                    Client.addDataset(dset,False,location=outDsLocation)
                    dq2_set_dataset_lifetime(dset, location=outDsLocation)
                except:
                    raise ApplicationConfigurationError(None,"Fail to create output dataset %s. Aborting" % dset)
            # extend registration to build job lib dataset:
            print "registering output dataset %s at %s" % (self.libDataset,outDsLocation)

            try:
                Client.addDataset(self.libDataset,False,location=outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset, outDsLocation)
            except:
                raise ApplicationConfigurationError(None,"Fail to create output dataset %s. Aborting" % self.libDataset)


        ###
        cacheVer = "-AtlasProduction_" + str(app.prod_release)
            
        logger.debug("master job submit?")
        self.outsite=job.backend.site
        if app.se_name and app.se_name != "none" and not self.outsite:
            self.outsite=app.se_name

       
        #       create build job
        jspec = JobSpec()
        jspec.jobDefinitionID   = job.id
        jspec.jobName           = commands.getoutput('uuidgen 2> /dev/null')
        jspec.AtlasRelease      = 'Atlas-%s' % app.atlas_rel
        jspec.homepackage       = 'AnalysisTransforms'+cacheVer#+nightVer
        jspec.transformation    = '%s/buildJob-00-00-03' % Client.baseURLSUB # common base to Athena and AthenaMC jobs: buildJob is a pilot job which takes care of all inputs for the real jobs (in prepare()
        jspec.destinationDBlock = self.libDataset
        jspec.destinationSE     = job.backend.site
        jspec.prodSourceLabel   = 'panda'
        jspec.assignedPriority  = 2000
        jspec.computingSite     = job.backend.site
        jspec.cloud             = job.backend.requirements.cloud
#        jspec.jobParameters     = self.args not known yet
        jspec.jobParameters     = '-o %s' % (self.library)
        if app.userarea:
            print app.userarea
            jspec.jobParameters     += ' -i %s' % (os.path.basename(app.userarea))
        else:
            jspec.jobParameters     += ' -i %s' % (sources)
        jspec.cmtConfig         = AthenaUtils.getCmtConfig(athenaVer=app.atlas_rel)
        
        matchURL = re.search('(http.*://[^/]+)/',Client.baseURLSSL)
        if matchURL:
            jspec.jobParameters += ' --sourceURL %s' % matchURL.group(1)

        fout = FileSpec()
        fout.lfn  = self.library
        fout.type = 'output'
        fout.dataset = self.libDataset
        fout.destinationDBlock = self.libDataset
        jspec.addFile(fout)

        flog = FileSpec()
        flog.lfn = '%s.log.tgz' % self.libDataset
        flog.type = 'log'
        flog.dataset = self.libDataset
        flog.destinationDBlock = self.libDataset
        jspec.addFile(flog)
        #print "MASTER JOB DETAILS:",jspec.jobParameters

        return jspec
Ejemplo n.º 56
0
    for r in res:
        ids.append(r[0])
    print 'got', len(ids), 'new pandaIDs'
    print
except cx_Oracle.DatabaseError, exc:
    error, = exc.args
    print "updatePandaWAN.py - problem in establishing connection to db"
    print "updatePandaWAN.py Oracle-Error-Code:", error.code
    print "updatePandaWAN.py Oracle-Error-Message:", error.message

print 'leaving only first 200 in the list.'
del ids[2000:]


from pandatools import Client
status,jobSpec = Client.getFullJobStatus(ids,False)
print 'got back from panda:', len(jobSpec)

try:
    connection = cx_Oracle.Connection(line)
    cursor = cx_Oracle.Cursor(connection)
    print 'Connection established.'
    for i in jobSpec:
        if i is None:
            print "problem..."
            continue 
        ti=i.pilotTiming.split('|')
        if len(ti)!=5:
            print i.PandaID,i.pilotTiming,ti
            #cursor.execute("DELETE FROM ATLAS_WANHCTEST.result where pandaid=:pid",{'pid':i.PandaID});
            continue
Ejemplo n.º 57
0
def expandExcludedSiteList(job):
    '''Expand a site list taking wildcards into account'''

    # first, check if there's anything to be done
    check_ddm = False
    wildcard = False
    excl_sites = []
    for s in job.backend.requirements.excluded_sites:
        if s.find("ANALY_") == -1:
            check_ddm = True

        if s.find("*") != -1:
            wildcard = True

        if s.find("ANALY_") != -1 and s.find("*") == -1:
            excl_sites.append(s)

    if not check_ddm and not wildcard:
        return excl_sites

    # we have either wildcards or DDM sites listed
    # First, find the allowed sites for this job and ensure no duplicates anywhere
    from pandatools import Client
    logger.info(
        "Excluding DDM and wildcarded sites from Jedi job. Please wait....")
    orig_ddm_list = []
    new_ddm_list = []
    for s in job.inputdata.get_locations():
        if not s in orig_ddm_list:
            orig_ddm_list.append(s)
            new_ddm_list.append(s)

    orig_panda_list = []
    for s in [Client.convertDQ2toPandaID(x) for x in new_ddm_list]:
        for s2 in Client.PandaSites.keys():
            if s2.find(s) != -1 and not s2 in orig_panda_list:
                orig_panda_list.append(s2)

    if check_ddm:
        # remove any DDM sites that are referenced, including wildcards
        for s in job.backend.requirements.excluded_sites:
            if s in orig_ddm_list:
                new_ddm_list.remove(s)

            if s.find("*") != -1:
                for s2 in orig_ddm_list:
                    if fnmatch.fnmatch(s2, s):
                        new_ddm_list.remove(s2)

        # now recreate the panda list and see if any have been dropped
        new_panda_list = []
        for s in [Client.convertDQ2toPandaID(x) for x in new_ddm_list]:
            for s2 in Client.PandaSites.keys():
                if s2.find(s) != -1 and not s2 in new_panda_list:
                    new_panda_list.append(s)

        for s in orig_panda_list:
            if not s in new_panda_list and not s in excl_sites:
                excl_sites.append(s)

    if wildcard:
        # find wilcarded ANALY_* sites and exclude any that match good sites
        for s in job.backend.requirements.excluded_sites:
            if s.find("*") != -1:
                for s2 in orig_panda_list:
                    if fnmatch.fnmatch(s2, s) and not s2 in excl_sites:
                        excl_sites.append(s2)

    return excl_sites
Ejemplo n.º 58
0
 def retry(self,JobsetID,newSite=False,newOpts={},noSubmit=False,ignoreDuplication=False,useJobsetID=False,retryBuild=False,reproduceFiles=[],unsetRetryID=False):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
         self.gridPassPhrase,
         False,
         self.verbose,
         useCache=True)
     # force update just in case
     self.status(JobsetID,True)
     # set an empty map since mutable default value is used
     if newOpts == {}:
         newOpts = {}
     # get jobset
     newJobsetID = -1
     jobList = self.getJobIDsWithSetID(JobsetID)
     if jobList == None:
         # works only for jobsetID
         if useJobsetID:
             return
         # works with jobID   
         isJobset = False
         jobList = [JobsetID]
     else:
         isJobset = True
         tmpMsg = "ID=%s is composed of JobID=" % JobsetID
         for tmpJobID in jobList:
             tmpMsg += '%s,' % tmpJobID
         tmpMsg = tmpMsg[:-1]
         tmpLog.info(tmpMsg)
     for JobID in jobList:    
         # get job info from local repository
         localJob = self.getJobInfo(JobID)
         if localJob == None:
             tmpLog.warning("JobID=%s not found in local repository. Synchronization may be needed" % JobID)            
             return None
         # for JEDI
         if localJob.isJEDI():
             status,out = Client.retryTask(
                     localJob.jediTaskID,
                     verbose=self.verbose,
                     properErrorCode=True,
                     newParams=newOpts)
             if status != 0:
                 tmpLog.error(status)
                 tmpLog.error(out)
                 tmpLog.error("Failed to retry TaskID=%s" % localJob.jediTaskID)
                 return False
             tmpStat,tmpDiag = out
             if (not tmpStat in [0,True] and newOpts == {}) or (newOpts != {} and tmpStat != 3):
                 tmpLog.error(tmpDiag)
                 tmpLog.error("Failed to retry TaskID=%s" % localJob.jediTaskID)
                 return False
             tmpLog.info(tmpDiag)
             continue
         # skip running job
         if localJob.dbStatus != 'frozen':
             tmpLog.info('Retry failed subjobs in running jobId=%s' % JobID)
             status,out = Client.retryFailedJobsInActive(JobID,verbose=self.verbose)
             if status != 0:
                 tmpLog.error(status)
                 tmpLog.error(out)
                 tmpLog.error("Failed to retry JobID=%s" % JobID)
             else:
                 job = self.status(JobID)
             if isJobset:
                 continue
             else:
                 return
         # skip already retried
         if localJob.retryID != '0':
             if isJobset:
                 tmpLog.info('Skip JobID=%s since already retried by JobID=%s JobsetID=%s' % \
                             (JobID,localJob.retryID,localJob.retryJobsetID))
                 continue
             else:
                 tmpLog.warning('This job was already retried by JobID=%s' % localJob.retryID)
                 return
         # check status of buildJob
         if not retryBuild and not localJob.buildStatus in ['','finished']:
             tmpMsgStr = 'Cannot retry since status of buildJob %s is %s (!= finished). ' \
                         % (localJob.PandaID.split(',')[0],localJob.buildStatus)
             tmpMsgStr += 'Please execute %s with the same input/output datasets (or containers). ' % localJob.jobType
             tmpMsgStr += 'It will run only on failed/cancelled/unused input files '
             tmpMsgStr += 'and append output files to the output dataset container. '
             tmpMsgStr += 'Or you may set retryBuild=True in pbook.retry() '                
             tmpLog.warning(tmpMsgStr)
             if isJobset:
                 continue
             else:
                 return
         # check opts for newSite
         if newSite or newOpts != {}:
             if not localJob.outDS.endswith('/') and not newOpts.has_key('outDS') and not newOpts.has_key('--outDS'):
                 tmpLog.warning('You need to specify --outDS in newOpts to retry at new site unless container is used as output')
                 return
         # get list of failed jobs
         pandaIDs  = localJob.PandaID.split(',')
         statusList= localJob.jobStatus.split(',')
         jobList = []
         for idx in range(len(pandaIDs)):
             # check status unless reproduce files
             if reproduceFiles == [] and not statusList[idx] in ['failed','cancelled']:
                 continue
             jobList.append(pandaIDs[idx])
         # no failed job
         if jobList == []:
             if isJobset:
                 tmpLog.info('Skip JobID=%s since no failed jobs' % JobID)                    
                 continue
             else:
                 tmpLog.info('No failed jobs to be retried for JobID=%s' % JobID)
                 return
         # get full job spec
         tmpLog.info("Retrying JobID=%s ..." % JobID)
         tmpLog.info("Getting job info")
         idxJL  = 0
         nQuery = 500
         pandaJobs = []
         while idxJL < len(jobList):
             # avoid burst query
             tmpLog.info(" %5s/%s" % (idxJL,len(jobList)))                
             status,oTmp = Client.getFullJobStatus(
                     jobList[idxJL:idxJL+nQuery],
                     verbose=self.verbose)
             if status != 0:
                 tmpLog.error(status)
                 tmpLog.error(oTmp)
                 tmpLog.error("Cannot get job info from Panda server")
                 return
             pandaJobs += oTmp
             idxJL += nQuery
             time.sleep(1)
         tmpLog.info(" %5s/%s" % (len(jobList),len(jobList)))
         # get PandaIDs to reproduce files
         if reproduceFiles != []:
             # change wildcard to .* for regexp
             reproduceFilePatt = []
             for tmpReproduceFile in reproduceFiles:
                 if '*' in tmpReproduceFile:
                     tmpReproduceFile = tmpReproduceFile.replace('*','.*')
                 reproduceFilePatt.append(tmpReproduceFile)
             # get list of jobs which produced interesting files    
             tmpJobList = []
             tmpPandaJobs = []
             for tmpPandaJob in pandaJobs:
                 # check names
                 tmpMatchFlag = False
                 for tmpFile in tmpPandaJob.Files:
                     if tmpFile.type == 'output' and tmpFile.status == 'ready':
                         for tmpReproduceFile in reproduceFilePatt:
                             # normal matching
                             if tmpReproduceFile == tmpFile.lfn:
                                 tmpMatchFlag = True
                                 break
                             # wild card
                             if '*' in tmpReproduceFile and \
                                re.search('^'+tmpReproduceFile,tmpFile.lfn) != None:
                                 tmpMatchFlag = True
                                 break
                         if tmpMatchFlag:
                             break
                 # append
                 if tmpMatchFlag:
                     tmpJobList.append(tmpPandaJob.PandaID)
                     tmpPandaJobs.append(tmpPandaJob)
             # use new list
             jobList = tmpJobList
             pandaJobs = tmpPandaJobs
             if jobList == []:
                 tmpLog.info("No jobs to reproduce files : Jobs in JobID=%s didn't produce lost files" % JobID)
                 continue
         # jobdefID
         newJobdefID = PsubUtils.readJobDefID()
         # reset some parameters
         retryJobs    = []
         retrySite    = None
         retryElement = None
         retryDestSE  = None
         outDsName    = None
         shadowList   = []
         oldLibDS     = None
         newLibDS     = None
         newLibTgz    = None
         rebroMap     = {}
         for idx in range(len(jobList)):
             job = pandaJobs[idx]
             # skip exired
             if job == None:
                 tmpLog.warning("Could not retry jobs older than 30 days : JobID=%s (PandaID=%s) expired" \
                                % (JobID,jobList[idxJob]))
                 return
             # skip jobs reassigned by rebrokerage
             if (job.jobStatus == 'cancelled' and job.taskBufferErrorCode in [105,'105']) or \
                    (job.jobStatus == 'failed' and job.taskBufferErrorCode in [106,'106']):
                 # extract JobIDs of reassigned jobs
                 tmpM = re.search('JobsetID=(\d+) JobID=(\d+)',job.taskBufferErrorDiag)
                 if tmpM != None:
                     tmpRebKey = (tmpM.group(1),tmpM.group(2))
                     if not rebroMap.has_key(tmpRebKey):
                         rebroMap[tmpRebKey] = 0
                     # count # of reassigned jobs
                     rebroMap[tmpRebKey] += 1
                 continue
             # get shadow list
             if (not ignoreDuplication) and outDsName == None and job.prodSourceLabel == 'user':
                 # look for dataset for log since it doesn't have suffix even when --individualOutDS is used
                 for tmpFile in job.Files:
                     if tmpFile.type == 'log':
                         outDsName = tmpFile.dataset
                         break
                 # output dataset was not found    
                 if outDsName == None:
                     tmpLog.error("Could not get output dataset name for JobID=%s (PandaID=%s)" \
                                  % (JobID,job.PandaID))
                     return
                 # get files in shadow
                 if outDsName.endswith('/'):
                     shadowList = Client.getFilesInShadowDataset(
                             outDsName,
                             Client.suffixShadow,
                             self.verbose)
                 else:
                     # disable duplication check mainly for old overlay jobs since non-signal files are wrongly skipped
                     #shadowList = Client.getFilesInShadowDatasetOld(outDsName,Client.suffixShadow,self.verbose)
                     pass
             # unify sitename
             if retrySite == None:
                 retrySite    = job.computingSite
                 retryElement = job.computingElement
                 retryDestSE  = job.destinationSE
             # reset
             job.jobStatus           = None
             job.commandToPilot      = None
             job.startTime           = None
             job.endTime             = None
             job.attemptNr           = 1+job.attemptNr
             for attr in job._attributes:
                 if attr.endswith('ErrorCode') or attr.endswith('ErrorDiag'):
                     setattr(job,attr,None)
             job.transExitCode       = None
             job.computingSite       = retrySite
             job.computingElement    = retryElement
             job.destinationSE       = retryDestSE
             job.dispatchDBlock      = None
             if not unsetRetryID:
                 job.jobExecutionID  = JobID
             job.jobDefinitionID     = newJobdefID
             job.parentID            = job.PandaID
             if job.jobsetID != ['NULL',None,-1]:
                 if not unsetRetryID:
                     job.sourceSite  = job.jobsetID
                 job.jobsetID        = newJobsetID
             skipInputList = []
             numUsedFiles = 0
             # loop over all files    
             for file in job.Files:
                 file.rowID = None
                 if file.type == 'input':
                     # protection against wrong sync which doesn't update buildStatus correctly
                     if not retryBuild and file.lfn.endswith('.lib.tgz') and file.GUID == 'NULL':
                         tmpLog.warning('GUID for %s is unknown. Cannot retry when corresponding buildJob failed' \
                                        % file.lfn)
                         return
                     if not retryBuild or not file.lfn.endswith('.lib.tgz'):
                         file.status = 'ready'
                     # set new lib dataset    
                     if retryBuild and file.lfn.endswith('.lib.tgz'):
                         if newLibTgz != None:
                             file.lfn            = newLibTgz
                             file.dataset        = newLibDS
                             file.dispatchDBlock = newLibDS
                     # check with shadow for non lib.tgz/DBR 
                     tmpDbrMatch = re.search('^DBRelease-.*\.tar\.gz$',file.lfn)
                     if tmpDbrMatch == None and not file.lfn.endswith('.lib.tgz'):
                         if file.lfn in shadowList:
                             skipInputList.append(file)
                         else:
                             numUsedFiles += 1
                 elif file.type in ('output','log'):
                     file.destinationSE = retryDestSE
                     file.destinationDBlock = re.sub('_sub\d+$','',file.destinationDBlock)
                     # add retry num
                     if file.dataset.endswith('/') or job.prodSourceLabel == 'panda':
                         oldOutDsName = file.destinationDBlock
                         retryDsPatt = '_r'
                         if reproduceFiles != []:
                             retryDsPatt = '_rp'
                         retryMatch = re.search(retryDsPatt+'(\d+)$',file.destinationDBlock)
                         if retryMatch == None:
                             file.destinationDBlock += (retryDsPatt+'1')
                         else:
                             tmpDestinationDBlock = re.sub(retryDsPatt+'(\d+)$','',file.destinationDBlock)
                             file.destinationDBlock = tmpDestinationDBlock + retryDsPatt + '%d' % (1+int(retryMatch.group(1)))
                         if job.processingType == 'usermerge':
                             job.jobParameters = job.jobParameters.replace(' %s ' % oldOutDsName,
                                                                           ' %s ' % file.destinationDBlock)
                         # use new dataset name for buildXYZ
                         if job.prodSourceLabel == 'panda':
                             if file.lfn.endswith('.lib.tgz'):
                                 # get new libDS and lib.tgz names
                                 oldLibDS  = file.dataset
                                 file.dataset = file.destinationDBlock
                                 newLibDS = file.dataset
                                 file.lfn = re.sub(oldLibDS,newLibDS,file.lfn)
                                 newLibTgz = file.lfn
                             else:
                                 file.dataset = file.destinationDBlock                                    
                     # add attempt nr
                     oldName  = file.lfn
                     if job.prodSourceLabel == 'panda' and file.lfn.endswith('.lib.tgz'):
                         continue
                     else:
                         # append attempt number at the tail 
                         file.lfn = re.sub("\.\d+$","",file.lfn)
                         file.lfn = "%s.%d" % (file.lfn,job.attemptNr)
                     newName  = file.lfn
                     # modify jobParameters
                     job.jobParameters = re.sub("'%s'" % oldName ,"'%s'" % newName,
                                                job.jobParameters)
                     # look for output in trf
                     oldGenelicName = re.sub('\.\d+$','',oldName)
                     match = re.search(oldGenelicName+'(\.\d+)*(%20|")',job.jobParameters)
                     if match != None:
                         job.jobParameters = job.jobParameters.replace(match.group(0),newName+match.group(2))
             # change lib.tgz name
             if retryBuild and newLibDS != None:
                 job.jobParameters = re.sub(oldLibDS,newLibDS,job.jobParameters)
                 # change destinationDBlock
                 if job.prodSourceLabel == 'panda':
                     job.destinationDBlock = newLibDS
             # all files are used by others
             if numUsedFiles == 0 and skipInputList != []:
                 continue
             # remove skipped files
             strSkipped = ''
             for tmpFile in skipInputList:
                 strSkipped += '%s,' % tmpFile.lfn
                 job.Files.remove(tmpFile)
             strSkipped = strSkipped[:-1]
             # modify jobpar
             if strSkipped != '':
                 optionToSkipFiles = '--skipInputByRetry'
                 if not optionToSkipFiles in job.jobParameters:
                     # just append
                     job.jobParameters += "%s=%s " % (optionToSkipFiles,strSkipped)
                 else:
                     # extract already skipped files
                     tmpMatch = re.search("(%s=[^ ]+)",job.jobParameters)
                     if tmpMatch == None:
                         tmpLog.error("Failed to extract arg of %s for PandaID=%s" \
                                      % (optionToSkipFiles,job.PandaID))
                         return
                     # replace
                     job.jobParameters = re.sub(tmpMatch.group(1),"%s,%s" % (tmpMatch.group(1),optionToSkipFiles),
                                                job.jobParameters)
             if self.verbose:
                 tmpLog.debug(job.jobParameters)
             # append
             retryJobs.append(job)
         # info on rebrokeage    
         if rebroMap != {}:
             for tmpRebKey,tmpRebNumJobs in rebroMap.iteritems():
                 tmpRebSetID,tmpRebJobID = tmpRebKey
                 tmpLog.info('Skip %s jobs since JobID=%s JobsetID=%s already reassigned them to another site' % \
                             (tmpRebNumJobs,tmpRebJobID,tmpRebSetID))
             if retryJobs == []:
                 tmpLog.info("No more jobs to be retried for JobID=%s" % JobID)
                 if isJobset:
                     continue
                 else:
                     return
         # all input files were or are being used by other jobs
         if retryJobs == []:
             tmpLog.info('All input files were or are being used by other jobs for the same output. No jobs to be retried. If you need to ignore duplication check (e.g., using the same EVNT file for multiple simulation subjobs), set ignoreDuplication=True. i.e. retry(123,ignoreDuplication=True)')
             if isJobset:
                 continue
             else:
                 return
         # check voms role
         if not retryJobs[0].workingGroup in ['NULL',None,'']:
             # VOMS role was used 
             if not "--workingGroup" in job.metadata:
                 # extract voms roles from metadata
                 match =  re.search("--voms( |=)[ \"]*([^ \"]+)",job.metadata)
                 if match != None:
                     vomsRoles = match.group(2)
                 else:
                     vomsRoles = "atlas:/atlas/%s/Role=production" % retryJobs[0].workingGroup
             # regenerate proxy with VOMS roles
             try:
                 tmpLog.info("Checking proxy role to resubmit %s jobs" % retryJobs[0].workingGroup)
                 self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
                         self.gridPassPhrase,
                         False,
                         self.verbose,vomsRoles,
                         useCache=True)
             except:
                 tmpLog.error("Failed to generate a proxy with %s" % vomsRoles)
                 return
         # check runtime env for new site submission
         if (newSite or newOpts != {}):
             if retryJobs[0].processingType == 'pathena' or '--useAthenaPackages' in retryJobs[0].metadata:
                 from pandatools import AthenaUtils
                 stA,retA = AthenaUtils.getAthenaVer()
                 if not stA:
                     tmpLog.error("Failed to get Athena rel/cache version in current runtime env")
                     return
                 athenaVer = retA['athenaVer']
                 cacheVer  = retA['cacheVer']
                 nightVer  = retA['nightVer']
                 wrongSetup = False
                 if retryJobs[0].AtlasRelease != 'Atlas-%s' % athenaVer:
                     wrongSetup = True
                     errMsg =  "Current Athena version Atlas-%s is inconsitent with the previous submission %s. " % (athenaVer,retryJobs[0].AtlasRelease)
                 elif retryJobs[0].homepackage != 'AnalysisTransforms'+cacheVer+nightVer:
                     wrongSetup = True                        
                     errMsg =  "Current cache version %s is inconsitent with the previous submission. " % cacheVer.replace('-','').replace('_','-')
                 if wrongSetup:    
                     errMsg += 'You need to have the same runtime env as before since all job spec need to be re-created to send jobs to a new site. '
                     errMsg += 'Please setup Athena correctly and restart pbook'                        
                     tmpLog.error(errMsg)
                     return
         # test mode
         if noSubmit:
             continue
         # invoke pathena/prun to send job to new site
         if (newSite or newOpts != {}) and retryJobs[0].processingType != 'usermerge':
             # set parent jobID and jobsetID
             newOpts['provenanceID'] = retryJobs[0].jobExecutionID
             newOpts['panda_parentJobsetID'] = retryJobs[0].sourceSite
             tmpLog.info("Constructing job spec again to be sent to another site ...")
             comStat= PsubUtils.execWithModifiedParams(retryJobs,newOpts,self.verbose,newSite)
             if comStat == 0:
                 # update database
                 time.sleep(2)
                 self.sync()
             else:
                 tmpLog.error("Failed to submit jobs to Panda server")                
             return
         # register datasets
         tmpOutDsLocation = Client.PandaSites[retryJobs[-1].computingSite]['ddm']
         addedDataset = []
         shadowDSname = None
         for tmpFile in retryJobs[-1].Files:
             if tmpFile.type in ['output','log'] and tmpFile.dataset.endswith('/'):
                 # add shadow
                 """
                 removed shadow
                 if shadowDSname == None and tmpFile.type == 'log':
                     shadowDSname = "%s%s" % (tmpFile.destinationDBlock,Client.suffixShadow)
                     Client.addDataset(shadowDSname,self.verbose)
                 """    
                 # add datasets    
                 if not tmpFile.destinationDBlock in addedDataset:
                     # create dataset
                     Client.addDataset(
                             tmpFile.destinationDBlock,
                             self.verbose,
                             location=tmpOutDsLocation,
                             dsCheck=False)
                     # add to container
                     Client.addDatasetsToContainer(
                             tmpFile.dataset,
                             [tmpFile.destinationDBlock],
                             self.verbose)
                     # append
                     addedDataset.append(tmpFile.destinationDBlock)
         # register libDS
         if retryBuild and newLibDS != None:
             Client.addDataset(
                     newLibDS,
                     self.verbose,
                     location=tmpOutDsLocation,
                     dsCheck=False)
         # submit
         tmpLog.info("Submitting job ...")            
         status,out = Client.submitJobs(retryJobs,verbose=self.verbose)
         if out == None or status != 0:
             tmpLog.error(status)
             tmpLog.error(out)
             tmpLog.error("Failed to submit jobs to Panda server")
             return
         # update database
         pandaIDstatus = {}
         newJobID = None
         for items in out:
             # get newJobID
             if newJobID == None:
                 newJobID = items[1]
             # check PandaID
             PandaID = items[0]
             if PandaID == 'NULL':
                 tmpLog.error("Panda server returned wrong IDs. It may have a temporary problem")
                 return
             # set newJobsetID
             if newJobsetID in [None,-1]:
                 newJobsetID = items[2]['jobsetID']
             # dummy statuso
             pandaIDstatus[PandaID] = ('defined','NULL')
         # set retry ID
         if not unsetRetryID:
             localJob.retryID = newJobID
             if not newJobsetID in [None,-1,'NULL']:
                 localJob.retryJobsetID = newJobsetID
             try:
                 PdbUtils.updateJobDB(localJob,self.verbose)
             except:
                 tmpLog.error("Failed to set retryID for JobID=%s" % JobID)
                 return
         # set new paramers
         newLocalJob = PdbUtils.convertPtoD(retryJobs,pandaIDstatus)
         newLocalJob.JobID = newJobID
         if not newJobsetID in [None,-1,'NULL']:
             newLocalJob.groupID = newJobsetID
         newLocalJob.creationTime = datetime.datetime.utcnow()
         # insert to DB
         try:
             PdbUtils.insertJobDB(newLocalJob,self.verbose)
         except:
             tmpLog.error("Failed to insert JobID=%s to local repository" % newJobID)
             return
         # write new jobdefID
         PsubUtils.writeJobDefID(newJobID)
         # done
         tmpMsg = 'Done. New JobID=%s' % newJobID
         if not newJobsetID in [None,-1,'NULL']:
             tmpMsg += " JobsetID=%s" % newJobsetID
         tmpLog.info(tmpMsg)
Ejemplo n.º 59
0
    def master_prepare(self,app,appconfig):
        '''Prepare the master job'''

        from pandatools import Client
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('ExecutablePandaRTHandler master_prepare called for %s', job.getFQID('.')) 

        # set chirp variables
        if configPanda['chirpconfig'] or configPanda['chirpserver']:
            setChirpVariables()

#       Pack inputsandbox
        inputsandbox = 'sources.%s.tar' % commands.getoutput('uuidgen 2> /dev/null') 
        inpw = job.getInputWorkspace()
        # add user script to inputsandbox
        if hasattr(job.application.exe, "name"):
            if not job.application.exe in job.inputsandbox:
                job.inputsandbox.append(job.application.exe)

        for fname in [f.name for f in job.inputsandbox]:
            fname.rstrip(os.sep)
            path = fname[:fname.rfind(os.sep)]
            f = fname[fname.rfind(os.sep)+1:]
            rc, output = commands.getstatusoutput('tar rf %s -C %s %s' % (inpw.getPath(inputsandbox), path, f))
            if rc:
                logger.error('Packing inputsandbox failed with status %d',rc)
                logger.error(output)
                raise ApplicationConfigurationError('Packing inputsandbox failed.')
        if len(job.inputsandbox) > 0:
            rc, output = commands.getstatusoutput('gzip %s' % (inpw.getPath(inputsandbox)))
            if rc:
                logger.error('Packing inputsandbox failed with status %d',rc)
                logger.error(output)
                raise ApplicationConfigurationError('Packing inputsandbox failed.')
            inputsandbox += ".gz"
        else:
            inputsandbox = None

#       Upload Inputsandbox
        if inputsandbox:
            logger.debug('Uploading source tarball ...')
            uploadSources(inpw.getPath(),os.path.basename(inputsandbox))
            self.inputsandbox = inputsandbox
        else:
            self.inputsandbox = None

#       input dataset
        if job.inputdata:
            if job.inputdata._name != 'DQ2Dataset':
                raise ApplicationConfigurationError('PANDA application supports only DQ2Datasets')

        # run brokerage here if not splitting
        if not job.splitter:
            from GangaPanda.Lib.Panda.Panda import runPandaBrokerage
            runPandaBrokerage(job)
        elif job.splitter._name not in ['DQ2JobSplitter', 'ArgSplitter', 'ArgSplitterTask']:
            raise ApplicationConfigurationError('Panda splitter must be DQ2JobSplitter or ArgSplitter')
        
        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError('site is still AUTO after brokerage!')

#       output dataset
        if job.outputdata:
            if job.outputdata._name != 'DQ2OutputDataset':
                raise ApplicationConfigurationError('Panda backend supports only DQ2OutputDataset')
        else:
            logger.info('Adding missing DQ2OutputDataset')
            job.outputdata = DQ2OutputDataset()

        job.outputdata.datasetname,outlfn = dq2outputdatasetname(job.outputdata.datasetname, job.id, job.outputdata.isGroupDS, job.outputdata.groupname)

        self.outDsLocation = Client.PandaSites[job.backend.site]['ddm']

        try:
            Client.addDataset(job.outputdata.datasetname,False,location=self.outDsLocation)
            logger.info('Output dataset %s registered at %s'%(job.outputdata.datasetname,self.outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname, location=self.outDsLocation)
        except exceptions.SystemExit:
            raise BackendError('Panda','Exception in Client.addDataset %s: %s %s'%(job.outputdata.datasetname,sys.exc_info()[0],sys.exc_info()[1]))

        # handle the libds
        if job.backend.libds:
            self.libDataset = job.backend.libds
            self.fileBO = getLibFileSpecFromLibDS(self.libDataset)
            self.library = self.fileBO.lfn
        elif job.backend.bexec:
            self.libDataset = job.outputdata.datasetname+'.lib'
            self.library = '%s.tgz' % self.libDataset
            try:
                Client.addDataset(self.libDataset,False,location=self.outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset, location=self.outDsLocation)
                logger.info('Lib dataset %s registered at %s'%(self.libDataset,self.outDsLocation))
            except exceptions.SystemExit:
                raise BackendError('Panda','Exception in Client.addDataset %s: %s %s'%(self.libDataset,sys.exc_info()[0],sys.exc_info()[1]))

        # collect extOutFiles
        self.extOutFile = []
        for tmpName in job.outputdata.outputdata:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.outputsandbox:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.backend.extOutFile:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        # create build job
        if job.backend.bexec != '':
            jspec = JobSpec()
            jspec.jobDefinitionID   = job.id
            jspec.jobName           = commands.getoutput('uuidgen 2> /dev/null')
            jspec.transformation    = '%s/buildGen-00-00-01' % Client.baseURLSUB
            if Client.isDQ2free(job.backend.site):
                jspec.destinationDBlock = '%s/%s' % (job.outputdata.datasetname,self.libDataset)
                jspec.destinationSE     = 'local'
            else:
                jspec.destinationDBlock = self.libDataset
                jspec.destinationSE     = job.backend.site
            jspec.prodSourceLabel   = configPanda['prodSourceLabelBuild']
            jspec.processingType    = configPanda['processingType']
            jspec.assignedPriority  = configPanda['assignedPriorityBuild']
            jspec.computingSite     = job.backend.site
            jspec.cloud             = job.backend.requirements.cloud
            jspec.jobParameters     = '-o %s' % (self.library)
            if self.inputsandbox:
                jspec.jobParameters     += ' -i %s' % (self.inputsandbox)
            else:
                raise ApplicationConfigurationError('Executable on Panda with build job defined, but inputsandbox is emtpy !')
            matchURL = re.search('(http.*://[^/]+)/',Client.baseURLCSRVSSL)
            if matchURL:
                jspec.jobParameters += ' --sourceURL %s ' % matchURL.group(1)
            if job.backend.bexec != '':
                jspec.jobParameters += ' --bexec "%s" ' % urllib.quote(job.backend.bexec)
                jspec.jobParameters += ' -r %s ' % '.'
                

            fout = FileSpec()
            fout.lfn  = self.library
            fout.type = 'output'
            fout.dataset = self.libDataset
            fout.destinationDBlock = self.libDataset
            jspec.addFile(fout)

            flog = FileSpec()
            flog.lfn = '%s.log.tgz' % self.libDataset
            flog.type = 'log'
            flog.dataset = self.libDataset
            flog.destinationDBlock = self.libDataset
            jspec.addFile(flog)
            return jspec
        else:
            return None
Ejemplo n.º 60
0
 def kill(self,JobID,useJobsetID=False):
     # get logger
     tmpLog = PLogger.getPandaLogger()
     # check proxy
     self.gridPassPhrase,self.vomsFQAN = PsubUtils.checkGridProxy(
             self.gridPassPhrase,
             False,
             self.verbose,
             useCache=True)
     # force update just in case
     self.status(JobID,True)
     # get jobset
     jobList = self.getJobIDsWithSetID(JobID)
     if jobList == None:
         # works only for jobsetID
         if useJobsetID:
             return
         # works with jobID
         jobList = [JobID]
     else:
         tmpMsg = "ID=%s is composed of JobID=" % JobID
         for tmpJobID in jobList:
             tmpMsg += '%s,' % tmpJobID
         tmpMsg = tmpMsg[:-1]
         tmpLog.info(tmpMsg)
     for tmpJobID in jobList:    
         # get job info from local repository
         job = self.getJobInfo(tmpJobID)
         if job == None:
             tmpLog.warning("JobID=%s not found in local repository. Synchronization may be needed" % tmpJobID)            
             continue
         # skip frozen job
         if job.dbStatus == 'frozen':
             tmpLog.info('All subJobs in JobID=%s already finished/failed' % tmpJobID)
             continue
         if not job.isJEDI():
             # get PandaID list
             killJobs = job.PandaID.split(',')
             # kill
             tmpLog.info('Sending kill command ...')
             status,output = Client.killJobs(killJobs,self.verbose)
             if status != 0:
                 tmpLog.error(output)
                 tmpLog.error("Failed to kill JobID=%s" % tmpJobID)
                 return False
             # update database
             job.commandToPilot = 'tobekilled'
             # update DB
             try:
                 PdbUtils.updateJobDB(job,self.verbose)
             except:
                 tmpLog.error("Failed to update local repository for JobID=%s" % tmpJobID)
                 return False
         else:
             # kill JEDI task
             tmpLog.info('Sending killTask command ...')
             status,output = Client.killTask(job.jediTaskID,self.verbose)
             # communication error
             if status != 0:
                 tmpLog.error(output)
                 tmpLog.error("Failed to kill JobID=%s" % tmpJobID)
                 return False
             tmpStat,tmpDiag = output
             if not tmpStat:
                 tmpLog.error(tmpDiag)
                 tmpLog.error("Failed to kill JobID=%s" % tmpJobID)
                 return False
             tmpLog.info(tmpDiag)
         # done
         if job.isJEDI():
             tmpLog.info('Done. TaskID=%s will be killed in 30min' % job.jediTaskID)
         else:
             tmpLog.info('Done. JobID=%s will be killed in 30min' % tmpJobID)
     return True