Exemple #1
0
    def _setup_bulk_subjobs(self, dirac_ids, dirac_script):
        """
        This is the old bulk submit method which is used to construct the subjobs for a parametric job
        Args:
            dirac_ids (list): This is a list of the Dirac ids which have been created
            dirac_script (str): Name of the dirac script which contains the job jdl
        """
        f = open(dirac_script, 'r')
        parametric_datasets = get_parametric_datasets(f.read().split('\n'))
        f.close()
        if len(parametric_datasets) != len(dirac_ids):
            raise BackendError('Dirac', 'Missmatch between number of datasets defines in dirac API script and those returned by DIRAC')

        from Ganga.GPIDev.Lib.Job.Job import Job
        master_job = self.getJobObject()
        master_job.subjobs = []
        for i in range(len(dirac_ids)):
            j = Job()
            j.copyFrom(master_job)
            j.splitter = None
            j.backend.id = dirac_ids[i]
            j.id = i
            j.inputdata = self._setup_subjob_dataset(parametric_datasets[i])
            j.status = 'submitted'
            j.time.timenow('submitted')
            master_job.subjobs.append(j)
        return True
Exemple #2
0
 def kill(self):
     """ Kill a Dirac jobs"""
     global dirac_ganga_server
     if not self.id: return None
     dirac_cmd = 'result = DiracCommands.kill(%d)' % self.id
     result = dirac_ganga_server.execute(dirac_cmd)
     if not result_ok(result):
         raise BackendError('Dirac', 'Could not kill job: %s' % str(result))
     return result['OK']
Exemple #3
0
 def kill(self):
     """ Kill a Dirac jobs"""
     if not self.id:
         return None
     dirac_cmd = 'kill(%d)' % self.id
     result = execute(dirac_cmd)
     if not result_ok(result):
         raise BackendError('Dirac', 'Could not kill job: %s' % str(result))
     return result['OK']
Exemple #4
0
def retrievePandaJobs(job, jIDs):
    '''
    methods for retrieving panda job ids of panda jobs given a jobDefId
    '''
    from pandatools import Client

    ick = False
    jstatus = ''
    num_pjobs = 0

    logger.debug("retrievePandaJobs jIDs=%s" % jIDs)

    # get status from Panda server
    rc, jobsStatus = Client.getFullJobStatus(jIDs, False)
    if rc:
        logger.error('Return code %d retrieving job status information.', rc)
        raise BackendError(
            'Jedi', 'Return code %d retrieving job status information.' % rc)

    for status in jobsStatus:
        if not status: continue

        jstatus = status.jobStatus
        if status.jobStatus == None:
            logger.warning('No panda jobs expected')
            job.backend.pandajobs = []

        elif status.jobStatus in [
                "defined", "activated", "running", "failed", "finished",
                "holding", "assigned"
        ]:
            logger.debug('Panda jobs are running')
            logger.debug("PandaID: %d" % status.PandaID)

            pjobj = JediPandaJob()
            pjobj.id = status.PandaID
            pjobj.url = 'http://panda.cern.ch/?job=%d' % status.PandaID
            pjobj.jopSpec = dict(zip(status._attributes, status.values()))
            for k in pjobj.jobSpec.keys():
                if type(pjobj.jobSpec[k]) not in [type(''), type(1)]:
                    pjobj.jobSpec[k] = str(pjobj.jobSpec[k])

            if pjobj not in job.backend.pandajobs:
                job.backend.pandajobs.append(pjobj)
            else:
                logger.debug("Panda job %s already exists locally" % pjobj.id)

            num_pjobs += 1
        else:
            logger.warning(
                "getFullJobStatus returned unsupported status %s for Panda job %s "
                % (status.jobStatus, status.PandaID))

        ick = True

    return (ick, jstatus, num_pjobs)
Exemple #5
0
 def command(klass, cmd, soutfile=None, allowed_exit=None):
     if allowed_exit is None:
         allowed_exit = [0]
     rc, soutfile, ef = shell_cmd(cmd, soutfile, allowed_exit)
     if not ef:
         logger.error(
             'Problem submitting batch job. Maybe your chosen batch system is not available or you have configured it wrongly')
         with open(soutfile) as sout_file:
             logger.error(sout_file.read())
             raiseable = BackendError(klass._name, 'It seems that %s commands are not installed properly:%s' % (klass._name, sout_file.readline()))
     return rc, soutfile
Exemple #6
0
def createContainer(name):
    from pandatools import Client
    # don't create containers for HC datasets
    if not configPanda['processingType'].startswith(
            'gangarobot') and not configPanda['processingType'].startswith(
                'hammercloud'):
        try:
            Client.createContainer(name, False)
            logger.info('Created output container %s' % name)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in Client.createContainer %s: %s %s' %
                (name, sys.exc_info()[0], sys.exc_info()[1]))
Exemple #7
0
def checkForRebrokerage(string):
    import re
    matchObj = re.match(
        'reassigned to another site by rebrokerage. new PandaID=(\d+) JobsetID=(\d+) JobID=(\d+)',
        string)
    if matchObj:
        newPandaID = long(matchObj.group(1))
        newJobsetID = long(matchObj.group(2))
        newJobID = long(matchObj.group(3))
        return newPandaID
    raise BackendError(
        'Jedi',
        'Error getting new PandaID for rebrokered job. Report to DA Help')
Exemple #8
0
    def _submit(self):
        '''Submit the job via the Dirac server.'''
        self.id = None
        self.actualCE = None
        self.status = None
        global dirac_ganga_server
        dirac_cmd = "execfile('%s')" % self._getDiracScript()
        result = dirac_ganga_server.execute(dirac_cmd)
        err_msg = 'Error submitting job to Dirac: %s' % str(result)
        if not result_ok(result) or not result.has_key('Value'):
            logger.error(err_msg)
            raise BackendError('Dirac', err_msg)

        self.id = result['Value']
        j = self.getJobObject()
        gDiracTask.addTaskJob(self.id, j.id)
        return type(self.id) == int
Exemple #9
0
    def _common_submit(self, dirac_script):
        '''Submit the job via the Dirac server.
        Args:
            dirac_script (str): filename of the JDL which is to be submitted to DIRAC
        '''
        j = self.getJobObject()
        self.id = None
        self.actualCE = None
        self.status = None
        self.extraInfo = None
        self.statusInfo = ''
        j.been_queued = False
        dirac_cmd = """execfile(\'%s\')""" % dirac_script
        result = execute(dirac_cmd)
        # Could use the below code instead to submit on a thread
        # If submitting many then user may terminate ganga before
        # all jobs submitted
#        def submit_checker(result, job, script):
#            err_msg = 'Error submitting job to Dirac: %s' % str(result)
#            if not result_ok(result) or 'Value' not in result:
#                logger.error(err_msg)
#                raise BackendError('Dirac',err_msg)
#
#            idlist = result['Value']
#            if type(idlist) is list:
#                return job._setup_bulk_subjobs(idlist, script)
#            job.id = idlist
#        server.execute_nonblocking(dirac_cmd, callback_func=submit_checker, args=(self, dirac_script))
#        return True

        err_msg = 'Error submitting job to Dirac: %s' % str(result)
        if not result_ok(result) or 'Value' not in result:
            logger.error(err_msg)
            logger.error("\n\n===\n%s\n===\n" % dirac_script)
            logger.error("\n\n====\n")
            with open(dirac_script, 'r') as file_in:
                logger.error("%s" % file_in.read())
            logger.error("\n====\n")
            raise BackendError('Dirac', err_msg)

        idlist = result['Value']
        if type(idlist) is list:
            return self._setup_bulk_subjobs(idlist, dirac_script)

        self.id = idlist
        return type(self.id) == int
Exemple #10
0
    def _setup_bulk_subjobs(self, dirac_ids, dirac_script):
        f = open(dirac_script, 'r')
        parametric_datasets = get_parametric_datasets(f.read().split('\n'))
        f.close()
        if len(parametric_datasets) != len(dirac_ids):
            raise BackendError('Dirac', 'Missmatch between number of datasets defines in dirac API script and those returned by DIRAC')

        from Ganga.GPIDev.Lib.Job.Job import Job
        master_job = self.getJobObject()
        master_job.subjobs = []
        for i in range(len(dirac_ids)):
            j = Job()
            j.copyFrom(master_job)
            j.splitter = None
            j.backend.id = dirac_ids[i]
            j.id = i
            j.inputdata = self._setup_subjob_dataset(parametric_datasets[i])
            j.status = 'submitted'
            j.time.timenow('submitted')
            master_job.subjobs.append(j)
        master_job._commit()
        return True
Exemple #11
0
    def submit(self, subjobconfig, master_input_sandbox):
        """Submit a DIRAC job"""
        j = self.getJobObject()

        dirac_script = subjobconfig.script
        dirac_script.name = mangle_job_name(j)
        dirac_script.settings = self.settings
        dirac_script.dirac_opts = self.diracOpts

        sboxname = j.createPackedInputSandbox(subjobconfig.getSandboxFiles())
        script_file = self._getDiracScript()
        dirac_script.input_sandbox = [
            sboxname[0], master_input_sandbox[0], script_file
        ]
        for lfn in self.inputSandboxLFNs:
            from GangaBoss.Lib.Dataset.PhysicalFile import PhysicalFile
            if type(lfn) is PhysicalFile:
                msg = 'Dirac.inputSandboxLFNs cannot contain a PhysicalFile.'
                logger.error(msg)
                raise BackendError('Dirac', msg)
            dirac_script.input_sandbox.append('LFN:' + lfn.name)
        dirac_script.write(script_file)
        return self._submit()
Exemple #12
0
    def checkReport(self, jobDoc):

        job = self.getJobObject()

        config = Config.getConfig('Metrics')
        location = config['location']
        if not os.path.exists(location):
            raise BackendError(0,
                               'Location %s file doesnt exist.' % (location))

        config = ConfigParser()
        config.read(location)

        PARAMS = [('status', 'status')]

        if config.has_section('report'):
            PARAMS += config.items('report')
        else:
            logger.warning('No report in metrics')

        for n, v in PARAMS:
            if v:
                job.backend.report[v] = jobDoc.getAttribute(v)
Exemple #13
0
    def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig):

        logger.debug("Prepare")

        inputsandbox, outputsandbox = sandbox_prepare(app, appsubconfig,
                                                      appmasterconfig,
                                                      jobmasterconfig)

        job = stripProxy(app).getJobObject()

        if job.inputdata:
            if not job.splitter:
                if len(job.inputdata) > 100:
                    raise BackendError(
                        "You're submitting a job to Dirac with no splitter and more than 100 files, please add a splitter and try again!"
                    )

        outputfiles = [
            this_file for this_file in job.outputfiles
            if isType(this_file, DiracFile)
        ]

        data_str = 'import os\n'
        data_str += 'execfile(\'data.py\')\n'

        if hasattr(job, '_splitter_data'):
            data_str += job._splitter_data
        inputsandbox.append(FileBuffer('data-wrapper.py', data_str))

        input_data = []

        # Cant wait to get rid of this when people no-longer specify
        # inputdata in options file
        #######################################################################
        # splitters ensure that subjobs pick up inputdata from job over that in
        # optsfiles but need to take care of unsplit jobs
        if not job.master:
            share_path = os.path.join(get_share_path(app), 'inputdata',
                                      'options_data.pkl')

            if not job.inputdata:
                if os.path.exists(share_path):
                    f = open(share_path, 'r+b')
                    job.inputdata = pickle.load(f)
                    f.close()

        #######################################################################

        # Cant wait to get rid of this when people no-longer specify
        # outputsandbox or outputdata in options file
        #######################################################################
        share_path = os.path.join(get_share_path(app), 'output',
                                  'options_parser.pkl')

        if os.path.exists(share_path):
            #        if not os.path.exists(share_path):
            # raise GangaException('could not find the parser')
            f = open(share_path, 'r+b')
            parser = pickle.load(f)
            f.close()

            outbox, outdata = parser.get_output(job)

            from Ganga.GPIDev.Lib.File import FileUtils
            from Ganga.GPIDev.Base.Filters import allComponentFilters

            fileTransform = allComponentFilters['gangafiles']
            outdata_files = [
                fileTransform(this_file, None) for this_file in outdata
                if not FileUtils.doesFileExist(this_file, job.outputfiles)
            ]
            job.non_copyable_outputfiles.extend([
                output_file for output_file in outdata_files
                if not isType(output_file, DiracFile)
            ])
            outbox_files = [
                fileTransform(this_file, None) for this_file in outbox
                if not FileUtils.doesFileExist(this_file, job.outputfiles)
            ]
            job.non_copyable_outputfiles.extend([
                outbox_file for outbox_file in outbox_files
                if not isType(outbox_file, DiracFile)
            ])

            outputsandbox = [
                f.namePattern for f in job.non_copyable_outputfiles
            ]

            outputsandbox.extend([
                f.namePattern for f in job.outputfiles
                if not isType(f, DiracFile)
            ])
            outputsandbox = unique(outputsandbox)  # + outbox[:])
        #######################################################################

        input_data_dirac, parametricinput_data = dirac_inputdata(
            job.application)

        if input_data_dirac is not None:
            for f in input_data_dirac:
                if isType(f, DiracFile):
                    input_data.append(f.lfn)
                elif isType(f, str):
                    input_data.append(f)
                else:
                    raise ApplicationConfigurationError(
                        "Don't know How to handle anythig other than DiracFiles or strings to LFNs!"
                    )

        commandline = "python ./gaudipython-wrapper.py"
        if is_gaudi_child(app):
            commandline = 'gaudirun.py '
            commandline += ' '.join([str(arg) for arg in app.args])
            commandline += ' options.pkl data-wrapper.py'
        logger.debug('Command line: %s: ', commandline)

        gaudi_script_path = os.path.join(job.getInputWorkspace().getPath(),
                                         "gaudi-script.py")

        script_generator(
            gaudi_script_template(),
            #remove_unreplaced = False,
            outputfile_path=gaudi_script_path,
            PLATFORM=app.platform,
            COMMAND=commandline,
            XMLSUMMARYPARSING=getXMLSummaryScript()  # ,
            #OUTPUTFILESINJECTEDCODE = getWNCodeForOutputPostprocessing(job, '    ')
        )

        #logger.debug( "input_data %s" % str( input_data ) )

        # We want to propogate the ancestor depth to DIRAC when we have
        # inputdata set
        if job.inputdata is not None and isType(job.inputdata, LHCbDataset):

            # As the RT Handler we already know we have a Dirac backend
            if type(job.backend.settings) is not dict:
                raise ApplicationConfigurationError(
                    None, 'backend.settings should be a dict')

            if 'AncestorDepth' in job.backend.settings:
                ancestor_depth = job.backend.settings['AncestorDepth']
            else:
                ancestor_depth = job.inputdata.depth
        else:
            ancestor_depth = 0

        lhcbdirac_script_template = lhcbdiracAPI_script_template()

        lhcb_dirac_outputfiles = lhcbdirac_outputfile_jdl(outputfiles)

        # not necessary to use lhcbdiracAPI_script_template any more as doing our own uploads to Dirac
        # remove after Ganga6 release
        # NOTE special case for replicas: replicate string must be empty for no
        # replication
        dirac_script = script_generator(
            lhcbdirac_script_template,
            DIRAC_IMPORT=
            'from LHCbDIRAC.Interfaces.API.DiracLHCb import DiracLHCb',
            DIRAC_JOB_IMPORT=
            'from LHCbDIRAC.Interfaces.API.LHCbJob import LHCbJob',
            DIRAC_OBJECT='DiracLHCb()',
            JOB_OBJECT='LHCbJob()',
            NAME=mangle_job_name(app),
            APP_NAME=stripProxy(app).appname,
            APP_VERSION=app.version,
            APP_SCRIPT=gaudi_script_path,
            APP_LOG_FILE='Ganga_%s_%s.log' %
            (stripProxy(app).appname, app.version),
            INPUTDATA=input_data,
            PARAMETRIC_INPUTDATA=parametricinput_data,
            OUTPUT_SANDBOX=API_nullifier(outputsandbox),
            OUTPUTFILESSCRIPT=lhcb_dirac_outputfiles,
            # job.fqid,#outputdata_path,
            OUTPUT_PATH="",
            OUTPUT_SE=getConfig('DIRAC')['DiracOutputDataSE'],
            SETTINGS=diracAPI_script_settings(job.application),
            DIRAC_OPTS=job.backend.diracOpts,
            PLATFORM=app.platform,
            REPLICATE='True'
            if getConfig('DIRAC')['ReplicateOutputData'] else '',
            ANCESTOR_DEPTH=ancestor_depth,
            ## This is to be modified in the final 'submit' function in the backend
            ## The backend also handles the inputfiles DiracFiles ass appropriate
            INPUT_SANDBOX='##INPUT_SANDBOX##')
        logger.debug("prepare: LHCbGaudiDiracRunTimeHandler")

        return StandardJobConfig(dirac_script,
                                 inputbox=unique(inputsandbox),
                                 outputbox=unique(outputsandbox))
Exemple #14
0
    def _internal_job_finalisation(job, updated_dirac_status):

        logger = getLogger()

        if updated_dirac_status == 'completed':
            # firstly update job to completing
            DiracBase._getStateTime(job, 'completing')
            if job.status in ['removed', 'killed']:
                return
            if (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us

            job.updateStatus('completing')
            if job.master:
                job.master.updateMasterJobStatus()

            import time
            start = time.time()
            # contact dirac for information
            job.backend.normCPUTime = execute('normCPUTime(%d)' % job.backend.id)
            getSandboxResult = execute("getOutputSandbox(%d,'%s')" % (job.backend.id, job.getOutputWorkspace().getPath()))
            file_info_dict = execute('getOutputDataInfo(%d)' % job.backend.id)
            now = time.time()
            logger.debug('Job ' + job.fqid + ' Time for Dirac metadata : ' + str(now - start))

            logger.debug('Job ' + job.fqid + ' OutputDataInfo: ' + str(file_info_dict))
            logger.debug('Job ' + job.fqid + ' OutputSandbox: ' + str(getSandboxResult))

            # Set DiracFile metadata
            wildcards = [f.namePattern for f in job.outputfiles.get(DiracFile) if regex.search(f.namePattern) is not None]

            with open(os.path.join(job.getOutputWorkspace().getPath(), getConfig('Output')['PostProcessLocationsFileName']), 'ab') as postprocesslocationsfile:
                if not hasattr(file_info_dict, 'keys'):
                    logger.error("Error understanding OutputDataInfo: %s" % str(file_info_dict))
                    from Ganga.Core.exceptions import GangaException
                    raise GangaException("Error understanding OutputDataInfo: %s" % str(file_info_dict))

                for file_name in file_info_dict.get('Value', []):
                    file_name = os.path.basename(file_name)
                    info = file_info_dict.get(file_name)
                    logger.debug("file_name: %s,\tinfo: %s" % (str(file_name), str(info)))

                    valid_wildcards = [wc for wc in wildcards if fnmatch.fnmatch(file_name, wc)]
                    if len(valid_wildcards) == 0: valid_wildcards.append('')

                    if not hasattr(info, 'get'):
                        logger.error("Error getting OutputDataInfo for: %s" % str(job.getFQID('.')))
                        logger.error("Please check the Dirac Job still exists or attempt a job.backend.reset() to try again!")
                        logger.error("Err: %s" % str(info))
                        logger.error("file_info_dict: %s" % str(file_info_dict))
                        from Ganga.Core.exceptions import GangaException
                        raise GangaException("Error getting OutputDataInfo")

                    for wc in valid_wildcards:
                        logger.debug("wildcard: %s" % str(wc))

                        DiracFileData = 'DiracFile:::%s&&%s->%s:::%s:::%s\n' % (wc,
                                                                                file_name,
                                                                                info.get('LFN', 'Error Getting LFN!'),
                                                                                str(info.get('LOCATIONS', ['NotAvailable'])),
                                                                                info.get('GUID', 'NotAvailable')
                                                                                )
                        logger.debug("DiracFileData: %s" % str(DiracFileData))
                        postprocesslocationsfile.write(DiracFileData)

            # check outputsandbox downloaded correctly
            if not result_ok(getSandboxResult):
                logger.warning('Problem retrieving outputsandbox: %s' % str(getSandboxResult))
                DiracBase._getStateTime(job, 'failed')
                if job.status in ['removed', 'killed']:
                    return
                if (job.master and job.master.status in ['removed', 'killed']):
                    return  # user changed it under us
                job.updateStatus('failed')
                if job.master:
                    job.master.updateMasterJobStatus()
                raise BackendError('Problem retrieving outputsandbox: %s' % str(getSandboxResult))

            # finally update job to completed
            DiracBase._getStateTime(job, 'completed')
            if job.status in ['removed', 'killed']:
                return
            if (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us
            job.updateStatus('completed')
            if job.master:
                job.master.updateMasterJobStatus()
            now = time.time()
            logger.debug('Job ' + job.fqid + ' Time for complete update : ' + str(now - start))

        elif updated_dirac_status == 'failed':
            # firstly update status to failed
            DiracBase._getStateTime(job, 'failed')
            if job.status in ['removed', 'killed']:
                return
            if (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us
            job.updateStatus('failed')
            if job.master:
                job.master.updateMasterJobStatus()

            # if requested try downloading outputsandbox anyway
            if getConfig('DIRAC')['failed_sandbox_download']:
                execute("getOutputSandbox(%d,'%s')" %
                        (job.backend.id, job.getOutputWorkspace().getPath()))
        else:
            logger.error("Unexpected dirac status '%s' encountered" % updated_dirac_status)
Exemple #15
0
    def _internal_job_finalisation(job, updated_dirac_status):
        """
        This method performs the main job finalisation
        Args:
            job (Job): Thi is the job we want to finalise
            updated_dirac_status (str): String representing the Ganga finalisation state of the job failed/completed
        """

        if updated_dirac_status == 'completed':
            start = time.time()
            # firstly update job to completing
            DiracBase._getStateTime(job, 'completing')
            if job.status in ['removed', 'killed']:
                return
            elif (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us

            job.updateStatus('completing')
            if job.master:
                job.master.updateMasterJobStatus()

            output_path = job.getOutputWorkspace().getPath()

            logger.info('Contacting DIRAC for job: %s' % job.fqid)
            # Contact dirac which knows about the job
            job.backend.normCPUTime, getSandboxResult, file_info_dict, completeTimeResult = execute("finished_job(%d, '%s')" % (job.backend.id, output_path))

            now = time.time()
            logger.info('%0.2fs taken to download output from DIRAC for Job %s' % ((now - start), job.fqid))

            #logger.info('Job ' + job.fqid + ' OutputDataInfo: ' + str(file_info_dict))
            #logger.info('Job ' + job.fqid + ' OutputSandbox: ' + str(getSandboxResult))
            #logger.info('Job ' + job.fqid + ' normCPUTime: ' + str(job.backend.normCPUTime))

            # Set DiracFile metadata
            wildcards = [f.namePattern for f in job.outputfiles.get(DiracFile) if regex.search(f.namePattern) is not None]

            lfn_store = os.path.join(output_path, getConfig('Output')['PostProcessLocationsFileName'])

            # Make the file on disk with a nullop...
            if not os.path.isfile(lfn_store):
                with open(lfn_store, 'w'):
                    pass

            if job.outputfiles.get(DiracFile):

                # Now we can iterate over the contents of the file without touching it
                with open(lfn_store, 'ab') as postprocesslocationsfile:
                    if not hasattr(file_info_dict, 'keys'):
                        logger.error("Error understanding OutputDataInfo: %s" % str(file_info_dict))
                        from Ganga.Core.exceptions import GangaException
                        raise GangaException("Error understanding OutputDataInfo: %s" % str(file_info_dict))

                    ## Caution is not clear atm whether this 'Value' is an LHCbism or bug
                    list_of_files = file_info_dict.get('Value', file_info_dict.keys())

                    for file_name in list_of_files:
                        file_name = os.path.basename(file_name)
                        info = file_info_dict.get(file_name)
                        #logger.debug("file_name: %s,\tinfo: %s" % (str(file_name), str(info)))

                        if not hasattr(info, 'get'):
                            logger.error("Error getting OutputDataInfo for: %s" % str(job.getFQID('.')))
                            logger.error("Please check the Dirac Job still exists or attempt a job.backend.reset() to try again!")
                            logger.error("Err: %s" % str(info))
                            logger.error("file_info_dict: %s" % str(file_info_dict))
                            from Ganga.Core.exceptions import GangaException
                            raise GangaException("Error getting OutputDataInfo")

                        valid_wildcards = [wc for wc in wildcards if fnmatch.fnmatch(file_name, wc)]
                        if not valid_wildcards:
                            valid_wildcards.append('')

                        for wc in valid_wildcards:
                            #logger.debug("wildcard: %s" % str(wc))

                            DiracFileData = 'DiracFile:::%s&&%s->%s:::%s:::%s\n' % (wc,
                                                                                    file_name,
                                                                                    info.get('LFN', 'Error Getting LFN!'),
                                                                                    str(info.get('LOCATIONS', ['NotAvailable'])),
                                                                                    info.get('GUID', 'NotAvailable')
                                                                                    )
                            #logger.debug("DiracFileData: %s" % str(DiracFileData))
                            postprocesslocationsfile.write(DiracFileData)
                            postprocesslocationsfile.flush()

                logger.debug("Written: %s" % open(lfn_store, 'r').readlines())

            # check outputsandbox downloaded correctly
            if not result_ok(getSandboxResult):
                logger.warning('Problem retrieving outputsandbox: %s' % str(getSandboxResult))
                DiracBase._getStateTime(job, 'failed')
                if job.status in ['removed', 'killed']:
                    return
                elif (job.master and job.master.status in ['removed', 'killed']):
                    return  # user changed it under us
                job.updateStatus('failed')
                if job.master:
                    job.master.updateMasterJobStatus()
                raise BackendError('Problem retrieving outputsandbox: %s' % str(getSandboxResult))

            # finally update job to completed
            DiracBase._getStateTime(job, 'completed', completeTimeResult)
            if job.status in ['removed', 'killed']:
                return
            elif (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us
            job.updateStatus('completed')
            if job.master:
                job.master.updateMasterJobStatus()
            now = time.time()
            logger.debug('Job ' + job.fqid + ' Time for complete update : ' + str(now - start))

        elif updated_dirac_status == 'failed':
            # firstly update status to failed
            DiracBase._getStateTime(job, 'failed')
            if job.status in ['removed', 'killed']:
                return
            if (job.master and job.master.status in ['removed', 'killed']):
                return  # user changed it under us
            job.updateStatus('failed')
            if job.master:
                job.master.updateMasterJobStatus()

            # if requested try downloading outputsandbox anyway
            if configDirac['failed_sandbox_download']:
                execute("getOutputSandbox(%d,'%s')" % (job.backend.id, job.getOutputWorkspace().getPath()))
        else:
            logger.error("Job #%s Unexpected dirac status '%s' encountered" % (job.getFQID('.'), updated_dirac_status))
Exemple #16
0
    def _resubmit(self):
        """Resubmit a DIRAC job"""
        j = self.getJobObject()
        parametric = False
        script_path = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py')
        # Check old script
        if j.master is None and not os.path.exists(script_path):
            raise BackendError('Dirac', 'No "dirac-script.py" found in j.inputdir')

        if j.master is not None and not os.path.exists(script_path):
            script_path = os.path.join(
                j.master.getInputWorkspace().getPath(), 'dirac-script.py')
            if not os.path.exists(script_path):
                raise BackendError('Dirac', 'No "dirac-script.py" found in j.inputdir or j.master.inputdir')
            parametric = True

        # Read old script
        f = open(script_path, 'r')
        script = f.read()
        f.close()

        # Create new script - ##note instead of using get_parametric_dataset
        # could just use j.inputdata.
        if parametric is True:
            parametric_datasets = get_parametric_datasets(script.split('\n'))
            if j.master:
                if len(parametric_datasets) != len(j.master.subjobs):
                    raise BackendError('Dirac', 'number of parametric datasets defined in API script doesn\'t match number of master.subjobs')
            if j.inputdata and len(j.inputdata) > 0:
                _input_files = [f for f in j.inputdata if not isType(f, DiracFile)]
            else:
                _input_files = []
            if set(parametric_datasets[j.id]).symmetric_difference(set([f.namePattern for f in _input_files])):
                raise BackendError(
                    'Dirac', 'Mismatch between dirac-script and job attributes.')
            script = script.replace('.setParametricInputData(%s)' % str(parametric_datasets),
                                    '.setInputData(%s)' % str(parametric_datasets[j.id]))
            script = script.replace('%n', str(j.id))  # name

        start_user_settings = '# <-- user settings\n'
        new_script = script[
            :script.find(start_user_settings) + len(start_user_settings)]

        job_ident = get_job_ident(script.split('\n'))
        for key, value in self.settings.iteritems():
            if str(key).startswith('set'):
                _key = key[3:]
            else:
                _key = key
            if type(value) is str:
                template = '%s.set%s("%s")\n'
            else:
                template = '%s.set%s(%s)\n'
            new_script += template % (job_ident, str(_key), str(value))
        new_script += script[script.find('# user settings -->'):]

        # Save new script
        new_script_filename = os.path.join(j.getInputWorkspace().getPath(), 'dirac-script.py')
        f = open(new_script_filename, 'w')
        f.write(new_script)
        f.flush()
        f.close()
        return self._common_submit(new_script_filename)
Exemple #17
0
    def master_prepare(self, app, appconfig):
        '''Prepare the master job'''

        from pandatools import Client
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec

        job = app._getParent()
        logger.debug('ExecutablePandaRTHandler master_prepare called for %s',
                     job.getFQID('.'))

        # set chirp variables
        if configPanda['chirpconfig'] or configPanda['chirpserver']:
            setChirpVariables()

#       Pack inputsandbox
        inputsandbox = 'sources.%s.tar' % commands.getoutput(
            'uuidgen 2> /dev/null')
        inpw = job.getInputWorkspace()
        # add user script to inputsandbox
        if hasattr(job.application.exe, "name"):
            if not job.application.exe in job.inputsandbox:
                job.inputsandbox.append(job.application.exe)

        for fname in [f.name for f in job.inputsandbox]:
            fname.rstrip(os.sep)
            path = fname[:fname.rfind(os.sep)]
            f = fname[fname.rfind(os.sep) + 1:]
            rc, output = commands.getstatusoutput(
                'tar rf %s -C %s %s' % (inpw.getPath(inputsandbox), path, f))
            if rc:
                logger.error('Packing inputsandbox failed with status %d', rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    None, 'Packing inputsandbox failed.')
        if len(job.inputsandbox) > 0:
            rc, output = commands.getstatusoutput('gzip %s' %
                                                  (inpw.getPath(inputsandbox)))
            if rc:
                logger.error('Packing inputsandbox failed with status %d', rc)
                logger.error(output)
                raise ApplicationConfigurationError(
                    None, 'Packing inputsandbox failed.')
            inputsandbox += ".gz"
        else:
            inputsandbox = None

#       Upload Inputsandbox
        if inputsandbox:
            logger.debug('Uploading source tarball ...')
            uploadSources(inpw.getPath(), os.path.basename(inputsandbox))
            self.inputsandbox = inputsandbox
        else:
            self.inputsandbox = None

#       input dataset
        if job.inputdata:
            if job.inputdata._name != 'DQ2Dataset':
                raise ApplicationConfigurationError(
                    None, 'PANDA application supports only DQ2Datasets')

        # run brokerage here if not splitting
        if not job.splitter:
            from GangaPanda.Lib.Panda.Panda import runPandaBrokerage
            runPandaBrokerage(job)
        elif job.splitter._name not in [
                'DQ2JobSplitter', 'ArgSplitter', 'ArgSplitterTask'
        ]:
            raise ApplicationConfigurationError(
                None, 'Panda splitter must be DQ2JobSplitter or ArgSplitter')

        if job.backend.site == 'AUTO':
            raise ApplicationConfigurationError(
                None, 'site is still AUTO after brokerage!')

#       output dataset
        if job.outputdata:
            if job.outputdata._name != 'DQ2OutputDataset':
                raise ApplicationConfigurationError(
                    None, 'Panda backend supports only DQ2OutputDataset')
        else:
            logger.info('Adding missing DQ2OutputDataset')
            job.outputdata = DQ2OutputDataset()

        job.outputdata.datasetname, outlfn = dq2outputdatasetname(
            job.outputdata.datasetname, job.id, job.outputdata.isGroupDS,
            job.outputdata.groupname)

        self.outDsLocation = Client.PandaSites[job.backend.site]['ddm']

        try:
            Client.addDataset(job.outputdata.datasetname,
                              False,
                              location=self.outDsLocation)
            logger.info('Output dataset %s registered at %s' %
                        (job.outputdata.datasetname, self.outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname,
                                     location=self.outDsLocation)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in Client.addDataset %s: %s %s' %
                (job.outputdata.datasetname, sys.exc_info()[0],
                 sys.exc_info()[1]))

        # handle the libds
        if job.backend.libds:
            self.libDataset = job.backend.libds
            self.fileBO = getLibFileSpecFromLibDS(self.libDataset)
            self.library = self.fileBO.lfn
        elif job.backend.bexec:
            self.libDataset = job.outputdata.datasetname + '.lib'
            self.library = '%s.tgz' % self.libDataset
            try:
                Client.addDataset(self.libDataset,
                                  False,
                                  location=self.outDsLocation)
                dq2_set_dataset_lifetime(self.libDataset,
                                         location=self.outDsLocation)
                logger.info('Lib dataset %s registered at %s' %
                            (self.libDataset, self.outDsLocation))
            except exceptions.SystemExit:
                raise BackendError(
                    'Panda', 'Exception in Client.addDataset %s: %s %s' %
                    (self.libDataset, sys.exc_info()[0], sys.exc_info()[1]))

        # collect extOutFiles
        self.extOutFile = []
        for tmpName in job.outputdata.outputdata:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.outputsandbox:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        for tmpName in job.backend.extOutFile:
            if tmpName != '':
                self.extOutFile.append(tmpName)

        # create build job
        if job.backend.bexec != '':
            jspec = JobSpec()
            jspec.jobDefinitionID = job.id
            jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
            jspec.transformation = '%s/buildGen-00-00-01' % Client.baseURLSUB
            if Client.isDQ2free(job.backend.site):
                jspec.destinationDBlock = '%s/%s' % (
                    job.outputdata.datasetname, self.libDataset)
                jspec.destinationSE = 'local'
            else:
                jspec.destinationDBlock = self.libDataset
                jspec.destinationSE = job.backend.site
            jspec.prodSourceLabel = configPanda['prodSourceLabelBuild']
            jspec.processingType = configPanda['processingType']
            jspec.assignedPriority = configPanda['assignedPriorityBuild']
            jspec.computingSite = job.backend.site
            jspec.cloud = job.backend.requirements.cloud
            jspec.jobParameters = '-o %s' % (self.library)
            if self.inputsandbox:
                jspec.jobParameters += ' -i %s' % (self.inputsandbox)
            else:
                raise ApplicationConfigurationError(
                    None,
                    'Executable on Panda with build job defined, but inputsandbox is emtpy !'
                )
            matchURL = re.search('(http.*://[^/]+)/', Client.baseURLCSRVSSL)
            if matchURL:
                jspec.jobParameters += ' --sourceURL %s ' % matchURL.group(1)
            if job.backend.bexec != '':
                jspec.jobParameters += ' --bexec "%s" ' % urllib.quote(
                    job.backend.bexec)
                jspec.jobParameters += ' -r %s ' % '.'

            fout = FileSpec()
            fout.lfn = self.library
            fout.type = 'output'
            fout.dataset = self.libDataset
            fout.destinationDBlock = self.libDataset
            jspec.addFile(fout)

            flog = FileSpec()
            flog.lfn = '%s.log.tgz' % self.libDataset
            flog.type = 'log'
            flog.dataset = self.libDataset
            flog.destinationDBlock = self.libDataset
            jspec.addFile(flog)
            return jspec
        else:
            return None
Exemple #18
0
    def parseResults(self):

        job = self.getJobObject()

        server = CRABServer()
        try:
            server.status(job)
            server.getOutput(job)
        except:
            logger.error('Could not get the output of the job.')
            # Let's not raise this yet (in case of a double call).
            # raise CRABServerError('Impossible to get the output of the job')

        workdir = job.inputdata.ui_working_dir
        index = int(job.id) + 1
        doc_path = '%s/res/crab_fjr_%d.xml' % (workdir, index)

        if not os.path.exists(doc_path):
            logger.error('FJR %s not found.' % (doc_path))
            return

        try:
            doc = parse(doc_path)
        except:
            logger.error("Could not parse document. File not present?")
            return
        status = doc.firstChild.getAttribute("Status")

        if status in ["Failed"]:
            self.postMortem(job)
            job.updateStatus('failed')
        elif status in ["Success"]:
            if job.status == 'submitting':
                job.updateStatus('submitted')
            job.updateStatus('completed')
        else:
            logger.warning("UNKNOWN PARSE STATUS: " + str(status))

        config = Config.getConfig('Metrics')
        location = config['location']
        if not os.path.exists(location):
            raise BackendError(0,
                               'Location %s file doesnt exist.' % (location))

        config = ConfigParser()
        config.read(location)

        #Iterate over all them
        SECTIONS = config.sections()
        if 'report' in SECTIONS:
            SECTIONS.remove('report')

        # Only five sections work here...
        for section in SECTIONS:

            if not job.backend.fjr.has_key(section):
                job.backend.fjr[section] = {}

            performancereport = doc.getElementsByTagName(
                "PerformanceReport")[0]
            performancesummary = performancereport.getElementsByTagName(
                "PerformanceSummary")
            for pfs in performancesummary:
                if pfs.getAttribute("Metric") == section:
                    metrics = pfs.getElementsByTagName("Metric")
                    for metric in metrics:
                        name = metric.getAttribute("Name")
                        if config.has_option(section, name):
                            # Due to the names with minus intead of underscore, we have to do thiw walkarround
                            # to send them to the DB.
                            name = config.get(section, name)
                            if name:
                                job.backend.fjr[section][
                                    name] = metric.getAttribute("Value")
Exemple #19
0
    def master_updateMonitoringInformation(jobs):
        '''Monitor jobs'''
        from pandatools import Client

        #active_status = [ None, 'defined', 'unknown', 'assigned', 'waiting', 'activated', 'sent', 'starting', 'running', 'holding', 'transferring' ]

        submitting_status = []
        active_status = [
            None, 'registered', 'waiting', 'defined', 'pending', 'assigning',
            'ready', 'scouting', 'running', 'holding', 'merging', 'prepared',
            'aborting', 'finishing'
        ]

        inactive_status = ['finished', 'aborted', 'broken', 'failed', 'done']

        # Find jobs to be monitored
        jobdict = {}
        for job in jobs:
            # add a delay as Panda can be a little slow in sorting out a new Task
            if job.backend.id and job.backend.status in active_status and (
                (datetime.datetime.utcnow() -
                 job.time.timestamps["submitted"]).seconds > 120):
                jobdict[job.backend.id] = job

        logger.debug("jobdict = %s" % jobdict)

        # Monitor active Jedi tasks
        allJobIDs = jobdict.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails(
                {'jediTaskID': jID}, False, True, verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                #raise BackendError('Jedi','Return code %d retrieving job status information.' % status)
                continue
            # Retrieve job
            job = jobdict[jediTaskDict['jediTaskID']]
            # Store associated Panda jobs
            if job.backend.pandajobs:
                pandaJobIDs[job.backend.id] = [
                    pj.id for pj in job.backend.pandajobs
                ]
            else:
                pandaJobIDs[
                    jediTaskDict['jediTaskID']] = jediTaskDict['PandaID']
            logger.debug("jID = %s, pandaJobIDs = %s" % (jID, pandaJobIDs))

            # Fill the output data dataset list
            if 'outDS' in jediTaskDict and jediTaskDict['outDS'] != '':
                for ds in jediTaskDict['outDS'].split(','):
                    if not ds in job.outputdata.datasetList:
                        job.outputdata.datasetList.append(ds)

            # Jedi job status has changed
            if job.backend.status != jediTaskDict['status']:
                logger.debug('Job %s has changed status from %s to %s',
                             job.getFQID('.'), job.backend.status,
                             jediTaskDict['status'])
                job.backend.status = jediTaskDict['status']
                job.backend.reason = jediTaskDict['statistics']

                # Now update Jedi job status
                if jediTaskDict['status'] in [
                        'registered', 'waiting', 'defined', 'pending',
                        'assigning', 'ready'
                ]:
                    job.updateStatus('submitted')
                elif jediTaskDict['status'] in [
                        'scouting', 'running', 'holding', 'merging', 'prepared'
                ]:
                    job.updateStatus('running')
                elif jediTaskDict['status'] in ['done']:
                    job.updateStatus('completed')
                elif jediTaskDict['status'] in ['failed', 'finished']:
                    job.updateStatus('failed')
                elif jediTaskDict['status'] in [
                        'aborted', 'broken', 'cancelled'
                ] and job.status not in ['completed', 'failed']:
                    job.updateStatus('killed')
                else:
                    logger.warning('Unexpected Jedi task status %s',
                                   jediTaskDict['status'])

            # Check if associated Panda job exist and monitor them
            if not job.backend.pandajobs:
                jdefids = pandaJobIDs[jID]
                # skip if there are no Panda jobs yet
                if not jdefids:
                    continue
                tot_num_mjobs = 0

                do_master_update = True
                ick, status, num_mjobs = retrievePandaJobs(job, jdefids)
                logger.debug('retrievePandaJobs returns: %s %s' %
                             (repr(ick), status))
                if not ick:
                    logger.debug(
                        'Panda job retrival failure for Jedi task %s with PandaIds %s'
                        % (job.backend.id, jdefids))
                    do_master_update = False

                tot_num_mjobs += num_mjobs
                logger.debug('Job %s retrieved %d Panda jobs' %
                             (job.getFQID('.'), tot_num_mjobs))
            # Now monitor the already attached Panda jobs
            else:
                jdefids = [pj.id for pj in job.backend.pandajobs]
                rc, jobsStatus = Client.getFullJobStatus(jdefids, False)
                if rc:
                    logger.error(
                        'Return code %d retrieving job status information.',
                        rc)
                    raise BackendError(
                        'Jedi',
                        'Return code %d retrieving job status information.' %
                        rc)

                for status in jobsStatus:
                    if not status: continue

                    for pjob in job.backend.pandajobs:
                        if pjob.id == status.PandaID:
                            # skip if no status change
                            if pjob.status == status.jobStatus:
                                continue
                            # Else update job record
                            pjob.jobSpec = dict(
                                zip(status._attributes, status.values()))

                            for k in pjob.jobSpec.keys():
                                if type(pjob.jobSpec[k]) not in [
                                        type(''), type(1)
                                ]:
                                    pjob.jobSpec[k] = str(pjob.jobSpec[k])

                            logger.debug(
                                'Job %s with Panda job %s has changed status from %s to %s',
                                job.getFQID('.'), pjob.id, pjob.status,
                                status.jobStatus)
                            pjob.status = status.jobStatus
                            pjob.exitcode = str(status.transExitCode)
                            pjob.piloterrorcode = str(status.pilotErrorCode)
                            pjob.reason = ''
                            for k in pjob.jobSpec.keys():
                                if k.endswith('ErrorDiag'
                                              ) and pjob.jobSpec[k] != 'NULL':
                                    pjob.reason += '%s: %s, ' % (
                                        k, str(pjob.jobSpec[k]))
                            #if job.backend.jobSpec['transExitCode'] != 'NULL':
                            pjob.reason += 'transExitCode: %s' % pjob.jobSpec[
                                'transExitCode']

                            if status.jobStatus in [
                                    'defined', 'unknown', 'assigned',
                                    'waiting', 'activated', 'sent'
                            ]:
                                logger.debug('Panda job %s %s' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus in [
                                    'starting', 'running', 'holding',
                                    'transferring', 'merging'
                            ]:
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus in ['finished']:
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                            elif status.jobStatus == 'failed':
                                logger.debug('Panda job %s %s ' %
                                             (pjob.id, status.jobStatus))
                                # check for server side retry
                                if 'taskBufferErrorDiag' in pjob.jobSpec and pjob.jobSpec[
                                        'taskBufferErrorDiag'].find(
                                            "PandaID=") != -1:
                                    # grab the new panda ID
                                    newPandaID = long(
                                        pjob.jobSpec['taskBufferErrorDiag'].
                                        split("=")[1])
                                    pjob.id = newPandaID
                                    pjob.status = None
                                    pjob.url = 'http://panda.cern.ch/?job=%d' % newPandaID
                            elif status.jobStatus == 'cancelled' and pjob.status not in [
                                    'completed', 'failed'
                            ]:  # bug 67716
                                logger.debug('Panda job %s cancelled' %
                                             pjob.id)
                                if 'taskBufferErrorDiag' in pjob.jobSpec and "rebrokerage" in pjob.jobSpec[
                                        'taskBufferErrorDiag']:
                                    newPandaID = checkForRebrokerage(
                                        pjob.jobSpec['taskBufferErrorDiag'])
                                    logger.warning(
                                        "Subjob rebrokered by Panda server. Job %d moved to %d."
                                        % (pjob.id, newPandaID))
                                    pjob.id = newPandaID
                                    pjob.status = None
                            else:
                                logger.warning('Unexpected job status %s',
                                               status.jobStatus)
    def prepare(self, app, appsubconfig, appmasterconfig, jobmasterconfig):
        """Prepare the specific aspec of each subjob.
           Returns: subjobconfig list of objects understood by backends."""

        from pandatools import Client
        from pandatools import AthenaUtils
        from taskbuffer.JobSpec import JobSpec
        from taskbuffer.FileSpec import FileSpec
        from GangaAtlas.Lib.ATLASDataset.DQ2Dataset import dq2_set_dataset_lifetime
        from GangaPanda.Lib.Panda.Panda import refreshPandaSpecs

        # make sure we have the correct siteType
        refreshPandaSpecs()

        job = app._getParent()
        masterjob = job._getRoot()

        logger.debug('ProdTransPandaRTHandler prepare called for %s',
                     job.getFQID('.'))

        job.backend.actualCE = job.backend.site
        job.backend.requirements.cloud = Client.PandaSites[
            job.backend.site]['cloud']

        # check that the site is in a submit-able status
        if not job.splitter or job.splitter._name != 'DQ2JobSplitter':
            allowed_sites = job.backend.list_ddm_sites()

        try:
            outDsLocation = Client.PandaSites[job.backend.site]['ddm']
            tmpDsExist = False
            if (configPanda['processingType'].startswith('gangarobot') or
                    configPanda['processingType'].startswith('hammercloud')):
                #if Client.getDatasets(job.outputdata.datasetname):
                if getDatasets(job.outputdata.datasetname):
                    tmpDsExist = True
                    logger.info('Re-using output dataset %s' %
                                job.outputdata.datasetname)
            if not configPanda[
                    'specialHandling'] == 'ddm:rucio' and not configPanda[
                        'processingType'].startswith(
                            'gangarobot'
                        ) and not configPanda['processingType'].startswith(
                            'hammercloud') and not configPanda[
                                'processingType'].startswith('rucio_test'):
                Client.addDataset(job.outputdata.datasetname,
                                  False,
                                  location=outDsLocation,
                                  allowProdDisk=True,
                                  dsExist=tmpDsExist)
            logger.info('Output dataset %s registered at %s' %
                        (job.outputdata.datasetname, outDsLocation))
            dq2_set_dataset_lifetime(job.outputdata.datasetname, outDsLocation)
        except exceptions.SystemExit:
            raise BackendError(
                'Panda', 'Exception in adding dataset %s: %s %s' %
                (job.outputdata.datasetname, sys.exc_info()[0],
                 sys.exc_info()[1]))

        # JobSpec.
        jspec = JobSpec()
        jspec.currentPriority = app.priority
        jspec.jobDefinitionID = masterjob.id
        jspec.jobName = commands.getoutput('uuidgen 2> /dev/null')
        jspec.coreCount = app.core_count
        jspec.AtlasRelease = 'Atlas-%s' % app.atlas_release
        jspec.homepackage = app.home_package
        jspec.transformation = app.transformation
        jspec.destinationDBlock = job.outputdata.datasetname
        if job.outputdata.location:
            jspec.destinationSE = job.outputdata.location
        else:
            jspec.destinationSE = job.backend.site
        if job.inputdata:
            jspec.prodDBlock = job.inputdata.dataset[0]
        else:
            jspec.prodDBlock = 'NULL'
        if app.prod_source_label:
            jspec.prodSourceLabel = app.prod_source_label
        else:
            jspec.prodSourceLabel = configPanda['prodSourceLabelRun']
        jspec.processingType = configPanda['processingType']
        jspec.specialHandling = configPanda['specialHandling']
        jspec.computingSite = job.backend.site
        jspec.cloud = job.backend.requirements.cloud
        jspec.cmtConfig = app.atlas_cmtconfig
        if app.dbrelease == 'LATEST':
            try:
                latest_dbrelease = getLatestDBReleaseCaching()
            except:
                from pandatools import Client
                latest_dbrelease = Client.getLatestDBRelease()
            m = re.search('(.*):DBRelease-(.*)\.tar\.gz', latest_dbrelease)
            if m:
                self.dbrelease_dataset = m.group(1)
                self.dbrelease = m.group(2)
            else:
                raise ApplicationConfigurationError(
                    None,
                    "Error retrieving LATEST DBRelease. Try setting application.dbrelease manually."
                )
        else:
            self.dbrelease_dataset = app.dbrelease_dataset
            self.dbrelease = app.dbrelease
        jspec.jobParameters = app.job_parameters

        if self.dbrelease:
            if self.dbrelease == 'current':
                jspec.jobParameters += ' --DBRelease=current'
            else:
                if jspec.transformation.endswith(
                        "_tf.py") or jspec.transformation.endswith("_tf"):
                    jspec.jobParameters += ' --DBRelease=DBRelease-%s.tar.gz' % (
                        self.dbrelease, )
                else:
                    jspec.jobParameters += ' DBRelease=DBRelease-%s.tar.gz' % (
                        self.dbrelease, )
                dbspec = FileSpec()
                dbspec.lfn = 'DBRelease-%s.tar.gz' % self.dbrelease
                dbspec.dataset = self.dbrelease_dataset
                dbspec.prodDBlock = jspec.prodDBlock
                dbspec.type = 'input'
                jspec.addFile(dbspec)

        if job.inputdata:
            m = re.search('(.*)\.(.*)\.(.*)\.(.*)\.(.*)\.(.*)',
                          job.inputdata.dataset[0])
            if not m:
                logger.error("Error retrieving run number from dataset name")
                #raise ApplicationConfigurationError(None, "Error retrieving run number from dataset name")
                runnumber = 105200
            else:
                runnumber = int(m.group(2))
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --runNumber %d' % runnumber
            else:
                jspec.jobParameters += ' RunNumber=%d' % runnumber

        # Output files.
        randomized_lfns = []
        ilfn = 0
        for lfn, lfntype in zip(app.output_files, app.output_type):
            ofspec = FileSpec()
            if app.randomize_lfns:
                randomized_lfn = lfn + (
                    '.%s.%d.%s' %
                    (job.backend.site, int(time.time()),
                     commands.getoutput('uuidgen 2> /dev/null')[:4]))
            else:
                randomized_lfn = lfn
            ofspec.lfn = randomized_lfn
            randomized_lfns.append(randomized_lfn)
            ofspec.destinationDBlock = jspec.destinationDBlock
            ofspec.destinationSE = jspec.destinationSE
            ofspec.dataset = jspec.destinationDBlock
            ofspec.type = 'output'
            jspec.addFile(ofspec)
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --output%sFile %s' % (
                    lfntype, randomized_lfns[ilfn])
            else:
                jspec.jobParameters += ' output%sFile=%s' % (
                    lfntype, randomized_lfns[ilfn])
            ilfn = ilfn + 1

        # Input files.
        if job.inputdata:
            for guid, lfn, size, checksum, scope in zip(
                    job.inputdata.guids, job.inputdata.names,
                    job.inputdata.sizes, job.inputdata.checksums,
                    job.inputdata.scopes):
                ifspec = FileSpec()
                ifspec.lfn = lfn
                ifspec.GUID = guid
                ifspec.fsize = size
                ifspec.md5sum = checksum
                ifspec.scope = scope
                ifspec.dataset = jspec.prodDBlock
                ifspec.prodDBlock = jspec.prodDBlock
                ifspec.type = 'input'
                jspec.addFile(ifspec)
            if app.input_type:
                itype = app.input_type
            else:
                itype = m.group(5)
            if jspec.transformation.endswith(
                    "_tf.py") or jspec.transformation.endswith("_tf"):
                jspec.jobParameters += ' --input%sFile %s' % (itype, ','.join(
                    job.inputdata.names))
            else:
                jspec.jobParameters += ' input%sFile=%s' % (itype, ','.join(
                    job.inputdata.names))

        # Log files.
        lfspec = FileSpec()
        lfspec.lfn = '%s.job.log.tgz' % jspec.jobName
        lfspec.destinationDBlock = jspec.destinationDBlock
        lfspec.destinationSE = jspec.destinationSE
        lfspec.dataset = jspec.destinationDBlock
        lfspec.type = 'log'
        jspec.addFile(lfspec)

        return jspec
Exemple #21
0
    def master_resubmit(self, jobs):
        '''Resubmit failed Jedi job'''
        from pandatools import Client

        jobIDs = {}
        for job in jobs:
            jobIDs[job.backend.id] = job

        allJobIDs = jobIDs.keys()
        pandaJobIDs = {}
        for jID in allJobIDs:
            status, jediTaskDict = Client.getJediTaskDetails(
                {'jediTaskID': jID}, False, True, verbose=False)
            if status != 0:
                logger.error("Failed to get task details for %s" % jID)
                raise BackendError(
                    'Jedi',
                    'Return code %d retrieving job status information.' %
                    status)

            # Retrieve job
            job = jobIDs[jediTaskDict['jediTaskID']]

            newJobsetID = -1  # get jobset
            retryJobs = []  # jspecs
            resubmittedJobs = []  # ganga jobs

            if jediTaskDict['status'] in [
                    'failed', 'killed', 'cancelled', 'aborted', 'broken',
                    'finished'
            ]:
                retryJobs.append(job)
                resubmittedJobs.append(jID)
            #elif jediTaskDict['status'] == 'finished':
            #    pass
            else:
                logger.warning("Cannot resubmit. Jedi task %s is status %s." %
                               (jID, jediTaskDict['status']))
                return False

            # submit
            if len(retryJobs) == 0:
                logger.warning("No failed jobs to resubmit")
                return False

            status, out = Client.retryTask(jID, verbose=False)
            if status != 0:
                logger.error(status)
                logger.error(out)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            tmpStat, tmpDiag = out
            if not tmpStat:
                logger.error(tmpDiag)
                logger.error("Failed to retry JobID=%s" % jID)
                return False
            logger.info(tmpDiag)

            job.backend.status = None
            job.backend.jobSpec = {}
            job.updateStatus('submitted')

        logger.info('Resubmission successful')
        return True